Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Fairseq
Commits
7df61696
Commit
7df61696
authored
Jul 28, 2023
by
Sugon_ldc
Browse files
add fairseq0.10.2
parents
Pipeline
#471
failed with stages
in 0 seconds
Changes
595
Pipelines
3
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3830 additions
and
0 deletions
+3830
-0
examples/speech_recognition/__init__.py
examples/speech_recognition/__init__.py
+1
-0
examples/speech_recognition/criterions/ASG_loss.py
examples/speech_recognition/criterions/ASG_loss.py
+170
-0
examples/speech_recognition/criterions/__init__.py
examples/speech_recognition/criterions/__init__.py
+17
-0
examples/speech_recognition/criterions/cross_entropy_acc.py
examples/speech_recognition/criterions/cross_entropy_acc.py
+130
-0
examples/speech_recognition/data/__init__.py
examples/speech_recognition/data/__init__.py
+11
-0
examples/speech_recognition/data/asr_dataset.py
examples/speech_recognition/data/asr_dataset.py
+122
-0
examples/speech_recognition/data/collaters.py
examples/speech_recognition/data/collaters.py
+131
-0
examples/speech_recognition/data/data_utils.py
examples/speech_recognition/data/data_utils.py
+100
-0
examples/speech_recognition/data/replabels.py
examples/speech_recognition/data/replabels.py
+70
-0
examples/speech_recognition/datasets/asr_prep_json.py
examples/speech_recognition/datasets/asr_prep_json.py
+125
-0
examples/speech_recognition/datasets/prepare-librispeech.sh
examples/speech_recognition/datasets/prepare-librispeech.sh
+88
-0
examples/speech_recognition/infer.py
examples/speech_recognition/infer.py
+464
-0
examples/speech_recognition/models/__init__.py
examples/speech_recognition/models/__init__.py
+8
-0
examples/speech_recognition/models/vggtransformer.py
examples/speech_recognition/models/vggtransformer.py
+1019
-0
examples/speech_recognition/models/w2l_conv_glu_enc.py
examples/speech_recognition/models/w2l_conv_glu_enc.py
+177
-0
examples/speech_recognition/tasks/__init__.py
examples/speech_recognition/tasks/__init__.py
+8
-0
examples/speech_recognition/tasks/speech_recognition.py
examples/speech_recognition/tasks/speech_recognition.py
+157
-0
examples/speech_recognition/utils/wer_utils.py
examples/speech_recognition/utils/wer_utils.py
+381
-0
examples/speech_recognition/w2l_decoder.py
examples/speech_recognition/w2l_decoder.py
+435
-0
examples/speech_to_text/README.md
examples/speech_to_text/README.md
+216
-0
No files found.
Too many changes to show.
To preserve performance only
595 of 595+
files are displayed.
Plain diff
Email patch
examples/speech_recognition/__init__.py
0 → 100644
View file @
7df61696
from
.
import
criterions
,
models
,
tasks
# noqa
examples/speech_recognition/criterions/ASG_loss.py
0 → 100644
View file @
7df61696
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
torch
from
examples.speech_recognition.data.replabels
import
pack_replabels
from
fairseq
import
utils
from
fairseq.criterions
import
FairseqCriterion
,
register_criterion
@
register_criterion
(
"asg_loss"
)
class
ASGCriterion
(
FairseqCriterion
):
@
staticmethod
def
add_args
(
parser
):
group
=
parser
.
add_argument_group
(
"ASG Loss"
)
group
.
add_argument
(
"--asg-transitions-init"
,
help
=
"initial diagonal value of transition matrix"
,
type
=
float
,
default
=
0.0
,
)
group
.
add_argument
(
"--max-replabel"
,
help
=
"maximum # of replabels"
,
type
=
int
,
default
=
2
)
group
.
add_argument
(
"--linseg-updates"
,
help
=
"# of training updates to use LinSeg initialization"
,
type
=
int
,
default
=
0
,
)
group
.
add_argument
(
"--hide-linseg-messages"
,
help
=
"hide messages about LinSeg initialization"
,
action
=
"store_true"
,
)
def
__init__
(
self
,
task
,
silence_token
,
asg_transitions_init
,
max_replabel
,
linseg_updates
,
hide_linseg_messages
,
):
from
wav2letter.criterion
import
ASGLoss
,
CriterionScaleMode
super
().
__init__
(
task
)
self
.
tgt_dict
=
task
.
target_dictionary
self
.
eos
=
self
.
tgt_dict
.
eos
()
self
.
silence
=
(
self
.
tgt_dict
.
index
(
silence_token
)
if
silence_token
in
self
.
tgt_dict
else
None
)
self
.
max_replabel
=
max_replabel
num_labels
=
len
(
self
.
tgt_dict
)
self
.
asg
=
ASGLoss
(
num_labels
,
scale_mode
=
CriterionScaleMode
.
TARGET_SZ_SQRT
)
self
.
asg
.
trans
=
torch
.
nn
.
Parameter
(
asg_transitions_init
*
torch
.
eye
(
num_labels
),
requires_grad
=
True
)
self
.
linseg_progress
=
torch
.
nn
.
Parameter
(
torch
.
tensor
([
0
],
dtype
=
torch
.
int
),
requires_grad
=
False
)
self
.
linseg_maximum
=
linseg_updates
self
.
linseg_message_state
=
"none"
if
hide_linseg_messages
else
"start"
@
classmethod
def
build_criterion
(
cls
,
args
,
task
):
return
cls
(
task
,
args
.
silence_token
,
args
.
asg_transitions_init
,
args
.
max_replabel
,
args
.
linseg_updates
,
args
.
hide_linseg_messages
,
)
def
linseg_step
(
self
):
if
not
self
.
training
:
return
False
if
self
.
linseg_progress
.
item
()
<
self
.
linseg_maximum
:
if
self
.
linseg_message_state
==
"start"
:
print
(
"| using LinSeg to initialize ASG"
)
self
.
linseg_message_state
=
"finish"
self
.
linseg_progress
.
add_
(
1
)
return
True
elif
self
.
linseg_message_state
==
"finish"
:
print
(
"| finished LinSeg initialization"
)
self
.
linseg_message_state
=
"none"
return
False
def
replace_eos_with_silence
(
self
,
tgt
):
if
tgt
[
-
1
]
!=
self
.
eos
:
return
tgt
elif
self
.
silence
is
None
or
(
len
(
tgt
)
>
1
and
tgt
[
-
2
]
==
self
.
silence
):
return
tgt
[:
-
1
]
else
:
return
tgt
[:
-
1
]
+
[
self
.
silence
]
def
forward
(
self
,
model
,
sample
,
reduce
=
True
):
"""Compute the loss for the given sample.
Returns a tuple with three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
"""
net_output
=
model
(
**
sample
[
"net_input"
])
emissions
=
net_output
[
"encoder_out"
].
transpose
(
0
,
1
).
contiguous
()
B
=
emissions
.
size
(
0
)
T
=
emissions
.
size
(
1
)
device
=
emissions
.
device
target
=
torch
.
IntTensor
(
B
,
T
)
target_size
=
torch
.
IntTensor
(
B
)
using_linseg
=
self
.
linseg_step
()
for
b
in
range
(
B
):
initial_target_size
=
sample
[
"target_lengths"
][
b
].
item
()
if
initial_target_size
==
0
:
raise
ValueError
(
"target size cannot be zero"
)
tgt
=
sample
[
"target"
][
b
,
:
initial_target_size
].
tolist
()
tgt
=
self
.
replace_eos_with_silence
(
tgt
)
tgt
=
pack_replabels
(
tgt
,
self
.
tgt_dict
,
self
.
max_replabel
)
tgt
=
tgt
[:
T
]
if
using_linseg
:
tgt
=
[
tgt
[
t
*
len
(
tgt
)
//
T
]
for
t
in
range
(
T
)]
target
[
b
][:
len
(
tgt
)]
=
torch
.
IntTensor
(
tgt
)
target_size
[
b
]
=
len
(
tgt
)
loss
=
self
.
asg
.
forward
(
emissions
,
target
.
to
(
device
),
target_size
.
to
(
device
))
if
reduce
:
loss
=
torch
.
sum
(
loss
)
sample_size
=
(
sample
[
"target"
].
size
(
0
)
if
self
.
args
.
sentence_avg
else
sample
[
"ntokens"
]
)
logging_output
=
{
"loss"
:
utils
.
item
(
loss
.
data
)
if
reduce
else
loss
.
data
,
"ntokens"
:
sample
[
"ntokens"
],
"nsentences"
:
sample
[
"target"
].
size
(
0
),
"sample_size"
:
sample_size
,
}
return
loss
,
sample_size
,
logging_output
@
staticmethod
def
aggregate_logging_outputs
(
logging_outputs
):
"""Aggregate logging outputs from data parallel training."""
loss_sum
=
sum
(
log
.
get
(
"loss"
,
0
)
for
log
in
logging_outputs
)
ntokens
=
sum
(
log
.
get
(
"ntokens"
,
0
)
for
log
in
logging_outputs
)
nsentences
=
sum
(
log
.
get
(
"nsentences"
,
0
)
for
log
in
logging_outputs
)
sample_size
=
sum
(
log
.
get
(
"sample_size"
,
0
)
for
log
in
logging_outputs
)
agg_output
=
{
"loss"
:
loss_sum
/
nsentences
,
"ntokens"
:
ntokens
,
"nsentences"
:
nsentences
,
"sample_size"
:
sample_size
,
}
return
agg_output
examples/speech_recognition/criterions/__init__.py
0 → 100644
View file @
7df61696
import
importlib
import
os
# ASG loss requires wav2letter
files_to_skip
=
set
()
try
:
import
wav2letter
except
ImportError
:
files_to_skip
.
add
(
"ASG_loss.py"
)
for
file
in
os
.
listdir
(
os
.
path
.
dirname
(
__file__
)):
if
file
.
endswith
(
".py"
)
and
not
file
.
startswith
(
"_"
)
and
file
not
in
files_to_skip
:
criterion_name
=
file
[:
file
.
find
(
".py"
)]
importlib
.
import_module
(
"examples.speech_recognition.criterions."
+
criterion_name
)
examples/speech_recognition/criterions/cross_entropy_acc.py
0 → 100644
View file @
7df61696
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
logging
import
math
import
torch
import
torch.nn.functional
as
F
from
fairseq
import
utils
from
fairseq.criterions
import
FairseqCriterion
,
register_criterion
@
register_criterion
(
"cross_entropy_acc"
)
class
CrossEntropyWithAccCriterion
(
FairseqCriterion
):
def
__init__
(
self
,
task
,
sentence_avg
):
super
().
__init__
(
task
)
self
.
sentence_avg
=
sentence_avg
def
compute_loss
(
self
,
model
,
net_output
,
target
,
reduction
,
log_probs
):
# N, T -> N * T
target
=
target
.
view
(
-
1
)
lprobs
=
model
.
get_normalized_probs
(
net_output
,
log_probs
=
log_probs
)
if
not
hasattr
(
lprobs
,
"batch_first"
):
logging
.
warning
(
"ERROR: we need to know whether "
"batch first for the net output; "
"you need to set batch_first attribute for the return value of "
"model.get_normalized_probs. Now, we assume this is true, but "
"in the future, we will raise exception instead. "
)
batch_first
=
getattr
(
lprobs
,
"batch_first"
,
True
)
if
not
batch_first
:
lprobs
=
lprobs
.
transpose
(
0
,
1
)
# N, T, D -> N * T, D
lprobs
=
lprobs
.
view
(
-
1
,
lprobs
.
size
(
-
1
))
loss
=
F
.
nll_loss
(
lprobs
,
target
,
ignore_index
=
self
.
padding_idx
,
reduction
=
reduction
)
return
lprobs
,
loss
def
get_logging_output
(
self
,
sample
,
target
,
lprobs
,
loss
):
target
=
target
.
view
(
-
1
)
mask
=
target
!=
self
.
padding_idx
correct
=
torch
.
sum
(
lprobs
.
argmax
(
1
).
masked_select
(
mask
)
==
target
.
masked_select
(
mask
)
)
total
=
torch
.
sum
(
mask
)
sample_size
=
(
sample
[
"target"
].
size
(
0
)
if
self
.
sentence_avg
else
sample
[
"ntokens"
]
)
logging_output
=
{
"loss"
:
utils
.
item
(
loss
.
data
),
# * sample['ntokens'],
"ntokens"
:
sample
[
"ntokens"
],
"nsentences"
:
sample
[
"target"
].
size
(
0
),
"sample_size"
:
sample_size
,
"correct"
:
utils
.
item
(
correct
.
data
),
"total"
:
utils
.
item
(
total
.
data
),
"nframes"
:
torch
.
sum
(
sample
[
"net_input"
][
"src_lengths"
]).
item
(),
}
return
sample_size
,
logging_output
def
forward
(
self
,
model
,
sample
,
reduction
=
"sum"
,
log_probs
=
True
):
"""Computes the cross entropy with accuracy metric for the given sample.
This is similar to CrossEntropyCriterion in fairseq, but also
computes accuracy metrics as part of logging
Args:
logprobs (Torch.tensor) of shape N, T, D i.e.
batchsize, timesteps, dimensions
targets (Torch.tensor) of shape N, T i.e batchsize, timesteps
Returns:
tuple: With three elements:
1) the loss
2) the sample size, which is used as the denominator for the gradient
3) logging outputs to display while training
TODO:
* Currently this Criterion will only work with LSTMEncoderModels or
FairseqModels which have decoder, or Models which return TorchTensor
as net_output.
We need to make a change to support all FairseqEncoder models.
"""
net_output
=
model
(
**
sample
[
"net_input"
])
target
=
model
.
get_targets
(
sample
,
net_output
)
lprobs
,
loss
=
self
.
compute_loss
(
model
,
net_output
,
target
,
reduction
,
log_probs
)
sample_size
,
logging_output
=
self
.
get_logging_output
(
sample
,
target
,
lprobs
,
loss
)
return
loss
,
sample_size
,
logging_output
@
staticmethod
def
aggregate_logging_outputs
(
logging_outputs
):
"""Aggregate logging outputs from data parallel training."""
correct_sum
=
sum
(
log
.
get
(
"correct"
,
0
)
for
log
in
logging_outputs
)
total_sum
=
sum
(
log
.
get
(
"total"
,
0
)
for
log
in
logging_outputs
)
loss_sum
=
sum
(
log
.
get
(
"loss"
,
0
)
for
log
in
logging_outputs
)
ntokens
=
sum
(
log
.
get
(
"ntokens"
,
0
)
for
log
in
logging_outputs
)
nsentences
=
sum
(
log
.
get
(
"nsentences"
,
0
)
for
log
in
logging_outputs
)
sample_size
=
sum
(
log
.
get
(
"sample_size"
,
0
)
for
log
in
logging_outputs
)
nframes
=
sum
(
log
.
get
(
"nframes"
,
0
)
for
log
in
logging_outputs
)
agg_output
=
{
"loss"
:
loss_sum
/
sample_size
/
math
.
log
(
2
)
if
sample_size
>
0
else
0.0
,
# if args.sentence_avg, then sample_size is nsentences, then loss
# is per-sentence loss; else sample_size is ntokens, the loss
# becomes per-output token loss
"ntokens"
:
ntokens
,
"nsentences"
:
nsentences
,
"nframes"
:
nframes
,
"sample_size"
:
sample_size
,
"acc"
:
correct_sum
*
100.0
/
total_sum
if
total_sum
>
0
else
0.0
,
"correct"
:
correct_sum
,
"total"
:
total_sum
,
# total is the number of validate tokens
}
if
sample_size
!=
ntokens
:
agg_output
[
"nll_loss"
]
=
loss_sum
/
ntokens
/
math
.
log
(
2
)
# loss: per output token loss
# nll_loss: per sentence loss
return
agg_output
examples/speech_recognition/data/__init__.py
0 → 100644
View file @
7df61696
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
.asr_dataset
import
AsrDataset
__all__
=
[
"AsrDataset"
,
]
examples/speech_recognition/data/asr_dataset.py
0 → 100644
View file @
7df61696
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
os
import
numpy
as
np
from
fairseq.data
import
FairseqDataset
from
.
import
data_utils
from
.collaters
import
Seq2SeqCollater
class
AsrDataset
(
FairseqDataset
):
"""
A dataset representing speech and corresponding transcription.
Args:
aud_paths: (List[str]): A list of str with paths to audio files.
aud_durations_ms (List[int]): A list of int containing the durations of
audio files.
tgt (List[torch.LongTensor]): A list of LongTensors containing the indices
of target transcriptions.
tgt_dict (~fairseq.data.Dictionary): target vocabulary.
ids (List[str]): A list of utterance IDs.
speakers (List[str]): A list of speakers corresponding to utterances.
num_mel_bins (int): Number of triangular mel-frequency bins (default: 80)
frame_length (float): Frame length in milliseconds (default: 25.0)
frame_shift (float): Frame shift in milliseconds (default: 10.0)
"""
def
__init__
(
self
,
aud_paths
,
aud_durations_ms
,
tgt
,
tgt_dict
,
ids
,
speakers
,
num_mel_bins
=
80
,
frame_length
=
25.0
,
frame_shift
=
10.0
,
):
assert
frame_length
>
0
assert
frame_shift
>
0
assert
all
(
x
>
frame_length
for
x
in
aud_durations_ms
)
self
.
frame_sizes
=
[
int
(
1
+
(
d
-
frame_length
)
/
frame_shift
)
for
d
in
aud_durations_ms
]
assert
len
(
aud_paths
)
>
0
assert
len
(
aud_paths
)
==
len
(
aud_durations_ms
)
assert
len
(
aud_paths
)
==
len
(
tgt
)
assert
len
(
aud_paths
)
==
len
(
ids
)
assert
len
(
aud_paths
)
==
len
(
speakers
)
self
.
aud_paths
=
aud_paths
self
.
tgt_dict
=
tgt_dict
self
.
tgt
=
tgt
self
.
ids
=
ids
self
.
speakers
=
speakers
self
.
num_mel_bins
=
num_mel_bins
self
.
frame_length
=
frame_length
self
.
frame_shift
=
frame_shift
self
.
s2s_collater
=
Seq2SeqCollater
(
0
,
1
,
pad_index
=
self
.
tgt_dict
.
pad
(),
eos_index
=
self
.
tgt_dict
.
eos
(),
move_eos_to_beginning
=
True
,
)
def
__getitem__
(
self
,
index
):
import
torchaudio
import
torchaudio.compliance.kaldi
as
kaldi
tgt_item
=
self
.
tgt
[
index
]
if
self
.
tgt
is
not
None
else
None
path
=
self
.
aud_paths
[
index
]
if
not
os
.
path
.
exists
(
path
):
raise
FileNotFoundError
(
"Audio file not found: {}"
.
format
(
path
))
sound
,
sample_rate
=
torchaudio
.
load_wav
(
path
)
output
=
kaldi
.
fbank
(
sound
,
num_mel_bins
=
self
.
num_mel_bins
,
frame_length
=
self
.
frame_length
,
frame_shift
=
self
.
frame_shift
,
)
output_cmvn
=
data_utils
.
apply_mv_norm
(
output
)
return
{
"id"
:
index
,
"data"
:
[
output_cmvn
.
detach
(),
tgt_item
]}
def
__len__
(
self
):
return
len
(
self
.
aud_paths
)
def
collater
(
self
,
samples
):
"""Merge a list of samples to form a mini-batch.
Args:
samples (List[int]): sample indices to collate
Returns:
dict: a mini-batch suitable for forwarding with a Model
"""
return
self
.
s2s_collater
.
collate
(
samples
)
def
num_tokens
(
self
,
index
):
return
self
.
frame_sizes
[
index
]
def
size
(
self
,
index
):
"""Return an example's size as a float or tuple. This value is used when
filtering a dataset with ``--max-positions``."""
return
(
self
.
frame_sizes
[
index
],
len
(
self
.
tgt
[
index
])
if
self
.
tgt
is
not
None
else
0
,
)
def
ordered_indices
(
self
):
"""Return an ordered list of indices. Batches will be constructed based
on this order."""
return
np
.
arange
(
len
(
self
))
examples/speech_recognition/data/collaters.py
0 → 100644
View file @
7df61696
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
This module contains collection of classes which implement
collate functionalities for various tasks.
Collaters should know what data to expect for each sample
and they should pack / collate them into batches
"""
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
numpy
as
np
import
torch
from
fairseq.data
import
data_utils
as
fairseq_data_utils
class
Seq2SeqCollater
(
object
):
"""
Implements collate function mainly for seq2seq tasks
This expects each sample to contain feature (src_tokens) and
targets.
This collator is also used for aligned training task.
"""
def
__init__
(
self
,
feature_index
=
0
,
label_index
=
1
,
pad_index
=
1
,
eos_index
=
2
,
move_eos_to_beginning
=
True
,
):
self
.
feature_index
=
feature_index
self
.
label_index
=
label_index
self
.
pad_index
=
pad_index
self
.
eos_index
=
eos_index
self
.
move_eos_to_beginning
=
move_eos_to_beginning
def
_collate_frames
(
self
,
frames
):
"""Convert a list of 2d frames into a padded 3d tensor
Args:
frames (list): list of 2d frames of size L[i]*f_dim. Where L[i] is
length of i-th frame and f_dim is static dimension of features
Returns:
3d tensor of size len(frames)*len_max*f_dim where len_max is max of L[i]
"""
len_max
=
max
(
frame
.
size
(
0
)
for
frame
in
frames
)
f_dim
=
frames
[
0
].
size
(
1
)
res
=
frames
[
0
].
new
(
len
(
frames
),
len_max
,
f_dim
).
fill_
(
0.0
)
for
i
,
v
in
enumerate
(
frames
):
res
[
i
,
:
v
.
size
(
0
)]
=
v
return
res
def
collate
(
self
,
samples
):
"""
utility function to collate samples into batch for speech recognition.
"""
if
len
(
samples
)
==
0
:
return
{}
# parse samples into torch tensors
parsed_samples
=
[]
for
s
in
samples
:
# skip invalid samples
if
s
[
"data"
][
self
.
feature_index
]
is
None
:
continue
source
=
s
[
"data"
][
self
.
feature_index
]
if
isinstance
(
source
,
(
np
.
ndarray
,
np
.
generic
)):
source
=
torch
.
from_numpy
(
source
)
target
=
s
[
"data"
][
self
.
label_index
]
if
isinstance
(
target
,
(
np
.
ndarray
,
np
.
generic
)):
target
=
torch
.
from_numpy
(
target
).
long
()
elif
isinstance
(
target
,
list
):
target
=
torch
.
LongTensor
(
target
)
parsed_sample
=
{
"id"
:
s
[
"id"
],
"source"
:
source
,
"target"
:
target
}
parsed_samples
.
append
(
parsed_sample
)
samples
=
parsed_samples
id
=
torch
.
LongTensor
([
s
[
"id"
]
for
s
in
samples
])
frames
=
self
.
_collate_frames
([
s
[
"source"
]
for
s
in
samples
])
# sort samples by descending number of frames
frames_lengths
=
torch
.
LongTensor
([
s
[
"source"
].
size
(
0
)
for
s
in
samples
])
frames_lengths
,
sort_order
=
frames_lengths
.
sort
(
descending
=
True
)
id
=
id
.
index_select
(
0
,
sort_order
)
frames
=
frames
.
index_select
(
0
,
sort_order
)
target
=
None
target_lengths
=
None
prev_output_tokens
=
None
if
samples
[
0
].
get
(
"target"
,
None
)
is
not
None
:
ntokens
=
sum
(
len
(
s
[
"target"
])
for
s
in
samples
)
target
=
fairseq_data_utils
.
collate_tokens
(
[
s
[
"target"
]
for
s
in
samples
],
self
.
pad_index
,
self
.
eos_index
,
left_pad
=
False
,
move_eos_to_beginning
=
False
,
)
target
=
target
.
index_select
(
0
,
sort_order
)
target_lengths
=
torch
.
LongTensor
(
[
s
[
"target"
].
size
(
0
)
for
s
in
samples
]
).
index_select
(
0
,
sort_order
)
prev_output_tokens
=
fairseq_data_utils
.
collate_tokens
(
[
s
[
"target"
]
for
s
in
samples
],
self
.
pad_index
,
self
.
eos_index
,
left_pad
=
False
,
move_eos_to_beginning
=
self
.
move_eos_to_beginning
,
)
prev_output_tokens
=
prev_output_tokens
.
index_select
(
0
,
sort_order
)
else
:
ntokens
=
sum
(
len
(
s
[
"source"
])
for
s
in
samples
)
batch
=
{
"id"
:
id
,
"ntokens"
:
ntokens
,
"net_input"
:
{
"src_tokens"
:
frames
,
"src_lengths"
:
frames_lengths
},
"target"
:
target
,
"target_lengths"
:
target_lengths
,
"nsentences"
:
len
(
samples
),
}
if
prev_output_tokens
is
not
None
:
batch
[
"net_input"
][
"prev_output_tokens"
]
=
prev_output_tokens
return
batch
examples/speech_recognition/data/data_utils.py
0 → 100644
View file @
7df61696
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
torch
def
calc_mean_invstddev
(
feature
):
if
len
(
feature
.
size
())
!=
2
:
raise
ValueError
(
"We expect the input feature to be 2-D tensor"
)
mean
=
feature
.
mean
(
0
)
var
=
feature
.
var
(
0
)
# avoid division by ~zero
eps
=
1e-8
if
(
var
<
eps
).
any
():
return
mean
,
1.0
/
(
torch
.
sqrt
(
var
)
+
eps
)
return
mean
,
1.0
/
torch
.
sqrt
(
var
)
def
apply_mv_norm
(
features
):
# If there is less than 2 spectrograms, the variance cannot be computed (is NaN)
# and normalization is not possible, so return the item as it is
if
features
.
size
(
0
)
<
2
:
return
features
mean
,
invstddev
=
calc_mean_invstddev
(
features
)
res
=
(
features
-
mean
)
*
invstddev
return
res
def
lengths_to_encoder_padding_mask
(
lengths
,
batch_first
=
False
):
"""
convert lengths (a 1-D Long/Int tensor) to 2-D binary tensor
Args:
lengths: a (B, )-shaped tensor
Return:
max_length: maximum length of B sequences
encoder_padding_mask: a (max_length, B) binary mask, where
[t, b] = 0 for t < lengths[b] and 1 otherwise
TODO:
kernelize this function if benchmarking shows this function is slow
"""
max_lengths
=
torch
.
max
(
lengths
).
item
()
bsz
=
lengths
.
size
(
0
)
encoder_padding_mask
=
torch
.
arange
(
max_lengths
).
to
(
# a (T, ) tensor with [0, ..., T-1]
lengths
.
device
).
view
(
# move to the right device
1
,
max_lengths
).
expand
(
# reshape to (1, T)-shaped tensor
bsz
,
-
1
)
>=
lengths
.
view
(
# expand to (B, T)-shaped tensor
bsz
,
1
).
expand
(
-
1
,
max_lengths
)
if
not
batch_first
:
return
encoder_padding_mask
.
t
(),
max_lengths
else
:
return
encoder_padding_mask
,
max_lengths
def
encoder_padding_mask_to_lengths
(
encoder_padding_mask
,
max_lengths
,
batch_size
,
device
):
"""
convert encoder_padding_mask (2-D binary tensor) to a 1-D tensor
Conventionally, encoder output contains a encoder_padding_mask, which is
a 2-D mask in a shape (T, B), whose (t, b) element indicate whether
encoder_out[t, b] is a valid output (=0) or not (=1). Occasionally, we
need to convert this mask tensor to a 1-D tensor in shape (B, ), where
[b] denotes the valid length of b-th sequence
Args:
encoder_padding_mask: a (T, B)-shaped binary tensor or None; if None,
indicating all are valid
Return:
seq_lengths: a (B,)-shaped tensor, where its (b, )-th element is the
number of valid elements of b-th sequence
max_lengths: maximum length of all sequence, if encoder_padding_mask is
not None, max_lengths must equal to encoder_padding_mask.size(0)
batch_size: batch size; if encoder_padding_mask is
not None, max_lengths must equal to encoder_padding_mask.size(1)
device: which device to put the result on
"""
if
encoder_padding_mask
is
None
:
return
torch
.
Tensor
([
max_lengths
]
*
batch_size
).
to
(
torch
.
int32
).
to
(
device
)
assert
encoder_padding_mask
.
size
(
0
)
==
max_lengths
,
"max_lengths does not match"
assert
encoder_padding_mask
.
size
(
1
)
==
batch_size
,
"batch_size does not match"
return
max_lengths
-
torch
.
sum
(
encoder_padding_mask
,
dim
=
0
)
examples/speech_recognition/data/replabels.py
0 → 100644
View file @
7df61696
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Replabel transforms for use with wav2letter's ASG criterion.
"""
def
replabel_symbol
(
i
):
"""
Replabel symbols used in wav2letter, currently just "1", "2", ...
This prevents training with numeral tokens, so this might change in the future
"""
return
str
(
i
)
def
pack_replabels
(
tokens
,
dictionary
,
max_reps
):
"""
Pack a token sequence so that repeated symbols are replaced by replabels
"""
if
len
(
tokens
)
==
0
or
max_reps
<=
0
:
return
tokens
replabel_value_to_idx
=
[
0
]
*
(
max_reps
+
1
)
for
i
in
range
(
1
,
max_reps
+
1
):
replabel_value_to_idx
[
i
]
=
dictionary
.
index
(
replabel_symbol
(
i
))
result
=
[]
prev_token
=
-
1
num_reps
=
0
for
token
in
tokens
:
if
token
==
prev_token
and
num_reps
<
max_reps
:
num_reps
+=
1
else
:
if
num_reps
>
0
:
result
.
append
(
replabel_value_to_idx
[
num_reps
])
num_reps
=
0
result
.
append
(
token
)
prev_token
=
token
if
num_reps
>
0
:
result
.
append
(
replabel_value_to_idx
[
num_reps
])
return
result
def
unpack_replabels
(
tokens
,
dictionary
,
max_reps
):
"""
Unpack a token sequence so that replabels are replaced by repeated symbols
"""
if
len
(
tokens
)
==
0
or
max_reps
<=
0
:
return
tokens
replabel_idx_to_value
=
{}
for
i
in
range
(
1
,
max_reps
+
1
):
replabel_idx_to_value
[
dictionary
.
index
(
replabel_symbol
(
i
))]
=
i
result
=
[]
prev_token
=
-
1
for
token
in
tokens
:
try
:
for
_
in
range
(
replabel_idx_to_value
[
token
]):
result
.
append
(
prev_token
)
prev_token
=
-
1
except
KeyError
:
result
.
append
(
token
)
prev_token
=
token
return
result
examples/speech_recognition/datasets/asr_prep_json.py
0 → 100644
View file @
7df61696
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
argparse
import
concurrent.futures
import
json
import
multiprocessing
import
os
from
collections
import
namedtuple
from
itertools
import
chain
import
sentencepiece
as
spm
from
fairseq.data
import
Dictionary
MILLISECONDS_TO_SECONDS
=
0.001
def
process_sample
(
aud_path
,
lable
,
utt_id
,
sp
,
tgt_dict
):
import
torchaudio
input
=
{}
output
=
{}
si
,
ei
=
torchaudio
.
info
(
aud_path
)
input
[
"length_ms"
]
=
int
(
si
.
length
/
si
.
channels
/
si
.
rate
/
MILLISECONDS_TO_SECONDS
)
input
[
"path"
]
=
aud_path
token
=
" "
.
join
(
sp
.
EncodeAsPieces
(
lable
))
ids
=
tgt_dict
.
encode_line
(
token
,
append_eos
=
False
)
output
[
"text"
]
=
lable
output
[
"token"
]
=
token
output
[
"tokenid"
]
=
", "
.
join
(
map
(
str
,
[
t
.
tolist
()
for
t
in
ids
]))
return
{
utt_id
:
{
"input"
:
input
,
"output"
:
output
}}
def
main
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--audio-dirs"
,
nargs
=
"+"
,
default
=
[
"-"
],
required
=
True
,
help
=
"input directories with audio files"
,
)
parser
.
add_argument
(
"--labels"
,
required
=
True
,
help
=
"aggregated input labels with format <ID LABEL> per line"
,
type
=
argparse
.
FileType
(
"r"
,
encoding
=
"UTF-8"
),
)
parser
.
add_argument
(
"--spm-model"
,
required
=
True
,
help
=
"sentencepiece model to use for encoding"
,
type
=
argparse
.
FileType
(
"r"
,
encoding
=
"UTF-8"
),
)
parser
.
add_argument
(
"--dictionary"
,
required
=
True
,
help
=
"file to load fairseq dictionary from"
,
type
=
argparse
.
FileType
(
"r"
,
encoding
=
"UTF-8"
),
)
parser
.
add_argument
(
"--audio-format"
,
choices
=
[
"flac"
,
"wav"
],
default
=
"wav"
)
parser
.
add_argument
(
"--output"
,
required
=
True
,
type
=
argparse
.
FileType
(
"w"
),
help
=
"path to save json output"
,
)
args
=
parser
.
parse_args
()
sp
=
spm
.
SentencePieceProcessor
()
sp
.
Load
(
args
.
spm_model
.
name
)
tgt_dict
=
Dictionary
.
load
(
args
.
dictionary
)
labels
=
{}
for
line
in
args
.
labels
:
(
utt_id
,
label
)
=
line
.
split
(
" "
,
1
)
labels
[
utt_id
]
=
label
if
len
(
labels
)
==
0
:
raise
Exception
(
"No labels found in "
,
args
.
labels_path
)
Sample
=
namedtuple
(
"Sample"
,
"aud_path utt_id"
)
samples
=
[]
for
path
,
_
,
files
in
chain
.
from_iterable
(
os
.
walk
(
path
)
for
path
in
args
.
audio_dirs
):
for
f
in
files
:
if
f
.
endswith
(
args
.
audio_format
):
if
len
(
os
.
path
.
splitext
(
f
))
!=
2
:
raise
Exception
(
"Expect <utt_id.extension> file name. Got: "
,
f
)
utt_id
=
os
.
path
.
splitext
(
f
)[
0
]
if
utt_id
not
in
labels
:
continue
samples
.
append
(
Sample
(
os
.
path
.
join
(
path
,
f
),
utt_id
))
utts
=
{}
num_cpu
=
multiprocessing
.
cpu_count
()
with
concurrent
.
futures
.
ThreadPoolExecutor
(
max_workers
=
num_cpu
)
as
executor
:
future_to_sample
=
{
executor
.
submit
(
process_sample
,
s
.
aud_path
,
labels
[
s
.
utt_id
],
s
.
utt_id
,
sp
,
tgt_dict
):
s
for
s
in
samples
}
for
future
in
concurrent
.
futures
.
as_completed
(
future_to_sample
):
try
:
data
=
future
.
result
()
except
Exception
as
exc
:
print
(
"generated an exception: "
,
exc
)
else
:
utts
.
update
(
data
)
json
.
dump
({
"utts"
:
utts
},
args
.
output
,
indent
=
4
)
if
__name__
==
"__main__"
:
main
()
examples/speech_recognition/datasets/prepare-librispeech.sh
0 → 100755
View file @
7df61696
#!/usr/bin/env bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# Prepare librispeech dataset
base_url
=
www.openslr.org/resources/12
train_dir
=
train_960
if
[
"$#"
-ne
2
]
;
then
echo
"Usage:
$0
<download_dir> <out_dir>"
echo
"e.g.:
$0
/tmp/librispeech_raw/ ~/data/librispeech_final"
exit
1
fi
download_dir
=
${
1
%/
}
out_dir
=
${
2
%/
}
fairseq_root
=
~/fairseq-py/
mkdir
-p
${
out_dir
}
cd
${
out_dir
}
||
exit
nbpe
=
5000
bpemode
=
unigram
if
[
!
-d
"
$fairseq_root
"
]
;
then
echo
"
$0
: Please set correct fairseq_root"
exit
1
fi
echo
"Data Download"
for
part
in
dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500
;
do
url
=
$base_url
/
$part
.tar.gz
if
!
wget
-P
$download_dir
$url
;
then
echo
"
$0
: wget failed for
$url
"
exit
1
fi
if
!
tar
-C
$download_dir
-xvzf
$download_dir
/
$part
.tar.gz
;
then
echo
"
$0
: error un-tarring archive
$download_dir
/
$part
.tar.gz"
exit
1
fi
done
echo
"Merge all train packs into one"
mkdir
-p
${
download_dir
}
/LibriSpeech/
${
train_dir
}
/
for
part
in
train-clean-100 train-clean-360 train-other-500
;
do
mv
${
download_dir
}
/LibriSpeech/
${
part
}
/
*
$download_dir
/LibriSpeech/
${
train_dir
}
/
done
echo
"Merge train text"
find
${
download_dir
}
/LibriSpeech/
${
train_dir
}
/
-name
'*.txt'
-exec
cat
{}
\;
>>
${
download_dir
}
/LibriSpeech/
${
train_dir
}
/text
# Use combined dev-clean and dev-other as validation set
find
${
download_dir
}
/LibriSpeech/dev-clean/
${
download_dir
}
/LibriSpeech/dev-other/
-name
'*.txt'
-exec
cat
{}
\;
>>
${
download_dir
}
/LibriSpeech/valid_text
find
${
download_dir
}
/LibriSpeech/test-clean/
-name
'*.txt'
-exec
cat
{}
\;
>>
${
download_dir
}
/LibriSpeech/test-clean/text
find
${
download_dir
}
/LibriSpeech/test-other/
-name
'*.txt'
-exec
cat
{}
\;
>>
${
download_dir
}
/LibriSpeech/test-other/text
dict
=
data/lang_char/
${
train_dir
}
_
${
bpemode
}${
nbpe
}
_units.txt
encoded
=
data/lang_char/
${
train_dir
}
_
${
bpemode
}${
nbpe
}
_encoded.txt
fairseq_dict
=
data/lang_char/
${
train_dir
}
_
${
bpemode
}${
nbpe
}
_fairseq_dict.txt
bpemodel
=
data/lang_char/
${
train_dir
}
_
${
bpemode
}${
nbpe
}
echo
"dictionary:
${
dict
}
"
echo
"Dictionary preparation"
mkdir
-p
data/lang_char/
echo
"<unk> 3"
>
${
dict
}
echo
"</s> 2"
>>
${
dict
}
echo
"<pad> 1"
>>
${
dict
}
cut
-f
2-
-d
" "
${
download_dir
}
/LibriSpeech/
${
train_dir
}
/text
>
data/lang_char/input.txt
spm_train
--input
=
data/lang_char/input.txt
--vocab_size
=
${
nbpe
}
--model_type
=
${
bpemode
}
--model_prefix
=
${
bpemodel
}
--input_sentence_size
=
100000000
--unk_id
=
3
--eos_id
=
2
--pad_id
=
1
--bos_id
=
-1
--character_coverage
=
1
spm_encode
--model
=
${
bpemodel
}
.model
--output_format
=
piece < data/lang_char/input.txt
>
${
encoded
}
cat
${
encoded
}
|
tr
' '
'\n'
|
sort
|
uniq
|
awk
'{print $0 " " NR+3}'
>>
${
dict
}
cat
${
encoded
}
|
tr
' '
'\n'
|
sort
|
uniq
-c
|
awk
'{print $2 " " $1}'
>
${
fairseq_dict
}
wc
-l
${
dict
}
echo
"Prepare train and test jsons"
for
part
in
train_960 test-other test-clean
;
do
python
${
fairseq_root
}
/examples/speech_recognition/datasets/asr_prep_json.py
--audio-dirs
${
download_dir
}
/LibriSpeech/
${
part
}
--labels
${
download_dir
}
/LibriSpeech/
${
part
}
/text
--spm-model
${
bpemodel
}
.model
--audio-format
flac
--dictionary
${
fairseq_dict
}
--output
${
part
}
.json
done
# fairseq expects to find train.json and valid.json during training
mv
train_960.json train.json
echo
"Prepare valid json"
python
${
fairseq_root
}
/examples/speech_recognition/datasets/asr_prep_json.py
--audio-dirs
${
download_dir
}
/LibriSpeech/dev-clean
${
download_dir
}
/LibriSpeech/dev-other
--labels
${
download_dir
}
/LibriSpeech/valid_text
--spm-model
${
bpemodel
}
.model
--audio-format
flac
--dictionary
${
fairseq_dict
}
--output
valid.json
cp
${
fairseq_dict
}
./dict.txt
cp
${
bpemodel
}
.model ./spm.model
examples/speech_recognition/infer.py
0 → 100644
View file @
7df61696
#!/usr/bin/env python3 -u
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Run inference for pre-processed data with a trained model.
"""
import
logging
import
math
import
os
import
sys
import
editdistance
import
numpy
as
np
import
torch
from
fairseq
import
checkpoint_utils
,
options
,
progress_bar
,
tasks
,
utils
from
fairseq.data.data_utils
import
post_process
from
fairseq.logging.meters
import
StopwatchMeter
,
TimeMeter
logging
.
basicConfig
()
logging
.
root
.
setLevel
(
logging
.
INFO
)
logging
.
basicConfig
(
level
=
logging
.
INFO
)
logger
=
logging
.
getLogger
(
__name__
)
def
add_asr_eval_argument
(
parser
):
parser
.
add_argument
(
"--kspmodel"
,
default
=
None
,
help
=
"sentence piece model"
)
parser
.
add_argument
(
"--wfstlm"
,
default
=
None
,
help
=
"wfstlm on dictonary output units"
)
parser
.
add_argument
(
"--rnnt_decoding_type"
,
default
=
"greedy"
,
help
=
"wfstlm on dictonary
\
output units"
,
)
try
:
parser
.
add_argument
(
"--lm-weight"
,
"--lm_weight"
,
type
=
float
,
default
=
0.2
,
help
=
"weight for lm while interpolating with neural score"
,
)
except
:
pass
parser
.
add_argument
(
"--rnnt_len_penalty"
,
default
=-
0.5
,
help
=
"rnnt length penalty on word level"
)
parser
.
add_argument
(
"--w2l-decoder"
,
choices
=
[
"viterbi"
,
"kenlm"
,
"fairseqlm"
],
help
=
"use a w2l decoder"
,
)
parser
.
add_argument
(
"--lexicon"
,
help
=
"lexicon for w2l decoder"
)
parser
.
add_argument
(
"--unit-lm"
,
action
=
"store_true"
,
help
=
"if using a unit lm"
)
parser
.
add_argument
(
"--kenlm-model"
,
"--lm-model"
,
help
=
"lm model for w2l decoder"
)
parser
.
add_argument
(
"--beam-threshold"
,
type
=
float
,
default
=
25.0
)
parser
.
add_argument
(
"--beam-size-token"
,
type
=
float
,
default
=
100
)
parser
.
add_argument
(
"--word-score"
,
type
=
float
,
default
=
1.0
)
parser
.
add_argument
(
"--unk-weight"
,
type
=
float
,
default
=-
math
.
inf
)
parser
.
add_argument
(
"--sil-weight"
,
type
=
float
,
default
=
0.0
)
parser
.
add_argument
(
"--dump-emissions"
,
type
=
str
,
default
=
None
,
help
=
"if present, dumps emissions into this file and exits"
,
)
parser
.
add_argument
(
"--dump-features"
,
type
=
str
,
default
=
None
,
help
=
"if present, dumps features into this file and exits"
,
)
parser
.
add_argument
(
"--load-emissions"
,
type
=
str
,
default
=
None
,
help
=
"if present, loads emissions from this file"
,
)
return
parser
def
check_args
(
args
):
# assert args.path is not None, "--path required for generation!"
# assert args.results_path is not None, "--results_path required for generation!"
assert
(
not
args
.
sampling
or
args
.
nbest
==
args
.
beam
),
"--sampling requires --nbest to be equal to --beam"
assert
(
args
.
replace_unk
is
None
or
args
.
raw_text
),
"--replace-unk requires a raw text dataset (--raw-text)"
def
get_dataset_itr
(
args
,
task
,
models
):
return
task
.
get_batch_iterator
(
dataset
=
task
.
dataset
(
args
.
gen_subset
),
max_tokens
=
args
.
max_tokens
,
max_sentences
=
args
.
batch_size
,
max_positions
=
(
sys
.
maxsize
,
sys
.
maxsize
),
ignore_invalid_inputs
=
args
.
skip_invalid_size_inputs_valid_test
,
required_batch_size_multiple
=
args
.
required_batch_size_multiple
,
num_shards
=
args
.
num_shards
,
shard_id
=
args
.
shard_id
,
num_workers
=
args
.
num_workers
,
data_buffer_size
=
args
.
data_buffer_size
,
).
next_epoch_itr
(
shuffle
=
False
)
def
process_predictions
(
args
,
hypos
,
sp
,
tgt_dict
,
target_tokens
,
res_files
,
speaker
,
id
):
for
hypo
in
hypos
[:
min
(
len
(
hypos
),
args
.
nbest
)]:
hyp_pieces
=
tgt_dict
.
string
(
hypo
[
"tokens"
].
int
().
cpu
())
if
"words"
in
hypo
:
hyp_words
=
" "
.
join
(
hypo
[
"words"
])
else
:
hyp_words
=
post_process
(
hyp_pieces
,
args
.
remove_bpe
)
if
res_files
is
not
None
:
print
(
"{} ({}-{})"
.
format
(
hyp_pieces
,
speaker
,
id
),
file
=
res_files
[
"hypo.units"
],
)
print
(
"{} ({}-{})"
.
format
(
hyp_words
,
speaker
,
id
),
file
=
res_files
[
"hypo.words"
],
)
tgt_pieces
=
tgt_dict
.
string
(
target_tokens
)
tgt_words
=
post_process
(
tgt_pieces
,
args
.
remove_bpe
)
if
res_files
is
not
None
:
print
(
"{} ({}-{})"
.
format
(
tgt_pieces
,
speaker
,
id
),
file
=
res_files
[
"ref.units"
],
)
print
(
"{} ({}-{})"
.
format
(
tgt_words
,
speaker
,
id
),
file
=
res_files
[
"ref.words"
]
)
# only score top hypothesis
if
not
args
.
quiet
:
logger
.
debug
(
"HYPO:"
+
hyp_words
)
logger
.
debug
(
"TARGET:"
+
tgt_words
)
logger
.
debug
(
"___________________"
)
hyp_words
=
hyp_words
.
split
()
tgt_words
=
tgt_words
.
split
()
return
editdistance
.
eval
(
hyp_words
,
tgt_words
),
len
(
tgt_words
)
def
prepare_result_files
(
args
):
def
get_res_file
(
file_prefix
):
if
args
.
num_shards
>
1
:
file_prefix
=
f
"
{
args
.
shard_id
}
_
{
file_prefix
}
"
path
=
os
.
path
.
join
(
args
.
results_path
,
"{}-{}-{}.txt"
.
format
(
file_prefix
,
os
.
path
.
basename
(
args
.
path
),
args
.
gen_subset
),
)
return
open
(
path
,
"w"
,
buffering
=
1
)
if
not
args
.
results_path
:
return
None
return
{
"hypo.words"
:
get_res_file
(
"hypo.word"
),
"hypo.units"
:
get_res_file
(
"hypo.units"
),
"ref.words"
:
get_res_file
(
"ref.word"
),
"ref.units"
:
get_res_file
(
"ref.units"
),
}
def
load_models_and_criterions
(
filenames
,
data_path
,
arg_overrides
=
None
,
task
=
None
,
model_state
=
None
):
models
=
[]
criterions
=
[]
if
arg_overrides
is
None
:
arg_overrides
=
{}
arg_overrides
[
"wer_args"
]
=
None
arg_overrides
[
"data"
]
=
data_path
if
filenames
is
None
:
assert
model_state
is
not
None
filenames
=
[
0
]
else
:
filenames
=
filenames
.
split
(
":"
)
for
filename
in
filenames
:
if
model_state
is
None
:
if
not
os
.
path
.
exists
(
filename
):
raise
IOError
(
"Model file not found: {}"
.
format
(
filename
))
state
=
checkpoint_utils
.
load_checkpoint_to_cpu
(
filename
,
arg_overrides
)
else
:
state
=
model_state
args
=
state
[
"args"
]
if
task
is
None
:
task
=
tasks
.
setup_task
(
args
)
model
=
task
.
build_model
(
args
)
model
.
load_state_dict
(
state
[
"model"
],
strict
=
True
)
models
.
append
(
model
)
criterion
=
task
.
build_criterion
(
args
)
if
"criterion"
in
state
:
criterion
.
load_state_dict
(
state
[
"criterion"
],
strict
=
True
)
criterions
.
append
(
criterion
)
return
models
,
criterions
,
args
def
optimize_models
(
args
,
use_cuda
,
models
):
"""Optimize ensemble for generation"""
for
model
in
models
:
model
.
make_generation_fast_
(
beamable_mm_beam_size
=
None
if
args
.
no_beamable_mm
else
args
.
beam
,
need_attn
=
args
.
print_alignment
,
)
if
args
.
fp16
:
model
.
half
()
if
use_cuda
:
model
.
cuda
()
class
ExistingEmissionsDecoder
(
object
):
def
__init__
(
self
,
decoder
,
emissions
):
self
.
decoder
=
decoder
self
.
emissions
=
emissions
def
generate
(
self
,
models
,
sample
,
**
unused
):
ids
=
sample
[
"id"
].
cpu
().
numpy
()
try
:
emissions
=
np
.
stack
(
self
.
emissions
[
ids
])
except
:
print
([
x
.
shape
for
x
in
self
.
emissions
[
ids
]])
raise
Exception
(
"invalid sizes"
)
emissions
=
torch
.
from_numpy
(
emissions
)
return
self
.
decoder
.
decode
(
emissions
)
def
main
(
args
,
task
=
None
,
model_state
=
None
):
check_args
(
args
)
if
args
.
max_tokens
is
None
and
args
.
batch_size
is
None
:
args
.
max_tokens
=
4000000
logger
.
info
(
args
)
use_cuda
=
torch
.
cuda
.
is_available
()
and
not
args
.
cpu
if
task
is
None
:
# Load dataset splits
task
=
tasks
.
setup_task
(
args
)
task
.
load_dataset
(
args
.
gen_subset
)
logger
.
info
(
"| {} {} {} examples"
.
format
(
args
.
data
,
args
.
gen_subset
,
len
(
task
.
dataset
(
args
.
gen_subset
))
)
)
# Set dictionary
tgt_dict
=
task
.
target_dictionary
logger
.
info
(
"| decoding with criterion {}"
.
format
(
args
.
criterion
))
# Load ensemble
if
args
.
load_emissions
:
models
,
criterions
=
[],
[]
else
:
logger
.
info
(
"| loading model(s) from {}"
.
format
(
args
.
path
))
models
,
criterions
,
_
=
load_models_and_criterions
(
args
.
path
,
data_path
=
args
.
data
,
arg_overrides
=
eval
(
args
.
model_overrides
),
# noqa
task
=
task
,
model_state
=
model_state
,
)
optimize_models
(
args
,
use_cuda
,
models
)
# hack to pass transitions to W2lDecoder
if
args
.
criterion
==
"asg_loss"
:
trans
=
criterions
[
0
].
asg
.
trans
.
data
args
.
asg_transitions
=
torch
.
flatten
(
trans
).
tolist
()
# Load dataset (possibly sharded)
itr
=
get_dataset_itr
(
args
,
task
,
models
)
# Initialize generator
gen_timer
=
StopwatchMeter
()
def
build_generator
(
args
):
w2l_decoder
=
getattr
(
args
,
"w2l_decoder"
,
None
)
if
w2l_decoder
==
"viterbi"
:
from
examples.speech_recognition.w2l_decoder
import
W2lViterbiDecoder
return
W2lViterbiDecoder
(
args
,
task
.
target_dictionary
)
elif
w2l_decoder
==
"kenlm"
:
from
examples.speech_recognition.w2l_decoder
import
W2lKenLMDecoder
return
W2lKenLMDecoder
(
args
,
task
.
target_dictionary
)
elif
w2l_decoder
==
"fairseqlm"
:
from
examples.speech_recognition.w2l_decoder
import
W2lFairseqLMDecoder
return
W2lFairseqLMDecoder
(
args
,
task
.
target_dictionary
)
else
:
print
(
"only wav2letter decoders with (viterbi, kenlm, fairseqlm) options are supported at the moment"
)
# please do not touch this unless you test both generate.py and infer.py with audio_pretraining task
generator
=
build_generator
(
args
)
if
args
.
load_emissions
:
generator
=
ExistingEmissionsDecoder
(
generator
,
np
.
load
(
args
.
load_emissions
,
allow_pickle
=
True
)
)
logger
.
info
(
"loaded emissions from "
+
args
.
load_emissions
)
num_sentences
=
0
if
args
.
results_path
is
not
None
and
not
os
.
path
.
exists
(
args
.
results_path
):
os
.
makedirs
(
args
.
results_path
)
max_source_pos
=
(
utils
.
resolve_max_positions
(
task
.
max_positions
(),
*
[
model
.
max_positions
()
for
model
in
models
]
),
)
if
max_source_pos
is
not
None
:
max_source_pos
=
max_source_pos
[
0
]
if
max_source_pos
is
not
None
:
max_source_pos
=
max_source_pos
[
0
]
-
1
if
args
.
dump_emissions
:
emissions
=
{}
if
args
.
dump_features
:
features
=
{}
models
[
0
].
bert
.
proj
=
None
else
:
res_files
=
prepare_result_files
(
args
)
errs_t
=
0
lengths_t
=
0
with
progress_bar
.
build_progress_bar
(
args
,
itr
)
as
t
:
wps_meter
=
TimeMeter
()
for
sample
in
t
:
sample
=
utils
.
move_to_cuda
(
sample
)
if
use_cuda
else
sample
if
"net_input"
not
in
sample
:
continue
prefix_tokens
=
None
if
args
.
prefix_size
>
0
:
prefix_tokens
=
sample
[
"target"
][:,
:
args
.
prefix_size
]
gen_timer
.
start
()
if
args
.
dump_emissions
:
with
torch
.
no_grad
():
encoder_out
=
models
[
0
](
**
sample
[
"net_input"
])
emm
=
models
[
0
].
get_normalized_probs
(
encoder_out
,
log_probs
=
True
)
emm
=
emm
.
transpose
(
0
,
1
).
cpu
().
numpy
()
for
i
,
id
in
enumerate
(
sample
[
"id"
]):
emissions
[
id
.
item
()]
=
emm
[
i
]
continue
elif
args
.
dump_features
:
with
torch
.
no_grad
():
encoder_out
=
models
[
0
](
**
sample
[
"net_input"
])
feat
=
encoder_out
[
"encoder_out"
].
transpose
(
0
,
1
).
cpu
().
numpy
()
for
i
,
id
in
enumerate
(
sample
[
"id"
]):
padding
=
(
encoder_out
[
"encoder_padding_mask"
][
i
].
cpu
().
numpy
()
if
encoder_out
[
"encoder_padding_mask"
]
is
not
None
else
None
)
features
[
id
.
item
()]
=
(
feat
[
i
],
padding
)
continue
hypos
=
task
.
inference_step
(
generator
,
models
,
sample
,
prefix_tokens
)
num_generated_tokens
=
sum
(
len
(
h
[
0
][
"tokens"
])
for
h
in
hypos
)
gen_timer
.
stop
(
num_generated_tokens
)
for
i
,
sample_id
in
enumerate
(
sample
[
"id"
].
tolist
()):
speaker
=
None
# id = task.dataset(args.gen_subset).ids[int(sample_id)]
id
=
sample_id
toks
=
(
sample
[
"target"
][
i
,
:]
if
"target_label"
not
in
sample
else
sample
[
"target_label"
][
i
,
:]
)
target_tokens
=
utils
.
strip_pad
(
toks
,
tgt_dict
.
pad
()).
int
().
cpu
()
# Process top predictions
errs
,
length
=
process_predictions
(
args
,
hypos
[
i
],
None
,
tgt_dict
,
target_tokens
,
res_files
,
speaker
,
id
,
)
errs_t
+=
errs
lengths_t
+=
length
wps_meter
.
update
(
num_generated_tokens
)
t
.
log
({
"wps"
:
round
(
wps_meter
.
avg
)})
num_sentences
+=
(
sample
[
"nsentences"
]
if
"nsentences"
in
sample
else
sample
[
"id"
].
numel
()
)
wer
=
None
if
args
.
dump_emissions
:
emm_arr
=
[]
for
i
in
range
(
len
(
emissions
)):
emm_arr
.
append
(
emissions
[
i
])
np
.
save
(
args
.
dump_emissions
,
emm_arr
)
logger
.
info
(
f
"saved
{
len
(
emissions
)
}
emissions to
{
args
.
dump_emissions
}
"
)
elif
args
.
dump_features
:
feat_arr
=
[]
for
i
in
range
(
len
(
features
)):
feat_arr
.
append
(
features
[
i
])
np
.
save
(
args
.
dump_features
,
feat_arr
)
logger
.
info
(
f
"saved
{
len
(
features
)
}
emissions to
{
args
.
dump_features
}
"
)
else
:
if
lengths_t
>
0
:
wer
=
errs_t
*
100.0
/
lengths_t
logger
.
info
(
f
"WER:
{
wer
}
"
)
logger
.
info
(
"| Processed {} sentences ({} tokens) in {:.1f}s ({:.2f}"
"sentences/s, {:.2f} tokens/s)"
.
format
(
num_sentences
,
gen_timer
.
n
,
gen_timer
.
sum
,
num_sentences
/
gen_timer
.
sum
,
1.0
/
gen_timer
.
avg
,
)
)
logger
.
info
(
"| Generate {} with beam={}"
.
format
(
args
.
gen_subset
,
args
.
beam
))
return
task
,
wer
def
make_parser
():
parser
=
options
.
get_generation_parser
()
parser
=
add_asr_eval_argument
(
parser
)
return
parser
def
cli_main
():
parser
=
make_parser
()
args
=
options
.
parse_args_and_arch
(
parser
)
main
(
args
)
if
__name__
==
"__main__"
:
cli_main
()
examples/speech_recognition/models/__init__.py
0 → 100644
View file @
7df61696
import
importlib
import
os
for
file
in
os
.
listdir
(
os
.
path
.
dirname
(
__file__
)):
if
file
.
endswith
(
".py"
)
and
not
file
.
startswith
(
"_"
):
model_name
=
file
[:
file
.
find
(
".py"
)]
importlib
.
import_module
(
"examples.speech_recognition.models."
+
model_name
)
examples/speech_recognition/models/vggtransformer.py
0 → 100644
View file @
7df61696
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
argparse
import
math
from
collections.abc
import
Iterable
import
torch
import
torch.nn
as
nn
from
examples.speech_recognition.data.data_utils
import
lengths_to_encoder_padding_mask
from
fairseq
import
utils
from
fairseq.models
import
(
FairseqEncoder
,
FairseqEncoderDecoderModel
,
FairseqEncoderModel
,
FairseqIncrementalDecoder
,
register_model
,
register_model_architecture
,
)
from
fairseq.modules
import
(
LinearizedConvolution
,
TransformerDecoderLayer
,
TransformerEncoderLayer
,
VGGBlock
,
)
@
register_model
(
"asr_vggtransformer"
)
class
VGGTransformerModel
(
FairseqEncoderDecoderModel
):
"""
Transformers with convolutional context for ASR
https://arxiv.org/abs/1904.11660
"""
def
__init__
(
self
,
encoder
,
decoder
):
super
().
__init__
(
encoder
,
decoder
)
@
staticmethod
def
add_args
(
parser
):
"""Add model-specific arguments to the parser."""
parser
.
add_argument
(
"--input-feat-per-channel"
,
type
=
int
,
metavar
=
"N"
,
help
=
"encoder input dimension per input channel"
,
)
parser
.
add_argument
(
"--vggblock-enc-config"
,
type
=
str
,
metavar
=
"EXPR"
,
help
=
"""
an array of tuples each containing the configuration of one vggblock:
[(out_channels,
conv_kernel_size,
pooling_kernel_size,
num_conv_layers,
use_layer_norm), ...])
"""
,
)
parser
.
add_argument
(
"--transformer-enc-config"
,
type
=
str
,
metavar
=
"EXPR"
,
help
=
""""
a tuple containing the configuration of the encoder transformer layers
configurations:
[(input_dim,
num_heads,
ffn_dim,
normalize_before,
dropout,
attention_dropout,
relu_dropout), ...]')
"""
,
)
parser
.
add_argument
(
"--enc-output-dim"
,
type
=
int
,
metavar
=
"N"
,
help
=
"""
encoder output dimension, can be None. If specified, projecting the
transformer output to the specified dimension"""
,
)
parser
.
add_argument
(
"--in-channels"
,
type
=
int
,
metavar
=
"N"
,
help
=
"number of encoder input channels"
,
)
parser
.
add_argument
(
"--tgt-embed-dim"
,
type
=
int
,
metavar
=
"N"
,
help
=
"embedding dimension of the decoder target tokens"
,
)
parser
.
add_argument
(
"--transformer-dec-config"
,
type
=
str
,
metavar
=
"EXPR"
,
help
=
"""
a tuple containing the configuration of the decoder transformer layers
configurations:
[(input_dim,
num_heads,
ffn_dim,
normalize_before,
dropout,
attention_dropout,
relu_dropout), ...]
"""
,
)
parser
.
add_argument
(
"--conv-dec-config"
,
type
=
str
,
metavar
=
"EXPR"
,
help
=
"""
an array of tuples for the decoder 1-D convolution config
[(out_channels, conv_kernel_size, use_layer_norm), ...]"""
,
)
@
classmethod
def
build_encoder
(
cls
,
args
,
task
):
return
VGGTransformerEncoder
(
input_feat_per_channel
=
args
.
input_feat_per_channel
,
vggblock_config
=
eval
(
args
.
vggblock_enc_config
),
transformer_config
=
eval
(
args
.
transformer_enc_config
),
encoder_output_dim
=
args
.
enc_output_dim
,
in_channels
=
args
.
in_channels
,
)
@
classmethod
def
build_decoder
(
cls
,
args
,
task
):
return
TransformerDecoder
(
dictionary
=
task
.
target_dictionary
,
embed_dim
=
args
.
tgt_embed_dim
,
transformer_config
=
eval
(
args
.
transformer_dec_config
),
conv_config
=
eval
(
args
.
conv_dec_config
),
encoder_output_dim
=
args
.
enc_output_dim
,
)
@
classmethod
def
build_model
(
cls
,
args
,
task
):
"""Build a new model instance."""
# make sure that all args are properly defaulted
# (in case there are any new ones)
base_architecture
(
args
)
encoder
=
cls
.
build_encoder
(
args
,
task
)
decoder
=
cls
.
build_decoder
(
args
,
task
)
return
cls
(
encoder
,
decoder
)
def
get_normalized_probs
(
self
,
net_output
,
log_probs
,
sample
=
None
):
# net_output['encoder_out'] is a (B, T, D) tensor
lprobs
=
super
().
get_normalized_probs
(
net_output
,
log_probs
,
sample
)
lprobs
.
batch_first
=
True
return
lprobs
DEFAULT_ENC_VGGBLOCK_CONFIG
=
((
32
,
3
,
2
,
2
,
False
),)
*
2
DEFAULT_ENC_TRANSFORMER_CONFIG
=
((
256
,
4
,
1024
,
True
,
0.2
,
0.2
,
0.2
),)
*
2
# 256: embedding dimension
# 4: number of heads
# 1024: FFN
# True: apply layerNorm before (dropout + resiaul) instead of after
# 0.2 (dropout): dropout after MultiheadAttention and second FC
# 0.2 (attention_dropout): dropout in MultiheadAttention
# 0.2 (relu_dropout): dropout after ReLu
DEFAULT_DEC_TRANSFORMER_CONFIG
=
((
256
,
2
,
1024
,
True
,
0.2
,
0.2
,
0.2
),)
*
2
DEFAULT_DEC_CONV_CONFIG
=
((
256
,
3
,
True
),)
*
2
# TODO: repace transformer encoder config from one liner
# to explicit args to get rid of this transformation
def
prepare_transformer_encoder_params
(
input_dim
,
num_heads
,
ffn_dim
,
normalize_before
,
dropout
,
attention_dropout
,
relu_dropout
,
):
args
=
argparse
.
Namespace
()
args
.
encoder_embed_dim
=
input_dim
args
.
encoder_attention_heads
=
num_heads
args
.
attention_dropout
=
attention_dropout
args
.
dropout
=
dropout
args
.
activation_dropout
=
relu_dropout
args
.
encoder_normalize_before
=
normalize_before
args
.
encoder_ffn_embed_dim
=
ffn_dim
return
args
def
prepare_transformer_decoder_params
(
input_dim
,
num_heads
,
ffn_dim
,
normalize_before
,
dropout
,
attention_dropout
,
relu_dropout
,
):
args
=
argparse
.
Namespace
()
args
.
decoder_embed_dim
=
input_dim
args
.
decoder_attention_heads
=
num_heads
args
.
attention_dropout
=
attention_dropout
args
.
dropout
=
dropout
args
.
activation_dropout
=
relu_dropout
args
.
decoder_normalize_before
=
normalize_before
args
.
decoder_ffn_embed_dim
=
ffn_dim
return
args
class
VGGTransformerEncoder
(
FairseqEncoder
):
"""VGG + Transformer encoder"""
def
__init__
(
self
,
input_feat_per_channel
,
vggblock_config
=
DEFAULT_ENC_VGGBLOCK_CONFIG
,
transformer_config
=
DEFAULT_ENC_TRANSFORMER_CONFIG
,
encoder_output_dim
=
512
,
in_channels
=
1
,
transformer_context
=
None
,
transformer_sampling
=
None
,
):
"""constructor for VGGTransformerEncoder
Args:
- input_feat_per_channel: feature dim (not including stacked,
just base feature)
- in_channel: # input channels (e.g., if stack 8 feature vector
together, this is 8)
- vggblock_config: configuration of vggblock, see comments on
DEFAULT_ENC_VGGBLOCK_CONFIG
- transformer_config: configuration of transformer layer, see comments
on DEFAULT_ENC_TRANSFORMER_CONFIG
- encoder_output_dim: final transformer output embedding dimension
- transformer_context: (left, right) if set, self-attention will be focused
on (t-left, t+right)
- transformer_sampling: an iterable of int, must match with
len(transformer_config), transformer_sampling[i] indicates sampling
factor for i-th transformer layer, after multihead att and feedfoward
part
"""
super
().
__init__
(
None
)
self
.
num_vggblocks
=
0
if
vggblock_config
is
not
None
:
if
not
isinstance
(
vggblock_config
,
Iterable
):
raise
ValueError
(
"vggblock_config is not iterable"
)
self
.
num_vggblocks
=
len
(
vggblock_config
)
self
.
conv_layers
=
nn
.
ModuleList
()
self
.
in_channels
=
in_channels
self
.
input_dim
=
input_feat_per_channel
self
.
pooling_kernel_sizes
=
[]
if
vggblock_config
is
not
None
:
for
_
,
config
in
enumerate
(
vggblock_config
):
(
out_channels
,
conv_kernel_size
,
pooling_kernel_size
,
num_conv_layers
,
layer_norm
,
)
=
config
self
.
conv_layers
.
append
(
VGGBlock
(
in_channels
,
out_channels
,
conv_kernel_size
,
pooling_kernel_size
,
num_conv_layers
,
input_dim
=
input_feat_per_channel
,
layer_norm
=
layer_norm
,
)
)
self
.
pooling_kernel_sizes
.
append
(
pooling_kernel_size
)
in_channels
=
out_channels
input_feat_per_channel
=
self
.
conv_layers
[
-
1
].
output_dim
transformer_input_dim
=
self
.
infer_conv_output_dim
(
self
.
in_channels
,
self
.
input_dim
)
# transformer_input_dim is the output dimension of VGG part
self
.
validate_transformer_config
(
transformer_config
)
self
.
transformer_context
=
self
.
parse_transformer_context
(
transformer_context
)
self
.
transformer_sampling
=
self
.
parse_transformer_sampling
(
transformer_sampling
,
len
(
transformer_config
)
)
self
.
transformer_layers
=
nn
.
ModuleList
()
if
transformer_input_dim
!=
transformer_config
[
0
][
0
]:
self
.
transformer_layers
.
append
(
Linear
(
transformer_input_dim
,
transformer_config
[
0
][
0
])
)
self
.
transformer_layers
.
append
(
TransformerEncoderLayer
(
prepare_transformer_encoder_params
(
*
transformer_config
[
0
])
)
)
for
i
in
range
(
1
,
len
(
transformer_config
)):
if
transformer_config
[
i
-
1
][
0
]
!=
transformer_config
[
i
][
0
]:
self
.
transformer_layers
.
append
(
Linear
(
transformer_config
[
i
-
1
][
0
],
transformer_config
[
i
][
0
])
)
self
.
transformer_layers
.
append
(
TransformerEncoderLayer
(
prepare_transformer_encoder_params
(
*
transformer_config
[
i
])
)
)
self
.
encoder_output_dim
=
encoder_output_dim
self
.
transformer_layers
.
extend
(
[
Linear
(
transformer_config
[
-
1
][
0
],
encoder_output_dim
),
LayerNorm
(
encoder_output_dim
),
]
)
def
forward
(
self
,
src_tokens
,
src_lengths
,
**
kwargs
):
"""
src_tokens: padded tensor (B, T, C * feat)
src_lengths: tensor of original lengths of input utterances (B,)
"""
bsz
,
max_seq_len
,
_
=
src_tokens
.
size
()
x
=
src_tokens
.
view
(
bsz
,
max_seq_len
,
self
.
in_channels
,
self
.
input_dim
)
x
=
x
.
transpose
(
1
,
2
).
contiguous
()
# (B, C, T, feat)
for
layer_idx
in
range
(
len
(
self
.
conv_layers
)):
x
=
self
.
conv_layers
[
layer_idx
](
x
)
bsz
,
_
,
output_seq_len
,
_
=
x
.
size
()
# (B, C, T, feat) -> (B, T, C, feat) -> (T, B, C, feat) -> (T, B, C * feat)
x
=
x
.
transpose
(
1
,
2
).
transpose
(
0
,
1
)
x
=
x
.
contiguous
().
view
(
output_seq_len
,
bsz
,
-
1
)
input_lengths
=
src_lengths
.
clone
()
for
s
in
self
.
pooling_kernel_sizes
:
input_lengths
=
(
input_lengths
.
float
()
/
s
).
ceil
().
long
()
encoder_padding_mask
,
_
=
lengths_to_encoder_padding_mask
(
input_lengths
,
batch_first
=
True
)
if
not
encoder_padding_mask
.
any
():
encoder_padding_mask
=
None
subsampling_factor
=
int
(
max_seq_len
*
1.0
/
output_seq_len
+
0.5
)
attn_mask
=
self
.
lengths_to_attn_mask
(
input_lengths
,
subsampling_factor
)
transformer_layer_idx
=
0
for
layer_idx
in
range
(
len
(
self
.
transformer_layers
)):
if
isinstance
(
self
.
transformer_layers
[
layer_idx
],
TransformerEncoderLayer
):
x
=
self
.
transformer_layers
[
layer_idx
](
x
,
encoder_padding_mask
,
attn_mask
)
if
self
.
transformer_sampling
[
transformer_layer_idx
]
!=
1
:
sampling_factor
=
self
.
transformer_sampling
[
transformer_layer_idx
]
x
,
encoder_padding_mask
,
attn_mask
=
self
.
slice
(
x
,
encoder_padding_mask
,
attn_mask
,
sampling_factor
)
transformer_layer_idx
+=
1
else
:
x
=
self
.
transformer_layers
[
layer_idx
](
x
)
# encoder_padding_maks is a (T x B) tensor, its [t, b] elements indicate
# whether encoder_output[t, b] is valid or not (valid=0, invalid=1)
return
{
"encoder_out"
:
x
,
# (T, B, C)
"encoder_padding_mask"
:
encoder_padding_mask
.
t
()
if
encoder_padding_mask
is
not
None
else
None
,
# (B, T) --> (T, B)
}
def
infer_conv_output_dim
(
self
,
in_channels
,
input_dim
):
sample_seq_len
=
200
sample_bsz
=
10
x
=
torch
.
randn
(
sample_bsz
,
in_channels
,
sample_seq_len
,
input_dim
)
for
i
,
_
in
enumerate
(
self
.
conv_layers
):
x
=
self
.
conv_layers
[
i
](
x
)
x
=
x
.
transpose
(
1
,
2
)
mb
,
seq
=
x
.
size
()[:
2
]
return
x
.
contiguous
().
view
(
mb
,
seq
,
-
1
).
size
(
-
1
)
def
validate_transformer_config
(
self
,
transformer_config
):
for
config
in
transformer_config
:
input_dim
,
num_heads
=
config
[:
2
]
if
input_dim
%
num_heads
!=
0
:
msg
=
(
"ERROR in transformer config {}: "
.
format
(
config
)
+
"input dimension {} "
.
format
(
input_dim
)
+
"not dividable by number of heads {}"
.
format
(
num_heads
)
)
raise
ValueError
(
msg
)
def
parse_transformer_context
(
self
,
transformer_context
):
"""
transformer_context can be the following:
- None; indicates no context is used, i.e.,
transformer can access full context
- a tuple/list of two int; indicates left and right context,
any number <0 indicates infinite context
* e.g., (5, 6) indicates that for query at x_t, transformer can
access [t-5, t+6] (inclusive)
* e.g., (-1, 6) indicates that for query at x_t, transformer can
access [0, t+6] (inclusive)
"""
if
transformer_context
is
None
:
return
None
if
not
isinstance
(
transformer_context
,
Iterable
):
raise
ValueError
(
"transformer context must be Iterable if it is not None"
)
if
len
(
transformer_context
)
!=
2
:
raise
ValueError
(
"transformer context must have length 2"
)
left_context
=
transformer_context
[
0
]
if
left_context
<
0
:
left_context
=
None
right_context
=
transformer_context
[
1
]
if
right_context
<
0
:
right_context
=
None
if
left_context
is
None
and
right_context
is
None
:
return
None
return
(
left_context
,
right_context
)
def
parse_transformer_sampling
(
self
,
transformer_sampling
,
num_layers
):
"""
parsing transformer sampling configuration
Args:
- transformer_sampling, accepted input:
* None, indicating no sampling
* an Iterable with int (>0) as element
- num_layers, expected number of transformer layers, must match with
the length of transformer_sampling if it is not None
Returns:
- A tuple with length num_layers
"""
if
transformer_sampling
is
None
:
return
(
1
,)
*
num_layers
if
not
isinstance
(
transformer_sampling
,
Iterable
):
raise
ValueError
(
"transformer_sampling must be an iterable if it is not None"
)
if
len
(
transformer_sampling
)
!=
num_layers
:
raise
ValueError
(
"transformer_sampling {} does not match with the number "
"of layers {}"
.
format
(
transformer_sampling
,
num_layers
)
)
for
layer
,
value
in
enumerate
(
transformer_sampling
):
if
not
isinstance
(
value
,
int
):
raise
ValueError
(
"Invalid value in transformer_sampling: "
)
if
value
<
1
:
raise
ValueError
(
"{} layer's subsampling is {}."
.
format
(
layer
,
value
)
+
" This is not allowed! "
)
return
transformer_sampling
def
slice
(
self
,
embedding
,
padding_mask
,
attn_mask
,
sampling_factor
):
"""
embedding is a (T, B, D) tensor
padding_mask is a (B, T) tensor or None
attn_mask is a (T, T) tensor or None
"""
embedding
=
embedding
[::
sampling_factor
,
:,
:]
if
padding_mask
is
not
None
:
padding_mask
=
padding_mask
[:,
::
sampling_factor
]
if
attn_mask
is
not
None
:
attn_mask
=
attn_mask
[::
sampling_factor
,
::
sampling_factor
]
return
embedding
,
padding_mask
,
attn_mask
def
lengths_to_attn_mask
(
self
,
input_lengths
,
subsampling_factor
=
1
):
"""
create attention mask according to sequence lengths and transformer
context
Args:
- input_lengths: (B, )-shape Int/Long tensor; input_lengths[b] is
the length of b-th sequence
- subsampling_factor: int
* Note that the left_context and right_context is specified in
the input frame-level while input to transformer may already
go through subsampling (e.g., the use of striding in vggblock)
we use subsampling_factor to scale the left/right context
Return:
- a (T, T) binary tensor or None, where T is max(input_lengths)
* if self.transformer_context is None, None
* if left_context is None,
* attn_mask[t, t + right_context + 1:] = 1
* others = 0
* if right_context is None,
* attn_mask[t, 0:t - left_context] = 1
* others = 0
* elsif
* attn_mask[t, t - left_context: t + right_context + 1] = 0
* others = 1
"""
if
self
.
transformer_context
is
None
:
return
None
maxT
=
torch
.
max
(
input_lengths
).
item
()
attn_mask
=
torch
.
zeros
(
maxT
,
maxT
)
left_context
=
self
.
transformer_context
[
0
]
right_context
=
self
.
transformer_context
[
1
]
if
left_context
is
not
None
:
left_context
=
math
.
ceil
(
self
.
transformer_context
[
0
]
/
subsampling_factor
)
if
right_context
is
not
None
:
right_context
=
math
.
ceil
(
self
.
transformer_context
[
1
]
/
subsampling_factor
)
for
t
in
range
(
maxT
):
if
left_context
is
not
None
:
st
=
0
en
=
max
(
st
,
t
-
left_context
)
attn_mask
[
t
,
st
:
en
]
=
1
if
right_context
is
not
None
:
st
=
t
+
right_context
+
1
st
=
min
(
st
,
maxT
-
1
)
attn_mask
[
t
,
st
:]
=
1
return
attn_mask
.
to
(
input_lengths
.
device
)
def
reorder_encoder_out
(
self
,
encoder_out
,
new_order
):
encoder_out
[
"encoder_out"
]
=
encoder_out
[
"encoder_out"
].
index_select
(
1
,
new_order
)
if
encoder_out
[
"encoder_padding_mask"
]
is
not
None
:
encoder_out
[
"encoder_padding_mask"
]
=
encoder_out
[
"encoder_padding_mask"
].
index_select
(
1
,
new_order
)
return
encoder_out
class
TransformerDecoder
(
FairseqIncrementalDecoder
):
"""
Transformer decoder consisting of *args.decoder_layers* layers. Each layer
is a :class:`TransformerDecoderLayer`.
Args:
args (argparse.Namespace): parsed command-line arguments
dictionary (~fairseq.data.Dictionary): decoding dictionary
embed_tokens (torch.nn.Embedding): output embedding
no_encoder_attn (bool, optional): whether to attend to encoder outputs.
Default: ``False``
left_pad (bool, optional): whether the input is left-padded. Default:
``False``
"""
def
__init__
(
self
,
dictionary
,
embed_dim
=
512
,
transformer_config
=
DEFAULT_ENC_TRANSFORMER_CONFIG
,
conv_config
=
DEFAULT_DEC_CONV_CONFIG
,
encoder_output_dim
=
512
,
):
super
().
__init__
(
dictionary
)
vocab_size
=
len
(
dictionary
)
self
.
padding_idx
=
dictionary
.
pad
()
self
.
embed_tokens
=
Embedding
(
vocab_size
,
embed_dim
,
self
.
padding_idx
)
self
.
conv_layers
=
nn
.
ModuleList
()
for
i
in
range
(
len
(
conv_config
)):
out_channels
,
kernel_size
,
layer_norm
=
conv_config
[
i
]
if
i
==
0
:
conv_layer
=
LinearizedConv1d
(
embed_dim
,
out_channels
,
kernel_size
,
padding
=
kernel_size
-
1
)
else
:
conv_layer
=
LinearizedConv1d
(
conv_config
[
i
-
1
][
0
],
out_channels
,
kernel_size
,
padding
=
kernel_size
-
1
,
)
self
.
conv_layers
.
append
(
conv_layer
)
if
layer_norm
:
self
.
conv_layers
.
append
(
nn
.
LayerNorm
(
out_channels
))
self
.
conv_layers
.
append
(
nn
.
ReLU
())
self
.
layers
=
nn
.
ModuleList
()
if
conv_config
[
-
1
][
0
]
!=
transformer_config
[
0
][
0
]:
self
.
layers
.
append
(
Linear
(
conv_config
[
-
1
][
0
],
transformer_config
[
0
][
0
]))
self
.
layers
.
append
(
TransformerDecoderLayer
(
prepare_transformer_decoder_params
(
*
transformer_config
[
0
])
)
)
for
i
in
range
(
1
,
len
(
transformer_config
)):
if
transformer_config
[
i
-
1
][
0
]
!=
transformer_config
[
i
][
0
]:
self
.
layers
.
append
(
Linear
(
transformer_config
[
i
-
1
][
0
],
transformer_config
[
i
][
0
])
)
self
.
layers
.
append
(
TransformerDecoderLayer
(
prepare_transformer_decoder_params
(
*
transformer_config
[
i
])
)
)
self
.
fc_out
=
Linear
(
transformer_config
[
-
1
][
0
],
vocab_size
)
def
forward
(
self
,
prev_output_tokens
,
encoder_out
=
None
,
incremental_state
=
None
):
"""
Args:
prev_output_tokens (LongTensor): previous decoder outputs of shape
`(batch, tgt_len)`, for input feeding/teacher forcing
encoder_out (Tensor, optional): output from the encoder, used for
encoder-side attention
incremental_state (dict): dictionary used for storing state during
:ref:`Incremental decoding`
Returns:
tuple:
- the last decoder layer's output of shape `(batch, tgt_len,
vocab)`
- the last decoder layer's attention weights of shape `(batch,
tgt_len, src_len)`
"""
target_padding_mask
=
(
(
prev_output_tokens
==
self
.
padding_idx
).
to
(
prev_output_tokens
.
device
)
if
incremental_state
is
None
else
None
)
if
incremental_state
is
not
None
:
prev_output_tokens
=
prev_output_tokens
[:,
-
1
:]
# embed tokens
x
=
self
.
embed_tokens
(
prev_output_tokens
)
# B x T x C -> T x B x C
x
=
self
.
_transpose_if_training
(
x
,
incremental_state
)
for
layer
in
self
.
conv_layers
:
if
isinstance
(
layer
,
LinearizedConvolution
):
x
=
layer
(
x
,
incremental_state
)
else
:
x
=
layer
(
x
)
# B x T x C -> T x B x C
x
=
self
.
_transpose_if_inference
(
x
,
incremental_state
)
# decoder layers
for
layer
in
self
.
layers
:
if
isinstance
(
layer
,
TransformerDecoderLayer
):
x
,
*
_
=
layer
(
x
,
(
encoder_out
[
"encoder_out"
]
if
encoder_out
is
not
None
else
None
),
(
encoder_out
[
"encoder_padding_mask"
].
t
()
if
encoder_out
[
"encoder_padding_mask"
]
is
not
None
else
None
),
incremental_state
,
self_attn_mask
=
(
self
.
buffered_future_mask
(
x
)
if
incremental_state
is
None
else
None
),
self_attn_padding_mask
=
(
target_padding_mask
if
incremental_state
is
None
else
None
),
)
else
:
x
=
layer
(
x
)
# T x B x C -> B x T x C
x
=
x
.
transpose
(
0
,
1
)
x
=
self
.
fc_out
(
x
)
return
x
,
None
def
buffered_future_mask
(
self
,
tensor
):
dim
=
tensor
.
size
(
0
)
if
(
not
hasattr
(
self
,
"_future_mask"
)
or
self
.
_future_mask
is
None
or
self
.
_future_mask
.
device
!=
tensor
.
device
):
self
.
_future_mask
=
torch
.
triu
(
utils
.
fill_with_neg_inf
(
tensor
.
new
(
dim
,
dim
)),
1
)
if
self
.
_future_mask
.
size
(
0
)
<
dim
:
self
.
_future_mask
=
torch
.
triu
(
utils
.
fill_with_neg_inf
(
self
.
_future_mask
.
resize_
(
dim
,
dim
)),
1
)
return
self
.
_future_mask
[:
dim
,
:
dim
]
def
_transpose_if_training
(
self
,
x
,
incremental_state
):
if
incremental_state
is
None
:
x
=
x
.
transpose
(
0
,
1
)
return
x
def
_transpose_if_inference
(
self
,
x
,
incremental_state
):
if
incremental_state
:
x
=
x
.
transpose
(
0
,
1
)
return
x
@
register_model
(
"asr_vggtransformer_encoder"
)
class
VGGTransformerEncoderModel
(
FairseqEncoderModel
):
def
__init__
(
self
,
encoder
):
super
().
__init__
(
encoder
)
@
staticmethod
def
add_args
(
parser
):
"""Add model-specific arguments to the parser."""
parser
.
add_argument
(
"--input-feat-per-channel"
,
type
=
int
,
metavar
=
"N"
,
help
=
"encoder input dimension per input channel"
,
)
parser
.
add_argument
(
"--vggblock-enc-config"
,
type
=
str
,
metavar
=
"EXPR"
,
help
=
"""
an array of tuples each containing the configuration of one vggblock
[(out_channels, conv_kernel_size, pooling_kernel_size,num_conv_layers), ...]
"""
,
)
parser
.
add_argument
(
"--transformer-enc-config"
,
type
=
str
,
metavar
=
"EXPR"
,
help
=
"""
a tuple containing the configuration of the Transformer layers
configurations:
[(input_dim,
num_heads,
ffn_dim,
normalize_before,
dropout,
attention_dropout,
relu_dropout), ]"""
,
)
parser
.
add_argument
(
"--enc-output-dim"
,
type
=
int
,
metavar
=
"N"
,
help
=
"encoder output dimension, projecting the LSTM output"
,
)
parser
.
add_argument
(
"--in-channels"
,
type
=
int
,
metavar
=
"N"
,
help
=
"number of encoder input channels"
,
)
parser
.
add_argument
(
"--transformer-context"
,
type
=
str
,
metavar
=
"EXPR"
,
help
=
"""
either None or a tuple of two ints, indicating left/right context a
transformer can have access to"""
,
)
parser
.
add_argument
(
"--transformer-sampling"
,
type
=
str
,
metavar
=
"EXPR"
,
help
=
"""
either None or a tuple of ints, indicating sampling factor in each layer"""
,
)
@
classmethod
def
build_model
(
cls
,
args
,
task
):
"""Build a new model instance."""
base_architecture_enconly
(
args
)
encoder
=
VGGTransformerEncoderOnly
(
vocab_size
=
len
(
task
.
target_dictionary
),
input_feat_per_channel
=
args
.
input_feat_per_channel
,
vggblock_config
=
eval
(
args
.
vggblock_enc_config
),
transformer_config
=
eval
(
args
.
transformer_enc_config
),
encoder_output_dim
=
args
.
enc_output_dim
,
in_channels
=
args
.
in_channels
,
transformer_context
=
eval
(
args
.
transformer_context
),
transformer_sampling
=
eval
(
args
.
transformer_sampling
),
)
return
cls
(
encoder
)
def
get_normalized_probs
(
self
,
net_output
,
log_probs
,
sample
=
None
):
# net_output['encoder_out'] is a (T, B, D) tensor
lprobs
=
super
().
get_normalized_probs
(
net_output
,
log_probs
,
sample
)
# lprobs is a (T, B, D) tensor
# we need to transoose to get (B, T, D) tensor
lprobs
=
lprobs
.
transpose
(
0
,
1
).
contiguous
()
lprobs
.
batch_first
=
True
return
lprobs
class
VGGTransformerEncoderOnly
(
VGGTransformerEncoder
):
def
__init__
(
self
,
vocab_size
,
input_feat_per_channel
,
vggblock_config
=
DEFAULT_ENC_VGGBLOCK_CONFIG
,
transformer_config
=
DEFAULT_ENC_TRANSFORMER_CONFIG
,
encoder_output_dim
=
512
,
in_channels
=
1
,
transformer_context
=
None
,
transformer_sampling
=
None
,
):
super
().
__init__
(
input_feat_per_channel
=
input_feat_per_channel
,
vggblock_config
=
vggblock_config
,
transformer_config
=
transformer_config
,
encoder_output_dim
=
encoder_output_dim
,
in_channels
=
in_channels
,
transformer_context
=
transformer_context
,
transformer_sampling
=
transformer_sampling
,
)
self
.
fc_out
=
Linear
(
self
.
encoder_output_dim
,
vocab_size
)
def
forward
(
self
,
src_tokens
,
src_lengths
,
**
kwargs
):
"""
src_tokens: padded tensor (B, T, C * feat)
src_lengths: tensor of original lengths of input utterances (B,)
"""
enc_out
=
super
().
forward
(
src_tokens
,
src_lengths
)
x
=
self
.
fc_out
(
enc_out
[
"encoder_out"
])
# x = F.log_softmax(x, dim=-1)
# Note: no need this line, because model.get_normalized_prob will call
# log_softmax
return
{
"encoder_out"
:
x
,
# (T, B, C)
"encoder_padding_mask"
:
enc_out
[
"encoder_padding_mask"
],
# (T, B)
}
def
max_positions
(
self
):
"""Maximum input length supported by the encoder."""
return
(
1e6
,
1e6
)
# an arbitrary large number
def
Embedding
(
num_embeddings
,
embedding_dim
,
padding_idx
):
m
=
nn
.
Embedding
(
num_embeddings
,
embedding_dim
,
padding_idx
=
padding_idx
)
# nn.init.uniform_(m.weight, -0.1, 0.1)
# nn.init.constant_(m.weight[padding_idx], 0)
return
m
def
Linear
(
in_features
,
out_features
,
bias
=
True
,
dropout
=
0
):
"""Linear layer (input: N x T x C)"""
m
=
nn
.
Linear
(
in_features
,
out_features
,
bias
=
bias
)
# m.weight.data.uniform_(-0.1, 0.1)
# if bias:
# m.bias.data.uniform_(-0.1, 0.1)
return
m
def
LinearizedConv1d
(
in_channels
,
out_channels
,
kernel_size
,
dropout
=
0
,
**
kwargs
):
"""Weight-normalized Conv1d layer optimized for decoding"""
m
=
LinearizedConvolution
(
in_channels
,
out_channels
,
kernel_size
,
**
kwargs
)
std
=
math
.
sqrt
((
4
*
(
1.0
-
dropout
))
/
(
m
.
kernel_size
[
0
]
*
in_channels
))
nn
.
init
.
normal_
(
m
.
weight
,
mean
=
0
,
std
=
std
)
nn
.
init
.
constant_
(
m
.
bias
,
0
)
return
nn
.
utils
.
weight_norm
(
m
,
dim
=
2
)
def
LayerNorm
(
embedding_dim
):
m
=
nn
.
LayerNorm
(
embedding_dim
)
return
m
# seq2seq models
def
base_architecture
(
args
):
args
.
input_feat_per_channel
=
getattr
(
args
,
"input_feat_per_channel"
,
40
)
args
.
vggblock_enc_config
=
getattr
(
args
,
"vggblock_enc_config"
,
DEFAULT_ENC_VGGBLOCK_CONFIG
)
args
.
transformer_enc_config
=
getattr
(
args
,
"transformer_enc_config"
,
DEFAULT_ENC_TRANSFORMER_CONFIG
)
args
.
enc_output_dim
=
getattr
(
args
,
"enc_output_dim"
,
512
)
args
.
in_channels
=
getattr
(
args
,
"in_channels"
,
1
)
args
.
tgt_embed_dim
=
getattr
(
args
,
"tgt_embed_dim"
,
128
)
args
.
transformer_dec_config
=
getattr
(
args
,
"transformer_dec_config"
,
DEFAULT_ENC_TRANSFORMER_CONFIG
)
args
.
conv_dec_config
=
getattr
(
args
,
"conv_dec_config"
,
DEFAULT_DEC_CONV_CONFIG
)
args
.
transformer_context
=
getattr
(
args
,
"transformer_context"
,
"None"
)
@
register_model_architecture
(
"asr_vggtransformer"
,
"vggtransformer_1"
)
def
vggtransformer_1
(
args
):
args
.
input_feat_per_channel
=
getattr
(
args
,
"input_feat_per_channel"
,
80
)
args
.
vggblock_enc_config
=
getattr
(
args
,
"vggblock_enc_config"
,
"[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
)
args
.
transformer_enc_config
=
getattr
(
args
,
"transformer_enc_config"
,
"((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 14"
,
)
args
.
enc_output_dim
=
getattr
(
args
,
"enc_output_dim"
,
1024
)
args
.
tgt_embed_dim
=
getattr
(
args
,
"tgt_embed_dim"
,
128
)
args
.
conv_dec_config
=
getattr
(
args
,
"conv_dec_config"
,
"((256, 3, True),) * 4"
)
args
.
transformer_dec_config
=
getattr
(
args
,
"transformer_dec_config"
,
"((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 4"
,
)
@
register_model_architecture
(
"asr_vggtransformer"
,
"vggtransformer_2"
)
def
vggtransformer_2
(
args
):
args
.
input_feat_per_channel
=
getattr
(
args
,
"input_feat_per_channel"
,
80
)
args
.
vggblock_enc_config
=
getattr
(
args
,
"vggblock_enc_config"
,
"[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
)
args
.
transformer_enc_config
=
getattr
(
args
,
"transformer_enc_config"
,
"((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16"
,
)
args
.
enc_output_dim
=
getattr
(
args
,
"enc_output_dim"
,
1024
)
args
.
tgt_embed_dim
=
getattr
(
args
,
"tgt_embed_dim"
,
512
)
args
.
conv_dec_config
=
getattr
(
args
,
"conv_dec_config"
,
"((256, 3, True),) * 4"
)
args
.
transformer_dec_config
=
getattr
(
args
,
"transformer_dec_config"
,
"((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 6"
,
)
@
register_model_architecture
(
"asr_vggtransformer"
,
"vggtransformer_base"
)
def
vggtransformer_base
(
args
):
args
.
input_feat_per_channel
=
getattr
(
args
,
"input_feat_per_channel"
,
80
)
args
.
vggblock_enc_config
=
getattr
(
args
,
"vggblock_enc_config"
,
"[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
)
args
.
transformer_enc_config
=
getattr
(
args
,
"transformer_enc_config"
,
"((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 12"
)
args
.
enc_output_dim
=
getattr
(
args
,
"enc_output_dim"
,
512
)
args
.
tgt_embed_dim
=
getattr
(
args
,
"tgt_embed_dim"
,
512
)
args
.
conv_dec_config
=
getattr
(
args
,
"conv_dec_config"
,
"((256, 3, True),) * 4"
)
args
.
transformer_dec_config
=
getattr
(
args
,
"transformer_dec_config"
,
"((512, 8, 2048, True, 0.15, 0.15, 0.15),) * 6"
)
# Size estimations:
# Encoder:
# - vggblock param: 64*1*3*3 + 64*64*3*3 + 128*64*3*3 + 128*128*3 = 258K
# Transformer:
# - input dimension adapter: 2560 x 512 -> 1.31M
# - transformer_layers (x12) --> 37.74M
# * MultiheadAttention: 512*512*3 (in_proj) + 512*512 (out_proj) = 1.048M
# * FFN weight: 512*2048*2 = 2.097M
# - output dimension adapter: 512 x 512 -> 0.26 M
# Decoder:
# - LinearizedConv1d: 512 * 256 * 3 + 256 * 256 * 3 * 3
# - transformer_layer: (x6) --> 25.16M
# * MultiheadAttention (self-attention): 512*512*3 + 512*512 = 1.048M
# * MultiheadAttention (encoder-attention): 512*512*3 + 512*512 = 1.048M
# * FFN: 512*2048*2 = 2.097M
# Final FC:
# - FC: 512*5000 = 256K (assuming vocab size 5K)
# In total:
# ~65 M
# CTC models
def
base_architecture_enconly
(
args
):
args
.
input_feat_per_channel
=
getattr
(
args
,
"input_feat_per_channel"
,
40
)
args
.
vggblock_enc_config
=
getattr
(
args
,
"vggblock_enc_config"
,
"[(32, 3, 2, 2, True)] * 2"
)
args
.
transformer_enc_config
=
getattr
(
args
,
"transformer_enc_config"
,
"((256, 4, 1024, True, 0.2, 0.2, 0.2),) * 2"
)
args
.
enc_output_dim
=
getattr
(
args
,
"enc_output_dim"
,
512
)
args
.
in_channels
=
getattr
(
args
,
"in_channels"
,
1
)
args
.
transformer_context
=
getattr
(
args
,
"transformer_context"
,
"None"
)
args
.
transformer_sampling
=
getattr
(
args
,
"transformer_sampling"
,
"None"
)
@
register_model_architecture
(
"asr_vggtransformer_encoder"
,
"vggtransformer_enc_1"
)
def
vggtransformer_enc_1
(
args
):
# vggtransformer_1 is the same as vggtransformer_enc_big, except the number
# of layers is increased to 16
# keep it here for backward compatiablity purpose
args
.
input_feat_per_channel
=
getattr
(
args
,
"input_feat_per_channel"
,
80
)
args
.
vggblock_enc_config
=
getattr
(
args
,
"vggblock_enc_config"
,
"[(64, 3, 2, 2, True), (128, 3, 2, 2, True)]"
)
args
.
transformer_enc_config
=
getattr
(
args
,
"transformer_enc_config"
,
"((1024, 16, 4096, True, 0.15, 0.15, 0.15),) * 16"
,
)
args
.
enc_output_dim
=
getattr
(
args
,
"enc_output_dim"
,
1024
)
examples/speech_recognition/models/w2l_conv_glu_enc.py
0 → 100644
View file @
7df61696
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
math
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
fairseq.models
import
(
FairseqEncoder
,
FairseqEncoderModel
,
register_model
,
register_model_architecture
,
)
from
fairseq.modules.fairseq_dropout
import
FairseqDropout
default_conv_enc_config
=
"""[
(400, 13, 170, 0.2),
(440, 14, 0, 0.214),
(484, 15, 0, 0.22898),
(532, 16, 0, 0.2450086),
(584, 17, 0, 0.262159202),
(642, 18, 0, 0.28051034614),
(706, 19, 0, 0.30014607037),
(776, 20, 0, 0.321156295296),
(852, 21, 0, 0.343637235966),
(936, 22, 0, 0.367691842484),
(1028, 23, 0, 0.393430271458),
(1130, 24, 0, 0.42097039046),
(1242, 25, 0, 0.450438317792),
(1366, 26, 0, 0.481969000038),
(1502, 27, 0, 0.51570683004),
(1652, 28, 0, 0.551806308143),
(1816, 29, 0, 0.590432749713),
]"""
@
register_model
(
"asr_w2l_conv_glu_encoder"
)
class
W2lConvGluEncoderModel
(
FairseqEncoderModel
):
def
__init__
(
self
,
encoder
):
super
().
__init__
(
encoder
)
@
staticmethod
def
add_args
(
parser
):
"""Add model-specific arguments to the parser."""
parser
.
add_argument
(
"--input-feat-per-channel"
,
type
=
int
,
metavar
=
"N"
,
help
=
"encoder input dimension per input channel"
,
)
parser
.
add_argument
(
"--in-channels"
,
type
=
int
,
metavar
=
"N"
,
help
=
"number of encoder input channels"
,
)
parser
.
add_argument
(
"--conv-enc-config"
,
type
=
str
,
metavar
=
"EXPR"
,
help
=
"""
an array of tuples each containing the configuration of one conv layer
[(out_channels, kernel_size, padding, dropout), ...]
"""
,
)
@
classmethod
def
build_model
(
cls
,
args
,
task
):
"""Build a new model instance."""
conv_enc_config
=
getattr
(
args
,
"conv_enc_config"
,
default_conv_enc_config
)
encoder
=
W2lConvGluEncoder
(
vocab_size
=
len
(
task
.
target_dictionary
),
input_feat_per_channel
=
args
.
input_feat_per_channel
,
in_channels
=
args
.
in_channels
,
conv_enc_config
=
eval
(
conv_enc_config
),
)
return
cls
(
encoder
)
def
get_normalized_probs
(
self
,
net_output
,
log_probs
,
sample
=
None
):
lprobs
=
super
().
get_normalized_probs
(
net_output
,
log_probs
,
sample
)
lprobs
.
batch_first
=
False
return
lprobs
class
W2lConvGluEncoder
(
FairseqEncoder
):
def
__init__
(
self
,
vocab_size
,
input_feat_per_channel
,
in_channels
,
conv_enc_config
):
super
().
__init__
(
None
)
self
.
input_dim
=
input_feat_per_channel
if
in_channels
!=
1
:
raise
ValueError
(
"only 1 input channel is currently supported"
)
self
.
conv_layers
=
nn
.
ModuleList
()
self
.
linear_layers
=
nn
.
ModuleList
()
self
.
dropouts
=
[]
cur_channels
=
input_feat_per_channel
for
out_channels
,
kernel_size
,
padding
,
dropout
in
conv_enc_config
:
layer
=
nn
.
Conv1d
(
cur_channels
,
out_channels
,
kernel_size
,
padding
=
padding
)
layer
.
weight
.
data
.
mul_
(
math
.
sqrt
(
3
))
# match wav2letter init
self
.
conv_layers
.
append
(
nn
.
utils
.
weight_norm
(
layer
))
self
.
dropouts
.
append
(
FairseqDropout
(
dropout
,
module_name
=
self
.
__class__
.
__name__
)
)
if
out_channels
%
2
!=
0
:
raise
ValueError
(
"odd # of out_channels is incompatible with GLU"
)
cur_channels
=
out_channels
//
2
# halved by GLU
for
out_channels
in
[
2
*
cur_channels
,
vocab_size
]:
layer
=
nn
.
Linear
(
cur_channels
,
out_channels
)
layer
.
weight
.
data
.
mul_
(
math
.
sqrt
(
3
))
self
.
linear_layers
.
append
(
nn
.
utils
.
weight_norm
(
layer
))
cur_channels
=
out_channels
//
2
def
forward
(
self
,
src_tokens
,
src_lengths
,
**
kwargs
):
"""
src_tokens: padded tensor (B, T, C * feat)
src_lengths: tensor of original lengths of input utterances (B,)
"""
B
,
T
,
_
=
src_tokens
.
size
()
x
=
src_tokens
.
transpose
(
1
,
2
).
contiguous
()
# (B, feat, T) assuming C == 1
for
layer_idx
in
range
(
len
(
self
.
conv_layers
)):
x
=
self
.
conv_layers
[
layer_idx
](
x
)
x
=
F
.
glu
(
x
,
dim
=
1
)
x
=
self
.
dropouts
[
layer_idx
](
x
)
x
=
x
.
transpose
(
1
,
2
).
contiguous
()
# (B, T, 908)
x
=
self
.
linear_layers
[
0
](
x
)
x
=
F
.
glu
(
x
,
dim
=
2
)
x
=
self
.
dropouts
[
-
1
](
x
)
x
=
self
.
linear_layers
[
1
](
x
)
assert
x
.
size
(
0
)
==
B
assert
x
.
size
(
1
)
==
T
encoder_out
=
x
.
transpose
(
0
,
1
)
# (T, B, vocab_size)
# need to debug this -- find a simpler/elegant way in pytorch APIs
encoder_padding_mask
=
(
torch
.
arange
(
T
).
view
(
1
,
T
).
expand
(
B
,
-
1
).
to
(
x
.
device
)
>=
src_lengths
.
view
(
B
,
1
).
expand
(
-
1
,
T
)
).
t
()
# (B x T) -> (T x B)
return
{
"encoder_out"
:
encoder_out
,
# (T, B, vocab_size)
"encoder_padding_mask"
:
encoder_padding_mask
,
# (T, B)
}
def
reorder_encoder_out
(
self
,
encoder_out
,
new_order
):
encoder_out
[
"encoder_out"
]
=
encoder_out
[
"encoder_out"
].
index_select
(
1
,
new_order
)
encoder_out
[
"encoder_padding_mask"
]
=
encoder_out
[
"encoder_padding_mask"
].
index_select
(
1
,
new_order
)
return
encoder_out
def
max_positions
(
self
):
"""Maximum input length supported by the encoder."""
return
(
1e6
,
1e6
)
# an arbitrary large number
@
register_model_architecture
(
"asr_w2l_conv_glu_encoder"
,
"w2l_conv_glu_enc"
)
def
w2l_conv_glu_enc
(
args
):
args
.
input_feat_per_channel
=
getattr
(
args
,
"input_feat_per_channel"
,
80
)
args
.
in_channels
=
getattr
(
args
,
"in_channels"
,
1
)
args
.
conv_enc_config
=
getattr
(
args
,
"conv_enc_config"
,
default_conv_enc_config
)
examples/speech_recognition/tasks/__init__.py
0 → 100644
View file @
7df61696
import
importlib
import
os
for
file
in
os
.
listdir
(
os
.
path
.
dirname
(
__file__
)):
if
file
.
endswith
(
".py"
)
and
not
file
.
startswith
(
"_"
):
task_name
=
file
[:
file
.
find
(
".py"
)]
importlib
.
import_module
(
"examples.speech_recognition.tasks."
+
task_name
)
examples/speech_recognition/tasks/speech_recognition.py
0 → 100644
View file @
7df61696
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import
json
import
os
import
re
import
sys
import
torch
from
examples.speech_recognition.data
import
AsrDataset
from
examples.speech_recognition.data.replabels
import
replabel_symbol
from
fairseq.data
import
Dictionary
from
fairseq.tasks
import
LegacyFairseqTask
,
register_task
def
get_asr_dataset_from_json
(
data_json_path
,
tgt_dict
):
"""
Parse data json and create dataset.
See scripts/asr_prep_json.py which pack json from raw files
Json example:
{
"utts": {
"4771-29403-0025": {
"input": {
"length_ms": 170,
"path": "/tmp/file1.flac"
},
"output": {
"text": "HELLO
\n
",
"token": "HE LLO",
"tokenid": "4815, 861"
}
},
"1564-142299-0096": {
...
}
}
"""
if
not
os
.
path
.
isfile
(
data_json_path
):
raise
FileNotFoundError
(
"Dataset not found: {}"
.
format
(
data_json_path
))
with
open
(
data_json_path
,
"rb"
)
as
f
:
data_samples
=
json
.
load
(
f
)[
"utts"
]
assert
len
(
data_samples
)
!=
0
sorted_samples
=
sorted
(
data_samples
.
items
(),
key
=
lambda
sample
:
int
(
sample
[
1
][
"input"
][
"length_ms"
]),
reverse
=
True
,
)
aud_paths
=
[
s
[
1
][
"input"
][
"path"
]
for
s
in
sorted_samples
]
ids
=
[
s
[
0
]
for
s
in
sorted_samples
]
speakers
=
[]
for
s
in
sorted_samples
:
m
=
re
.
search
(
"(.+?)-(.+?)-(.+?)"
,
s
[
0
])
speakers
.
append
(
m
.
group
(
1
)
+
"_"
+
m
.
group
(
2
))
frame_sizes
=
[
s
[
1
][
"input"
][
"length_ms"
]
for
s
in
sorted_samples
]
tgt
=
[
[
int
(
i
)
for
i
in
s
[
1
][
"output"
][
"tokenid"
].
split
(
", "
)]
for
s
in
sorted_samples
]
# append eos
tgt
=
[[
*
t
,
tgt_dict
.
eos
()]
for
t
in
tgt
]
return
AsrDataset
(
aud_paths
,
frame_sizes
,
tgt
,
tgt_dict
,
ids
,
speakers
)
@
register_task
(
"speech_recognition"
)
class
SpeechRecognitionTask
(
LegacyFairseqTask
):
"""
Task for training speech recognition model.
"""
@
staticmethod
def
add_args
(
parser
):
"""Add task-specific arguments to the parser."""
parser
.
add_argument
(
"data"
,
help
=
"path to data directory"
)
parser
.
add_argument
(
"--silence-token"
,
default
=
"
\u2581
"
,
help
=
"token for silence (used by w2l)"
)
parser
.
add_argument
(
"--max-source-positions"
,
default
=
sys
.
maxsize
,
type
=
int
,
metavar
=
"N"
,
help
=
"max number of frames in the source sequence"
,
)
parser
.
add_argument
(
"--max-target-positions"
,
default
=
1024
,
type
=
int
,
metavar
=
"N"
,
help
=
"max number of tokens in the target sequence"
,
)
def
__init__
(
self
,
args
,
tgt_dict
):
super
().
__init__
(
args
)
self
.
tgt_dict
=
tgt_dict
@
classmethod
def
setup_task
(
cls
,
args
,
**
kwargs
):
"""Setup the task (e.g., load dictionaries)."""
dict_path
=
os
.
path
.
join
(
args
.
data
,
"dict.txt"
)
if
not
os
.
path
.
isfile
(
dict_path
):
raise
FileNotFoundError
(
"Dict not found: {}"
.
format
(
dict_path
))
tgt_dict
=
Dictionary
.
load
(
dict_path
)
if
args
.
criterion
==
"ctc_loss"
:
tgt_dict
.
add_symbol
(
"<ctc_blank>"
)
elif
args
.
criterion
==
"asg_loss"
:
for
i
in
range
(
1
,
args
.
max_replabel
+
1
):
tgt_dict
.
add_symbol
(
replabel_symbol
(
i
))
print
(
"| dictionary: {} types"
.
format
(
len
(
tgt_dict
)))
return
cls
(
args
,
tgt_dict
)
def
load_dataset
(
self
,
split
,
combine
=
False
,
**
kwargs
):
"""Load a given dataset split.
Args:
split (str): name of the split (e.g., train, valid, test)
"""
data_json_path
=
os
.
path
.
join
(
self
.
args
.
data
,
"{}.json"
.
format
(
split
))
self
.
datasets
[
split
]
=
get_asr_dataset_from_json
(
data_json_path
,
self
.
tgt_dict
)
def
build_generator
(
self
,
models
,
args
,
**
unused
):
w2l_decoder
=
getattr
(
args
,
"w2l_decoder"
,
None
)
if
w2l_decoder
==
"viterbi"
:
from
examples.speech_recognition.w2l_decoder
import
W2lViterbiDecoder
return
W2lViterbiDecoder
(
args
,
self
.
target_dictionary
)
elif
w2l_decoder
==
"kenlm"
:
from
examples.speech_recognition.w2l_decoder
import
W2lKenLMDecoder
return
W2lKenLMDecoder
(
args
,
self
.
target_dictionary
)
elif
w2l_decoder
==
"fairseqlm"
:
from
examples.speech_recognition.w2l_decoder
import
W2lFairseqLMDecoder
return
W2lFairseqLMDecoder
(
args
,
self
.
target_dictionary
)
else
:
return
super
().
build_generator
(
models
,
args
)
@
property
def
target_dictionary
(
self
):
"""Return the :class:`~fairseq.data.Dictionary` for the language
model."""
return
self
.
tgt_dict
@
property
def
source_dictionary
(
self
):
"""Return the source :class:`~fairseq.data.Dictionary` (if applicable
for this task)."""
return
None
def
max_positions
(
self
):
"""Return the max speech and sentence length allowed by the task."""
return
(
self
.
args
.
max_source_positions
,
self
.
args
.
max_target_positions
)
examples/speech_recognition/utils/wer_utils.py
0 → 100644
View file @
7df61696
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
from
__future__
import
absolute_import
,
division
,
print_function
,
unicode_literals
import
re
from
collections
import
deque
from
enum
import
Enum
import
numpy
as
np
"""
Utility modules for computation of Word Error Rate,
Alignments, as well as more granular metrics like
deletion, insersion and substitutions.
"""
class
Code
(
Enum
):
match
=
1
substitution
=
2
insertion
=
3
deletion
=
4
class
Token
(
object
):
def
__init__
(
self
,
lbl
=
""
,
st
=
np
.
nan
,
en
=
np
.
nan
):
if
np
.
isnan
(
st
):
self
.
label
,
self
.
start
,
self
.
end
=
""
,
0.0
,
0.0
else
:
self
.
label
,
self
.
start
,
self
.
end
=
lbl
,
st
,
en
class
AlignmentResult
(
object
):
def
__init__
(
self
,
refs
,
hyps
,
codes
,
score
):
self
.
refs
=
refs
# std::deque<int>
self
.
hyps
=
hyps
# std::deque<int>
self
.
codes
=
codes
# std::deque<Code>
self
.
score
=
score
# float
def
coordinate_to_offset
(
row
,
col
,
ncols
):
return
int
(
row
*
ncols
+
col
)
def
offset_to_row
(
offset
,
ncols
):
return
int
(
offset
/
ncols
)
def
offset_to_col
(
offset
,
ncols
):
return
int
(
offset
%
ncols
)
def
trimWhitespace
(
str
):
return
re
.
sub
(
" +"
,
" "
,
re
.
sub
(
" *$"
,
""
,
re
.
sub
(
"^ *"
,
""
,
str
)))
def
str2toks
(
str
):
pieces
=
trimWhitespace
(
str
).
split
(
" "
)
toks
=
[]
for
p
in
pieces
:
toks
.
append
(
Token
(
p
,
0.0
,
0.0
))
return
toks
class
EditDistance
(
object
):
def
__init__
(
self
,
time_mediated
):
self
.
time_mediated_
=
time_mediated
self
.
scores_
=
np
.
nan
# Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>
self
.
backtraces_
=
(
np
.
nan
)
# Eigen::Matrix<size_t, Eigen::Dynamic, Eigen::Dynamic> backtraces_;
self
.
confusion_pairs_
=
{}
def
cost
(
self
,
ref
,
hyp
,
code
):
if
self
.
time_mediated_
:
if
code
==
Code
.
match
:
return
abs
(
ref
.
start
-
hyp
.
start
)
+
abs
(
ref
.
end
-
hyp
.
end
)
elif
code
==
Code
.
insertion
:
return
hyp
.
end
-
hyp
.
start
elif
code
==
Code
.
deletion
:
return
ref
.
end
-
ref
.
start
else
:
# substitution
return
abs
(
ref
.
start
-
hyp
.
start
)
+
abs
(
ref
.
end
-
hyp
.
end
)
+
0.1
else
:
if
code
==
Code
.
match
:
return
0
elif
code
==
Code
.
insertion
or
code
==
Code
.
deletion
:
return
3
else
:
# substitution
return
4
def
get_result
(
self
,
refs
,
hyps
):
res
=
AlignmentResult
(
refs
=
deque
(),
hyps
=
deque
(),
codes
=
deque
(),
score
=
np
.
nan
)
num_rows
,
num_cols
=
self
.
scores_
.
shape
res
.
score
=
self
.
scores_
[
num_rows
-
1
,
num_cols
-
1
]
curr_offset
=
coordinate_to_offset
(
num_rows
-
1
,
num_cols
-
1
,
num_cols
)
while
curr_offset
!=
0
:
curr_row
=
offset_to_row
(
curr_offset
,
num_cols
)
curr_col
=
offset_to_col
(
curr_offset
,
num_cols
)
prev_offset
=
self
.
backtraces_
[
curr_row
,
curr_col
]
prev_row
=
offset_to_row
(
prev_offset
,
num_cols
)
prev_col
=
offset_to_col
(
prev_offset
,
num_cols
)
res
.
refs
.
appendleft
(
curr_row
-
1
)
# Note: this was .push_front() in C++
res
.
hyps
.
appendleft
(
curr_col
-
1
)
if
curr_row
-
1
==
prev_row
and
curr_col
==
prev_col
:
res
.
codes
.
appendleft
(
Code
.
deletion
)
elif
curr_row
==
prev_row
and
curr_col
-
1
==
prev_col
:
res
.
codes
.
appendleft
(
Code
.
insertion
)
else
:
# assert(curr_row - 1 == prev_row and curr_col - 1 == prev_col)
ref_str
=
refs
[
res
.
refs
[
0
]].
label
hyp_str
=
hyps
[
res
.
hyps
[
0
]].
label
if
ref_str
==
hyp_str
:
res
.
codes
.
appendleft
(
Code
.
match
)
else
:
res
.
codes
.
appendleft
(
Code
.
substitution
)
confusion_pair
=
"%s -> %s"
%
(
ref_str
,
hyp_str
)
if
confusion_pair
not
in
self
.
confusion_pairs_
:
self
.
confusion_pairs_
[
confusion_pair
]
=
1
else
:
self
.
confusion_pairs_
[
confusion_pair
]
+=
1
curr_offset
=
prev_offset
return
res
def
align
(
self
,
refs
,
hyps
):
if
len
(
refs
)
==
0
and
len
(
hyps
)
==
0
:
return
np
.
nan
# NOTE: we're not resetting the values in these matrices because every value
# will be overridden in the loop below. If this assumption doesn't hold,
# be sure to set all entries in self.scores_ and self.backtraces_ to 0.
self
.
scores_
=
np
.
zeros
((
len
(
refs
)
+
1
,
len
(
hyps
)
+
1
))
self
.
backtraces_
=
np
.
zeros
((
len
(
refs
)
+
1
,
len
(
hyps
)
+
1
))
num_rows
,
num_cols
=
self
.
scores_
.
shape
for
i
in
range
(
num_rows
):
for
j
in
range
(
num_cols
):
if
i
==
0
and
j
==
0
:
self
.
scores_
[
i
,
j
]
=
0.0
self
.
backtraces_
[
i
,
j
]
=
0
continue
if
i
==
0
:
self
.
scores_
[
i
,
j
]
=
self
.
scores_
[
i
,
j
-
1
]
+
self
.
cost
(
None
,
hyps
[
j
-
1
],
Code
.
insertion
)
self
.
backtraces_
[
i
,
j
]
=
coordinate_to_offset
(
i
,
j
-
1
,
num_cols
)
continue
if
j
==
0
:
self
.
scores_
[
i
,
j
]
=
self
.
scores_
[
i
-
1
,
j
]
+
self
.
cost
(
refs
[
i
-
1
],
None
,
Code
.
deletion
)
self
.
backtraces_
[
i
,
j
]
=
coordinate_to_offset
(
i
-
1
,
j
,
num_cols
)
continue
# Below here both i and j are greater than 0
ref
=
refs
[
i
-
1
]
hyp
=
hyps
[
j
-
1
]
best_score
=
self
.
scores_
[
i
-
1
,
j
-
1
]
+
(
self
.
cost
(
ref
,
hyp
,
Code
.
match
)
if
(
ref
.
label
==
hyp
.
label
)
else
self
.
cost
(
ref
,
hyp
,
Code
.
substitution
)
)
prev_row
=
i
-
1
prev_col
=
j
-
1
ins
=
self
.
scores_
[
i
,
j
-
1
]
+
self
.
cost
(
None
,
hyp
,
Code
.
insertion
)
if
ins
<
best_score
:
best_score
=
ins
prev_row
=
i
prev_col
=
j
-
1
delt
=
self
.
scores_
[
i
-
1
,
j
]
+
self
.
cost
(
ref
,
None
,
Code
.
deletion
)
if
delt
<
best_score
:
best_score
=
delt
prev_row
=
i
-
1
prev_col
=
j
self
.
scores_
[
i
,
j
]
=
best_score
self
.
backtraces_
[
i
,
j
]
=
coordinate_to_offset
(
prev_row
,
prev_col
,
num_cols
)
return
self
.
get_result
(
refs
,
hyps
)
class
WERTransformer
(
object
):
def
__init__
(
self
,
hyp_str
,
ref_str
,
verbose
=
True
):
self
.
ed_
=
EditDistance
(
False
)
self
.
id2oracle_errs_
=
{}
self
.
utts_
=
0
self
.
words_
=
0
self
.
insertions_
=
0
self
.
deletions_
=
0
self
.
substitutions_
=
0
self
.
process
([
"dummy_str"
,
hyp_str
,
ref_str
])
if
verbose
:
print
(
"'%s' vs '%s'"
%
(
hyp_str
,
ref_str
))
self
.
report_result
()
def
process
(
self
,
input
):
# std::vector<std::string>&& input
if
len
(
input
)
<
3
:
print
(
"Input must be of the form <id> ... <hypo> <ref> , got "
,
len
(
input
),
" inputs:"
,
)
return
None
# Align
# std::vector<Token> hyps;
# std::vector<Token> refs;
hyps
=
str2toks
(
input
[
-
2
])
refs
=
str2toks
(
input
[
-
1
])
alignment
=
self
.
ed_
.
align
(
refs
,
hyps
)
if
alignment
is
None
:
print
(
"Alignment is null"
)
return
np
.
nan
# Tally errors
ins
=
0
dels
=
0
subs
=
0
for
code
in
alignment
.
codes
:
if
code
==
Code
.
substitution
:
subs
+=
1
elif
code
==
Code
.
insertion
:
ins
+=
1
elif
code
==
Code
.
deletion
:
dels
+=
1
# Output
row
=
input
row
.
append
(
str
(
len
(
refs
)))
row
.
append
(
str
(
ins
))
row
.
append
(
str
(
dels
))
row
.
append
(
str
(
subs
))
# print(row)
# Accumulate
kIdIndex
=
0
kNBestSep
=
"/"
pieces
=
input
[
kIdIndex
].
split
(
kNBestSep
)
if
len
(
pieces
)
==
0
:
print
(
"Error splitting "
,
input
[
kIdIndex
],
" on '"
,
kNBestSep
,
"', got empty list"
,
)
return
np
.
nan
id
=
pieces
[
0
]
if
id
not
in
self
.
id2oracle_errs_
:
self
.
utts_
+=
1
self
.
words_
+=
len
(
refs
)
self
.
insertions_
+=
ins
self
.
deletions_
+=
dels
self
.
substitutions_
+=
subs
self
.
id2oracle_errs_
[
id
]
=
[
ins
,
dels
,
subs
]
else
:
curr_err
=
ins
+
dels
+
subs
prev_err
=
np
.
sum
(
self
.
id2oracle_errs_
[
id
])
if
curr_err
<
prev_err
:
self
.
id2oracle_errs_
[
id
]
=
[
ins
,
dels
,
subs
]
return
0
def
report_result
(
self
):
# print("---------- Summary ---------------")
if
self
.
words_
==
0
:
print
(
"No words counted"
)
return
# 1-best
best_wer
=
(
100.0
*
(
self
.
insertions_
+
self
.
deletions_
+
self
.
substitutions_
)
/
self
.
words_
)
print
(
"
\t
WER = %0.2f%% (%i utts, %i words, %0.2f%% ins, "
"%0.2f%% dels, %0.2f%% subs)"
%
(
best_wer
,
self
.
utts_
,
self
.
words_
,
100.0
*
self
.
insertions_
/
self
.
words_
,
100.0
*
self
.
deletions_
/
self
.
words_
,
100.0
*
self
.
substitutions_
/
self
.
words_
,
)
)
def
wer
(
self
):
if
self
.
words_
==
0
:
wer
=
np
.
nan
else
:
wer
=
(
100.0
*
(
self
.
insertions_
+
self
.
deletions_
+
self
.
substitutions_
)
/
self
.
words_
)
return
wer
def
stats
(
self
):
if
self
.
words_
==
0
:
stats
=
{}
else
:
wer
=
(
100.0
*
(
self
.
insertions_
+
self
.
deletions_
+
self
.
substitutions_
)
/
self
.
words_
)
stats
=
dict
(
{
"wer"
:
wer
,
"utts"
:
self
.
utts_
,
"numwords"
:
self
.
words_
,
"ins"
:
self
.
insertions_
,
"dels"
:
self
.
deletions_
,
"subs"
:
self
.
substitutions_
,
"confusion_pairs"
:
self
.
ed_
.
confusion_pairs_
,
}
)
return
stats
def
calc_wer
(
hyp_str
,
ref_str
):
t
=
WERTransformer
(
hyp_str
,
ref_str
,
verbose
=
0
)
return
t
.
wer
()
def
calc_wer_stats
(
hyp_str
,
ref_str
):
t
=
WERTransformer
(
hyp_str
,
ref_str
,
verbose
=
0
)
return
t
.
stats
()
def
get_wer_alignment_codes
(
hyp_str
,
ref_str
):
"""
INPUT: hypothesis string, reference string
OUTPUT: List of alignment codes (intermediate results from WER computation)
"""
t
=
WERTransformer
(
hyp_str
,
ref_str
,
verbose
=
0
)
return
t
.
ed_
.
align
(
str2toks
(
ref_str
),
str2toks
(
hyp_str
)).
codes
def
merge_counts
(
x
,
y
):
# Merge two hashes which have 'counts' as their values
# This can be used for example to merge confusion pair counts
# conf_pairs = merge_counts(conf_pairs, stats['confusion_pairs'])
for
k
,
v
in
y
.
items
():
if
k
not
in
x
:
x
[
k
]
=
0
x
[
k
]
+=
v
return
x
examples/speech_recognition/w2l_decoder.py
0 → 100644
View file @
7df61696
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""
Wav2letter decoders.
"""
import
gc
import
itertools
as
it
import
os.path
as
osp
import
warnings
from
collections
import
deque
,
namedtuple
import
numpy
as
np
import
torch
from
examples.speech_recognition.data.replabels
import
unpack_replabels
from
fairseq
import
tasks
from
fairseq.utils
import
apply_to_sample
try
:
from
wav2letter.common
import
create_word_dict
,
load_words
from
wav2letter.criterion
import
CpuViterbiPath
,
get_data_ptr_as_bytes
from
wav2letter.decoder
import
(
CriterionType
,
DecoderOptions
,
KenLM
,
LM
,
LMState
,
SmearingMode
,
Trie
,
LexiconDecoder
,
LexiconFreeDecoder
,
)
except
:
warnings
.
warn
(
"wav2letter python bindings are required to use this functionality. Please install from https://github.com/facebookresearch/wav2letter/wiki/Python-bindings"
)
LM
=
object
LMState
=
object
class
W2lDecoder
(
object
):
def
__init__
(
self
,
args
,
tgt_dict
):
self
.
tgt_dict
=
tgt_dict
self
.
vocab_size
=
len
(
tgt_dict
)
self
.
nbest
=
args
.
nbest
# criterion-specific init
if
args
.
criterion
==
"ctc"
:
self
.
criterion_type
=
CriterionType
.
CTC
self
.
blank
=
(
tgt_dict
.
index
(
"<ctc_blank>"
)
if
"<ctc_blank>"
in
tgt_dict
.
indices
else
tgt_dict
.
bos
()
)
self
.
asg_transitions
=
None
elif
args
.
criterion
==
"asg_loss"
:
self
.
criterion_type
=
CriterionType
.
ASG
self
.
blank
=
-
1
self
.
asg_transitions
=
args
.
asg_transitions
self
.
max_replabel
=
args
.
max_replabel
assert
len
(
self
.
asg_transitions
)
==
self
.
vocab_size
**
2
else
:
raise
RuntimeError
(
f
"unknown criterion:
{
args
.
criterion
}
"
)
def
generate
(
self
,
models
,
sample
,
**
unused
):
"""Generate a batch of inferences."""
# model.forward normally channels prev_output_tokens into the decoder
# separately, but SequenceGenerator directly calls model.encoder
encoder_input
=
{
k
:
v
for
k
,
v
in
sample
[
"net_input"
].
items
()
if
k
!=
"prev_output_tokens"
}
emissions
=
self
.
get_emissions
(
models
,
encoder_input
)
return
self
.
decode
(
emissions
)
def
get_emissions
(
self
,
models
,
encoder_input
):
"""Run encoder and normalize emissions"""
# encoder_out = models[0].encoder(**encoder_input)
encoder_out
=
models
[
0
](
**
encoder_input
)
if
self
.
criterion_type
==
CriterionType
.
CTC
:
emissions
=
models
[
0
].
get_normalized_probs
(
encoder_out
,
log_probs
=
True
)
elif
self
.
criterion_type
==
CriterionType
.
ASG
:
emissions
=
encoder_out
[
"encoder_out"
]
return
emissions
.
transpose
(
0
,
1
).
float
().
cpu
().
contiguous
()
def
get_tokens
(
self
,
idxs
):
"""Normalize tokens by handling CTC blank, ASG replabels, etc."""
idxs
=
(
g
[
0
]
for
g
in
it
.
groupby
(
idxs
))
if
self
.
criterion_type
==
CriterionType
.
CTC
:
idxs
=
filter
(
lambda
x
:
x
!=
self
.
blank
,
idxs
)
elif
self
.
criterion_type
==
CriterionType
.
ASG
:
idxs
=
filter
(
lambda
x
:
x
>=
0
,
idxs
)
idxs
=
unpack_replabels
(
list
(
idxs
),
self
.
tgt_dict
,
self
.
max_replabel
)
return
torch
.
LongTensor
(
list
(
idxs
))
class
W2lViterbiDecoder
(
W2lDecoder
):
def
__init__
(
self
,
args
,
tgt_dict
):
super
().
__init__
(
args
,
tgt_dict
)
def
decode
(
self
,
emissions
):
B
,
T
,
N
=
emissions
.
size
()
hypos
=
[]
if
self
.
asg_transitions
is
None
:
transitions
=
torch
.
FloatTensor
(
N
,
N
).
zero_
()
else
:
transitions
=
torch
.
FloatTensor
(
self
.
asg_transitions
).
view
(
N
,
N
)
viterbi_path
=
torch
.
IntTensor
(
B
,
T
)
workspace
=
torch
.
ByteTensor
(
CpuViterbiPath
.
get_workspace_size
(
B
,
T
,
N
))
CpuViterbiPath
.
compute
(
B
,
T
,
N
,
get_data_ptr_as_bytes
(
emissions
),
get_data_ptr_as_bytes
(
transitions
),
get_data_ptr_as_bytes
(
viterbi_path
),
get_data_ptr_as_bytes
(
workspace
),
)
return
[
[{
"tokens"
:
self
.
get_tokens
(
viterbi_path
[
b
].
tolist
()),
"score"
:
0
}]
for
b
in
range
(
B
)
]
class
W2lKenLMDecoder
(
W2lDecoder
):
def
__init__
(
self
,
args
,
tgt_dict
):
super
().
__init__
(
args
,
tgt_dict
)
self
.
silence
=
(
tgt_dict
.
index
(
"<ctc_blank>"
)
if
"<ctc_blank>"
in
tgt_dict
.
indices
else
tgt_dict
.
bos
()
)
self
.
lexicon
=
load_words
(
args
.
lexicon
)
self
.
word_dict
=
create_word_dict
(
self
.
lexicon
)
self
.
unk_word
=
self
.
word_dict
.
get_index
(
"<unk>"
)
self
.
lm
=
KenLM
(
args
.
kenlm_model
,
self
.
word_dict
)
self
.
trie
=
Trie
(
self
.
vocab_size
,
self
.
silence
)
start_state
=
self
.
lm
.
start
(
False
)
for
i
,
(
word
,
spellings
)
in
enumerate
(
self
.
lexicon
.
items
()):
word_idx
=
self
.
word_dict
.
get_index
(
word
)
_
,
score
=
self
.
lm
.
score
(
start_state
,
word_idx
)
for
spelling
in
spellings
:
spelling_idxs
=
[
tgt_dict
.
index
(
token
)
for
token
in
spelling
]
assert
(
tgt_dict
.
unk
()
not
in
spelling_idxs
),
f
"
{
spelling
}
{
spelling_idxs
}
"
self
.
trie
.
insert
(
spelling_idxs
,
word_idx
,
score
)
self
.
trie
.
smear
(
SmearingMode
.
MAX
)
self
.
decoder_opts
=
DecoderOptions
(
args
.
beam
,
int
(
getattr
(
args
,
"beam_size_token"
,
len
(
tgt_dict
))),
args
.
beam_threshold
,
args
.
lm_weight
,
args
.
word_score
,
args
.
unk_weight
,
args
.
sil_weight
,
0
,
False
,
self
.
criterion_type
,
)
if
self
.
asg_transitions
is
None
:
N
=
768
# self.asg_transitions = torch.FloatTensor(N, N).zero_()
self
.
asg_transitions
=
[]
self
.
decoder
=
LexiconDecoder
(
self
.
decoder_opts
,
self
.
trie
,
self
.
lm
,
self
.
silence
,
self
.
blank
,
self
.
unk_word
,
self
.
asg_transitions
,
False
,
)
def
decode
(
self
,
emissions
):
B
,
T
,
N
=
emissions
.
size
()
hypos
=
[]
for
b
in
range
(
B
):
emissions_ptr
=
emissions
.
data_ptr
()
+
4
*
b
*
emissions
.
stride
(
0
)
results
=
self
.
decoder
.
decode
(
emissions_ptr
,
T
,
N
)
nbest_results
=
results
[:
self
.
nbest
]
hypos
.
append
(
[
{
"tokens"
:
self
.
get_tokens
(
result
.
tokens
),
"score"
:
result
.
score
,
"words"
:
[
self
.
word_dict
.
get_entry
(
x
)
for
x
in
result
.
words
if
x
>=
0
],
}
for
result
in
nbest_results
]
)
return
hypos
FairseqLMState
=
namedtuple
(
"FairseqLMState"
,
[
"prefix"
,
"incremental_state"
,
"probs"
])
class
FairseqLM
(
LM
):
def
__init__
(
self
,
dictionary
,
model
):
LM
.
__init__
(
self
)
self
.
dictionary
=
dictionary
self
.
model
=
model
self
.
unk
=
self
.
dictionary
.
unk
()
self
.
save_incremental
=
False
# this currently does not work properly
self
.
max_cache
=
20_000
model
.
cuda
()
model
.
eval
()
model
.
make_generation_fast_
()
self
.
states
=
{}
self
.
stateq
=
deque
()
def
start
(
self
,
start_with_nothing
):
state
=
LMState
()
prefix
=
torch
.
LongTensor
([[
self
.
dictionary
.
eos
()]])
incremental_state
=
{}
if
self
.
save_incremental
else
None
with
torch
.
no_grad
():
res
=
self
.
model
(
prefix
.
cuda
(),
incremental_state
=
incremental_state
)
probs
=
self
.
model
.
get_normalized_probs
(
res
,
log_probs
=
True
,
sample
=
None
)
if
incremental_state
is
not
None
:
incremental_state
=
apply_to_sample
(
lambda
x
:
x
.
cpu
(),
incremental_state
)
self
.
states
[
state
]
=
FairseqLMState
(
prefix
.
numpy
(),
incremental_state
,
probs
[
0
,
-
1
].
cpu
().
numpy
()
)
self
.
stateq
.
append
(
state
)
return
state
def
score
(
self
,
state
:
LMState
,
token_index
:
int
,
no_cache
:
bool
=
False
):
"""
Evaluate language model based on the current lm state and new word
Parameters:
-----------
state: current lm state
token_index: index of the word
(can be lexicon index then you should store inside LM the
mapping between indices of lexicon and lm, or lm index of a word)
Returns:
--------
(LMState, float): pair of (new state, score for the current word)
"""
curr_state
=
self
.
states
[
state
]
def
trim_cache
(
targ_size
):
while
len
(
self
.
stateq
)
>
targ_size
:
rem_k
=
self
.
stateq
.
popleft
()
rem_st
=
self
.
states
[
rem_k
]
rem_st
=
FairseqLMState
(
rem_st
.
prefix
,
None
,
None
)
self
.
states
[
rem_k
]
=
rem_st
if
curr_state
.
probs
is
None
:
new_incremental_state
=
(
curr_state
.
incremental_state
.
copy
()
if
curr_state
.
incremental_state
is
not
None
else
None
)
with
torch
.
no_grad
():
if
new_incremental_state
is
not
None
:
new_incremental_state
=
apply_to_sample
(
lambda
x
:
x
.
cuda
(),
new_incremental_state
)
elif
self
.
save_incremental
:
new_incremental_state
=
{}
res
=
self
.
model
(
torch
.
from_numpy
(
curr_state
.
prefix
).
cuda
(),
incremental_state
=
new_incremental_state
,
)
probs
=
self
.
model
.
get_normalized_probs
(
res
,
log_probs
=
True
,
sample
=
None
)
if
new_incremental_state
is
not
None
:
new_incremental_state
=
apply_to_sample
(
lambda
x
:
x
.
cpu
(),
new_incremental_state
)
curr_state
=
FairseqLMState
(
curr_state
.
prefix
,
new_incremental_state
,
probs
[
0
,
-
1
].
cpu
().
numpy
()
)
if
not
no_cache
:
self
.
states
[
state
]
=
curr_state
self
.
stateq
.
append
(
state
)
score
=
curr_state
.
probs
[
token_index
].
item
()
trim_cache
(
self
.
max_cache
)
outstate
=
state
.
child
(
token_index
)
if
outstate
not
in
self
.
states
and
not
no_cache
:
prefix
=
np
.
concatenate
(
[
curr_state
.
prefix
,
torch
.
LongTensor
([[
token_index
]])],
-
1
)
incr_state
=
curr_state
.
incremental_state
self
.
states
[
outstate
]
=
FairseqLMState
(
prefix
,
incr_state
,
None
)
if
token_index
==
self
.
unk
:
score
=
float
(
"-inf"
)
return
outstate
,
score
def
finish
(
self
,
state
:
LMState
):
"""
Evaluate eos for language model based on the current lm state
Returns:
--------
(LMState, float): pair of (new state, score for the current word)
"""
return
self
.
score
(
state
,
self
.
dictionary
.
eos
())
def
empty_cache
(
self
):
self
.
states
=
{}
self
.
stateq
=
deque
()
gc
.
collect
()
class
W2lFairseqLMDecoder
(
W2lDecoder
):
def
__init__
(
self
,
args
,
tgt_dict
):
super
().
__init__
(
args
,
tgt_dict
)
self
.
silence
=
tgt_dict
.
bos
()
self
.
unit_lm
=
getattr
(
args
,
"unit_lm"
,
False
)
self
.
lexicon
=
load_words
(
args
.
lexicon
)
if
args
.
lexicon
else
None
self
.
idx_to_wrd
=
{}
checkpoint
=
torch
.
load
(
args
.
kenlm_model
,
map_location
=
"cpu"
)
lm_args
=
checkpoint
[
"args"
]
lm_args
.
data
=
osp
.
dirname
(
args
.
kenlm_model
)
print
(
lm_args
)
task
=
tasks
.
setup_task
(
lm_args
)
model
=
task
.
build_model
(
lm_args
)
model
.
load_state_dict
(
checkpoint
[
"model"
],
strict
=
False
)
self
.
trie
=
Trie
(
self
.
vocab_size
,
self
.
silence
)
self
.
word_dict
=
task
.
dictionary
self
.
unk_word
=
self
.
word_dict
.
unk
()
self
.
lm
=
FairseqLM
(
self
.
word_dict
,
model
)
self
.
decoder_opts
=
DecoderOptions
(
args
.
beam
,
int
(
getattr
(
args
,
"beam_size_token"
,
len
(
tgt_dict
))),
args
.
beam_threshold
,
args
.
lm_weight
,
args
.
word_score
,
args
.
unk_weight
,
args
.
sil_weight
,
0
,
False
,
self
.
criterion_type
,
)
if
self
.
lexicon
:
start_state
=
self
.
lm
.
start
(
False
)
for
i
,
(
word
,
spellings
)
in
enumerate
(
self
.
lexicon
.
items
()):
if
self
.
unit_lm
:
word_idx
=
i
self
.
idx_to_wrd
[
i
]
=
word
score
=
0
else
:
word_idx
=
self
.
word_dict
.
index
(
word
)
_
,
score
=
self
.
lm
.
score
(
start_state
,
word_idx
,
no_cache
=
True
)
for
spelling
in
spellings
:
spelling_idxs
=
[
tgt_dict
.
index
(
token
)
for
token
in
spelling
]
assert
(
tgt_dict
.
unk
()
not
in
spelling_idxs
),
f
"
{
spelling
}
{
spelling_idxs
}
"
self
.
trie
.
insert
(
spelling_idxs
,
word_idx
,
score
)
self
.
trie
.
smear
(
SmearingMode
.
MAX
)
self
.
decoder
=
LexiconDecoder
(
self
.
decoder_opts
,
self
.
trie
,
self
.
lm
,
self
.
silence
,
self
.
blank
,
self
.
unk_word
,
[],
self
.
unit_lm
,
)
else
:
self
.
decoder
=
LexiconFreeDecoder
(
self
.
decoder_opts
,
self
.
lm
,
self
.
silence
,
self
.
blank
,
[]
)
def
decode
(
self
,
emissions
):
B
,
T
,
N
=
emissions
.
size
()
hypos
=
[]
def
idx_to_word
(
idx
):
if
self
.
unit_lm
:
return
self
.
idx_to_wrd
[
idx
]
else
:
return
self
.
word_dict
[
idx
]
def
make_hypo
(
result
):
hypo
=
{
"tokens"
:
self
.
get_tokens
(
result
.
tokens
),
"score"
:
result
.
score
}
if
self
.
lexicon
:
hypo
[
"words"
]
=
[
idx_to_word
(
x
)
for
x
in
result
.
words
if
x
>=
0
]
return
hypo
for
b
in
range
(
B
):
emissions_ptr
=
emissions
.
data_ptr
()
+
4
*
b
*
emissions
.
stride
(
0
)
results
=
self
.
decoder
.
decode
(
emissions_ptr
,
T
,
N
)
nbest_results
=
results
[:
self
.
nbest
]
hypos
.
append
([
make_hypo
(
result
)
for
result
in
nbest_results
])
self
.
lm
.
empty_cache
()
return
hypos
examples/speech_to_text/README.md
0 → 100644
View file @
7df61696
# Speech-to-Text (S2T) Modeling
## Data Preparation
S2T modeling data consists of source speech features, target text and other optional information
(source text, speaker id, etc.). Fairseq S2T uses per-dataset-split TSV manifest files
to store these information. Each data field is represented by a column in the TSV file.
Unlike text token embeddings, speech features (e.g. log mel-filter banks) are usually fixed
during model training and can be pre-computed. The manifest file contains the path to
either the feature file in NumPy format or the WAV/FLAC audio file. For the latter,
features will be extracted on-the-fly by fairseq S2T. Optionally, feature/audio files can be packed
into uncompressed ZIP files (then accessed via byte offset and length) to improve I/O performance.
Fairseq S2T also employs a YAML file for data related configurations: tokenizer type and dictionary path
for the target text, feature transforms such as CMVN (cepstral mean and variance normalization) and SpecAugment,
temperature-based resampling, etc.
## Model Training & Evaluation
Fairseq S2T uses the unified
`fairseq-train`
/
`fairseq-generate`
interface for model training and evaluation.
It requires arguments
`--task speech_to_text`
and
`--arch <arch in fairseq.models.speech_to_text.*>`
.
## Example 1: Speech Recognition (ASR) on LibriSpeech
#### Data preparation
Download and preprocess LibriSpeech data with
```
bash
python examples/speech_to_text/prep_librispeech_data.py
\
--output-root
${
LS_ROOT
}
--vocab-type
unigram
--vocab-size
10000
```
where
`LS_ROOT`
is the root path for downloaded data as well as generated manifest and feature files.
#### Training
```
bash
fairseq-train
${
LS_ROOT
}
--train-subset
train
--valid-subset
dev
--save-dir
${
SAVE_DIR
}
--num-workers
4
\
--max-tokens
40000
--task
speech_to_text
--criterion
label_smoothed_cross_entropy
--max-update
300000
\
--arch
s2t_transformer_s
--optimizer
adam
--lr
2e-3
--lr-scheduler
inverse_sqrt
--warmup-updates
10000
\
--clip-norm
10.0
--seed
1
--update-freq
8
```
where
`SAVE_DIR`
is the checkpoint root path. Here we use
`--arch s2t_transformer_s`
(31M parameters) as example.
You may switch to
`s2t_transformer_m`
(71M) or
`s2t_transformer_l`
(268M) for better performance. We set
`--update-freq 8`
to simulate 8 GPUs with 1 GPU. You may want to update it accordingly when using more than 1 GPU.
#### Inference & Evaluation
Average the last 10 checkpoints and evaluate on the 4 splits
(
`dev-clean`
,
`dev-other`
,
`test-clean`
and
`test-other`
):
```
bash
CHECKPOINT_FILENAME
=
avg_last_10_checkpoint.pt
python scripts/average_checkpoints.py
\
--inputs
${
SAVE_DIR
}
--num-epoch-checkpoints
10
--output
"
${
SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
"
for
SUBSET
in
dev-clean dev-other test-clean test-other
;
do
fairseq-generate
${
LS_ROOT
}
--gen-subset
${
SUBSET
}
--task
speech_to_text
\
--path
${
SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
--max-tokens
50000
--beam
5
--scoring
wer
done
```
#### Result
| --arch | Params | dev-clean | dev-other | test-clean | test-other |
|---|---|---|---|---|---|
| s2t_transformer_s | 30M | 4.1 | 9.3 | 4.4 | 9.2 |
| s2t_transformer_sp | 35M | 3.9 | 9.3 | 4.3 | 8.8 |
| s2t_transformer_m | 71M | 3.5 | 8.1 | 3.7 | 8.1 |
| s2t_transformer_mp | 84M | 3.3 | 7.8 | 3.7 | 8.2 |
| s2t_transformer_l | 268M | 3.3 | 7.7 | 3.5 | 7.8 |
| s2t_transformer_lp | 318M | 3.1 | 7.5 | 3.4 | 7.6 |
## Example 2: Speech Translation (ST) on MuST-C
#### Data Preparation
[
Download
](
https://ict.fbk.eu/must-c
)
and unpack MuST-C data to a path
`MUSTC_ROOT`
, then preprocess it with
```
bash
python examples/speech_to_text/prep_mustc_data.py
--data-root
${
MUSTC_ROOT
}
\
--asr-vocab-type
unigram
--asr-vocab-size
5000
\
--st-vocab-type
unigram
--st-vocab-size
8000
```
The generated manifest and feature files will be available under
`MUSTC_ROOT`
.
#### ASR
###### Training
```
bash
fairseq-train
${
MUSTC_ROOT
}
--train-subset
train_asr
--valid-subset
dev_asr
--save-dir
${
ASR_SAVE_DIR
}
\
--num-workers
4
--max-tokens
40000
--task
speech_to_text
--criterion
label_smoothed_cross_entropy
\
--report-accuracy
--max-update
100000
--arch
s2t_transformer_s
--optimizer
adam
--lr
1e-3
\
--lr-scheduler
inverse_sqrt
--warmup-updates
10000
--clip-norm
10.0
--seed
1
--update-freq
8
```
where
`ASR_SAVE_DIR`
is the checkpoint root path. We set
`--update-freq 8`
to simulate 8 GPUs with 1 GPU.
You may want to update it accordingly when using more than 1 GPU.
###### Inference & Evaluation
```
bash
CHECKPOINT_FILENAME
=
avg_last_10_checkpoint.pt
python scripts/average_checkpoints.py
\
--inputs
${
ASR_SAVE_DIR
}
--num-epoch-checkpoints
10
--output
"
${
ASR_SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
"
fairseq-generate
${
MUSTC_ROOT
}
--gen-subset
tst-COMMON_asr
--task
speech_to_text
\
--path
${
ASR_SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
--max-tokens
50000
--beam
5
\
--scoring
wer
--wer-tokenizer
13a
--wer-lowercase
--wer-remove-punct
```
###### Result
| --arch | Params | En-De | En-Nl | En-Es | En-Fr | En-It | En-Pt | En-Ro | En-Ru |
|---|---|---|---|---|---|---|---|---|---|
| s2t_transformer_s | 31M | 18.2 | 17.6 | 17.7 | 17.2 | 17.9 | 19.1 | 18.1 | 17.7 |
#### ST
###### Training
```
bash
fairseq-train
${
MUSTC_ROOT
}
--train-subset
train_st
--valid-subset
dev_st
--save-dir
${
ST_SAVE_DIR
}
\
--num-workers
4
--max-tokens
40000
--task
speech_to_text
--criterion
label_smoothed_cross_entropy
\
--report-accuracy
--max-update
100000
--arch
s2t_transformer_s
--optimizer
adam
--lr
2e-3
\
--lr-scheduler
inverse_sqrt
--warmup-updates
10000
--clip-norm
10.0
--seed
1
--update-freq
8
\
--load-pretrained-encoder-from
${
ASR_SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
```
where
`ST_SAVE_DIR`
is the checkpoint root path. The ST encoder is pre-trained by ASR for faster training and better
performance:
`--load-pretrained-encoder-from <ASR checkpoint path>`
. We set
`--update-freq 8`
to simulate 8 GPUs with 1 GPU.
You may want to update it accordingly when using more than 1 GPU.
###### Inference & Evaluation
Average the last 10 checkpoints and evaluate on the
`tst-COMMON`
split:
```
bash
CHECKPOINT_FILENAME
=
avg_last_10_checkpoint.pt
python scripts/average_checkpoints.py
\
--inputs
${
ST_SAVE_DIR
}
--num-epoch-checkpoints
10
--output
"
${
ST_SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
"
fairseq-generate
${
MUSTC_ROOT
}
--gen-subset
tst-COMMON_st
--task
speech_to_text
\
--path
${
ST_SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
--max-tokens
50000
--beam
5
--scoring
sacrebleu
```
###### Result
| --arch | Params | En-De | En-Nl | En-Es | En-Fr | En-It | En-Pt | En-Ro | En-Ru |
|---|---|---|---|---|---|---|---|---|---|
| s2t_transformer_s | 31M | 22.7 | 27.3 | 27.2 | 32.9 | 22.7 | 28.1 | 21.9 | 15.3 |
## Example 3: ST on CoVoST
#### Data Preparation
Download and preprocess CoVoST data with
```
bash
# En ASR
python examples/speech_to_text/prep_covost_data.py
--data-root
${
COVOST_ROOT
}
\
--vocab-type
char
--src-lang
en
# ST
python examples/speech_to_text/prep_covost_data.py
--data-root
${
COVOST_ROOT
}
\
--vocab-type
char
--src-lang
fr
--tgt-lang
en
```
where
`COVOST_ROOT`
is the root path for downloaded data as well as generated manifest and feature files.
#### ASR
###### Training
```
bash
fairseq-train
${
COVOST_ROOT
}
--train-subset
train_asr
--valid-subset
dev_asr
--save-dir
${
ASR_SAVE_DIR
}
\
--num-workers
4
--max-tokens
40000
--task
speech_to_text
--criterion
label_smoothed_cross_entropy
\
--report-accuracy
--max-update
100000
--arch
s2t_transformer_s
--optimizer
adam
--lr
1e-3
\
--lr-scheduler
inverse_sqrt
--warmup-updates
10000
--clip-norm
10.0
--seed
1
--update-freq
8
```
where
`ASR_SAVE_DIR`
is the checkpoint root path. We set
`--update-freq 8`
to simulate 8 GPUs with 1 GPU.
You may want to update it accordingly when using more than 1 GPU.
###### Inference & Evaluation
```
bash
CHECKPOINT_FILENAME
=
avg_last_10_checkpoint.pt
python scripts/average_checkpoints.py
\
--inputs
${
ASR_SAVE_DIR
}
--num-epoch-checkpoints
10
--output
"
${
ASR_SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
"
fairseq-generate
${
COVOST_ROOT
}
--gen-subset
test_asr_en
--task
speech_to_text
\
--path
${
ASR_SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
--max-tokens
50000
--beam
5
\
--scoring
wer
--wer-tokenizer
13a
--wer-lowercase
--wer-remove-punct
```
###### Result
| --arch | Params | En |
|---|---|---|
| s2t_transformer_s | 31M | 25.6 |
#### ST
###### Training
```
bash
fairseq-train
${
COVOST_ROOT
}
--train-subset
train_st_fr_en
--valid-subset
dev_st_fr_en
--save-dir
${
ST_SAVE_DIR
}
\
--num-workers
4
--max-tokens
40000
--task
speech_to_text
--criterion
label_smoothed_cross_entropy
\
--report-accuracy
--max-update
100000
--arch
s2t_transformer_s
--optimizer
adam
--lr
2e-3
\
--lr-scheduler
inverse_sqrt
--warmup-updates
10000
--clip-norm
10.0
--seed
1
--update-freq
8
\
--load-pretrained-encoder-from
${
ASR_SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
```
where
`ST_SAVE_DIR`
is the checkpoint root path. The ST encoder is pre-trained by En ASR for faster training and better
performance:
`--load-pretrained-encoder-from <ASR checkpoint path>`
. We set
`--update-freq 8`
to simulate 8 GPUs with 1 GPU.
You may want to update it accordingly when using more than 1 GPU.
###### Inference & Evaluation
Average the last 10 checkpoints and evaluate on test split:
```
bash
CHECKPOINT_FILENAME
=
avg_last_10_checkpoint.pt
python scripts/average_checkpoints.py
\
--inputs
${
ST_SAVE_DIR
}
--num-epoch-checkpoints
10
--output
"
${
ST_SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
"
fairseq-generate
${
COVOST_ROOT
}
--gen-subset
test_st_fr_en
--task
speech_to_text
\
--path
${
ST_SAVE_DIR
}
/
${
CHECKPOINT_FILENAME
}
--max-tokens
50000
--beam
5
--scoring
sacrebleu
```
###### Result
| --arch | Params | Fr-En | De-En | Es-En | Ca-En | En-De | En-Ca | En-Fa | En-Et |
|---|---|---|---|---|---|---|---|---|---|
| s2t_transformer_s | 31M | 26.3 | 17.1 | 23.0 | 18.8 | 16.3 | 21.8 | 13.1 | 13.2 |
## Citation
Please cite as:
```
@inproceedings{wang2020fairseqs2t,
title = {fairseq S2T: Fast Speech-to-Text Modeling with fairseq},
author = {Changhan Wang and Yun Tang and Xutai Ma and Anne Wu and Dmytro Okhonko and Juan Pino},
booktitle = {Proceedings of the 2020 Conference of the Asian Chapter of the Association for Computational Linguistics (AACL): System Demonstrations},
year = {2020},
}
@inproceedings{ott2019fairseq,
title = {fairseq: A Fast, Extensible Toolkit for Sequence Modeling},
author = {Myle Ott and Sergey Edunov and Alexei Baevski and Angela Fan and Sam Gross and Nathan Ng and David Grangier and Michael Auli},
booktitle = {Proceedings of NAACL-HLT 2019: Demonstrations},
year = {2019},
}
```
Prev
1
…
8
9
10
11
12
13
14
15
16
…
30
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment