Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
sunzhq2
yidong-infer
Commits
60a2c57a
Commit
60a2c57a
authored
Jan 27, 2026
by
sunzhq2
Committed by
xuxo
Jan 27, 2026
Browse files
update conformer
parent
4a699441
Changes
216
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
3019 additions
and
0 deletions
+3019
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/e2e_asr.py
...20240621/build/lib/espnet/nets/chainer_backend/e2e_asr.py
+226
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/e2e_asr_transformer.py
...ld/lib/espnet/nets/chainer_backend/e2e_asr_transformer.py
+622
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/nets_utils.py
...40621/build/lib/espnet/nets/chainer_backend/nets_utils.py
+7
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/__init__.py
...621/build/lib/espnet/nets/chainer_backend/rnn/__init__.py
+1
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/attentions.py
...1/build/lib/espnet/nets/chainer_backend/rnn/attentions.py
+279
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/decoders.py
...621/build/lib/espnet/nets/chainer_backend/rnn/decoders.py
+525
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/encoders.py
...621/build/lib/espnet/nets/chainer_backend/rnn/encoders.py
+327
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/training.py
...621/build/lib/espnet/nets/chainer_backend/rnn/training.py
+259
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/__init__.py
...d/lib/espnet/nets/chainer_backend/transformer/__init__.py
+1
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/attention.py
.../lib/espnet/nets/chainer_backend/transformer/attention.py
+96
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/ctc.py
.../build/lib/espnet/nets/chainer_backend/transformer/ctc.py
+87
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/decoder.py
...ld/lib/espnet/nets/chainer_backend/transformer/decoder.py
+113
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/decoder_layer.py
.../espnet/nets/chainer_backend/transformer/decoder_layer.py
+79
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/embedding.py
.../lib/espnet/nets/chainer_backend/transformer/embedding.py
+36
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/encoder.py
...ld/lib/espnet/nets/chainer_backend/transformer/encoder.py
+135
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/encoder_layer.py
.../espnet/nets/chainer_backend/transformer/encoder_layer.py
+59
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/label_smoothing_loss.py
.../nets/chainer_backend/transformer/label_smoothing_loss.py
+70
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/layer_norm.py
...lib/espnet/nets/chainer_backend/transformer/layer_norm.py
+16
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/mask.py
...build/lib/espnet/nets/chainer_backend/transformer/mask.py
+17
-0
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/positionwise_feed_forward.py
.../chainer_backend/transformer/positionwise_feed_forward.py
+64
-0
No files found.
Too many changes to show.
To preserve performance only
216 of 216+
files are displayed.
Plain diff
Email patch
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/e2e_asr.py
0 → 100644
View file @
60a2c57a
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""RNN sequence-to-sequence speech recognition model (chainer)."""
import
logging
import
math
import
chainer
import
numpy
as
np
from
chainer
import
reporter
from
espnet.nets.chainer_backend.asr_interface
import
ChainerASRInterface
from
espnet.nets.chainer_backend.ctc
import
ctc_for
from
espnet.nets.chainer_backend.rnn.attentions
import
att_for
from
espnet.nets.chainer_backend.rnn.decoders
import
decoder_for
from
espnet.nets.chainer_backend.rnn.encoders
import
encoder_for
from
espnet.nets.e2e_asr_common
import
label_smoothing_dist
from
espnet.nets.pytorch_backend.e2e_asr
import
E2E
as
E2E_pytorch
from
espnet.nets.pytorch_backend.nets_utils
import
get_subsample
CTC_LOSS_THRESHOLD
=
10000
class
E2E
(
ChainerASRInterface
):
"""E2E module for chainer backend.
Args:
idim (int): Dimension of the inputs.
odim (int): Dimension of the outputs.
args (parser.args): Training config.
flag_return (bool): If True, train() would return
additional metrics in addition to the training
loss.
"""
@
staticmethod
def
add_arguments
(
parser
):
"""Add arguments."""
return
E2E_pytorch
.
add_arguments
(
parser
)
def
get_total_subsampling_factor
(
self
):
"""Get total subsampling factor."""
return
self
.
enc
.
conv_subsampling_factor
*
int
(
np
.
prod
(
self
.
subsample
))
def
__init__
(
self
,
idim
,
odim
,
args
,
flag_return
=
True
):
"""Construct an E2E object.
:param int idim: dimension of inputs
:param int odim: dimension of outputs
:param Namespace args: argument Namespace containing options
"""
chainer
.
Chain
.
__init__
(
self
)
self
.
mtlalpha
=
args
.
mtlalpha
assert
0
<=
self
.
mtlalpha
<=
1
,
"mtlalpha must be [0,1]"
self
.
etype
=
args
.
etype
self
.
verbose
=
args
.
verbose
self
.
char_list
=
args
.
char_list
self
.
outdir
=
args
.
outdir
# below means the last number becomes eos/sos ID
# note that sos/eos IDs are identical
self
.
sos
=
odim
-
1
self
.
eos
=
odim
-
1
# subsample info
self
.
subsample
=
get_subsample
(
args
,
mode
=
"asr"
,
arch
=
"rnn"
)
# label smoothing info
if
args
.
lsm_type
:
logging
.
info
(
"Use label smoothing with "
+
args
.
lsm_type
)
labeldist
=
label_smoothing_dist
(
odim
,
args
.
lsm_type
,
transcript
=
args
.
train_json
)
else
:
labeldist
=
None
with
self
.
init_scope
():
# encoder
self
.
enc
=
encoder_for
(
args
,
idim
,
self
.
subsample
)
# ctc
self
.
ctc
=
ctc_for
(
args
,
odim
)
# attention
self
.
att
=
att_for
(
args
)
# decoder
self
.
dec
=
decoder_for
(
args
,
odim
,
self
.
sos
,
self
.
eos
,
self
.
att
,
labeldist
)
self
.
acc
=
None
self
.
loss
=
None
self
.
flag_return
=
flag_return
def
forward
(
self
,
xs
,
ilens
,
ys
):
"""E2E forward propagation.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.Variable): Batch of length of each input batch. (B,)
ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)
Returns:
float: Loss that calculated by attention and ctc loss.
float (optional): Ctc loss.
float (optional): Attention loss.
float (optional): Accuracy.
"""
# 1. encoder
hs
,
ilens
=
self
.
enc
(
xs
,
ilens
)
# 3. CTC loss
if
self
.
mtlalpha
==
0
:
loss_ctc
=
None
else
:
loss_ctc
=
self
.
ctc
(
hs
,
ys
)
# 4. attention loss
if
self
.
mtlalpha
==
1
:
loss_att
=
None
acc
=
None
else
:
loss_att
,
acc
=
self
.
dec
(
hs
,
ys
)
self
.
acc
=
acc
alpha
=
self
.
mtlalpha
if
alpha
==
0
:
self
.
loss
=
loss_att
elif
alpha
==
1
:
self
.
loss
=
loss_ctc
else
:
self
.
loss
=
alpha
*
loss_ctc
+
(
1
-
alpha
)
*
loss_att
if
self
.
loss
.
data
<
CTC_LOSS_THRESHOLD
and
not
math
.
isnan
(
self
.
loss
.
data
):
reporter
.
report
({
"loss_ctc"
:
loss_ctc
},
self
)
reporter
.
report
({
"loss_att"
:
loss_att
},
self
)
reporter
.
report
({
"acc"
:
acc
},
self
)
logging
.
info
(
"mtl loss:"
+
str
(
self
.
loss
.
data
))
reporter
.
report
({
"loss"
:
self
.
loss
},
self
)
else
:
logging
.
warning
(
"loss (=%f) is not correct"
,
self
.
loss
.
data
)
if
self
.
flag_return
:
return
self
.
loss
,
loss_ctc
,
loss_att
,
acc
else
:
return
self
.
loss
def
recognize
(
self
,
x
,
recog_args
,
char_list
,
rnnlm
=
None
):
"""E2E greedy/beam search.
Args:
x (chainer.Variable): Input tensor for recognition.
recog_args (parser.args): Arguments of config file.
char_list (List[str]): List of Characters.
rnnlm (Module): RNNLM module defined at `espnet.lm.chainer_backend.lm`.
Returns:
List[Dict[str, Any]]: Result of recognition.
"""
# subsample frame
x
=
x
[::
self
.
subsample
[
0
],
:]
ilen
=
self
.
xp
.
array
(
x
.
shape
[
0
],
dtype
=
np
.
int32
)
h
=
chainer
.
Variable
(
self
.
xp
.
array
(
x
,
dtype
=
np
.
float32
))
with
chainer
.
no_backprop_mode
(),
chainer
.
using_config
(
"train"
,
False
):
# 1. encoder
# make a utt list (1) to use the same interface for encoder
h
,
_
=
self
.
enc
([
h
],
[
ilen
])
# calculate log P(z_t|X) for CTC scores
if
recog_args
.
ctc_weight
>
0.0
:
lpz
=
self
.
ctc
.
log_softmax
(
h
).
data
[
0
]
else
:
lpz
=
None
# 2. decoder
# decode the first utterance
y
=
self
.
dec
.
recognize_beam
(
h
[
0
],
lpz
,
recog_args
,
char_list
,
rnnlm
)
return
y
def
calculate_all_attentions
(
self
,
xs
,
ilens
,
ys
):
"""E2E attention calculation.
Args:
xs (List): List of padded input sequences. [(T1, idim), (T2, idim), ...]
ilens (np.ndarray): Batch of lengths of input sequences. (B)
ys (List): List of character id sequence tensor. [(L1), (L2), (L3), ...]
Returns:
float np.ndarray: Attention weights. (B, Lmax, Tmax)
"""
hs
,
ilens
=
self
.
enc
(
xs
,
ilens
)
att_ws
=
self
.
dec
.
calculate_all_attentions
(
hs
,
ys
)
return
att_ws
@
staticmethod
def
custom_converter
(
subsampling_factor
=
0
):
"""Get customconverter of the model."""
from
espnet.nets.chainer_backend.rnn.training
import
CustomConverter
return
CustomConverter
(
subsampling_factor
=
subsampling_factor
)
@
staticmethod
def
custom_updater
(
iters
,
optimizer
,
converter
,
device
=-
1
,
accum_grad
=
1
):
"""Get custom_updater of the model."""
from
espnet.nets.chainer_backend.rnn.training
import
CustomUpdater
return
CustomUpdater
(
iters
,
optimizer
,
converter
=
converter
,
device
=
device
,
accum_grad
=
accum_grad
)
@
staticmethod
def
custom_parallel_updater
(
iters
,
optimizer
,
converter
,
devices
,
accum_grad
=
1
):
"""Get custom_parallel_updater of the model."""
from
espnet.nets.chainer_backend.rnn.training
import
CustomParallelUpdater
return
CustomParallelUpdater
(
iters
,
optimizer
,
converter
=
converter
,
devices
=
devices
,
accum_grad
=
accum_grad
,
)
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/e2e_asr_transformer.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Transformer-based model for End-to-end ASR."""
import
logging
import
math
from
argparse
import
Namespace
from
distutils.util
import
strtobool
import
chainer
import
chainer.functions
as
F
import
numpy
as
np
from
chainer
import
reporter
from
espnet.nets.chainer_backend.asr_interface
import
ChainerASRInterface
from
espnet.nets.chainer_backend.transformer
import
ctc
from
espnet.nets.chainer_backend.transformer.attention
import
MultiHeadAttention
from
espnet.nets.chainer_backend.transformer.decoder
import
Decoder
from
espnet.nets.chainer_backend.transformer.encoder
import
Encoder
from
espnet.nets.chainer_backend.transformer.label_smoothing_loss
import
(
# noqa: H301
LabelSmoothingLoss
,
)
from
espnet.nets.chainer_backend.transformer.training
import
(
# noqa: H301
CustomConverter
,
CustomParallelUpdater
,
CustomUpdater
,
)
from
espnet.nets.ctc_prefix_score
import
CTCPrefixScore
from
espnet.nets.e2e_asr_common
import
ErrorCalculator
,
end_detect
from
espnet.nets.pytorch_backend.nets_utils
import
get_subsample
from
espnet.nets.pytorch_backend.transformer.plot
import
PlotAttentionReport
CTC_SCORING_RATIO
=
1.5
MAX_DECODER_OUTPUT
=
5
class
E2E
(
ChainerASRInterface
):
"""E2E module.
Args:
idim (int): Input dimmensions.
odim (int): Output dimmensions.
args (Namespace): Training config.
ignore_id (int, optional): Id for ignoring a character.
flag_return (bool, optional): If true, return a list with (loss,
loss_ctc, loss_att, acc) in forward. Otherwise, return loss.
"""
@
staticmethod
def
add_arguments
(
parser
):
"""Customize flags for transformer setup.
Args:
parser (Namespace): Training config.
"""
group
=
parser
.
add_argument_group
(
"transformer model setting"
)
group
.
add_argument
(
"--transformer-init"
,
type
=
str
,
default
=
"pytorch"
,
help
=
"how to initialize transformer parameters"
,
)
group
.
add_argument
(
"--transformer-input-layer"
,
type
=
str
,
default
=
"conv2d"
,
choices
=
[
"conv2d"
,
"linear"
,
"embed"
],
help
=
"transformer input layer type"
,
)
group
.
add_argument
(
"--transformer-attn-dropout-rate"
,
default
=
None
,
type
=
float
,
help
=
"dropout in transformer attention. use --dropout-rate if None is set"
,
)
group
.
add_argument
(
"--transformer-lr"
,
default
=
10.0
,
type
=
float
,
help
=
"Initial value of learning rate"
,
)
group
.
add_argument
(
"--transformer-warmup-steps"
,
default
=
25000
,
type
=
int
,
help
=
"optimizer warmup steps"
,
)
group
.
add_argument
(
"--transformer-length-normalized-loss"
,
default
=
True
,
type
=
strtobool
,
help
=
"normalize loss by length"
,
)
group
.
add_argument
(
"--dropout-rate"
,
default
=
0.0
,
type
=
float
,
help
=
"Dropout rate for the encoder"
,
)
# Encoder
group
.
add_argument
(
"--elayers"
,
default
=
4
,
type
=
int
,
help
=
"Number of encoder layers (for shared recognition part "
"in multi-speaker asr mode)"
,
)
group
.
add_argument
(
"--eunits"
,
"-u"
,
default
=
300
,
type
=
int
,
help
=
"Number of encoder hidden units"
,
)
# Attention
group
.
add_argument
(
"--adim"
,
default
=
320
,
type
=
int
,
help
=
"Number of attention transformation dimensions"
,
)
group
.
add_argument
(
"--aheads"
,
default
=
4
,
type
=
int
,
help
=
"Number of heads for multi head attention"
,
)
# Decoder
group
.
add_argument
(
"--dlayers"
,
default
=
1
,
type
=
int
,
help
=
"Number of decoder layers"
)
group
.
add_argument
(
"--dunits"
,
default
=
320
,
type
=
int
,
help
=
"Number of decoder hidden units"
)
return
parser
def
get_total_subsampling_factor
(
self
):
"""Get total subsampling factor."""
return
self
.
encoder
.
conv_subsampling_factor
*
int
(
np
.
prod
(
self
.
subsample
))
def
__init__
(
self
,
idim
,
odim
,
args
,
ignore_id
=-
1
,
flag_return
=
True
):
"""Initialize the transformer."""
chainer
.
Chain
.
__init__
(
self
)
self
.
mtlalpha
=
args
.
mtlalpha
assert
0
<=
self
.
mtlalpha
<=
1
,
"mtlalpha must be [0,1]"
if
args
.
transformer_attn_dropout_rate
is
None
:
args
.
transformer_attn_dropout_rate
=
args
.
dropout_rate
self
.
use_label_smoothing
=
False
self
.
char_list
=
args
.
char_list
self
.
space
=
args
.
sym_space
self
.
blank
=
args
.
sym_blank
self
.
scale_emb
=
args
.
adim
**
0.5
self
.
sos
=
odim
-
1
self
.
eos
=
odim
-
1
self
.
subsample
=
get_subsample
(
args
,
mode
=
"asr"
,
arch
=
"transformer"
)
self
.
ignore_id
=
ignore_id
self
.
reset_parameters
(
args
)
with
self
.
init_scope
():
self
.
encoder
=
Encoder
(
idim
=
idim
,
attention_dim
=
args
.
adim
,
attention_heads
=
args
.
aheads
,
linear_units
=
args
.
eunits
,
input_layer
=
args
.
transformer_input_layer
,
dropout_rate
=
args
.
dropout_rate
,
positional_dropout_rate
=
args
.
dropout_rate
,
attention_dropout_rate
=
args
.
transformer_attn_dropout_rate
,
initialW
=
self
.
initialW
,
initial_bias
=
self
.
initialB
,
)
self
.
decoder
=
Decoder
(
odim
,
args
,
initialW
=
self
.
initialW
,
initial_bias
=
self
.
initialB
)
self
.
criterion
=
LabelSmoothingLoss
(
args
.
lsm_weight
,
len
(
args
.
char_list
),
args
.
transformer_length_normalized_loss
,
)
if
args
.
mtlalpha
>
0.0
:
if
args
.
ctc_type
==
"builtin"
:
logging
.
info
(
"Using chainer CTC implementation"
)
self
.
ctc
=
ctc
.
CTC
(
odim
,
args
.
adim
,
args
.
dropout_rate
)
else
:
raise
ValueError
(
'ctc_type must be "builtin": {}'
.
format
(
args
.
ctc_type
)
)
else
:
self
.
ctc
=
None
self
.
dims
=
args
.
adim
self
.
odim
=
odim
self
.
flag_return
=
flag_return
if
args
.
report_cer
or
args
.
report_wer
:
self
.
error_calculator
=
ErrorCalculator
(
args
.
char_list
,
args
.
sym_space
,
args
.
sym_blank
,
args
.
report_cer
,
args
.
report_wer
,
)
else
:
self
.
error_calculator
=
None
if
"Namespace"
in
str
(
type
(
args
)):
self
.
verbose
=
0
if
"verbose"
not
in
args
else
args
.
verbose
else
:
self
.
verbose
=
0
if
args
.
verbose
is
None
else
args
.
verbose
def
reset_parameters
(
self
,
args
):
"""Initialize the Weight according to the give initialize-type.
Args:
args (Namespace): Transformer config.
"""
type_init
=
args
.
transformer_init
if
type_init
==
"lecun_uniform"
:
logging
.
info
(
"Using LeCunUniform as Parameter initializer"
)
self
.
initialW
=
chainer
.
initializers
.
LeCunUniform
elif
type_init
==
"lecun_normal"
:
logging
.
info
(
"Using LeCunNormal as Parameter initializer"
)
self
.
initialW
=
chainer
.
initializers
.
LeCunNormal
elif
type_init
==
"gorot_uniform"
:
logging
.
info
(
"Using GlorotUniform as Parameter initializer"
)
self
.
initialW
=
chainer
.
initializers
.
GlorotUniform
elif
type_init
==
"gorot_normal"
:
logging
.
info
(
"Using GlorotNormal as Parameter initializer"
)
self
.
initialW
=
chainer
.
initializers
.
GlorotNormal
elif
type_init
==
"he_uniform"
:
logging
.
info
(
"Using HeUniform as Parameter initializer"
)
self
.
initialW
=
chainer
.
initializers
.
HeUniform
elif
type_init
==
"he_normal"
:
logging
.
info
(
"Using HeNormal as Parameter initializer"
)
self
.
initialW
=
chainer
.
initializers
.
HeNormal
elif
type_init
==
"pytorch"
:
logging
.
info
(
"Using Pytorch initializer"
)
self
.
initialW
=
chainer
.
initializers
.
Uniform
else
:
logging
.
info
(
"Using Chainer default as Parameter initializer"
)
self
.
initialW
=
chainer
.
initializers
.
Uniform
self
.
initialB
=
chainer
.
initializers
.
Uniform
def
forward
(
self
,
xs
,
ilens
,
ys_pad
,
calculate_attentions
=
False
):
"""E2E forward propagation.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.Variable): Batch of length of each input batch. (B,)
ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)
calculate_attentions (bool): If true, return value is the output of encoder.
Returns:
float: Training loss.
float (optional): Training loss for ctc.
float (optional): Training loss for attention.
float (optional): Accuracy.
chainer.Variable (Optional): Output of the encoder.
"""
alpha
=
self
.
mtlalpha
# 1. Encoder
xs
,
x_mask
,
ilens
=
self
.
encoder
(
xs
,
ilens
)
# 2. CTC loss
cer_ctc
=
None
if
alpha
==
0.0
:
loss_ctc
=
None
else
:
_ys
=
[
y
.
astype
(
np
.
int32
)
for
y
in
ys_pad
]
loss_ctc
=
self
.
ctc
(
xs
,
_ys
)
if
self
.
error_calculator
is
not
None
:
with
chainer
.
no_backprop_mode
():
ys_hat
=
chainer
.
backends
.
cuda
.
to_cpu
(
self
.
ctc
.
argmax
(
xs
).
data
)
cer_ctc
=
self
.
error_calculator
(
ys_hat
,
ys_pad
,
is_ctc
=
True
)
# 3. Decoder
if
calculate_attentions
:
self
.
calculate_attentions
(
xs
,
x_mask
,
ys_pad
)
ys
=
self
.
decoder
(
ys_pad
,
xs
,
x_mask
)
# 4. Attention Loss
cer
,
wer
=
None
,
None
if
alpha
==
1
:
loss_att
=
None
acc
=
None
else
:
# Make target
eos
=
np
.
array
([
self
.
eos
],
"i"
)
with
chainer
.
no_backprop_mode
():
ys_pad_out
=
[
np
.
concatenate
([
y
,
eos
],
axis
=
0
)
for
y
in
ys_pad
]
ys_pad_out
=
F
.
pad_sequence
(
ys_pad_out
,
padding
=-
1
).
data
ys_pad_out
=
self
.
xp
.
array
(
ys_pad_out
)
loss_att
=
self
.
criterion
(
ys
,
ys_pad_out
)
acc
=
F
.
accuracy
(
ys
.
reshape
(
-
1
,
self
.
odim
),
ys_pad_out
.
reshape
(
-
1
),
ignore_label
=-
1
)
if
(
not
chainer
.
config
.
train
)
and
(
self
.
error_calculator
is
not
None
):
cer
,
wer
=
self
.
error_calculator
(
ys
,
ys_pad
)
if
alpha
==
0.0
:
self
.
loss
=
loss_att
loss_att_data
=
loss_att
.
data
loss_ctc_data
=
None
elif
alpha
==
1.0
:
self
.
loss
=
loss_ctc
loss_att_data
=
None
loss_ctc_data
=
loss_ctc
.
data
else
:
self
.
loss
=
alpha
*
loss_ctc
+
(
1
-
alpha
)
*
loss_att
loss_att_data
=
loss_att
.
data
loss_ctc_data
=
loss_ctc
.
data
loss_data
=
self
.
loss
.
data
if
not
math
.
isnan
(
loss_data
):
reporter
.
report
({
"loss_ctc"
:
loss_ctc_data
},
self
)
reporter
.
report
({
"loss_att"
:
loss_att_data
},
self
)
reporter
.
report
({
"acc"
:
acc
},
self
)
reporter
.
report
({
"cer_ctc"
:
cer_ctc
},
self
)
reporter
.
report
({
"cer"
:
cer
},
self
)
reporter
.
report
({
"wer"
:
wer
},
self
)
logging
.
info
(
"mtl loss:"
+
str
(
loss_data
))
reporter
.
report
({
"loss"
:
loss_data
},
self
)
else
:
logging
.
warning
(
"loss (=%f) is not correct"
,
loss_data
)
if
self
.
flag_return
:
loss_ctc
=
None
return
self
.
loss
,
loss_ctc
,
loss_att
,
acc
else
:
return
self
.
loss
def
calculate_attentions
(
self
,
xs
,
x_mask
,
ys_pad
):
"""Calculate Attentions."""
self
.
decoder
(
ys_pad
,
xs
,
x_mask
)
def
recognize
(
self
,
x_block
,
recog_args
,
char_list
=
None
,
rnnlm
=
None
):
"""E2E recognition function.
Args:
x (ndarray): Input acouctic feature (B, T, D) or (T, D).
recog_args (Namespace): Argment namespace contraining options.
char_list (List[str]): List of characters.
rnnlm (chainer.Chain): Language model module defined at
`espnet.lm.chainer_backend.lm`.
Returns:
List: N-best decoding results.
"""
with
chainer
.
no_backprop_mode
(),
chainer
.
using_config
(
"train"
,
False
):
# 1. encoder
ilens
=
[
x_block
.
shape
[
0
]]
batch
=
len
(
ilens
)
xs
,
_
,
_
=
self
.
encoder
(
x_block
[
None
,
:,
:],
ilens
)
# calculate log P(z_t|X) for CTC scores
if
recog_args
.
ctc_weight
>
0.0
:
lpz
=
self
.
ctc
.
log_softmax
(
xs
.
reshape
(
batch
,
-
1
,
self
.
dims
)).
data
[
0
]
else
:
lpz
=
None
# 2. decoder
if
recog_args
.
lm_weight
==
0.0
:
rnnlm
=
None
y
=
self
.
recognize_beam
(
xs
,
lpz
,
recog_args
,
char_list
,
rnnlm
)
return
y
def
recognize_beam
(
self
,
h
,
lpz
,
recog_args
,
char_list
=
None
,
rnnlm
=
None
):
"""E2E beam search.
Args:
h (ndarray): Encoder output features (B, T, D) or (T, D).
lpz (ndarray): Log probabilities from CTC.
recog_args (Namespace): Argment namespace contraining options.
char_list (List[str]): List of characters.
rnnlm (chainer.Chain): Language model module defined at
`espnet.lm.chainer_backend.lm`.
Returns:
List: N-best decoding results.
"""
logging
.
info
(
"input lengths: "
+
str
(
h
.
shape
[
1
]))
# initialization
n_len
=
h
.
shape
[
1
]
xp
=
self
.
xp
h_mask
=
xp
.
ones
((
1
,
n_len
))
# search parms
beam
=
recog_args
.
beam_size
penalty
=
recog_args
.
penalty
ctc_weight
=
recog_args
.
ctc_weight
# prepare sos
y
=
self
.
sos
if
recog_args
.
maxlenratio
==
0
:
maxlen
=
n_len
else
:
maxlen
=
max
(
1
,
int
(
recog_args
.
maxlenratio
*
n_len
))
minlen
=
int
(
recog_args
.
minlenratio
*
n_len
)
logging
.
info
(
"max output length: "
+
str
(
maxlen
))
logging
.
info
(
"min output length: "
+
str
(
minlen
))
# initialize hypothesis
if
rnnlm
:
hyp
=
{
"score"
:
0.0
,
"yseq"
:
[
y
],
"rnnlm_prev"
:
None
}
else
:
hyp
=
{
"score"
:
0.0
,
"yseq"
:
[
y
]}
if
lpz
is
not
None
:
ctc_prefix_score
=
CTCPrefixScore
(
lpz
,
0
,
self
.
eos
,
self
.
xp
)
hyp
[
"ctc_state_prev"
]
=
ctc_prefix_score
.
initial_state
()
hyp
[
"ctc_score_prev"
]
=
0.0
if
ctc_weight
!=
1.0
:
# pre-pruning based on attention scores
ctc_beam
=
min
(
lpz
.
shape
[
-
1
],
int
(
beam
*
CTC_SCORING_RATIO
))
else
:
ctc_beam
=
lpz
.
shape
[
-
1
]
hyps
=
[
hyp
]
ended_hyps
=
[]
for
i
in
range
(
maxlen
):
logging
.
debug
(
"position "
+
str
(
i
))
hyps_best_kept
=
[]
for
hyp
in
hyps
:
ys
=
F
.
expand_dims
(
xp
.
array
(
hyp
[
"yseq"
]),
axis
=
0
).
data
out
=
self
.
decoder
(
ys
,
h
,
h_mask
)
# get nbest local scores and their ids
local_att_scores
=
F
.
log_softmax
(
out
[:,
-
1
],
axis
=-
1
).
data
if
rnnlm
:
rnnlm_state
,
local_lm_scores
=
rnnlm
.
predict
(
hyp
[
"rnnlm_prev"
],
hyp
[
"yseq"
][
i
]
)
local_scores
=
(
local_att_scores
+
recog_args
.
lm_weight
*
local_lm_scores
)
else
:
local_scores
=
local_att_scores
if
lpz
is
not
None
:
local_best_ids
=
xp
.
argsort
(
local_scores
,
axis
=
1
)[
0
,
::
-
1
][
:
ctc_beam
]
ctc_scores
,
ctc_states
=
ctc_prefix_score
(
hyp
[
"yseq"
],
local_best_ids
,
hyp
[
"ctc_state_prev"
]
)
local_scores
=
(
1.0
-
ctc_weight
)
*
local_att_scores
[
:,
local_best_ids
]
+
ctc_weight
*
(
ctc_scores
-
hyp
[
"ctc_score_prev"
])
if
rnnlm
:
local_scores
+=
(
recog_args
.
lm_weight
*
local_lm_scores
[:,
local_best_ids
]
)
joint_best_ids
=
xp
.
argsort
(
local_scores
,
axis
=
1
)[
0
,
::
-
1
][:
beam
]
local_best_scores
=
local_scores
[:,
joint_best_ids
]
local_best_ids
=
local_best_ids
[
joint_best_ids
]
else
:
local_best_ids
=
self
.
xp
.
argsort
(
local_scores
,
axis
=
1
)[
0
,
::
-
1
][
:
beam
]
local_best_scores
=
local_scores
[:,
local_best_ids
]
for
j
in
range
(
beam
):
new_hyp
=
{}
new_hyp
[
"score"
]
=
hyp
[
"score"
]
+
float
(
local_best_scores
[
0
,
j
])
new_hyp
[
"yseq"
]
=
[
0
]
*
(
1
+
len
(
hyp
[
"yseq"
]))
new_hyp
[
"yseq"
][:
len
(
hyp
[
"yseq"
])]
=
hyp
[
"yseq"
]
new_hyp
[
"yseq"
][
len
(
hyp
[
"yseq"
])]
=
int
(
local_best_ids
[
j
])
if
rnnlm
:
new_hyp
[
"rnnlm_prev"
]
=
rnnlm_state
if
lpz
is
not
None
:
new_hyp
[
"ctc_state_prev"
]
=
ctc_states
[
joint_best_ids
[
j
]]
new_hyp
[
"ctc_score_prev"
]
=
ctc_scores
[
joint_best_ids
[
j
]]
hyps_best_kept
.
append
(
new_hyp
)
hyps_best_kept
=
sorted
(
hyps_best_kept
,
key
=
lambda
x
:
x
[
"score"
],
reverse
=
True
)[:
beam
]
# sort and get nbest
hyps
=
hyps_best_kept
logging
.
debug
(
"number of pruned hypothesis: "
+
str
(
len
(
hyps
)))
if
char_list
is
not
None
:
logging
.
debug
(
"best hypo: "
+
""
.
join
([
char_list
[
int
(
x
)]
for
x
in
hyps
[
0
][
"yseq"
][
1
:]])
+
" score: "
+
str
(
hyps
[
0
][
"score"
])
)
# add eos in the final loop to avoid that there are no ended hyps
if
i
==
maxlen
-
1
:
logging
.
info
(
"adding <eos> in the last position in the loop"
)
for
hyp
in
hyps
:
hyp
[
"yseq"
].
append
(
self
.
eos
)
# add ended hypothes to a final list, and removed them from current hypothes
# (this will be a probmlem, number of hyps < beam)
remained_hyps
=
[]
for
hyp
in
hyps
:
if
hyp
[
"yseq"
][
-
1
]
==
self
.
eos
:
# only store the sequence that has more than minlen outputs
# also add penalty
if
len
(
hyp
[
"yseq"
])
>
minlen
:
hyp
[
"score"
]
+=
(
i
+
1
)
*
penalty
if
rnnlm
:
# Word LM needs to add final <eos> score
hyp
[
"score"
]
+=
recog_args
.
lm_weight
*
rnnlm
.
final
(
hyp
[
"rnnlm_prev"
]
)
ended_hyps
.
append
(
hyp
)
else
:
remained_hyps
.
append
(
hyp
)
# end detection
if
end_detect
(
ended_hyps
,
i
)
and
recog_args
.
maxlenratio
==
0.0
:
logging
.
info
(
"end detected at %d"
,
i
)
break
hyps
=
remained_hyps
if
len
(
hyps
)
>
0
:
logging
.
debug
(
"remained hypothes: "
+
str
(
len
(
hyps
)))
else
:
logging
.
info
(
"no hypothesis. Finish decoding."
)
break
if
char_list
is
not
None
:
for
hyp
in
hyps
:
logging
.
debug
(
"hypo: "
+
""
.
join
([
char_list
[
int
(
x
)]
for
x
in
hyp
[
"yseq"
][
1
:]])
)
logging
.
debug
(
"number of ended hypothes: "
+
str
(
len
(
ended_hyps
)))
nbest_hyps
=
sorted
(
ended_hyps
,
key
=
lambda
x
:
x
[
"score"
],
reverse
=
True
)
# [:min(len(ended_hyps), recog_args.nbest)]
logging
.
debug
(
nbest_hyps
)
# check number of hypotheis
if
len
(
nbest_hyps
)
==
0
:
logging
.
warn
(
"there is no N-best results, perform recognition "
"again with smaller minlenratio."
)
# should copy becasuse Namespace will be overwritten globally
recog_args
=
Namespace
(
**
vars
(
recog_args
))
recog_args
.
minlenratio
=
max
(
0.0
,
recog_args
.
minlenratio
-
0.1
)
return
self
.
recognize_beam
(
h
,
lpz
,
recog_args
,
char_list
,
rnnlm
)
logging
.
info
(
"total log probability: "
+
str
(
nbest_hyps
[
0
][
"score"
]))
logging
.
info
(
"normalized log probability: "
+
str
(
nbest_hyps
[
0
][
"score"
]
/
len
(
nbest_hyps
[
0
][
"yseq"
]))
)
# remove sos
return
nbest_hyps
def
calculate_all_attentions
(
self
,
xs
,
ilens
,
ys
):
"""E2E attention calculation.
Args:
xs (List[tuple()]): List of padded input sequences.
[(T1, idim), (T2, idim), ...]
ilens (ndarray): Batch of lengths of input sequences. (B)
ys (List): List of character id sequence tensor. [(L1), (L2), (L3), ...]
Returns:
float ndarray: Attention weights. (B, Lmax, Tmax)
"""
with
chainer
.
no_backprop_mode
():
self
(
xs
,
ilens
,
ys
,
calculate_attentions
=
True
)
ret
=
dict
()
for
name
,
m
in
self
.
namedlinks
():
if
isinstance
(
m
,
MultiHeadAttention
):
var
=
m
.
attn
var
.
to_cpu
()
_name
=
name
[
1
:].
replace
(
"/"
,
"_"
)
ret
[
_name
]
=
var
.
data
return
ret
@
property
def
attention_plot_class
(
self
):
"""Attention plot function.
Redirects to PlotAttentionReport
Returns:
PlotAttentionReport
"""
return
PlotAttentionReport
@
staticmethod
def
custom_converter
(
subsampling_factor
=
0
):
"""Get customconverter of the model."""
return
CustomConverter
()
@
staticmethod
def
custom_updater
(
iters
,
optimizer
,
converter
,
device
=-
1
,
accum_grad
=
1
):
"""Get custom_updater of the model."""
return
CustomUpdater
(
iters
,
optimizer
,
converter
=
converter
,
device
=
device
,
accum_grad
=
accum_grad
)
@
staticmethod
def
custom_parallel_updater
(
iters
,
optimizer
,
converter
,
devices
,
accum_grad
=
1
):
"""Get custom_parallel_updater of the model."""
return
CustomParallelUpdater
(
iters
,
optimizer
,
converter
=
converter
,
devices
=
devices
,
accum_grad
=
accum_grad
,
)
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/nets_utils.py
0 → 100644
View file @
60a2c57a
import
chainer.functions
as
F
def
_subsamplex
(
x
,
n
):
x
=
[
F
.
get_item
(
xx
,
(
slice
(
None
,
None
,
n
),
slice
(
None
)))
for
xx
in
x
]
ilens
=
[
xx
.
shape
[
0
]
for
xx
in
x
]
return
x
,
ilens
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/__init__.py
0 → 100644
View file @
60a2c57a
"""Initialize sub package."""
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/attentions.py
0 → 100644
View file @
60a2c57a
import
chainer
import
chainer.functions
as
F
import
chainer.links
as
L
import
numpy
as
np
# dot product based attention
class
AttDot
(
chainer
.
Chain
):
"""Compute attention based on dot product.
Args:
eprojs (int | None): Dimension of input vectors from encoder.
dunits (int | None): Dimension of input vectors for decoder.
att_dim (int): Dimension of input vectors for attention.
"""
def
__init__
(
self
,
eprojs
,
dunits
,
att_dim
):
super
(
AttDot
,
self
).
__init__
()
with
self
.
init_scope
():
self
.
mlp_enc
=
L
.
Linear
(
eprojs
,
att_dim
)
self
.
mlp_dec
=
L
.
Linear
(
dunits
,
att_dim
)
self
.
dunits
=
dunits
self
.
eprojs
=
eprojs
self
.
att_dim
=
att_dim
self
.
h_length
=
None
self
.
enc_h
=
None
self
.
pre_compute_enc_h
=
None
def
reset
(
self
):
"""Reset states."""
self
.
h_length
=
None
self
.
enc_h
=
None
self
.
pre_compute_enc_h
=
None
def
__call__
(
self
,
enc_hs
,
dec_z
,
att_prev
,
scaling
=
2.0
):
"""Compute AttDot forward layer.
Args:
enc_hs (chainer.Variable | N-dimensional array):
Input variable from encoder.
dec_z (chainer.Variable | N-dimensional array): Input variable of decoder.
scaling (float): Scaling weight to make attention sharp.
Returns:
chainer.Variable: Weighted sum over flames.
chainer.Variable: Attention weight.
"""
batch
=
len
(
enc_hs
)
# pre-compute all h outside the decoder loop
if
self
.
pre_compute_enc_h
is
None
:
self
.
enc_h
=
F
.
pad_sequence
(
enc_hs
)
# utt x frame x hdim
self
.
h_length
=
self
.
enc_h
.
shape
[
1
]
# utt x frame x att_dim
self
.
pre_compute_enc_h
=
F
.
tanh
(
self
.
mlp_enc
(
self
.
enc_h
,
n_batch_axes
=
2
))
if
dec_z
is
None
:
dec_z
=
chainer
.
Variable
(
self
.
xp
.
zeros
((
batch
,
self
.
dunits
),
dtype
=
np
.
float32
)
)
else
:
dec_z
=
dec_z
.
reshape
(
batch
,
self
.
dunits
)
# <phi (h_t), psi (s)> for all t
u
=
F
.
broadcast_to
(
F
.
expand_dims
(
F
.
tanh
(
self
.
mlp_dec
(
dec_z
)),
1
),
self
.
pre_compute_enc_h
.
shape
)
e
=
F
.
sum
(
self
.
pre_compute_enc_h
*
u
,
axis
=
2
)
# utt x frame
# Applying a minus-large-number filter
# to make a probability value zero for a padded area
# simply degrades the performance, and I gave up this implementation
# Apply a scaling to make an attention sharp
w
=
F
.
softmax
(
scaling
*
e
)
# weighted sum over flames
# utt x hdim
c
=
F
.
sum
(
self
.
enc_h
*
F
.
broadcast_to
(
F
.
expand_dims
(
w
,
2
),
self
.
enc_h
.
shape
),
axis
=
1
)
return
c
,
w
# location based attention
class
AttLoc
(
chainer
.
Chain
):
"""Compute location-based attention.
Args:
eprojs (int | None): Dimension of input vectors from encoder.
dunits (int | None): Dimension of input vectors for decoder.
att_dim (int): Dimension of input vectors for attention.
aconv_chans (int): Number of channels of output arrays from convolutional layer.
aconv_filts (int): Size of filters of convolutional layer.
"""
def
__init__
(
self
,
eprojs
,
dunits
,
att_dim
,
aconv_chans
,
aconv_filts
):
super
(
AttLoc
,
self
).
__init__
()
with
self
.
init_scope
():
self
.
mlp_enc
=
L
.
Linear
(
eprojs
,
att_dim
)
self
.
mlp_dec
=
L
.
Linear
(
dunits
,
att_dim
,
nobias
=
True
)
self
.
mlp_att
=
L
.
Linear
(
aconv_chans
,
att_dim
,
nobias
=
True
)
self
.
loc_conv
=
L
.
Convolution2D
(
1
,
aconv_chans
,
ksize
=
(
1
,
2
*
aconv_filts
+
1
),
pad
=
(
0
,
aconv_filts
)
)
self
.
gvec
=
L
.
Linear
(
att_dim
,
1
)
self
.
dunits
=
dunits
self
.
eprojs
=
eprojs
self
.
att_dim
=
att_dim
self
.
h_length
=
None
self
.
enc_h
=
None
self
.
pre_compute_enc_h
=
None
self
.
aconv_chans
=
aconv_chans
def
reset
(
self
):
"""Reset states."""
self
.
h_length
=
None
self
.
enc_h
=
None
self
.
pre_compute_enc_h
=
None
def
__call__
(
self
,
enc_hs
,
dec_z
,
att_prev
,
scaling
=
2.0
):
"""Compute AttLoc forward layer.
Args:
enc_hs (chainer.Variable | N-dimensional array):
Input variable from encoders.
dec_z (chainer.Variable | N-dimensional array): Input variable of decoder.
att_prev (chainer.Variable | None): Attention weight.
scaling (float): Scaling weight to make attention sharp.
Returns:
chainer.Variable: Weighted sum over flames.
chainer.Variable: Attention weight.
"""
batch
=
len
(
enc_hs
)
# pre-compute all h outside the decoder loop
if
self
.
pre_compute_enc_h
is
None
:
self
.
enc_h
=
F
.
pad_sequence
(
enc_hs
)
# utt x frame x hdim
self
.
h_length
=
self
.
enc_h
.
shape
[
1
]
# utt x frame x att_dim
self
.
pre_compute_enc_h
=
self
.
mlp_enc
(
self
.
enc_h
,
n_batch_axes
=
2
)
if
dec_z
is
None
:
dec_z
=
chainer
.
Variable
(
self
.
xp
.
zeros
((
batch
,
self
.
dunits
),
dtype
=
np
.
float32
)
)
else
:
dec_z
=
dec_z
.
reshape
(
batch
,
self
.
dunits
)
# initialize attention weight with uniform dist.
if
att_prev
is
None
:
att_prev
=
[
self
.
xp
.
full
(
hh
.
shape
[
0
],
1.0
/
hh
.
shape
[
0
],
dtype
=
np
.
float32
)
for
hh
in
enc_hs
]
att_prev
=
[
chainer
.
Variable
(
att
)
for
att
in
att_prev
]
att_prev
=
F
.
pad_sequence
(
att_prev
)
# att_prev: utt x frame -> utt x 1 x 1 x frame
# -> utt x att_conv_chans x 1 x frame
att_conv
=
self
.
loc_conv
(
att_prev
.
reshape
(
batch
,
1
,
1
,
self
.
h_length
))
# att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
att_conv
=
F
.
swapaxes
(
F
.
squeeze
(
att_conv
,
axis
=
2
),
1
,
2
)
# att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
att_conv
=
self
.
mlp_att
(
att_conv
,
n_batch_axes
=
2
)
# dec_z_tiled: utt x frame x att_dim
dec_z_tiled
=
F
.
broadcast_to
(
F
.
expand_dims
(
self
.
mlp_dec
(
dec_z
),
1
),
self
.
pre_compute_enc_h
.
shape
)
# dot with gvec
# utt x frame x att_dim -> utt x frame
# TODO(watanabe) use batch_matmul
e
=
F
.
squeeze
(
self
.
gvec
(
F
.
tanh
(
att_conv
+
self
.
pre_compute_enc_h
+
dec_z_tiled
),
n_batch_axes
=
2
),
axis
=
2
,
)
# Applying a minus-large-number filter
# to make a probability value zero for a padded area
# simply degrades the performance, and I gave up this implementation
# Apply a scaling to make an attention sharp
w
=
F
.
softmax
(
scaling
*
e
)
# weighted sum over flames
# utt x hdim
c
=
F
.
sum
(
self
.
enc_h
*
F
.
broadcast_to
(
F
.
expand_dims
(
w
,
2
),
self
.
enc_h
.
shape
),
axis
=
1
)
return
c
,
w
class
NoAtt
(
chainer
.
Chain
):
"""Compute non-attention layer.
This layer is a dummy attention layer to be compatible with other
attention-based models.
"""
def
__init__
(
self
):
super
(
NoAtt
,
self
).
__init__
()
self
.
h_length
=
None
self
.
enc_h
=
None
self
.
pre_compute_enc_h
=
None
self
.
c
=
None
def
reset
(
self
):
"""Reset states."""
self
.
h_length
=
None
self
.
enc_h
=
None
self
.
pre_compute_enc_h
=
None
self
.
c
=
None
def
__call__
(
self
,
enc_hs
,
dec_z
,
att_prev
):
"""Compute NoAtt forward layer.
Args:
enc_hs (chainer.Variable | N-dimensional array):
Input variable from encoders.
dec_z: Dummy.
att_prev (chainer.Variable | None): Attention weight.
Returns:
chainer.Variable: Sum over flames.
chainer.Variable: Attention weight.
"""
# pre-compute all h outside the decoder loop
if
self
.
pre_compute_enc_h
is
None
:
self
.
enc_h
=
F
.
pad_sequence
(
enc_hs
)
# utt x frame x hdim
self
.
h_length
=
self
.
enc_h
.
shape
[
1
]
# initialize attention weight with uniform dist.
if
att_prev
is
None
:
att_prev
=
[
self
.
xp
.
full
(
hh
.
shape
[
0
],
1.0
/
hh
.
shape
[
0
],
dtype
=
np
.
float32
)
for
hh
in
enc_hs
]
att_prev
=
[
chainer
.
Variable
(
att
)
for
att
in
att_prev
]
att_prev
=
F
.
pad_sequence
(
att_prev
)
self
.
c
=
F
.
sum
(
self
.
enc_h
*
F
.
broadcast_to
(
F
.
expand_dims
(
att_prev
,
2
),
self
.
enc_h
.
shape
),
axis
=
1
,
)
return
self
.
c
,
att_prev
def
att_for
(
args
):
"""Returns an attention layer given the program arguments.
Args:
args (Namespace): The arguments.
Returns:
chainer.Chain: The corresponding attention module.
"""
if
args
.
atype
==
"dot"
:
att
=
AttDot
(
args
.
eprojs
,
args
.
dunits
,
args
.
adim
)
elif
args
.
atype
==
"location"
:
att
=
AttLoc
(
args
.
eprojs
,
args
.
dunits
,
args
.
adim
,
args
.
aconv_chans
,
args
.
aconv_filts
)
elif
args
.
atype
==
"noatt"
:
att
=
NoAtt
()
else
:
raise
NotImplementedError
(
"chainer supports only noatt, dot, and location attention."
)
return
att
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/decoders.py
0 → 100644
View file @
60a2c57a
import
logging
import
random
from
argparse
import
Namespace
import
chainer
import
chainer.functions
as
F
import
chainer.links
as
L
import
numpy
as
np
import
espnet.nets.chainer_backend.deterministic_embed_id
as
DL
from
espnet.nets.ctc_prefix_score
import
CTCPrefixScore
from
espnet.nets.e2e_asr_common
import
end_detect
CTC_SCORING_RATIO
=
1.5
MAX_DECODER_OUTPUT
=
5
class
Decoder
(
chainer
.
Chain
):
"""Decoder layer.
Args:
eprojs (int): Dimension of input variables from encoder.
odim (int): The output dimension.
dtype (str): Decoder type.
dlayers (int): Number of layers for decoder.
dunits (int): Dimension of input vector of decoder.
sos (int): Number to indicate the start of sequences.
eos (int): Number to indicate the end of sequences.
att (Module): Attention module defined at
`espnet.espnet.nets.chainer_backend.attentions`.
verbose (int): Verbosity level.
char_list (List[str]): List of all characters.
labeldist (numpy.array): Distributed array of counted transcript length.
lsm_weight (float): Weight to use when calculating the training loss.
sampling_probability (float): Threshold for scheduled sampling.
"""
def
__init__
(
self
,
eprojs
,
odim
,
dtype
,
dlayers
,
dunits
,
sos
,
eos
,
att
,
verbose
=
0
,
char_list
=
None
,
labeldist
=
None
,
lsm_weight
=
0.0
,
sampling_probability
=
0.0
,
):
super
(
Decoder
,
self
).
__init__
()
with
self
.
init_scope
():
self
.
embed
=
DL
.
EmbedID
(
odim
,
dunits
)
self
.
rnn0
=
(
L
.
StatelessLSTM
(
dunits
+
eprojs
,
dunits
)
if
dtype
==
"lstm"
else
L
.
StatelessGRU
(
dunits
+
eprojs
,
dunits
)
)
for
i
in
range
(
1
,
dlayers
):
setattr
(
self
,
"rnn%d"
%
i
,
L
.
StatelessLSTM
(
dunits
,
dunits
)
if
dtype
==
"lstm"
else
L
.
StatelessGRU
(
dunits
,
dunits
),
)
self
.
output
=
L
.
Linear
(
dunits
,
odim
)
self
.
dtype
=
dtype
self
.
loss
=
None
self
.
att
=
att
self
.
dlayers
=
dlayers
self
.
dunits
=
dunits
self
.
sos
=
sos
self
.
eos
=
eos
self
.
verbose
=
verbose
self
.
char_list
=
char_list
# for label smoothing
self
.
labeldist
=
labeldist
self
.
vlabeldist
=
None
self
.
lsm_weight
=
lsm_weight
self
.
sampling_probability
=
sampling_probability
def
rnn_forward
(
self
,
ey
,
z_list
,
c_list
,
z_prev
,
c_prev
):
if
self
.
dtype
==
"lstm"
:
c_list
[
0
],
z_list
[
0
]
=
self
.
rnn0
(
c_prev
[
0
],
z_prev
[
0
],
ey
)
for
i
in
range
(
1
,
self
.
dlayers
):
c_list
[
i
],
z_list
[
i
]
=
self
[
"rnn%d"
%
i
](
c_prev
[
i
],
z_prev
[
i
],
z_list
[
i
-
1
]
)
else
:
if
z_prev
[
0
]
is
None
:
xp
=
self
.
xp
with
chainer
.
backends
.
cuda
.
get_device_from_id
(
self
.
_device_id
):
z_prev
[
0
]
=
chainer
.
Variable
(
xp
.
zeros
((
ey
.
shape
[
0
],
self
.
dunits
),
dtype
=
ey
.
dtype
)
)
z_list
[
0
]
=
self
.
rnn0
(
z_prev
[
0
],
ey
)
for
i
in
range
(
1
,
self
.
dlayers
):
if
z_prev
[
i
]
is
None
:
xp
=
self
.
xp
with
chainer
.
backends
.
cuda
.
get_device_from_id
(
self
.
_device_id
):
z_prev
[
i
]
=
chainer
.
Variable
(
xp
.
zeros
(
(
z_list
[
i
-
1
].
shape
[
0
],
self
.
dunits
),
dtype
=
z_list
[
i
-
1
].
dtype
,
)
)
z_list
[
i
]
=
self
[
"rnn%d"
%
i
](
z_prev
[
i
],
z_list
[
i
-
1
])
return
z_list
,
c_list
def
__call__
(
self
,
hs
,
ys
):
"""Core function of Decoder layer.
Args:
hs (list of chainer.Variable | N-dimension array):
Input variable from encoder.
ys (list of chainer.Variable | N-dimension array):
Input variable of decoder.
Returns:
chainer.Variable: A variable holding a scalar array of the training loss.
chainer.Variable: A variable holding a scalar array of the accuracy.
"""
self
.
loss
=
None
# prepare input and output word sequences with sos/eos IDs
eos
=
self
.
xp
.
array
([
self
.
eos
],
"i"
)
sos
=
self
.
xp
.
array
([
self
.
sos
],
"i"
)
ys_in
=
[
F
.
concat
([
sos
,
y
],
axis
=
0
)
for
y
in
ys
]
ys_out
=
[
F
.
concat
([
y
,
eos
],
axis
=
0
)
for
y
in
ys
]
# padding for ys with -1
# pys: utt x olen
pad_ys_in
=
F
.
pad_sequence
(
ys_in
,
padding
=
self
.
eos
)
pad_ys_out
=
F
.
pad_sequence
(
ys_out
,
padding
=-
1
)
# get dim, length info
batch
=
pad_ys_out
.
shape
[
0
]
olength
=
pad_ys_out
.
shape
[
1
]
logging
.
info
(
self
.
__class__
.
__name__
+
" input lengths: "
+
str
(
self
.
xp
.
array
([
h
.
shape
[
0
]
for
h
in
hs
]))
)
logging
.
info
(
self
.
__class__
.
__name__
+
" output lengths: "
+
str
(
self
.
xp
.
array
([
y
.
shape
[
0
]
for
y
in
ys_out
]))
)
# initialization
c_list
=
[
None
]
# list of cell state of each layer
z_list
=
[
None
]
# list of hidden state of each layer
for
_
in
range
(
1
,
self
.
dlayers
):
c_list
.
append
(
None
)
z_list
.
append
(
None
)
att_w
=
None
z_all
=
[]
self
.
att
.
reset
()
# reset pre-computation of h
# pre-computation of embedding
eys
=
self
.
embed
(
pad_ys_in
)
# utt x olen x zdim
eys
=
F
.
separate
(
eys
,
axis
=
1
)
# loop for an output sequence
for
i
in
range
(
olength
):
att_c
,
att_w
=
self
.
att
(
hs
,
z_list
[
0
],
att_w
)
if
i
>
0
and
random
.
random
()
<
self
.
sampling_probability
:
logging
.
info
(
" scheduled sampling "
)
z_out
=
self
.
output
(
z_all
[
-
1
])
z_out
=
F
.
argmax
(
F
.
log_softmax
(
z_out
),
axis
=
1
)
z_out
=
self
.
embed
(
z_out
)
ey
=
F
.
hstack
((
z_out
,
att_c
))
# utt x (zdim + hdim)
else
:
ey
=
F
.
hstack
((
eys
[
i
],
att_c
))
# utt x (zdim + hdim)
z_list
,
c_list
=
self
.
rnn_forward
(
ey
,
z_list
,
c_list
,
z_list
,
c_list
)
z_all
.
append
(
z_list
[
-
1
])
z_all
=
F
.
stack
(
z_all
,
axis
=
1
).
reshape
(
batch
*
olength
,
self
.
dunits
)
# compute loss
y_all
=
self
.
output
(
z_all
)
self
.
loss
=
F
.
softmax_cross_entropy
(
y_all
,
F
.
flatten
(
pad_ys_out
))
# -1: eos, which is removed in the loss computation
self
.
loss
*=
np
.
mean
([
len
(
x
)
for
x
in
ys_in
])
-
1
acc
=
F
.
accuracy
(
y_all
,
F
.
flatten
(
pad_ys_out
),
ignore_label
=-
1
)
logging
.
info
(
"att loss:"
+
str
(
self
.
loss
.
data
))
# show predicted character sequence for debug
if
self
.
verbose
>
0
and
self
.
char_list
is
not
None
:
y_hat
=
y_all
.
reshape
(
batch
,
olength
,
-
1
)
y_true
=
pad_ys_out
for
(
i
,
y_hat_
),
y_true_
in
zip
(
enumerate
(
y_hat
.
data
),
y_true
.
data
):
if
i
==
MAX_DECODER_OUTPUT
:
break
idx_hat
=
self
.
xp
.
argmax
(
y_hat_
[
y_true_
!=
-
1
],
axis
=
1
)
idx_true
=
y_true_
[
y_true_
!=
-
1
]
seq_hat
=
[
self
.
char_list
[
int
(
idx
)]
for
idx
in
idx_hat
]
seq_true
=
[
self
.
char_list
[
int
(
idx
)]
for
idx
in
idx_true
]
seq_hat
=
""
.
join
(
seq_hat
).
replace
(
"<space>"
,
" "
)
seq_true
=
""
.
join
(
seq_true
).
replace
(
"<space>"
,
" "
)
logging
.
info
(
"groundtruth[%d]: "
%
i
+
seq_true
)
logging
.
info
(
"prediction [%d]: "
%
i
+
seq_hat
)
if
self
.
labeldist
is
not
None
:
if
self
.
vlabeldist
is
None
:
self
.
vlabeldist
=
chainer
.
Variable
(
self
.
xp
.
asarray
(
self
.
labeldist
))
loss_reg
=
-
F
.
sum
(
F
.
scale
(
F
.
log_softmax
(
y_all
),
self
.
vlabeldist
,
axis
=
1
)
)
/
len
(
ys_in
)
self
.
loss
=
(
1.0
-
self
.
lsm_weight
)
*
self
.
loss
+
self
.
lsm_weight
*
loss_reg
return
self
.
loss
,
acc
def
recognize_beam
(
self
,
h
,
lpz
,
recog_args
,
char_list
,
rnnlm
=
None
):
"""Beam search implementation.
Args:
h (chainer.Variable): One of the output from the encoder.
lpz (chainer.Variable | None): Result of net propagation.
recog_args (Namespace): The argument.
char_list (List[str]): List of all characters.
rnnlm (Module): RNNLM module. Defined at `espnet.lm.chainer_backend.lm`
Returns:
List[Dict[str,Any]]: Result of recognition.
"""
logging
.
info
(
"input lengths: "
+
str
(
h
.
shape
[
0
]))
# initialization
c_list
=
[
None
]
# list of cell state of each layer
z_list
=
[
None
]
# list of hidden state of each layer
for
_
in
range
(
1
,
self
.
dlayers
):
c_list
.
append
(
None
)
z_list
.
append
(
None
)
a
=
None
self
.
att
.
reset
()
# reset pre-computation of h
# search parms
beam
=
recog_args
.
beam_size
penalty
=
recog_args
.
penalty
ctc_weight
=
recog_args
.
ctc_weight
# preprate sos
y
=
self
.
xp
.
full
(
1
,
self
.
sos
,
"i"
)
if
recog_args
.
maxlenratio
==
0
:
maxlen
=
h
.
shape
[
0
]
else
:
# maxlen >= 1
maxlen
=
max
(
1
,
int
(
recog_args
.
maxlenratio
*
h
.
shape
[
0
]))
minlen
=
int
(
recog_args
.
minlenratio
*
h
.
shape
[
0
])
logging
.
info
(
"max output length: "
+
str
(
maxlen
))
logging
.
info
(
"min output length: "
+
str
(
minlen
))
# initialize hypothesis
if
rnnlm
:
hyp
=
{
"score"
:
0.0
,
"yseq"
:
[
y
],
"c_prev"
:
c_list
,
"z_prev"
:
z_list
,
"a_prev"
:
a
,
"rnnlm_prev"
:
None
,
}
else
:
hyp
=
{
"score"
:
0.0
,
"yseq"
:
[
y
],
"c_prev"
:
c_list
,
"z_prev"
:
z_list
,
"a_prev"
:
a
,
}
if
lpz
is
not
None
:
ctc_prefix_score
=
CTCPrefixScore
(
lpz
,
0
,
self
.
eos
,
self
.
xp
)
hyp
[
"ctc_state_prev"
]
=
ctc_prefix_score
.
initial_state
()
hyp
[
"ctc_score_prev"
]
=
0.0
if
ctc_weight
!=
1.0
:
# pre-pruning based on attention scores
ctc_beam
=
min
(
lpz
.
shape
[
-
1
],
int
(
beam
*
CTC_SCORING_RATIO
))
else
:
ctc_beam
=
lpz
.
shape
[
-
1
]
hyps
=
[
hyp
]
ended_hyps
=
[]
for
i
in
range
(
maxlen
):
logging
.
debug
(
"position "
+
str
(
i
))
hyps_best_kept
=
[]
for
hyp
in
hyps
:
ey
=
self
.
embed
(
hyp
[
"yseq"
][
i
])
# utt list (1) x zdim
att_c
,
att_w
=
self
.
att
([
h
],
hyp
[
"z_prev"
][
0
],
hyp
[
"a_prev"
])
ey
=
F
.
hstack
((
ey
,
att_c
))
# utt(1) x (zdim + hdim)
z_list
,
c_list
=
self
.
rnn_forward
(
ey
,
z_list
,
c_list
,
hyp
[
"z_prev"
],
hyp
[
"c_prev"
]
)
# get nbest local scores and their ids
local_att_scores
=
F
.
log_softmax
(
self
.
output
(
z_list
[
-
1
])).
data
if
rnnlm
:
rnnlm_state
,
local_lm_scores
=
rnnlm
.
predict
(
hyp
[
"rnnlm_prev"
],
hyp
[
"yseq"
][
i
]
)
local_scores
=
(
local_att_scores
+
recog_args
.
lm_weight
*
local_lm_scores
)
else
:
local_scores
=
local_att_scores
if
lpz
is
not
None
:
local_best_ids
=
self
.
xp
.
argsort
(
local_scores
,
axis
=
1
)[
0
,
::
-
1
][
:
ctc_beam
]
ctc_scores
,
ctc_states
=
ctc_prefix_score
(
hyp
[
"yseq"
],
local_best_ids
,
hyp
[
"ctc_state_prev"
]
)
local_scores
=
(
1.0
-
ctc_weight
)
*
local_att_scores
[
:,
local_best_ids
]
+
ctc_weight
*
(
ctc_scores
-
hyp
[
"ctc_score_prev"
])
if
rnnlm
:
local_scores
+=
(
recog_args
.
lm_weight
*
local_lm_scores
[:,
local_best_ids
]
)
joint_best_ids
=
self
.
xp
.
argsort
(
local_scores
,
axis
=
1
)[
0
,
::
-
1
][
:
beam
]
local_best_scores
=
local_scores
[:,
joint_best_ids
]
local_best_ids
=
local_best_ids
[
joint_best_ids
]
else
:
local_best_ids
=
self
.
xp
.
argsort
(
local_scores
,
axis
=
1
)[
0
,
::
-
1
][
:
beam
]
local_best_scores
=
local_scores
[:,
local_best_ids
]
for
j
in
range
(
beam
):
new_hyp
=
{}
# do not copy {z,c}_list directly
new_hyp
[
"z_prev"
]
=
z_list
[:]
new_hyp
[
"c_prev"
]
=
c_list
[:]
new_hyp
[
"a_prev"
]
=
att_w
new_hyp
[
"score"
]
=
hyp
[
"score"
]
+
local_best_scores
[
0
,
j
]
new_hyp
[
"yseq"
]
=
[
0
]
*
(
1
+
len
(
hyp
[
"yseq"
]))
new_hyp
[
"yseq"
][:
len
(
hyp
[
"yseq"
])]
=
hyp
[
"yseq"
]
new_hyp
[
"yseq"
][
len
(
hyp
[
"yseq"
])]
=
self
.
xp
.
full
(
1
,
local_best_ids
[
j
],
"i"
)
if
rnnlm
:
new_hyp
[
"rnnlm_prev"
]
=
rnnlm_state
if
lpz
is
not
None
:
new_hyp
[
"ctc_state_prev"
]
=
ctc_states
[
joint_best_ids
[
j
]]
new_hyp
[
"ctc_score_prev"
]
=
ctc_scores
[
joint_best_ids
[
j
]]
# will be (2 x beam) hyps at most
hyps_best_kept
.
append
(
new_hyp
)
hyps_best_kept
=
sorted
(
hyps_best_kept
,
key
=
lambda
x
:
x
[
"score"
],
reverse
=
True
)[:
beam
]
# sort and get nbest
hyps
=
hyps_best_kept
logging
.
debug
(
"number of pruned hypotheses: "
+
str
(
len
(
hyps
)))
logging
.
debug
(
"best hypo: "
+
""
.
join
([
char_list
[
int
(
x
)]
for
x
in
hyps
[
0
][
"yseq"
][
1
:]]).
replace
(
"<space>"
,
" "
)
)
# add eos in the final loop to avoid that there are no ended hyps
if
i
==
maxlen
-
1
:
logging
.
info
(
"adding <eos> in the last position in the loop"
)
for
hyp
in
hyps
:
hyp
[
"yseq"
].
append
(
self
.
xp
.
full
(
1
,
self
.
eos
,
"i"
))
# add ended hypotheses to a final list,
# and removed them from current hypotheses
# (this will be a problem, number of hyps < beam)
remained_hyps
=
[]
for
hyp
in
hyps
:
if
hyp
[
"yseq"
][
-
1
]
==
self
.
eos
:
# only store the sequence that has more than minlen outputs
# also add penalty
if
len
(
hyp
[
"yseq"
])
>
minlen
:
hyp
[
"score"
]
+=
(
i
+
1
)
*
penalty
if
rnnlm
:
# Word LM needs to add final <eos> score
hyp
[
"score"
]
+=
recog_args
.
lm_weight
*
rnnlm
.
final
(
hyp
[
"rnnlm_prev"
]
)
ended_hyps
.
append
(
hyp
)
else
:
remained_hyps
.
append
(
hyp
)
# end detection
if
end_detect
(
ended_hyps
,
i
)
and
recog_args
.
maxlenratio
==
0.0
:
logging
.
info
(
"end detected at %d"
,
i
)
break
hyps
=
remained_hyps
if
len
(
hyps
)
>
0
:
logging
.
debug
(
"remaining hypotheses: "
+
str
(
len
(
hyps
)))
else
:
logging
.
info
(
"no hypothesis. Finish decoding."
)
break
for
hyp
in
hyps
:
logging
.
debug
(
"hypo: "
+
""
.
join
([
char_list
[
int
(
x
)]
for
x
in
hyp
[
"yseq"
][
1
:]]).
replace
(
"<space>"
,
" "
)
)
logging
.
debug
(
"number of ended hypotheses: "
+
str
(
len
(
ended_hyps
)))
nbest_hyps
=
sorted
(
ended_hyps
,
key
=
lambda
x
:
x
[
"score"
],
reverse
=
True
)[
:
min
(
len
(
ended_hyps
),
recog_args
.
nbest
)
]
# check number of hypotheses
if
len
(
nbest_hyps
)
==
0
:
logging
.
warning
(
"there is no N-best results, "
"perform recognition again with smaller minlenratio."
)
# should copy because Namespace will be overwritten globally
recog_args
=
Namespace
(
**
vars
(
recog_args
))
recog_args
.
minlenratio
=
max
(
0.0
,
recog_args
.
minlenratio
-
0.1
)
return
self
.
recognize_beam
(
h
,
lpz
,
recog_args
,
char_list
,
rnnlm
)
logging
.
info
(
"total log probability: "
+
str
(
nbest_hyps
[
0
][
"score"
]))
logging
.
info
(
"normalized log probability: "
+
str
(
nbest_hyps
[
0
][
"score"
]
/
len
(
nbest_hyps
[
0
][
"yseq"
]))
)
return
nbest_hyps
def
calculate_all_attentions
(
self
,
hs
,
ys
):
"""Calculate all of attentions.
Args:
hs (list of chainer.Variable | N-dimensional array):
Input variable from encoder.
ys (list of chainer.Variable | N-dimensional array):
Input variable of decoder.
Returns:
chainer.Variable: List of attention weights.
"""
# prepare input and output word sequences with sos/eos IDs
eos
=
self
.
xp
.
array
([
self
.
eos
],
"i"
)
sos
=
self
.
xp
.
array
([
self
.
sos
],
"i"
)
ys_in
=
[
F
.
concat
([
sos
,
y
],
axis
=
0
)
for
y
in
ys
]
ys_out
=
[
F
.
concat
([
y
,
eos
],
axis
=
0
)
for
y
in
ys
]
# padding for ys with -1
# pys: utt x olen
pad_ys_in
=
F
.
pad_sequence
(
ys_in
,
padding
=
self
.
eos
)
pad_ys_out
=
F
.
pad_sequence
(
ys_out
,
padding
=-
1
)
# get length info
olength
=
pad_ys_out
.
shape
[
1
]
# initialization
c_list
=
[
None
]
# list of cell state of each layer
z_list
=
[
None
]
# list of hidden state of each layer
for
_
in
range
(
1
,
self
.
dlayers
):
c_list
.
append
(
None
)
z_list
.
append
(
None
)
att_w
=
None
att_ws
=
[]
self
.
att
.
reset
()
# reset pre-computation of h
# pre-computation of embedding
eys
=
self
.
embed
(
pad_ys_in
)
# utt x olen x zdim
eys
=
F
.
separate
(
eys
,
axis
=
1
)
# loop for an output sequence
for
i
in
range
(
olength
):
att_c
,
att_w
=
self
.
att
(
hs
,
z_list
[
0
],
att_w
)
ey
=
F
.
hstack
((
eys
[
i
],
att_c
))
# utt x (zdim + hdim)
z_list
,
c_list
=
self
.
rnn_forward
(
ey
,
z_list
,
c_list
,
z_list
,
c_list
)
att_ws
.
append
(
att_w
)
# for debugging
att_ws
=
F
.
stack
(
att_ws
,
axis
=
1
)
att_ws
.
to_cpu
()
return
att_ws
.
data
def
decoder_for
(
args
,
odim
,
sos
,
eos
,
att
,
labeldist
):
"""Return the decoding layer corresponding to the args.
Args:
args (Namespace): The program arguments.
odim (int): The output dimension.
sos (int): Number to indicate the start of sequences.
eos (int) Number to indicate the end of sequences.
att (Module):
Attention module defined at `espnet.nets.chainer_backend.attentions`.
labeldist (numpy.array): Distributed array of length od transcript.
Returns:
chainer.Chain: The decoder module.
"""
return
Decoder
(
args
.
eprojs
,
odim
,
args
.
dtype
,
args
.
dlayers
,
args
.
dunits
,
sos
,
eos
,
att
,
args
.
verbose
,
args
.
char_list
,
labeldist
,
args
.
lsm_weight
,
args
.
sampling_probability
,
)
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/encoders.py
0 → 100644
View file @
60a2c57a
import
logging
import
chainer
import
chainer.functions
as
F
import
chainer.links
as
L
import
numpy
as
np
from
chainer
import
cuda
from
espnet.nets.chainer_backend.nets_utils
import
_subsamplex
from
espnet.nets.e2e_asr_common
import
get_vgg2l_odim
# TODO(watanabe) explanation of BLSTMP
class
RNNP
(
chainer
.
Chain
):
"""RNN with projection layer module.
Args:
idim (int): Dimension of inputs.
elayers (int): Number of encoder layers.
cdim (int): Number of rnn units. (resulted in cdim * 2 if bidirectional)
hdim (int): Number of projection units.
subsample (np.ndarray): List to use sabsample the input array.
dropout (float): Dropout rate.
typ (str): The RNN type.
"""
def
__init__
(
self
,
idim
,
elayers
,
cdim
,
hdim
,
subsample
,
dropout
,
typ
=
"blstm"
):
super
(
RNNP
,
self
).
__init__
()
bidir
=
typ
[
0
]
==
"b"
if
bidir
:
rnn
=
L
.
NStepBiLSTM
if
"lstm"
in
typ
else
L
.
NStepBiGRU
else
:
rnn
=
L
.
NStepLSTM
if
"lstm"
in
typ
else
L
.
NStepGRU
rnn_label
=
"birnn"
if
bidir
else
"rnn"
with
self
.
init_scope
():
for
i
in
range
(
elayers
):
if
i
==
0
:
inputdim
=
idim
else
:
inputdim
=
hdim
_cdim
=
2
*
cdim
if
bidir
else
cdim
# bottleneck layer to merge
setattr
(
self
,
"{}{:d}"
.
format
(
rnn_label
,
i
),
rnn
(
1
,
inputdim
,
cdim
,
dropout
)
)
setattr
(
self
,
"bt%d"
%
i
,
L
.
Linear
(
_cdim
,
hdim
))
self
.
elayers
=
elayers
self
.
rnn_label
=
rnn_label
self
.
cdim
=
cdim
self
.
subsample
=
subsample
self
.
typ
=
typ
self
.
bidir
=
bidir
def
__call__
(
self
,
xs
,
ilens
):
"""RNNP forward.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.Variable): Batch of length of each input batch. (B,)
Returns:
xs (chainer.Variable):subsampled vector of xs.
chainer.Variable: Subsampled vector of ilens.
"""
logging
.
info
(
self
.
__class__
.
__name__
+
" input lengths: "
+
str
(
ilens
))
for
layer
in
range
(
self
.
elayers
):
if
"lstm"
in
self
.
typ
:
_
,
_
,
ys
=
self
[
self
.
rnn_label
+
str
(
layer
)](
None
,
None
,
xs
)
else
:
_
,
ys
=
self
[
self
.
rnn_label
+
str
(
layer
)](
None
,
xs
)
# ys: utt list of frame x cdim x 2 (2: means bidirectional)
# TODO(watanabe) replace subsample and FC layer with CNN
ys
,
ilens
=
_subsamplex
(
ys
,
self
.
subsample
[
layer
+
1
])
# (sum _utt frame_utt) x dim
ys
=
self
[
"bt"
+
str
(
layer
)](
F
.
vstack
(
ys
))
xs
=
F
.
split_axis
(
ys
,
np
.
cumsum
(
ilens
[:
-
1
]),
axis
=
0
)
# final tanh operation
xs
=
F
.
split_axis
(
F
.
tanh
(
F
.
vstack
(
xs
)),
np
.
cumsum
(
ilens
[:
-
1
]),
axis
=
0
)
# 1 utterance case, it becomes an array, so need to make a utt tuple
if
not
isinstance
(
xs
,
tuple
):
xs
=
[
xs
]
return
xs
,
ilens
# x: utt list of frame x dim
class
RNN
(
chainer
.
Chain
):
"""RNN Module.
Args:
idim (int): Dimension of the imput.
elayers (int): Number of encoder layers.
cdim (int): Number of rnn units.
hdim (int): Number of projection units.
dropout (float): Dropout rate.
typ (str): Rnn type.
"""
def
__init__
(
self
,
idim
,
elayers
,
cdim
,
hdim
,
dropout
,
typ
=
"lstm"
):
super
(
RNN
,
self
).
__init__
()
bidir
=
typ
[
0
]
==
"b"
if
bidir
:
rnn
=
L
.
NStepBiLSTM
if
"lstm"
in
typ
else
L
.
NStepBiGRU
else
:
rnn
=
L
.
NStepLSTM
if
"lstm"
in
typ
else
L
.
NStepGRU
_cdim
=
2
*
cdim
if
bidir
else
cdim
with
self
.
init_scope
():
self
.
nbrnn
=
rnn
(
elayers
,
idim
,
cdim
,
dropout
)
self
.
l_last
=
L
.
Linear
(
_cdim
,
hdim
)
self
.
typ
=
typ
self
.
bidir
=
bidir
def
__call__
(
self
,
xs
,
ilens
):
"""BRNN forward propagation.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.Variable): Batch of length of each input batch. (B,)
Returns:
tuple(chainer.Variable): Tuple of `chainer.Variable` objects.
chainer.Variable: `ilens` .
"""
logging
.
info
(
self
.
__class__
.
__name__
+
" input lengths: "
+
str
(
ilens
))
# need to move ilens to cpu
ilens
=
cuda
.
to_cpu
(
ilens
)
if
"lstm"
in
self
.
typ
:
_
,
_
,
ys
=
self
.
nbrnn
(
None
,
None
,
xs
)
else
:
_
,
ys
=
self
.
nbrnn
(
None
,
xs
)
ys
=
self
.
l_last
(
F
.
vstack
(
ys
))
# (sum _utt frame_utt) x dim
xs
=
F
.
split_axis
(
ys
,
np
.
cumsum
(
ilens
[:
-
1
]),
axis
=
0
)
# final tanh operation
xs
=
F
.
split_axis
(
F
.
tanh
(
F
.
vstack
(
xs
)),
np
.
cumsum
(
ilens
[:
-
1
]),
axis
=
0
)
# 1 utterance case, it becomes an array, so need to make a utt tuple
if
not
isinstance
(
xs
,
tuple
):
xs
=
[
xs
]
return
xs
,
ilens
# x: utt list of frame x dim
# TODO(watanabe) explanation of VGG2L, VGG2B (Block) might be better
class
VGG2L
(
chainer
.
Chain
):
"""VGG motibated cnn layers.
Args:
in_channel (int): Number of channels.
"""
def
__init__
(
self
,
in_channel
=
1
):
super
(
VGG2L
,
self
).
__init__
()
with
self
.
init_scope
():
# CNN layer (VGG motivated)
self
.
conv1_1
=
L
.
Convolution2D
(
in_channel
,
64
,
3
,
stride
=
1
,
pad
=
1
)
self
.
conv1_2
=
L
.
Convolution2D
(
64
,
64
,
3
,
stride
=
1
,
pad
=
1
)
self
.
conv2_1
=
L
.
Convolution2D
(
64
,
128
,
3
,
stride
=
1
,
pad
=
1
)
self
.
conv2_2
=
L
.
Convolution2D
(
128
,
128
,
3
,
stride
=
1
,
pad
=
1
)
self
.
in_channel
=
in_channel
def
__call__
(
self
,
xs
,
ilens
):
"""VGG2L forward propagation.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.Variable): Batch of length of each features. (B,)
Returns:
chainer.Variable: Subsampled vector of xs.
chainer.Variable: Subsampled vector of ilens.
"""
logging
.
info
(
self
.
__class__
.
__name__
+
" input lengths: "
+
str
(
ilens
))
# x: utt x frame x dim
xs
=
F
.
pad_sequence
(
xs
)
# x: utt x 1 (input channel num) x frame x dim
xs
=
F
.
swapaxes
(
xs
.
reshape
(
xs
.
shape
[
0
],
xs
.
shape
[
1
],
self
.
in_channel
,
xs
.
shape
[
2
]
//
self
.
in_channel
,
),
1
,
2
,
)
xs
=
F
.
relu
(
self
.
conv1_1
(
xs
))
xs
=
F
.
relu
(
self
.
conv1_2
(
xs
))
xs
=
F
.
max_pooling_2d
(
xs
,
2
,
stride
=
2
)
xs
=
F
.
relu
(
self
.
conv2_1
(
xs
))
xs
=
F
.
relu
(
self
.
conv2_2
(
xs
))
xs
=
F
.
max_pooling_2d
(
xs
,
2
,
stride
=
2
)
# change ilens accordingly
ilens
=
self
.
xp
.
array
(
self
.
xp
.
ceil
(
self
.
xp
.
array
(
ilens
,
dtype
=
np
.
float32
)
/
2
),
dtype
=
np
.
int32
)
ilens
=
self
.
xp
.
array
(
self
.
xp
.
ceil
(
self
.
xp
.
array
(
ilens
,
dtype
=
np
.
float32
)
/
2
),
dtype
=
np
.
int32
)
# x: utt_list of frame (remove zeropaded frames) x (input channel num x dim)
xs
=
F
.
swapaxes
(
xs
,
1
,
2
)
xs
=
xs
.
reshape
(
xs
.
shape
[
0
],
xs
.
shape
[
1
],
xs
.
shape
[
2
]
*
xs
.
shape
[
3
])
xs
=
[
xs
[
i
,
:
ilens
[
i
],
:]
for
i
in
range
(
len
(
ilens
))]
return
xs
,
ilens
class
Encoder
(
chainer
.
Chain
):
"""Encoder network class.
Args:
etype (str): Type of encoder network.
idim (int): Number of dimensions of encoder network.
elayers (int): Number of layers of encoder network.
eunits (int): Number of lstm units of encoder network.
eprojs (int): Number of projection units of encoder network.
subsample (np.array): Subsampling number. e.g. 1_2_2_2_1
dropout (float): Dropout rate.
"""
def
__init__
(
self
,
etype
,
idim
,
elayers
,
eunits
,
eprojs
,
subsample
,
dropout
,
in_channel
=
1
):
super
(
Encoder
,
self
).
__init__
()
typ
=
etype
.
lstrip
(
"vgg"
).
rstrip
(
"p"
)
if
typ
not
in
[
"lstm"
,
"gru"
,
"blstm"
,
"bgru"
]:
logging
.
error
(
"Error: need to specify an appropriate encoder architecture"
)
with
self
.
init_scope
():
if
etype
.
startswith
(
"vgg"
):
if
etype
[
-
1
]
==
"p"
:
self
.
enc
=
chainer
.
Sequential
(
VGG2L
(
in_channel
),
RNNP
(
get_vgg2l_odim
(
idim
,
in_channel
=
in_channel
),
elayers
,
eunits
,
eprojs
,
subsample
,
dropout
,
typ
=
typ
,
),
)
logging
.
info
(
"Use CNN-VGG + "
+
typ
.
upper
()
+
"P for encoder"
)
else
:
self
.
enc
=
chainer
.
Sequential
(
VGG2L
(
in_channel
),
RNN
(
get_vgg2l_odim
(
idim
,
in_channel
=
in_channel
),
elayers
,
eunits
,
eprojs
,
dropout
,
typ
=
typ
,
),
)
logging
.
info
(
"Use CNN-VGG + "
+
typ
.
upper
()
+
" for encoder"
)
self
.
conv_subsampling_factor
=
4
else
:
if
etype
[
-
1
]
==
"p"
:
self
.
enc
=
chainer
.
Sequential
(
RNNP
(
idim
,
elayers
,
eunits
,
eprojs
,
subsample
,
dropout
,
typ
=
typ
)
)
logging
.
info
(
typ
.
upper
()
+
" with every-layer projection for encoder"
)
else
:
self
.
enc
=
chainer
.
Sequential
(
RNN
(
idim
,
elayers
,
eunits
,
eprojs
,
dropout
,
typ
=
typ
)
)
logging
.
info
(
typ
.
upper
()
+
" without projection for encoder"
)
self
.
conv_subsampling_factor
=
1
def
__call__
(
self
,
xs
,
ilens
):
"""Encoder forward.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.variable): Batch of length of each features. (B,)
Returns:
chainer.Variable: Output of the encoder.
chainer.Variable: (Subsampled) vector of ilens.
"""
xs
,
ilens
=
self
.
enc
(
xs
,
ilens
)
return
xs
,
ilens
def
encoder_for
(
args
,
idim
,
subsample
):
"""Return the Encoder module.
Args:
idim (int): Dimension of input array.
subsample (numpy.array): Subsample number. egs).1_2_2_2_1
Return
chainer.nn.Module: Encoder module.
"""
return
Encoder
(
args
.
etype
,
idim
,
args
.
elayers
,
args
.
eunits
,
args
.
eprojs
,
subsample
,
args
.
dropout_rate
,
)
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/training.py
0 → 100644
View file @
60a2c57a
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import
collections
import
logging
import
math
import
numpy
as
np
# chainer related
from
chainer
import
Variable
,
cuda
,
training
from
chainer.training.updaters.multiprocess_parallel_updater
import
(
gather_grads
,
gather_params
,
scatter_grads
,
)
# copied from https://github.com/chainer/chainer/blob/master/chainer/optimizer.py
def
sum_sqnorm
(
arr
):
"""Calculate the norm of the array.
Args:
arr (numpy.ndarray)
Returns:
Float: Sum of the norm calculated from the given array.
"""
sq_sum
=
collections
.
defaultdict
(
float
)
for
x
in
arr
:
with
cuda
.
get_device_from_array
(
x
)
as
dev
:
if
x
is
not
None
:
x
=
x
.
ravel
()
s
=
x
.
dot
(
x
)
sq_sum
[
int
(
dev
)]
+=
s
return
sum
([
float
(
i
)
for
i
in
sq_sum
.
values
()])
class
CustomUpdater
(
training
.
StandardUpdater
):
"""Custom updater for chainer.
Args:
train_iter (iterator | dict[str, iterator]): Dataset iterator for the
training dataset. It can also be a dictionary that maps strings to
iterators. If this is just an iterator, then the iterator is
registered by the name ``'main'``.
optimizer (optimizer | dict[str, optimizer]): Optimizer to update
parameters. It can also be a dictionary that maps strings to
optimizers. If this is just an optimizer, then the optimizer is
registered by the name ``'main'``.
converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
function to build input arrays. Each batch extracted by the main
iterator and the ``device`` option are passed to this function.
:func:`chainer.dataset.concat_examples` is used by default.
device (int or dict): The destination device info to send variables. In the
case of cpu or single gpu, `device=-1 or 0`, respectively.
In the case of multi-gpu, `device={"main":0, "sub_1": 1, ...}`.
accum_grad (int):The number of gradient accumulation. if set to 2, the network
parameters will be updated once in twice,
i.e. actual batchsize will be doubled.
"""
def
__init__
(
self
,
train_iter
,
optimizer
,
converter
,
device
,
accum_grad
=
1
):
super
(
CustomUpdater
,
self
).
__init__
(
train_iter
,
optimizer
,
converter
=
converter
,
device
=
device
)
self
.
forward_count
=
0
self
.
accum_grad
=
accum_grad
self
.
start
=
True
# To solve #1091, it is required to set the variable inside this class.
self
.
device
=
device
# The core part of the update routine can be customized by overriding.
def
update_core
(
self
):
"""Main update routine for Custom Updater."""
train_iter
=
self
.
get_iterator
(
"main"
)
optimizer
=
self
.
get_optimizer
(
"main"
)
# Get batch and convert into variables
batch
=
train_iter
.
next
()
x
=
self
.
converter
(
batch
,
self
.
device
)
if
self
.
start
:
optimizer
.
target
.
cleargrads
()
self
.
start
=
False
# Compute the loss at this time step and accumulate it
loss
=
optimizer
.
target
(
*
x
)
/
self
.
accum_grad
loss
.
backward
()
# Backprop
loss
.
unchain_backward
()
# Truncate the graph
# update parameters
self
.
forward_count
+=
1
if
self
.
forward_count
!=
self
.
accum_grad
:
return
self
.
forward_count
=
0
# compute the gradient norm to check if it is normal or not
grad_norm
=
np
.
sqrt
(
sum_sqnorm
([
p
.
grad
for
p
in
optimizer
.
target
.
params
(
False
)])
)
logging
.
info
(
"grad norm={}"
.
format
(
grad_norm
))
if
math
.
isnan
(
grad_norm
):
logging
.
warning
(
"grad norm is nan. Do not update model."
)
else
:
optimizer
.
update
()
optimizer
.
target
.
cleargrads
()
# Clear the parameter gradients
def
update
(
self
):
self
.
update_core
()
if
self
.
forward_count
==
0
:
self
.
iteration
+=
1
class
CustomParallelUpdater
(
training
.
updaters
.
MultiprocessParallelUpdater
):
"""Custom Parallel Updater for chainer.
Defines the main update routine.
Args:
train_iter (iterator | dict[str, iterator]): Dataset iterator for the
training dataset. It can also be a dictionary that maps strings to
iterators. If this is just an iterator, then the iterator is
registered by the name ``'main'``.
optimizer (optimizer | dict[str, optimizer]): Optimizer to update
parameters. It can also be a dictionary that maps strings to
optimizers. If this is just an optimizer, then the optimizer is
registered by the name ``'main'``.
converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
function to build input arrays. Each batch extracted by the main
iterator and the ``device`` option are passed to this function.
:func:`chainer.dataset.concat_examples` is used by default.
device (torch.device): Device to which the training data is sent.
Negative value
indicates the host memory (CPU).
accum_grad (int):The number of gradient accumulation. if set to 2,
the network parameters will be updated once in twice,
i.e. actual batchsize will be doubled.
"""
def
__init__
(
self
,
train_iters
,
optimizer
,
converter
,
devices
,
accum_grad
=
1
):
super
(
CustomParallelUpdater
,
self
).
__init__
(
train_iters
,
optimizer
,
converter
=
converter
,
devices
=
devices
)
from
cupy.cuda
import
nccl
self
.
accum_grad
=
accum_grad
self
.
forward_count
=
0
self
.
nccl
=
nccl
# The core part of the update routine can be customized by overriding.
def
update_core
(
self
):
"""Main Update routine of the custom parallel updater."""
self
.
setup_workers
()
self
.
_send_message
((
"update"
,
None
))
with
cuda
.
Device
(
self
.
_devices
[
0
]):
# For reducing memory
optimizer
=
self
.
get_optimizer
(
"main"
)
batch
=
self
.
get_iterator
(
"main"
).
next
()
x
=
self
.
converter
(
batch
,
self
.
_devices
[
0
])
loss
=
self
.
_master
(
*
x
)
/
self
.
accum_grad
loss
.
backward
()
loss
.
unchain_backward
()
# NCCL: reduce grads
null_stream
=
cuda
.
Stream
.
null
if
self
.
comm
is
not
None
:
gg
=
gather_grads
(
self
.
_master
)
self
.
comm
.
reduce
(
gg
.
data
.
ptr
,
gg
.
data
.
ptr
,
gg
.
size
,
self
.
nccl
.
NCCL_FLOAT
,
self
.
nccl
.
NCCL_SUM
,
0
,
null_stream
.
ptr
,
)
scatter_grads
(
self
.
_master
,
gg
)
del
gg
# update parameters
self
.
forward_count
+=
1
if
self
.
forward_count
!=
self
.
accum_grad
:
return
self
.
forward_count
=
0
# check gradient value
grad_norm
=
np
.
sqrt
(
sum_sqnorm
([
p
.
grad
for
p
in
optimizer
.
target
.
params
(
False
)])
)
logging
.
info
(
"grad norm={}"
.
format
(
grad_norm
))
# update
if
math
.
isnan
(
grad_norm
):
logging
.
warning
(
"grad norm is nan. Do not update model."
)
else
:
optimizer
.
update
()
self
.
_master
.
cleargrads
()
if
self
.
comm
is
not
None
:
gp
=
gather_params
(
self
.
_master
)
self
.
comm
.
bcast
(
gp
.
data
.
ptr
,
gp
.
size
,
self
.
nccl
.
NCCL_FLOAT
,
0
,
null_stream
.
ptr
)
def
update
(
self
):
self
.
update_core
()
if
self
.
forward_count
==
0
:
self
.
iteration
+=
1
class
CustomConverter
(
object
):
"""Custom Converter.
Args:
subsampling_factor (int): The subsampling factor.
"""
def
__init__
(
self
,
subsampling_factor
=
1
):
self
.
subsampling_factor
=
subsampling_factor
def
__call__
(
self
,
batch
,
device
):
"""Perform sabsampling.
Args:
batch (list): Batch that will be sabsampled.
device (device): GPU device.
Returns:
chainer.Variable: xp.array that sabsampled from batch.
xp.array: xp.array of the length of the mini-batches.
chainer.Variable: xp.array that sabsampled from batch.
"""
# set device
xp
=
cuda
.
cupy
if
device
!=
-
1
else
np
# batch should be located in list
assert
len
(
batch
)
==
1
xs
,
ys
=
batch
[
0
]
# perform subsampling
if
self
.
subsampling_factor
>
1
:
xs
=
[
x
[::
self
.
subsampling_factor
,
:]
for
x
in
xs
]
# get batch made of lengths of input sequences
ilens
=
[
x
.
shape
[
0
]
for
x
in
xs
]
# convert to Variable
xs
=
[
Variable
(
xp
.
array
(
x
,
dtype
=
xp
.
float32
))
for
x
in
xs
]
ilens
=
xp
.
array
(
ilens
,
dtype
=
xp
.
int32
)
ys
=
[
Variable
(
xp
.
array
(
y
,
dtype
=
xp
.
int32
))
for
y
in
ys
]
return
xs
,
ilens
,
ys
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/__init__.py
0 → 100644
View file @
60a2c57a
"""Initialize sub package."""
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/attention.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Class Declaration of Transformer's Attention."""
import
chainer
import
chainer.functions
as
F
import
chainer.links
as
L
import
numpy
as
np
MIN_VALUE
=
float
(
np
.
finfo
(
np
.
float32
).
min
)
class
MultiHeadAttention
(
chainer
.
Chain
):
"""Multi Head Attention Layer.
Args:
n_units (int): Number of input units.
h (int): Number of attention heads.
dropout (float): Dropout rate.
initialW: Initializer to initialize the weight.
initial_bias: Initializer to initialize the bias.
:param int h: the number of heads
:param int n_units: the number of features
:param float dropout_rate: dropout rate
"""
def
__init__
(
self
,
n_units
,
h
=
8
,
dropout
=
0.1
,
initialW
=
None
,
initial_bias
=
None
):
"""Initialize MultiHeadAttention."""
super
(
MultiHeadAttention
,
self
).
__init__
()
assert
n_units
%
h
==
0
stvd
=
1.0
/
np
.
sqrt
(
n_units
)
with
self
.
init_scope
():
self
.
linear_q
=
L
.
Linear
(
n_units
,
n_units
,
initialW
=
initialW
(
scale
=
stvd
),
initial_bias
=
initial_bias
(
scale
=
stvd
),
)
self
.
linear_k
=
L
.
Linear
(
n_units
,
n_units
,
initialW
=
initialW
(
scale
=
stvd
),
initial_bias
=
initial_bias
(
scale
=
stvd
),
)
self
.
linear_v
=
L
.
Linear
(
n_units
,
n_units
,
initialW
=
initialW
(
scale
=
stvd
),
initial_bias
=
initial_bias
(
scale
=
stvd
),
)
self
.
linear_out
=
L
.
Linear
(
n_units
,
n_units
,
initialW
=
initialW
(
scale
=
stvd
),
initial_bias
=
initial_bias
(
scale
=
stvd
),
)
self
.
d_k
=
n_units
//
h
self
.
h
=
h
self
.
dropout
=
dropout
self
.
attn
=
None
def
forward
(
self
,
e_var
,
s_var
=
None
,
mask
=
None
,
batch
=
1
):
"""Core function of the Multi-head attention layer.
Args:
e_var (chainer.Variable): Variable of input array.
s_var (chainer.Variable): Variable of source array from encoder.
mask (chainer.Variable): Attention mask.
batch (int): Batch size.
Returns:
chainer.Variable: Outout of multi-head attention layer.
"""
xp
=
self
.
xp
if
s_var
is
None
:
# batch, head, time1/2, d_k)
Q
=
self
.
linear_q
(
e_var
).
reshape
(
batch
,
-
1
,
self
.
h
,
self
.
d_k
)
K
=
self
.
linear_k
(
e_var
).
reshape
(
batch
,
-
1
,
self
.
h
,
self
.
d_k
)
V
=
self
.
linear_v
(
e_var
).
reshape
(
batch
,
-
1
,
self
.
h
,
self
.
d_k
)
else
:
Q
=
self
.
linear_q
(
e_var
).
reshape
(
batch
,
-
1
,
self
.
h
,
self
.
d_k
)
K
=
self
.
linear_k
(
s_var
).
reshape
(
batch
,
-
1
,
self
.
h
,
self
.
d_k
)
V
=
self
.
linear_v
(
s_var
).
reshape
(
batch
,
-
1
,
self
.
h
,
self
.
d_k
)
scores
=
F
.
matmul
(
F
.
swapaxes
(
Q
,
1
,
2
),
K
.
transpose
(
0
,
2
,
3
,
1
))
/
np
.
sqrt
(
self
.
d_k
)
if
mask
is
not
None
:
mask
=
xp
.
stack
([
mask
]
*
self
.
h
,
axis
=
1
)
scores
=
F
.
where
(
mask
,
scores
,
xp
.
full
(
scores
.
shape
,
MIN_VALUE
,
"f"
))
self
.
attn
=
F
.
softmax
(
scores
,
axis
=-
1
)
p_attn
=
F
.
dropout
(
self
.
attn
,
self
.
dropout
)
x
=
F
.
matmul
(
p_attn
,
F
.
swapaxes
(
V
,
1
,
2
))
x
=
F
.
swapaxes
(
x
,
1
,
2
).
reshape
(
-
1
,
self
.
h
*
self
.
d_k
)
return
self
.
linear_out
(
x
)
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/ctc.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Class Declaration of Transformer's CTC."""
import
logging
import
chainer
import
chainer.functions
as
F
import
chainer.links
as
L
import
numpy
as
np
# TODO(nelson): Merge chainer_backend/transformer/ctc.py in chainer_backend/ctc.py
class
CTC
(
chainer
.
Chain
):
"""Chainer implementation of ctc layer.
Args:
odim (int): The output dimension.
eprojs (int | None): Dimension of input vectors from encoder.
dropout_rate (float): Dropout rate.
"""
def
__init__
(
self
,
odim
,
eprojs
,
dropout_rate
):
"""Initialize CTC."""
super
(
CTC
,
self
).
__init__
()
self
.
dropout_rate
=
dropout_rate
self
.
loss
=
None
with
self
.
init_scope
():
self
.
ctc_lo
=
L
.
Linear
(
eprojs
,
odim
)
def
__call__
(
self
,
hs
,
ys
):
"""CTC forward.
Args:
hs (list of chainer.Variable | N-dimension array):
Input variable from encoder.
ys (list of chainer.Variable | N-dimension array):
Input variable of decoder.
Returns:
chainer.Variable: A variable holding a scalar value of the CTC loss.
"""
self
.
loss
=
None
ilens
=
[
x
.
shape
[
0
]
for
x
in
hs
]
olens
=
[
x
.
shape
[
0
]
for
x
in
ys
]
# zero padding for hs
y_hat
=
self
.
ctc_lo
(
F
.
dropout
(
F
.
pad_sequence
(
hs
),
ratio
=
self
.
dropout_rate
),
n_batch_axes
=
2
)
y_hat
=
F
.
separate
(
y_hat
,
axis
=
1
)
# ilen list of batch x hdim
# zero padding for ys
y_true
=
F
.
pad_sequence
(
ys
,
padding
=-
1
)
# batch x olen
# get length info
input_length
=
chainer
.
Variable
(
self
.
xp
.
array
(
ilens
,
dtype
=
np
.
int32
))
label_length
=
chainer
.
Variable
(
self
.
xp
.
array
(
olens
,
dtype
=
np
.
int32
))
logging
.
info
(
self
.
__class__
.
__name__
+
" input lengths: "
+
str
(
input_length
.
data
)
)
logging
.
info
(
self
.
__class__
.
__name__
+
" output lengths: "
+
str
(
label_length
.
data
)
)
# get ctc loss
self
.
loss
=
F
.
connectionist_temporal_classification
(
y_hat
,
y_true
,
0
,
input_length
,
label_length
)
logging
.
info
(
"ctc loss:"
+
str
(
self
.
loss
.
data
))
return
self
.
loss
def
log_softmax
(
self
,
hs
):
"""Log_softmax of frame activations.
Args:
hs (list of chainer.Variable | N-dimension array):
Input variable from encoder.
Returns:
chainer.Variable: A n-dimension float array.
"""
y_hat
=
self
.
ctc_lo
(
F
.
pad_sequence
(
hs
),
n_batch_axes
=
2
)
return
F
.
log_softmax
(
y_hat
.
reshape
(
-
1
,
y_hat
.
shape
[
-
1
])).
reshape
(
y_hat
.
shape
)
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/decoder.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Class Declaration of Transformer's Decoder."""
import
chainer
import
chainer.functions
as
F
import
chainer.links
as
L
import
numpy
as
np
from
espnet.nets.chainer_backend.transformer.decoder_layer
import
DecoderLayer
from
espnet.nets.chainer_backend.transformer.embedding
import
PositionalEncoding
from
espnet.nets.chainer_backend.transformer.layer_norm
import
LayerNorm
from
espnet.nets.chainer_backend.transformer.mask
import
make_history_mask
class
Decoder
(
chainer
.
Chain
):
"""Decoder layer.
Args:
odim (int): The output dimension.
n_layers (int): Number of ecoder layers.
n_units (int): Number of attention units.
d_units (int): Dimension of input vector of decoder.
h (int): Number of attention heads.
dropout (float): Dropout rate.
initialW (Initializer): Initializer to initialize the weight.
initial_bias (Initializer): Initializer to initialize the bias.
"""
def
__init__
(
self
,
odim
,
args
,
initialW
=
None
,
initial_bias
=
None
):
"""Initialize Decoder."""
super
(
Decoder
,
self
).
__init__
()
self
.
sos
=
odim
-
1
self
.
eos
=
odim
-
1
initialW
=
chainer
.
initializers
.
Uniform
if
initialW
is
None
else
initialW
initial_bias
=
(
chainer
.
initializers
.
Uniform
if
initial_bias
is
None
else
initial_bias
)
with
self
.
init_scope
():
self
.
output_norm
=
LayerNorm
(
args
.
adim
)
self
.
pe
=
PositionalEncoding
(
args
.
adim
,
args
.
dropout_rate
)
stvd
=
1.0
/
np
.
sqrt
(
args
.
adim
)
self
.
output_layer
=
L
.
Linear
(
args
.
adim
,
odim
,
initialW
=
initialW
(
scale
=
stvd
),
initial_bias
=
initial_bias
(
scale
=
stvd
),
)
self
.
embed
=
L
.
EmbedID
(
odim
,
args
.
adim
,
ignore_label
=-
1
,
initialW
=
chainer
.
initializers
.
Normal
(
scale
=
1.0
),
)
for
i
in
range
(
args
.
dlayers
):
name
=
"decoders."
+
str
(
i
)
layer
=
DecoderLayer
(
args
.
adim
,
d_units
=
args
.
dunits
,
h
=
args
.
aheads
,
dropout
=
args
.
dropout_rate
,
initialW
=
initialW
,
initial_bias
=
initial_bias
,
)
self
.
add_link
(
name
,
layer
)
self
.
n_layers
=
args
.
dlayers
def
make_attention_mask
(
self
,
source_block
,
target_block
):
"""Prepare the attention mask.
Args:
source_block (ndarray): Source block with dimensions: (B x S).
target_block (ndarray): Target block with dimensions: (B x T).
Returns:
ndarray: Mask with dimensions (B, S, T).
"""
mask
=
(
target_block
[:,
None
,
:]
>=
0
)
*
(
source_block
[:,
:,
None
]
>=
0
)
# (batch, source_length, target_length)
return
mask
def
forward
(
self
,
ys_pad
,
source
,
x_mask
):
"""Forward decoder.
:param xp.array e: input token ids, int64 (batch, maxlen_out)
:param xp.array yy_mask: input token mask, uint8 (batch, maxlen_out)
:param xp.array source: encoded memory, float32 (batch, maxlen_in, feat)
:param xp.array xy_mask: encoded memory mask, uint8 (batch, maxlen_in)
:return e: decoded token score before softmax (batch, maxlen_out, token)
:rtype: chainer.Variable
"""
xp
=
self
.
xp
sos
=
np
.
array
([
self
.
sos
],
np
.
int32
)
ys
=
[
np
.
concatenate
([
sos
,
y
],
axis
=
0
)
for
y
in
ys_pad
]
e
=
F
.
pad_sequence
(
ys
,
padding
=
self
.
eos
).
data
e
=
xp
.
array
(
e
)
# mask preparation
xy_mask
=
self
.
make_attention_mask
(
e
,
xp
.
array
(
x_mask
))
yy_mask
=
self
.
make_attention_mask
(
e
,
e
)
yy_mask
*=
make_history_mask
(
xp
,
e
)
e
=
self
.
pe
(
self
.
embed
(
e
))
batch
,
length
,
dims
=
e
.
shape
e
=
e
.
reshape
(
-
1
,
dims
)
source
=
source
.
reshape
(
-
1
,
dims
)
for
i
in
range
(
self
.
n_layers
):
e
=
self
[
"decoders."
+
str
(
i
)](
e
,
source
,
xy_mask
,
yy_mask
,
batch
)
return
self
.
output_layer
(
self
.
output_norm
(
e
)).
reshape
(
batch
,
length
,
-
1
)
def
recognize
(
self
,
e
,
yy_mask
,
source
):
"""Process recognition function."""
e
=
self
.
forward
(
e
,
source
,
yy_mask
)
return
F
.
log_softmax
(
e
,
axis
=-
1
)
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/decoder_layer.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Class Declaration of Transformer's Decoder Block."""
import
chainer
import
chainer.functions
as
F
from
espnet.nets.chainer_backend.transformer.attention
import
MultiHeadAttention
from
espnet.nets.chainer_backend.transformer.layer_norm
import
LayerNorm
from
espnet.nets.chainer_backend.transformer.positionwise_feed_forward
import
(
PositionwiseFeedForward
,
)
class
DecoderLayer
(
chainer
.
Chain
):
"""Single decoder layer module.
Args:
n_units (int): Number of input/output dimension of a FeedForward layer.
d_units (int): Number of units of hidden layer in a FeedForward layer.
h (int): Number of attention heads.
dropout (float): Dropout rate
"""
def
__init__
(
self
,
n_units
,
d_units
=
0
,
h
=
8
,
dropout
=
0.1
,
initialW
=
None
,
initial_bias
=
None
):
"""Initialize DecoderLayer."""
super
(
DecoderLayer
,
self
).
__init__
()
with
self
.
init_scope
():
self
.
self_attn
=
MultiHeadAttention
(
n_units
,
h
,
dropout
=
dropout
,
initialW
=
initialW
,
initial_bias
=
initial_bias
,
)
self
.
src_attn
=
MultiHeadAttention
(
n_units
,
h
,
dropout
=
dropout
,
initialW
=
initialW
,
initial_bias
=
initial_bias
,
)
self
.
feed_forward
=
PositionwiseFeedForward
(
n_units
,
d_units
=
d_units
,
dropout
=
dropout
,
initialW
=
initialW
,
initial_bias
=
initial_bias
,
)
self
.
norm1
=
LayerNorm
(
n_units
)
self
.
norm2
=
LayerNorm
(
n_units
)
self
.
norm3
=
LayerNorm
(
n_units
)
self
.
dropout
=
dropout
def
forward
(
self
,
e
,
s
,
xy_mask
,
yy_mask
,
batch
):
"""Compute Encoder layer.
Args:
e (chainer.Variable): Batch of padded features. (B, Lmax)
s (chainer.Variable): Batch of padded character. (B, Tmax)
Returns:
chainer.Variable: Computed variable of decoder.
"""
n_e
=
self
.
norm1
(
e
)
n_e
=
self
.
self_attn
(
n_e
,
mask
=
yy_mask
,
batch
=
batch
)
e
=
e
+
F
.
dropout
(
n_e
,
self
.
dropout
)
n_e
=
self
.
norm2
(
e
)
n_e
=
self
.
src_attn
(
n_e
,
s_var
=
s
,
mask
=
xy_mask
,
batch
=
batch
)
e
=
e
+
F
.
dropout
(
n_e
,
self
.
dropout
)
n_e
=
self
.
norm3
(
e
)
n_e
=
self
.
feed_forward
(
n_e
)
e
=
e
+
F
.
dropout
(
n_e
,
self
.
dropout
)
return
e
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/embedding.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Class Declaration of Transformer's Positional Encoding."""
import
chainer
import
chainer.functions
as
F
import
numpy
as
np
class
PositionalEncoding
(
chainer
.
Chain
):
"""Positional encoding module.
:param int n_units: embedding dim
:param float dropout: dropout rate
:param int length: maximum input length
"""
def
__init__
(
self
,
n_units
,
dropout
=
0.1
,
length
=
5000
):
"""Initialize Positional Encoding."""
# Implementation described in the paper
super
(
PositionalEncoding
,
self
).
__init__
()
self
.
dropout
=
dropout
posi_block
=
np
.
arange
(
0
,
length
,
dtype
=
np
.
float32
)[:,
None
]
unit_block
=
np
.
exp
(
np
.
arange
(
0
,
n_units
,
2
,
dtype
=
np
.
float32
)
*
-
(
np
.
log
(
10000.0
)
/
n_units
)
)
self
.
pe
=
np
.
zeros
((
length
,
n_units
),
dtype
=
np
.
float32
)
self
.
pe
[:,
::
2
]
=
np
.
sin
(
posi_block
*
unit_block
)
self
.
pe
[:,
1
::
2
]
=
np
.
cos
(
posi_block
*
unit_block
)
self
.
scale
=
np
.
sqrt
(
n_units
)
def
forward
(
self
,
e
):
"""Forward Positional Encoding."""
length
=
e
.
shape
[
1
]
e
=
e
*
self
.
scale
+
self
.
xp
.
array
(
self
.
pe
[:
length
])
return
F
.
dropout
(
e
,
self
.
dropout
)
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/encoder.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Class Declaration of Transformer's Encoder."""
import
logging
import
chainer
import
numpy
as
np
from
chainer
import
links
as
L
from
espnet.nets.chainer_backend.transformer.embedding
import
PositionalEncoding
from
espnet.nets.chainer_backend.transformer.encoder_layer
import
EncoderLayer
from
espnet.nets.chainer_backend.transformer.layer_norm
import
LayerNorm
from
espnet.nets.chainer_backend.transformer.mask
import
make_history_mask
from
espnet.nets.chainer_backend.transformer.subsampling
import
(
Conv2dSubsampling
,
LinearSampling
,
)
class
Encoder
(
chainer
.
Chain
):
"""Encoder.
Args:
input_type(str):
Sampling type. `input_type` must be `conv2d` or 'linear' currently.
idim (int): Dimension of inputs.
n_layers (int): Number of encoder layers.
n_units (int): Number of input/output dimension of a FeedForward layer.
d_units (int): Number of units of hidden layer in a FeedForward layer.
h (int): Number of attention heads.
dropout (float): Dropout rate
"""
def
__init__
(
self
,
idim
,
attention_dim
=
256
,
attention_heads
=
4
,
linear_units
=
2048
,
num_blocks
=
6
,
dropout_rate
=
0.1
,
positional_dropout_rate
=
0.1
,
attention_dropout_rate
=
0.0
,
input_layer
=
"conv2d"
,
pos_enc_class
=
PositionalEncoding
,
initialW
=
None
,
initial_bias
=
None
,
):
"""Initialize Encoder.
Args:
idim (int): Input dimension.
args (Namespace): Training config.
initialW (int, optional): Initializer to initialize the weight.
initial_bias (bool, optional): Initializer to initialize the bias.
"""
super
(
Encoder
,
self
).
__init__
()
initialW
=
chainer
.
initializers
.
Uniform
if
initialW
is
None
else
initialW
initial_bias
=
(
chainer
.
initializers
.
Uniform
if
initial_bias
is
None
else
initial_bias
)
self
.
do_history_mask
=
False
with
self
.
init_scope
():
self
.
conv_subsampling_factor
=
1
channels
=
64
# Based in paper
if
input_layer
==
"conv2d"
:
idim
=
int
(
np
.
ceil
(
np
.
ceil
(
idim
/
2
)
/
2
))
*
channels
self
.
input_layer
=
Conv2dSubsampling
(
channels
,
idim
,
attention_dim
,
dropout
=
dropout_rate
,
initialW
=
initialW
,
initial_bias
=
initial_bias
,
)
self
.
conv_subsampling_factor
=
4
elif
input_layer
==
"linear"
:
self
.
input_layer
=
LinearSampling
(
idim
,
attention_dim
,
initialW
=
initialW
,
initial_bias
=
initial_bias
)
elif
input_layer
==
"embed"
:
self
.
input_layer
=
chainer
.
Sequential
(
L
.
EmbedID
(
idim
,
attention_dim
,
ignore_label
=-
1
),
pos_enc_class
(
attention_dim
,
positional_dropout_rate
),
)
self
.
do_history_mask
=
True
else
:
raise
ValueError
(
"unknown input_layer: "
+
input_layer
)
self
.
norm
=
LayerNorm
(
attention_dim
)
for
i
in
range
(
num_blocks
):
name
=
"encoders."
+
str
(
i
)
layer
=
EncoderLayer
(
attention_dim
,
d_units
=
linear_units
,
h
=
attention_heads
,
dropout
=
attention_dropout_rate
,
initialW
=
initialW
,
initial_bias
=
initial_bias
,
)
self
.
add_link
(
name
,
layer
)
self
.
n_layers
=
num_blocks
def
forward
(
self
,
e
,
ilens
):
"""Compute Encoder layer.
Args:
e (chainer.Variable): Batch of padded character. (B, Tmax)
ilens (chainer.Variable): Batch of length of each input batch. (B,)
Returns:
chainer.Variable: Computed variable of encoder.
numpy.array: Mask.
chainer.Variable: Batch of lengths of each encoder outputs.
"""
if
isinstance
(
self
.
input_layer
,
Conv2dSubsampling
):
e
,
ilens
=
self
.
input_layer
(
e
,
ilens
)
else
:
e
=
self
.
input_layer
(
e
)
batch
,
length
,
dims
=
e
.
shape
x_mask
=
np
.
ones
([
batch
,
length
])
for
j
in
range
(
batch
):
x_mask
[
j
,
ilens
[
j
]
:]
=
-
1
xx_mask
=
(
x_mask
[:,
None
,
:]
>=
0
)
*
(
x_mask
[:,
:,
None
]
>=
0
)
xx_mask
=
self
.
xp
.
array
(
xx_mask
)
if
self
.
do_history_mask
:
history_mask
=
make_history_mask
(
self
.
xp
,
x_mask
)
xx_mask
*=
history_mask
logging
.
debug
(
"encoders size: "
+
str
(
e
.
shape
))
e
=
e
.
reshape
(
-
1
,
dims
)
for
i
in
range
(
self
.
n_layers
):
e
=
self
[
"encoders."
+
str
(
i
)](
e
,
xx_mask
,
batch
)
return
self
.
norm
(
e
).
reshape
(
batch
,
length
,
-
1
),
x_mask
,
ilens
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/encoder_layer.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Class Declaration of Transformer's Encoder Block."""
import
chainer
import
chainer.functions
as
F
from
espnet.nets.chainer_backend.transformer.attention
import
MultiHeadAttention
from
espnet.nets.chainer_backend.transformer.layer_norm
import
LayerNorm
from
espnet.nets.chainer_backend.transformer.positionwise_feed_forward
import
(
PositionwiseFeedForward
,
)
class
EncoderLayer
(
chainer
.
Chain
):
"""Single encoder layer module.
Args:
n_units (int): Number of input/output dimension of a FeedForward layer.
d_units (int): Number of units of hidden layer in a FeedForward layer.
h (int): Number of attention heads.
dropout (float): Dropout rate
"""
def
__init__
(
self
,
n_units
,
d_units
=
0
,
h
=
8
,
dropout
=
0.1
,
initialW
=
None
,
initial_bias
=
None
):
"""Initialize EncoderLayer."""
super
(
EncoderLayer
,
self
).
__init__
()
with
self
.
init_scope
():
self
.
self_attn
=
MultiHeadAttention
(
n_units
,
h
,
dropout
=
dropout
,
initialW
=
initialW
,
initial_bias
=
initial_bias
,
)
self
.
feed_forward
=
PositionwiseFeedForward
(
n_units
,
d_units
=
d_units
,
dropout
=
dropout
,
initialW
=
initialW
,
initial_bias
=
initial_bias
,
)
self
.
norm1
=
LayerNorm
(
n_units
)
self
.
norm2
=
LayerNorm
(
n_units
)
self
.
dropout
=
dropout
self
.
n_units
=
n_units
def
forward
(
self
,
e
,
xx_mask
,
batch
):
"""Forward Positional Encoding."""
n_e
=
self
.
norm1
(
e
)
n_e
=
self
.
self_attn
(
n_e
,
mask
=
xx_mask
,
batch
=
batch
)
e
=
e
+
F
.
dropout
(
n_e
,
self
.
dropout
)
n_e
=
self
.
norm2
(
e
)
n_e
=
self
.
feed_forward
(
n_e
)
e
=
e
+
F
.
dropout
(
n_e
,
self
.
dropout
)
return
e
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/label_smoothing_loss.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Class Declaration of Transformer's Label Smootion loss."""
import
logging
import
chainer
import
chainer.functions
as
F
class
LabelSmoothingLoss
(
chainer
.
Chain
):
"""Label Smoothing Loss.
Args:
smoothing (float): smoothing rate (0.0 means the conventional CE).
n_target_vocab (int): number of classes.
normalize_length (bool): normalize loss by sequence length if True.
"""
def
__init__
(
self
,
smoothing
,
n_target_vocab
,
normalize_length
=
False
,
ignore_id
=-
1
):
"""Initialize Loss."""
super
(
LabelSmoothingLoss
,
self
).
__init__
()
self
.
use_label_smoothing
=
False
if
smoothing
>
0.0
:
logging
.
info
(
"Use label smoothing"
)
self
.
smoothing
=
smoothing
self
.
confidence
=
1.0
-
smoothing
self
.
use_label_smoothing
=
True
self
.
n_target_vocab
=
n_target_vocab
self
.
normalize_length
=
normalize_length
self
.
ignore_id
=
ignore_id
self
.
acc
=
None
def
forward
(
self
,
ys_block
,
ys_pad
):
"""Forward Loss.
Args:
ys_block (chainer.Variable): Predicted labels.
ys_pad (chainer.Variable): Target (true) labels.
Returns:
float: Training loss.
"""
# Output (all together at once for efficiency)
batch
,
length
,
dims
=
ys_block
.
shape
concat_logit_block
=
ys_block
.
reshape
(
-
1
,
dims
)
# Target reshape
concat_t_block
=
ys_pad
.
reshape
((
batch
*
length
))
ignore_mask
=
concat_t_block
>=
0
n_token
=
ignore_mask
.
sum
()
normalizer
=
n_token
if
self
.
normalize_length
else
batch
if
not
self
.
use_label_smoothing
:
loss
=
F
.
softmax_cross_entropy
(
concat_logit_block
,
concat_t_block
)
loss
=
loss
*
n_token
/
normalizer
else
:
log_prob
=
F
.
log_softmax
(
concat_logit_block
)
broad_ignore_mask
=
self
.
xp
.
broadcast_to
(
ignore_mask
[:,
None
],
concat_logit_block
.
shape
)
pre_loss
=
(
ignore_mask
*
log_prob
[
self
.
xp
.
arange
(
batch
*
length
),
concat_t_block
]
)
loss
=
-
F
.
sum
(
pre_loss
)
/
normalizer
label_smoothing
=
broad_ignore_mask
*
-
1.0
/
self
.
n_target_vocab
*
log_prob
label_smoothing
=
F
.
sum
(
label_smoothing
)
/
normalizer
loss
=
self
.
confidence
*
loss
+
self
.
smoothing
*
label_smoothing
return
loss
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/layer_norm.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Class Declaration of Transformer's Label Smootion loss."""
import
chainer.links
as
L
class
LayerNorm
(
L
.
LayerNormalization
):
"""Redirect to L.LayerNormalization."""
def
__init__
(
self
,
dims
,
eps
=
1e-12
):
"""Initialize LayerNorm."""
super
(
LayerNorm
,
self
).
__init__
(
size
=
dims
,
eps
=
eps
)
def
__call__
(
self
,
e
):
"""Forward LayerNorm."""
return
super
(
LayerNorm
,
self
).
__call__
(
e
)
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/mask.py
0 → 100644
View file @
60a2c57a
"""Create mask for subsequent steps."""
def
make_history_mask
(
xp
,
block
):
"""Prepare the history mask.
Args:
block (ndarray): Block with dimensions: (B x S).
Returns:
ndarray, np.ndarray: History mask with dimensions (B, S, S).
"""
batch
,
length
=
block
.
shape
arange
=
xp
.
arange
(
length
)
history_mask
=
(
arange
[
None
]
<=
arange
[:,
None
])[
None
,]
history_mask
=
xp
.
broadcast_to
(
history_mask
,
(
batch
,
length
,
length
))
return
history_mask
conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/positionwise_feed_forward.py
0 → 100644
View file @
60a2c57a
# encoding: utf-8
"""Class Declaration of Transformer's Positionwise Feedforward."""
import
chainer
import
chainer.functions
as
F
import
chainer.links
as
L
import
numpy
as
np
class
PositionwiseFeedForward
(
chainer
.
Chain
):
"""Positionwise feed forward.
Args:
:param int idim: input dimenstion
:param int hidden_units: number of hidden units
:param float dropout_rate: dropout rate
"""
def
__init__
(
self
,
n_units
,
d_units
=
0
,
dropout
=
0.1
,
initialW
=
None
,
initial_bias
=
None
):
"""Initialize PositionwiseFeedForward.
Args:
n_units (int): Input dimension.
d_units (int, optional): Output dimension of hidden layer.
dropout (float, optional): Dropout ratio.
initialW (int, optional): Initializer to initialize the weight.
initial_bias (bool, optional): Initializer to initialize the bias.
"""
super
(
PositionwiseFeedForward
,
self
).
__init__
()
n_inner_units
=
d_units
if
d_units
>
0
else
n_units
*
4
with
self
.
init_scope
():
stvd
=
1.0
/
np
.
sqrt
(
n_units
)
self
.
w_1
=
L
.
Linear
(
n_units
,
n_inner_units
,
initialW
=
initialW
(
scale
=
stvd
),
initial_bias
=
initial_bias
(
scale
=
stvd
),
)
stvd
=
1.0
/
np
.
sqrt
(
n_inner_units
)
self
.
w_2
=
L
.
Linear
(
n_inner_units
,
n_units
,
initialW
=
initialW
(
scale
=
stvd
),
initial_bias
=
initial_bias
(
scale
=
stvd
),
)
self
.
act
=
F
.
relu
self
.
dropout
=
dropout
def
__call__
(
self
,
e
):
"""Initialize PositionwiseFeedForward.
Args:
e (chainer.Variable): Input variable.
Return:
chainer.Variable: Output variable.
"""
e
=
F
.
dropout
(
self
.
act
(
self
.
w_1
(
e
)),
self
.
dropout
)
return
self
.
w_2
(
e
)
Prev
1
2
3
4
5
6
7
8
9
…
11
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment