Commit 60a2c57a authored by sunzhq2's avatar sunzhq2 Committed by xuxo
Browse files

update conformer

parent 4a699441
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""RNN sequence-to-sequence speech recognition model (chainer)."""
import logging
import math
import chainer
import numpy as np
from chainer import reporter
from espnet.nets.chainer_backend.asr_interface import ChainerASRInterface
from espnet.nets.chainer_backend.ctc import ctc_for
from espnet.nets.chainer_backend.rnn.attentions import att_for
from espnet.nets.chainer_backend.rnn.decoders import decoder_for
from espnet.nets.chainer_backend.rnn.encoders import encoder_for
from espnet.nets.e2e_asr_common import label_smoothing_dist
from espnet.nets.pytorch_backend.e2e_asr import E2E as E2E_pytorch
from espnet.nets.pytorch_backend.nets_utils import get_subsample
CTC_LOSS_THRESHOLD = 10000
class E2E(ChainerASRInterface):
"""E2E module for chainer backend.
Args:
idim (int): Dimension of the inputs.
odim (int): Dimension of the outputs.
args (parser.args): Training config.
flag_return (bool): If True, train() would return
additional metrics in addition to the training
loss.
"""
@staticmethod
def add_arguments(parser):
"""Add arguments."""
return E2E_pytorch.add_arguments(parser)
def get_total_subsampling_factor(self):
"""Get total subsampling factor."""
return self.enc.conv_subsampling_factor * int(np.prod(self.subsample))
def __init__(self, idim, odim, args, flag_return=True):
"""Construct an E2E object.
:param int idim: dimension of inputs
:param int odim: dimension of outputs
:param Namespace args: argument Namespace containing options
"""
chainer.Chain.__init__(self)
self.mtlalpha = args.mtlalpha
assert 0 <= self.mtlalpha <= 1, "mtlalpha must be [0,1]"
self.etype = args.etype
self.verbose = args.verbose
self.char_list = args.char_list
self.outdir = args.outdir
# below means the last number becomes eos/sos ID
# note that sos/eos IDs are identical
self.sos = odim - 1
self.eos = odim - 1
# subsample info
self.subsample = get_subsample(args, mode="asr", arch="rnn")
# label smoothing info
if args.lsm_type:
logging.info("Use label smoothing with " + args.lsm_type)
labeldist = label_smoothing_dist(
odim, args.lsm_type, transcript=args.train_json
)
else:
labeldist = None
with self.init_scope():
# encoder
self.enc = encoder_for(args, idim, self.subsample)
# ctc
self.ctc = ctc_for(args, odim)
# attention
self.att = att_for(args)
# decoder
self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist)
self.acc = None
self.loss = None
self.flag_return = flag_return
def forward(self, xs, ilens, ys):
"""E2E forward propagation.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.Variable): Batch of length of each input batch. (B,)
ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)
Returns:
float: Loss that calculated by attention and ctc loss.
float (optional): Ctc loss.
float (optional): Attention loss.
float (optional): Accuracy.
"""
# 1. encoder
hs, ilens = self.enc(xs, ilens)
# 3. CTC loss
if self.mtlalpha == 0:
loss_ctc = None
else:
loss_ctc = self.ctc(hs, ys)
# 4. attention loss
if self.mtlalpha == 1:
loss_att = None
acc = None
else:
loss_att, acc = self.dec(hs, ys)
self.acc = acc
alpha = self.mtlalpha
if alpha == 0:
self.loss = loss_att
elif alpha == 1:
self.loss = loss_ctc
else:
self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
if self.loss.data < CTC_LOSS_THRESHOLD and not math.isnan(self.loss.data):
reporter.report({"loss_ctc": loss_ctc}, self)
reporter.report({"loss_att": loss_att}, self)
reporter.report({"acc": acc}, self)
logging.info("mtl loss:" + str(self.loss.data))
reporter.report({"loss": self.loss}, self)
else:
logging.warning("loss (=%f) is not correct", self.loss.data)
if self.flag_return:
return self.loss, loss_ctc, loss_att, acc
else:
return self.loss
def recognize(self, x, recog_args, char_list, rnnlm=None):
"""E2E greedy/beam search.
Args:
x (chainer.Variable): Input tensor for recognition.
recog_args (parser.args): Arguments of config file.
char_list (List[str]): List of Characters.
rnnlm (Module): RNNLM module defined at `espnet.lm.chainer_backend.lm`.
Returns:
List[Dict[str, Any]]: Result of recognition.
"""
# subsample frame
x = x[:: self.subsample[0], :]
ilen = self.xp.array(x.shape[0], dtype=np.int32)
h = chainer.Variable(self.xp.array(x, dtype=np.float32))
with chainer.no_backprop_mode(), chainer.using_config("train", False):
# 1. encoder
# make a utt list (1) to use the same interface for encoder
h, _ = self.enc([h], [ilen])
# calculate log P(z_t|X) for CTC scores
if recog_args.ctc_weight > 0.0:
lpz = self.ctc.log_softmax(h).data[0]
else:
lpz = None
# 2. decoder
# decode the first utterance
y = self.dec.recognize_beam(h[0], lpz, recog_args, char_list, rnnlm)
return y
def calculate_all_attentions(self, xs, ilens, ys):
"""E2E attention calculation.
Args:
xs (List): List of padded input sequences. [(T1, idim), (T2, idim), ...]
ilens (np.ndarray): Batch of lengths of input sequences. (B)
ys (List): List of character id sequence tensor. [(L1), (L2), (L3), ...]
Returns:
float np.ndarray: Attention weights. (B, Lmax, Tmax)
"""
hs, ilens = self.enc(xs, ilens)
att_ws = self.dec.calculate_all_attentions(hs, ys)
return att_ws
@staticmethod
def custom_converter(subsampling_factor=0):
"""Get customconverter of the model."""
from espnet.nets.chainer_backend.rnn.training import CustomConverter
return CustomConverter(subsampling_factor=subsampling_factor)
@staticmethod
def custom_updater(iters, optimizer, converter, device=-1, accum_grad=1):
"""Get custom_updater of the model."""
from espnet.nets.chainer_backend.rnn.training import CustomUpdater
return CustomUpdater(
iters, optimizer, converter=converter, device=device, accum_grad=accum_grad
)
@staticmethod
def custom_parallel_updater(iters, optimizer, converter, devices, accum_grad=1):
"""Get custom_parallel_updater of the model."""
from espnet.nets.chainer_backend.rnn.training import CustomParallelUpdater
return CustomParallelUpdater(
iters,
optimizer,
converter=converter,
devices=devices,
accum_grad=accum_grad,
)
# encoding: utf-8
"""Transformer-based model for End-to-end ASR."""
import logging
import math
from argparse import Namespace
from distutils.util import strtobool
import chainer
import chainer.functions as F
import numpy as np
from chainer import reporter
from espnet.nets.chainer_backend.asr_interface import ChainerASRInterface
from espnet.nets.chainer_backend.transformer import ctc
from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention
from espnet.nets.chainer_backend.transformer.decoder import Decoder
from espnet.nets.chainer_backend.transformer.encoder import Encoder
from espnet.nets.chainer_backend.transformer.label_smoothing_loss import ( # noqa: H301
LabelSmoothingLoss,
)
from espnet.nets.chainer_backend.transformer.training import ( # noqa: H301
CustomConverter,
CustomParallelUpdater,
CustomUpdater,
)
from espnet.nets.ctc_prefix_score import CTCPrefixScore
from espnet.nets.e2e_asr_common import ErrorCalculator, end_detect
from espnet.nets.pytorch_backend.nets_utils import get_subsample
from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
CTC_SCORING_RATIO = 1.5
MAX_DECODER_OUTPUT = 5
class E2E(ChainerASRInterface):
"""E2E module.
Args:
idim (int): Input dimmensions.
odim (int): Output dimmensions.
args (Namespace): Training config.
ignore_id (int, optional): Id for ignoring a character.
flag_return (bool, optional): If true, return a list with (loss,
loss_ctc, loss_att, acc) in forward. Otherwise, return loss.
"""
@staticmethod
def add_arguments(parser):
"""Customize flags for transformer setup.
Args:
parser (Namespace): Training config.
"""
group = parser.add_argument_group("transformer model setting")
group.add_argument(
"--transformer-init",
type=str,
default="pytorch",
help="how to initialize transformer parameters",
)
group.add_argument(
"--transformer-input-layer",
type=str,
default="conv2d",
choices=["conv2d", "linear", "embed"],
help="transformer input layer type",
)
group.add_argument(
"--transformer-attn-dropout-rate",
default=None,
type=float,
help="dropout in transformer attention. use --dropout-rate if None is set",
)
group.add_argument(
"--transformer-lr",
default=10.0,
type=float,
help="Initial value of learning rate",
)
group.add_argument(
"--transformer-warmup-steps",
default=25000,
type=int,
help="optimizer warmup steps",
)
group.add_argument(
"--transformer-length-normalized-loss",
default=True,
type=strtobool,
help="normalize loss by length",
)
group.add_argument(
"--dropout-rate",
default=0.0,
type=float,
help="Dropout rate for the encoder",
)
# Encoder
group.add_argument(
"--elayers",
default=4,
type=int,
help="Number of encoder layers (for shared recognition part "
"in multi-speaker asr mode)",
)
group.add_argument(
"--eunits",
"-u",
default=300,
type=int,
help="Number of encoder hidden units",
)
# Attention
group.add_argument(
"--adim",
default=320,
type=int,
help="Number of attention transformation dimensions",
)
group.add_argument(
"--aheads",
default=4,
type=int,
help="Number of heads for multi head attention",
)
# Decoder
group.add_argument(
"--dlayers", default=1, type=int, help="Number of decoder layers"
)
group.add_argument(
"--dunits", default=320, type=int, help="Number of decoder hidden units"
)
return parser
def get_total_subsampling_factor(self):
"""Get total subsampling factor."""
return self.encoder.conv_subsampling_factor * int(np.prod(self.subsample))
def __init__(self, idim, odim, args, ignore_id=-1, flag_return=True):
"""Initialize the transformer."""
chainer.Chain.__init__(self)
self.mtlalpha = args.mtlalpha
assert 0 <= self.mtlalpha <= 1, "mtlalpha must be [0,1]"
if args.transformer_attn_dropout_rate is None:
args.transformer_attn_dropout_rate = args.dropout_rate
self.use_label_smoothing = False
self.char_list = args.char_list
self.space = args.sym_space
self.blank = args.sym_blank
self.scale_emb = args.adim**0.5
self.sos = odim - 1
self.eos = odim - 1
self.subsample = get_subsample(args, mode="asr", arch="transformer")
self.ignore_id = ignore_id
self.reset_parameters(args)
with self.init_scope():
self.encoder = Encoder(
idim=idim,
attention_dim=args.adim,
attention_heads=args.aheads,
linear_units=args.eunits,
input_layer=args.transformer_input_layer,
dropout_rate=args.dropout_rate,
positional_dropout_rate=args.dropout_rate,
attention_dropout_rate=args.transformer_attn_dropout_rate,
initialW=self.initialW,
initial_bias=self.initialB,
)
self.decoder = Decoder(
odim, args, initialW=self.initialW, initial_bias=self.initialB
)
self.criterion = LabelSmoothingLoss(
args.lsm_weight,
len(args.char_list),
args.transformer_length_normalized_loss,
)
if args.mtlalpha > 0.0:
if args.ctc_type == "builtin":
logging.info("Using chainer CTC implementation")
self.ctc = ctc.CTC(odim, args.adim, args.dropout_rate)
else:
raise ValueError(
'ctc_type must be "builtin": {}'.format(args.ctc_type)
)
else:
self.ctc = None
self.dims = args.adim
self.odim = odim
self.flag_return = flag_return
if args.report_cer or args.report_wer:
self.error_calculator = ErrorCalculator(
args.char_list,
args.sym_space,
args.sym_blank,
args.report_cer,
args.report_wer,
)
else:
self.error_calculator = None
if "Namespace" in str(type(args)):
self.verbose = 0 if "verbose" not in args else args.verbose
else:
self.verbose = 0 if args.verbose is None else args.verbose
def reset_parameters(self, args):
"""Initialize the Weight according to the give initialize-type.
Args:
args (Namespace): Transformer config.
"""
type_init = args.transformer_init
if type_init == "lecun_uniform":
logging.info("Using LeCunUniform as Parameter initializer")
self.initialW = chainer.initializers.LeCunUniform
elif type_init == "lecun_normal":
logging.info("Using LeCunNormal as Parameter initializer")
self.initialW = chainer.initializers.LeCunNormal
elif type_init == "gorot_uniform":
logging.info("Using GlorotUniform as Parameter initializer")
self.initialW = chainer.initializers.GlorotUniform
elif type_init == "gorot_normal":
logging.info("Using GlorotNormal as Parameter initializer")
self.initialW = chainer.initializers.GlorotNormal
elif type_init == "he_uniform":
logging.info("Using HeUniform as Parameter initializer")
self.initialW = chainer.initializers.HeUniform
elif type_init == "he_normal":
logging.info("Using HeNormal as Parameter initializer")
self.initialW = chainer.initializers.HeNormal
elif type_init == "pytorch":
logging.info("Using Pytorch initializer")
self.initialW = chainer.initializers.Uniform
else:
logging.info("Using Chainer default as Parameter initializer")
self.initialW = chainer.initializers.Uniform
self.initialB = chainer.initializers.Uniform
def forward(self, xs, ilens, ys_pad, calculate_attentions=False):
"""E2E forward propagation.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.Variable): Batch of length of each input batch. (B,)
ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)
calculate_attentions (bool): If true, return value is the output of encoder.
Returns:
float: Training loss.
float (optional): Training loss for ctc.
float (optional): Training loss for attention.
float (optional): Accuracy.
chainer.Variable (Optional): Output of the encoder.
"""
alpha = self.mtlalpha
# 1. Encoder
xs, x_mask, ilens = self.encoder(xs, ilens)
# 2. CTC loss
cer_ctc = None
if alpha == 0.0:
loss_ctc = None
else:
_ys = [y.astype(np.int32) for y in ys_pad]
loss_ctc = self.ctc(xs, _ys)
if self.error_calculator is not None:
with chainer.no_backprop_mode():
ys_hat = chainer.backends.cuda.to_cpu(self.ctc.argmax(xs).data)
cer_ctc = self.error_calculator(ys_hat, ys_pad, is_ctc=True)
# 3. Decoder
if calculate_attentions:
self.calculate_attentions(xs, x_mask, ys_pad)
ys = self.decoder(ys_pad, xs, x_mask)
# 4. Attention Loss
cer, wer = None, None
if alpha == 1:
loss_att = None
acc = None
else:
# Make target
eos = np.array([self.eos], "i")
with chainer.no_backprop_mode():
ys_pad_out = [np.concatenate([y, eos], axis=0) for y in ys_pad]
ys_pad_out = F.pad_sequence(ys_pad_out, padding=-1).data
ys_pad_out = self.xp.array(ys_pad_out)
loss_att = self.criterion(ys, ys_pad_out)
acc = F.accuracy(
ys.reshape(-1, self.odim), ys_pad_out.reshape(-1), ignore_label=-1
)
if (not chainer.config.train) and (self.error_calculator is not None):
cer, wer = self.error_calculator(ys, ys_pad)
if alpha == 0.0:
self.loss = loss_att
loss_att_data = loss_att.data
loss_ctc_data = None
elif alpha == 1.0:
self.loss = loss_ctc
loss_att_data = None
loss_ctc_data = loss_ctc.data
else:
self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
loss_att_data = loss_att.data
loss_ctc_data = loss_ctc.data
loss_data = self.loss.data
if not math.isnan(loss_data):
reporter.report({"loss_ctc": loss_ctc_data}, self)
reporter.report({"loss_att": loss_att_data}, self)
reporter.report({"acc": acc}, self)
reporter.report({"cer_ctc": cer_ctc}, self)
reporter.report({"cer": cer}, self)
reporter.report({"wer": wer}, self)
logging.info("mtl loss:" + str(loss_data))
reporter.report({"loss": loss_data}, self)
else:
logging.warning("loss (=%f) is not correct", loss_data)
if self.flag_return:
loss_ctc = None
return self.loss, loss_ctc, loss_att, acc
else:
return self.loss
def calculate_attentions(self, xs, x_mask, ys_pad):
"""Calculate Attentions."""
self.decoder(ys_pad, xs, x_mask)
def recognize(self, x_block, recog_args, char_list=None, rnnlm=None):
"""E2E recognition function.
Args:
x (ndarray): Input acouctic feature (B, T, D) or (T, D).
recog_args (Namespace): Argment namespace contraining options.
char_list (List[str]): List of characters.
rnnlm (chainer.Chain): Language model module defined at
`espnet.lm.chainer_backend.lm`.
Returns:
List: N-best decoding results.
"""
with chainer.no_backprop_mode(), chainer.using_config("train", False):
# 1. encoder
ilens = [x_block.shape[0]]
batch = len(ilens)
xs, _, _ = self.encoder(x_block[None, :, :], ilens)
# calculate log P(z_t|X) for CTC scores
if recog_args.ctc_weight > 0.0:
lpz = self.ctc.log_softmax(xs.reshape(batch, -1, self.dims)).data[0]
else:
lpz = None
# 2. decoder
if recog_args.lm_weight == 0.0:
rnnlm = None
y = self.recognize_beam(xs, lpz, recog_args, char_list, rnnlm)
return y
def recognize_beam(self, h, lpz, recog_args, char_list=None, rnnlm=None):
"""E2E beam search.
Args:
h (ndarray): Encoder output features (B, T, D) or (T, D).
lpz (ndarray): Log probabilities from CTC.
recog_args (Namespace): Argment namespace contraining options.
char_list (List[str]): List of characters.
rnnlm (chainer.Chain): Language model module defined at
`espnet.lm.chainer_backend.lm`.
Returns:
List: N-best decoding results.
"""
logging.info("input lengths: " + str(h.shape[1]))
# initialization
n_len = h.shape[1]
xp = self.xp
h_mask = xp.ones((1, n_len))
# search parms
beam = recog_args.beam_size
penalty = recog_args.penalty
ctc_weight = recog_args.ctc_weight
# prepare sos
y = self.sos
if recog_args.maxlenratio == 0:
maxlen = n_len
else:
maxlen = max(1, int(recog_args.maxlenratio * n_len))
minlen = int(recog_args.minlenratio * n_len)
logging.info("max output length: " + str(maxlen))
logging.info("min output length: " + str(minlen))
# initialize hypothesis
if rnnlm:
hyp = {"score": 0.0, "yseq": [y], "rnnlm_prev": None}
else:
hyp = {"score": 0.0, "yseq": [y]}
if lpz is not None:
ctc_prefix_score = CTCPrefixScore(lpz, 0, self.eos, self.xp)
hyp["ctc_state_prev"] = ctc_prefix_score.initial_state()
hyp["ctc_score_prev"] = 0.0
if ctc_weight != 1.0:
# pre-pruning based on attention scores
ctc_beam = min(lpz.shape[-1], int(beam * CTC_SCORING_RATIO))
else:
ctc_beam = lpz.shape[-1]
hyps = [hyp]
ended_hyps = []
for i in range(maxlen):
logging.debug("position " + str(i))
hyps_best_kept = []
for hyp in hyps:
ys = F.expand_dims(xp.array(hyp["yseq"]), axis=0).data
out = self.decoder(ys, h, h_mask)
# get nbest local scores and their ids
local_att_scores = F.log_softmax(out[:, -1], axis=-1).data
if rnnlm:
rnnlm_state, local_lm_scores = rnnlm.predict(
hyp["rnnlm_prev"], hyp["yseq"][i]
)
local_scores = (
local_att_scores + recog_args.lm_weight * local_lm_scores
)
else:
local_scores = local_att_scores
if lpz is not None:
local_best_ids = xp.argsort(local_scores, axis=1)[0, ::-1][
:ctc_beam
]
ctc_scores, ctc_states = ctc_prefix_score(
hyp["yseq"], local_best_ids, hyp["ctc_state_prev"]
)
local_scores = (1.0 - ctc_weight) * local_att_scores[
:, local_best_ids
] + ctc_weight * (ctc_scores - hyp["ctc_score_prev"])
if rnnlm:
local_scores += (
recog_args.lm_weight * local_lm_scores[:, local_best_ids]
)
joint_best_ids = xp.argsort(local_scores, axis=1)[0, ::-1][:beam]
local_best_scores = local_scores[:, joint_best_ids]
local_best_ids = local_best_ids[joint_best_ids]
else:
local_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
:beam
]
local_best_scores = local_scores[:, local_best_ids]
for j in range(beam):
new_hyp = {}
new_hyp["score"] = hyp["score"] + float(local_best_scores[0, j])
new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
new_hyp["yseq"][len(hyp["yseq"])] = int(local_best_ids[j])
if rnnlm:
new_hyp["rnnlm_prev"] = rnnlm_state
if lpz is not None:
new_hyp["ctc_state_prev"] = ctc_states[joint_best_ids[j]]
new_hyp["ctc_score_prev"] = ctc_scores[joint_best_ids[j]]
hyps_best_kept.append(new_hyp)
hyps_best_kept = sorted(
hyps_best_kept, key=lambda x: x["score"], reverse=True
)[:beam]
# sort and get nbest
hyps = hyps_best_kept
logging.debug("number of pruned hypothesis: " + str(len(hyps)))
if char_list is not None:
logging.debug(
"best hypo: "
+ "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]])
+ " score: "
+ str(hyps[0]["score"])
)
# add eos in the final loop to avoid that there are no ended hyps
if i == maxlen - 1:
logging.info("adding <eos> in the last position in the loop")
for hyp in hyps:
hyp["yseq"].append(self.eos)
# add ended hypothes to a final list, and removed them from current hypothes
# (this will be a probmlem, number of hyps < beam)
remained_hyps = []
for hyp in hyps:
if hyp["yseq"][-1] == self.eos:
# only store the sequence that has more than minlen outputs
# also add penalty
if len(hyp["yseq"]) > minlen:
hyp["score"] += (i + 1) * penalty
if rnnlm: # Word LM needs to add final <eos> score
hyp["score"] += recog_args.lm_weight * rnnlm.final(
hyp["rnnlm_prev"]
)
ended_hyps.append(hyp)
else:
remained_hyps.append(hyp)
# end detection
if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0:
logging.info("end detected at %d", i)
break
hyps = remained_hyps
if len(hyps) > 0:
logging.debug("remained hypothes: " + str(len(hyps)))
else:
logging.info("no hypothesis. Finish decoding.")
break
if char_list is not None:
for hyp in hyps:
logging.debug(
"hypo: " + "".join([char_list[int(x)] for x in hyp["yseq"][1:]])
)
logging.debug("number of ended hypothes: " + str(len(ended_hyps)))
nbest_hyps = sorted(
ended_hyps, key=lambda x: x["score"], reverse=True
) # [:min(len(ended_hyps), recog_args.nbest)]
logging.debug(nbest_hyps)
# check number of hypotheis
if len(nbest_hyps) == 0:
logging.warn(
"there is no N-best results, perform recognition "
"again with smaller minlenratio."
)
# should copy becasuse Namespace will be overwritten globally
recog_args = Namespace(**vars(recog_args))
recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1)
return self.recognize_beam(h, lpz, recog_args, char_list, rnnlm)
logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
logging.info(
"normalized log probability: "
+ str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
)
# remove sos
return nbest_hyps
def calculate_all_attentions(self, xs, ilens, ys):
"""E2E attention calculation.
Args:
xs (List[tuple()]): List of padded input sequences.
[(T1, idim), (T2, idim), ...]
ilens (ndarray): Batch of lengths of input sequences. (B)
ys (List): List of character id sequence tensor. [(L1), (L2), (L3), ...]
Returns:
float ndarray: Attention weights. (B, Lmax, Tmax)
"""
with chainer.no_backprop_mode():
self(xs, ilens, ys, calculate_attentions=True)
ret = dict()
for name, m in self.namedlinks():
if isinstance(m, MultiHeadAttention):
var = m.attn
var.to_cpu()
_name = name[1:].replace("/", "_")
ret[_name] = var.data
return ret
@property
def attention_plot_class(self):
"""Attention plot function.
Redirects to PlotAttentionReport
Returns:
PlotAttentionReport
"""
return PlotAttentionReport
@staticmethod
def custom_converter(subsampling_factor=0):
"""Get customconverter of the model."""
return CustomConverter()
@staticmethod
def custom_updater(iters, optimizer, converter, device=-1, accum_grad=1):
"""Get custom_updater of the model."""
return CustomUpdater(
iters, optimizer, converter=converter, device=device, accum_grad=accum_grad
)
@staticmethod
def custom_parallel_updater(iters, optimizer, converter, devices, accum_grad=1):
"""Get custom_parallel_updater of the model."""
return CustomParallelUpdater(
iters,
optimizer,
converter=converter,
devices=devices,
accum_grad=accum_grad,
)
import chainer.functions as F
def _subsamplex(x, n):
x = [F.get_item(xx, (slice(None, None, n), slice(None))) for xx in x]
ilens = [xx.shape[0] for xx in x]
return x, ilens
import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np
# dot product based attention
class AttDot(chainer.Chain):
"""Compute attention based on dot product.
Args:
eprojs (int | None): Dimension of input vectors from encoder.
dunits (int | None): Dimension of input vectors for decoder.
att_dim (int): Dimension of input vectors for attention.
"""
def __init__(self, eprojs, dunits, att_dim):
super(AttDot, self).__init__()
with self.init_scope():
self.mlp_enc = L.Linear(eprojs, att_dim)
self.mlp_dec = L.Linear(dunits, att_dim)
self.dunits = dunits
self.eprojs = eprojs
self.att_dim = att_dim
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
def reset(self):
"""Reset states."""
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0):
"""Compute AttDot forward layer.
Args:
enc_hs (chainer.Variable | N-dimensional array):
Input variable from encoder.
dec_z (chainer.Variable | N-dimensional array): Input variable of decoder.
scaling (float): Scaling weight to make attention sharp.
Returns:
chainer.Variable: Weighted sum over flames.
chainer.Variable: Attention weight.
"""
batch = len(enc_hs)
# pre-compute all h outside the decoder loop
if self.pre_compute_enc_h is None:
self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim
self.h_length = self.enc_h.shape[1]
# utt x frame x att_dim
self.pre_compute_enc_h = F.tanh(self.mlp_enc(self.enc_h, n_batch_axes=2))
if dec_z is None:
dec_z = chainer.Variable(
self.xp.zeros((batch, self.dunits), dtype=np.float32)
)
else:
dec_z = dec_z.reshape(batch, self.dunits)
# <phi (h_t), psi (s)> for all t
u = F.broadcast_to(
F.expand_dims(F.tanh(self.mlp_dec(dec_z)), 1), self.pre_compute_enc_h.shape
)
e = F.sum(self.pre_compute_enc_h * u, axis=2) # utt x frame
# Applying a minus-large-number filter
# to make a probability value zero for a padded area
# simply degrades the performance, and I gave up this implementation
# Apply a scaling to make an attention sharp
w = F.softmax(scaling * e)
# weighted sum over flames
# utt x hdim
c = F.sum(
self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1
)
return c, w
# location based attention
class AttLoc(chainer.Chain):
"""Compute location-based attention.
Args:
eprojs (int | None): Dimension of input vectors from encoder.
dunits (int | None): Dimension of input vectors for decoder.
att_dim (int): Dimension of input vectors for attention.
aconv_chans (int): Number of channels of output arrays from convolutional layer.
aconv_filts (int): Size of filters of convolutional layer.
"""
def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
super(AttLoc, self).__init__()
with self.init_scope():
self.mlp_enc = L.Linear(eprojs, att_dim)
self.mlp_dec = L.Linear(dunits, att_dim, nobias=True)
self.mlp_att = L.Linear(aconv_chans, att_dim, nobias=True)
self.loc_conv = L.Convolution2D(
1, aconv_chans, ksize=(1, 2 * aconv_filts + 1), pad=(0, aconv_filts)
)
self.gvec = L.Linear(att_dim, 1)
self.dunits = dunits
self.eprojs = eprojs
self.att_dim = att_dim
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
self.aconv_chans = aconv_chans
def reset(self):
"""Reset states."""
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0):
"""Compute AttLoc forward layer.
Args:
enc_hs (chainer.Variable | N-dimensional array):
Input variable from encoders.
dec_z (chainer.Variable | N-dimensional array): Input variable of decoder.
att_prev (chainer.Variable | None): Attention weight.
scaling (float): Scaling weight to make attention sharp.
Returns:
chainer.Variable: Weighted sum over flames.
chainer.Variable: Attention weight.
"""
batch = len(enc_hs)
# pre-compute all h outside the decoder loop
if self.pre_compute_enc_h is None:
self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim
self.h_length = self.enc_h.shape[1]
# utt x frame x att_dim
self.pre_compute_enc_h = self.mlp_enc(self.enc_h, n_batch_axes=2)
if dec_z is None:
dec_z = chainer.Variable(
self.xp.zeros((batch, self.dunits), dtype=np.float32)
)
else:
dec_z = dec_z.reshape(batch, self.dunits)
# initialize attention weight with uniform dist.
if att_prev is None:
att_prev = [
self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32)
for hh in enc_hs
]
att_prev = [chainer.Variable(att) for att in att_prev]
att_prev = F.pad_sequence(att_prev)
# att_prev: utt x frame -> utt x 1 x 1 x frame
# -> utt x att_conv_chans x 1 x frame
att_conv = self.loc_conv(att_prev.reshape(batch, 1, 1, self.h_length))
# att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
att_conv = F.swapaxes(F.squeeze(att_conv, axis=2), 1, 2)
# att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
att_conv = self.mlp_att(att_conv, n_batch_axes=2)
# dec_z_tiled: utt x frame x att_dim
dec_z_tiled = F.broadcast_to(
F.expand_dims(self.mlp_dec(dec_z), 1), self.pre_compute_enc_h.shape
)
# dot with gvec
# utt x frame x att_dim -> utt x frame
# TODO(watanabe) use batch_matmul
e = F.squeeze(
self.gvec(
F.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled), n_batch_axes=2
),
axis=2,
)
# Applying a minus-large-number filter
# to make a probability value zero for a padded area
# simply degrades the performance, and I gave up this implementation
# Apply a scaling to make an attention sharp
w = F.softmax(scaling * e)
# weighted sum over flames
# utt x hdim
c = F.sum(
self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1
)
return c, w
class NoAtt(chainer.Chain):
"""Compute non-attention layer.
This layer is a dummy attention layer to be compatible with other
attention-based models.
"""
def __init__(self):
super(NoAtt, self).__init__()
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
self.c = None
def reset(self):
"""Reset states."""
self.h_length = None
self.enc_h = None
self.pre_compute_enc_h = None
self.c = None
def __call__(self, enc_hs, dec_z, att_prev):
"""Compute NoAtt forward layer.
Args:
enc_hs (chainer.Variable | N-dimensional array):
Input variable from encoders.
dec_z: Dummy.
att_prev (chainer.Variable | None): Attention weight.
Returns:
chainer.Variable: Sum over flames.
chainer.Variable: Attention weight.
"""
# pre-compute all h outside the decoder loop
if self.pre_compute_enc_h is None:
self.enc_h = F.pad_sequence(enc_hs) # utt x frame x hdim
self.h_length = self.enc_h.shape[1]
# initialize attention weight with uniform dist.
if att_prev is None:
att_prev = [
self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32)
for hh in enc_hs
]
att_prev = [chainer.Variable(att) for att in att_prev]
att_prev = F.pad_sequence(att_prev)
self.c = F.sum(
self.enc_h
* F.broadcast_to(F.expand_dims(att_prev, 2), self.enc_h.shape),
axis=1,
)
return self.c, att_prev
def att_for(args):
"""Returns an attention layer given the program arguments.
Args:
args (Namespace): The arguments.
Returns:
chainer.Chain: The corresponding attention module.
"""
if args.atype == "dot":
att = AttDot(args.eprojs, args.dunits, args.adim)
elif args.atype == "location":
att = AttLoc(
args.eprojs, args.dunits, args.adim, args.aconv_chans, args.aconv_filts
)
elif args.atype == "noatt":
att = NoAtt()
else:
raise NotImplementedError(
"chainer supports only noatt, dot, and location attention."
)
return att
import logging
import random
from argparse import Namespace
import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np
import espnet.nets.chainer_backend.deterministic_embed_id as DL
from espnet.nets.ctc_prefix_score import CTCPrefixScore
from espnet.nets.e2e_asr_common import end_detect
CTC_SCORING_RATIO = 1.5
MAX_DECODER_OUTPUT = 5
class Decoder(chainer.Chain):
"""Decoder layer.
Args:
eprojs (int): Dimension of input variables from encoder.
odim (int): The output dimension.
dtype (str): Decoder type.
dlayers (int): Number of layers for decoder.
dunits (int): Dimension of input vector of decoder.
sos (int): Number to indicate the start of sequences.
eos (int): Number to indicate the end of sequences.
att (Module): Attention module defined at
`espnet.espnet.nets.chainer_backend.attentions`.
verbose (int): Verbosity level.
char_list (List[str]): List of all characters.
labeldist (numpy.array): Distributed array of counted transcript length.
lsm_weight (float): Weight to use when calculating the training loss.
sampling_probability (float): Threshold for scheduled sampling.
"""
def __init__(
self,
eprojs,
odim,
dtype,
dlayers,
dunits,
sos,
eos,
att,
verbose=0,
char_list=None,
labeldist=None,
lsm_weight=0.0,
sampling_probability=0.0,
):
super(Decoder, self).__init__()
with self.init_scope():
self.embed = DL.EmbedID(odim, dunits)
self.rnn0 = (
L.StatelessLSTM(dunits + eprojs, dunits)
if dtype == "lstm"
else L.StatelessGRU(dunits + eprojs, dunits)
)
for i in range(1, dlayers):
setattr(
self,
"rnn%d" % i,
L.StatelessLSTM(dunits, dunits)
if dtype == "lstm"
else L.StatelessGRU(dunits, dunits),
)
self.output = L.Linear(dunits, odim)
self.dtype = dtype
self.loss = None
self.att = att
self.dlayers = dlayers
self.dunits = dunits
self.sos = sos
self.eos = eos
self.verbose = verbose
self.char_list = char_list
# for label smoothing
self.labeldist = labeldist
self.vlabeldist = None
self.lsm_weight = lsm_weight
self.sampling_probability = sampling_probability
def rnn_forward(self, ey, z_list, c_list, z_prev, c_prev):
if self.dtype == "lstm":
c_list[0], z_list[0] = self.rnn0(c_prev[0], z_prev[0], ey)
for i in range(1, self.dlayers):
c_list[i], z_list[i] = self["rnn%d" % i](
c_prev[i], z_prev[i], z_list[i - 1]
)
else:
if z_prev[0] is None:
xp = self.xp
with chainer.backends.cuda.get_device_from_id(self._device_id):
z_prev[0] = chainer.Variable(
xp.zeros((ey.shape[0], self.dunits), dtype=ey.dtype)
)
z_list[0] = self.rnn0(z_prev[0], ey)
for i in range(1, self.dlayers):
if z_prev[i] is None:
xp = self.xp
with chainer.backends.cuda.get_device_from_id(self._device_id):
z_prev[i] = chainer.Variable(
xp.zeros(
(z_list[i - 1].shape[0], self.dunits),
dtype=z_list[i - 1].dtype,
)
)
z_list[i] = self["rnn%d" % i](z_prev[i], z_list[i - 1])
return z_list, c_list
def __call__(self, hs, ys):
"""Core function of Decoder layer.
Args:
hs (list of chainer.Variable | N-dimension array):
Input variable from encoder.
ys (list of chainer.Variable | N-dimension array):
Input variable of decoder.
Returns:
chainer.Variable: A variable holding a scalar array of the training loss.
chainer.Variable: A variable holding a scalar array of the accuracy.
"""
self.loss = None
# prepare input and output word sequences with sos/eos IDs
eos = self.xp.array([self.eos], "i")
sos = self.xp.array([self.sos], "i")
ys_in = [F.concat([sos, y], axis=0) for y in ys]
ys_out = [F.concat([y, eos], axis=0) for y in ys]
# padding for ys with -1
# pys: utt x olen
pad_ys_in = F.pad_sequence(ys_in, padding=self.eos)
pad_ys_out = F.pad_sequence(ys_out, padding=-1)
# get dim, length info
batch = pad_ys_out.shape[0]
olength = pad_ys_out.shape[1]
logging.info(
self.__class__.__name__
+ " input lengths: "
+ str(self.xp.array([h.shape[0] for h in hs]))
)
logging.info(
self.__class__.__name__
+ " output lengths: "
+ str(self.xp.array([y.shape[0] for y in ys_out]))
)
# initialization
c_list = [None] # list of cell state of each layer
z_list = [None] # list of hidden state of each layer
for _ in range(1, self.dlayers):
c_list.append(None)
z_list.append(None)
att_w = None
z_all = []
self.att.reset() # reset pre-computation of h
# pre-computation of embedding
eys = self.embed(pad_ys_in) # utt x olen x zdim
eys = F.separate(eys, axis=1)
# loop for an output sequence
for i in range(olength):
att_c, att_w = self.att(hs, z_list[0], att_w)
if i > 0 and random.random() < self.sampling_probability:
logging.info(" scheduled sampling ")
z_out = self.output(z_all[-1])
z_out = F.argmax(F.log_softmax(z_out), axis=1)
z_out = self.embed(z_out)
ey = F.hstack((z_out, att_c)) # utt x (zdim + hdim)
else:
ey = F.hstack((eys[i], att_c)) # utt x (zdim + hdim)
z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list)
z_all.append(z_list[-1])
z_all = F.stack(z_all, axis=1).reshape(batch * olength, self.dunits)
# compute loss
y_all = self.output(z_all)
self.loss = F.softmax_cross_entropy(y_all, F.flatten(pad_ys_out))
# -1: eos, which is removed in the loss computation
self.loss *= np.mean([len(x) for x in ys_in]) - 1
acc = F.accuracy(y_all, F.flatten(pad_ys_out), ignore_label=-1)
logging.info("att loss:" + str(self.loss.data))
# show predicted character sequence for debug
if self.verbose > 0 and self.char_list is not None:
y_hat = y_all.reshape(batch, olength, -1)
y_true = pad_ys_out
for (i, y_hat_), y_true_ in zip(enumerate(y_hat.data), y_true.data):
if i == MAX_DECODER_OUTPUT:
break
idx_hat = self.xp.argmax(y_hat_[y_true_ != -1], axis=1)
idx_true = y_true_[y_true_ != -1]
seq_hat = [self.char_list[int(idx)] for idx in idx_hat]
seq_true = [self.char_list[int(idx)] for idx in idx_true]
seq_hat = "".join(seq_hat).replace("<space>", " ")
seq_true = "".join(seq_true).replace("<space>", " ")
logging.info("groundtruth[%d]: " % i + seq_true)
logging.info("prediction [%d]: " % i + seq_hat)
if self.labeldist is not None:
if self.vlabeldist is None:
self.vlabeldist = chainer.Variable(self.xp.asarray(self.labeldist))
loss_reg = -F.sum(
F.scale(F.log_softmax(y_all), self.vlabeldist, axis=1)
) / len(ys_in)
self.loss = (1.0 - self.lsm_weight) * self.loss + self.lsm_weight * loss_reg
return self.loss, acc
def recognize_beam(self, h, lpz, recog_args, char_list, rnnlm=None):
"""Beam search implementation.
Args:
h (chainer.Variable): One of the output from the encoder.
lpz (chainer.Variable | None): Result of net propagation.
recog_args (Namespace): The argument.
char_list (List[str]): List of all characters.
rnnlm (Module): RNNLM module. Defined at `espnet.lm.chainer_backend.lm`
Returns:
List[Dict[str,Any]]: Result of recognition.
"""
logging.info("input lengths: " + str(h.shape[0]))
# initialization
c_list = [None] # list of cell state of each layer
z_list = [None] # list of hidden state of each layer
for _ in range(1, self.dlayers):
c_list.append(None)
z_list.append(None)
a = None
self.att.reset() # reset pre-computation of h
# search parms
beam = recog_args.beam_size
penalty = recog_args.penalty
ctc_weight = recog_args.ctc_weight
# preprate sos
y = self.xp.full(1, self.sos, "i")
if recog_args.maxlenratio == 0:
maxlen = h.shape[0]
else:
# maxlen >= 1
maxlen = max(1, int(recog_args.maxlenratio * h.shape[0]))
minlen = int(recog_args.minlenratio * h.shape[0])
logging.info("max output length: " + str(maxlen))
logging.info("min output length: " + str(minlen))
# initialize hypothesis
if rnnlm:
hyp = {
"score": 0.0,
"yseq": [y],
"c_prev": c_list,
"z_prev": z_list,
"a_prev": a,
"rnnlm_prev": None,
}
else:
hyp = {
"score": 0.0,
"yseq": [y],
"c_prev": c_list,
"z_prev": z_list,
"a_prev": a,
}
if lpz is not None:
ctc_prefix_score = CTCPrefixScore(lpz, 0, self.eos, self.xp)
hyp["ctc_state_prev"] = ctc_prefix_score.initial_state()
hyp["ctc_score_prev"] = 0.0
if ctc_weight != 1.0:
# pre-pruning based on attention scores
ctc_beam = min(lpz.shape[-1], int(beam * CTC_SCORING_RATIO))
else:
ctc_beam = lpz.shape[-1]
hyps = [hyp]
ended_hyps = []
for i in range(maxlen):
logging.debug("position " + str(i))
hyps_best_kept = []
for hyp in hyps:
ey = self.embed(hyp["yseq"][i]) # utt list (1) x zdim
att_c, att_w = self.att([h], hyp["z_prev"][0], hyp["a_prev"])
ey = F.hstack((ey, att_c)) # utt(1) x (zdim + hdim)
z_list, c_list = self.rnn_forward(
ey, z_list, c_list, hyp["z_prev"], hyp["c_prev"]
)
# get nbest local scores and their ids
local_att_scores = F.log_softmax(self.output(z_list[-1])).data
if rnnlm:
rnnlm_state, local_lm_scores = rnnlm.predict(
hyp["rnnlm_prev"], hyp["yseq"][i]
)
local_scores = (
local_att_scores + recog_args.lm_weight * local_lm_scores
)
else:
local_scores = local_att_scores
if lpz is not None:
local_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
:ctc_beam
]
ctc_scores, ctc_states = ctc_prefix_score(
hyp["yseq"], local_best_ids, hyp["ctc_state_prev"]
)
local_scores = (1.0 - ctc_weight) * local_att_scores[
:, local_best_ids
] + ctc_weight * (ctc_scores - hyp["ctc_score_prev"])
if rnnlm:
local_scores += (
recog_args.lm_weight * local_lm_scores[:, local_best_ids]
)
joint_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
:beam
]
local_best_scores = local_scores[:, joint_best_ids]
local_best_ids = local_best_ids[joint_best_ids]
else:
local_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
:beam
]
local_best_scores = local_scores[:, local_best_ids]
for j in range(beam):
new_hyp = {}
# do not copy {z,c}_list directly
new_hyp["z_prev"] = z_list[:]
new_hyp["c_prev"] = c_list[:]
new_hyp["a_prev"] = att_w
new_hyp["score"] = hyp["score"] + local_best_scores[0, j]
new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
new_hyp["yseq"][len(hyp["yseq"])] = self.xp.full(
1, local_best_ids[j], "i"
)
if rnnlm:
new_hyp["rnnlm_prev"] = rnnlm_state
if lpz is not None:
new_hyp["ctc_state_prev"] = ctc_states[joint_best_ids[j]]
new_hyp["ctc_score_prev"] = ctc_scores[joint_best_ids[j]]
# will be (2 x beam) hyps at most
hyps_best_kept.append(new_hyp)
hyps_best_kept = sorted(
hyps_best_kept, key=lambda x: x["score"], reverse=True
)[:beam]
# sort and get nbest
hyps = hyps_best_kept
logging.debug("number of pruned hypotheses: " + str(len(hyps)))
logging.debug(
"best hypo: "
+ "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]]).replace(
"<space>", " "
)
)
# add eos in the final loop to avoid that there are no ended hyps
if i == maxlen - 1:
logging.info("adding <eos> in the last position in the loop")
for hyp in hyps:
hyp["yseq"].append(self.xp.full(1, self.eos, "i"))
# add ended hypotheses to a final list,
# and removed them from current hypotheses
# (this will be a problem, number of hyps < beam)
remained_hyps = []
for hyp in hyps:
if hyp["yseq"][-1] == self.eos:
# only store the sequence that has more than minlen outputs
# also add penalty
if len(hyp["yseq"]) > minlen:
hyp["score"] += (i + 1) * penalty
if rnnlm: # Word LM needs to add final <eos> score
hyp["score"] += recog_args.lm_weight * rnnlm.final(
hyp["rnnlm_prev"]
)
ended_hyps.append(hyp)
else:
remained_hyps.append(hyp)
# end detection
if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0:
logging.info("end detected at %d", i)
break
hyps = remained_hyps
if len(hyps) > 0:
logging.debug("remaining hypotheses: " + str(len(hyps)))
else:
logging.info("no hypothesis. Finish decoding.")
break
for hyp in hyps:
logging.debug(
"hypo: "
+ "".join([char_list[int(x)] for x in hyp["yseq"][1:]]).replace(
"<space>", " "
)
)
logging.debug("number of ended hypotheses: " + str(len(ended_hyps)))
nbest_hyps = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[
: min(len(ended_hyps), recog_args.nbest)
]
# check number of hypotheses
if len(nbest_hyps) == 0:
logging.warning(
"there is no N-best results, "
"perform recognition again with smaller minlenratio."
)
# should copy because Namespace will be overwritten globally
recog_args = Namespace(**vars(recog_args))
recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1)
return self.recognize_beam(h, lpz, recog_args, char_list, rnnlm)
logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
logging.info(
"normalized log probability: "
+ str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
)
return nbest_hyps
def calculate_all_attentions(self, hs, ys):
"""Calculate all of attentions.
Args:
hs (list of chainer.Variable | N-dimensional array):
Input variable from encoder.
ys (list of chainer.Variable | N-dimensional array):
Input variable of decoder.
Returns:
chainer.Variable: List of attention weights.
"""
# prepare input and output word sequences with sos/eos IDs
eos = self.xp.array([self.eos], "i")
sos = self.xp.array([self.sos], "i")
ys_in = [F.concat([sos, y], axis=0) for y in ys]
ys_out = [F.concat([y, eos], axis=0) for y in ys]
# padding for ys with -1
# pys: utt x olen
pad_ys_in = F.pad_sequence(ys_in, padding=self.eos)
pad_ys_out = F.pad_sequence(ys_out, padding=-1)
# get length info
olength = pad_ys_out.shape[1]
# initialization
c_list = [None] # list of cell state of each layer
z_list = [None] # list of hidden state of each layer
for _ in range(1, self.dlayers):
c_list.append(None)
z_list.append(None)
att_w = None
att_ws = []
self.att.reset() # reset pre-computation of h
# pre-computation of embedding
eys = self.embed(pad_ys_in) # utt x olen x zdim
eys = F.separate(eys, axis=1)
# loop for an output sequence
for i in range(olength):
att_c, att_w = self.att(hs, z_list[0], att_w)
ey = F.hstack((eys[i], att_c)) # utt x (zdim + hdim)
z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list)
att_ws.append(att_w) # for debugging
att_ws = F.stack(att_ws, axis=1)
att_ws.to_cpu()
return att_ws.data
def decoder_for(args, odim, sos, eos, att, labeldist):
"""Return the decoding layer corresponding to the args.
Args:
args (Namespace): The program arguments.
odim (int): The output dimension.
sos (int): Number to indicate the start of sequences.
eos (int) Number to indicate the end of sequences.
att (Module):
Attention module defined at `espnet.nets.chainer_backend.attentions`.
labeldist (numpy.array): Distributed array of length od transcript.
Returns:
chainer.Chain: The decoder module.
"""
return Decoder(
args.eprojs,
odim,
args.dtype,
args.dlayers,
args.dunits,
sos,
eos,
att,
args.verbose,
args.char_list,
labeldist,
args.lsm_weight,
args.sampling_probability,
)
import logging
import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np
from chainer import cuda
from espnet.nets.chainer_backend.nets_utils import _subsamplex
from espnet.nets.e2e_asr_common import get_vgg2l_odim
# TODO(watanabe) explanation of BLSTMP
class RNNP(chainer.Chain):
"""RNN with projection layer module.
Args:
idim (int): Dimension of inputs.
elayers (int): Number of encoder layers.
cdim (int): Number of rnn units. (resulted in cdim * 2 if bidirectional)
hdim (int): Number of projection units.
subsample (np.ndarray): List to use sabsample the input array.
dropout (float): Dropout rate.
typ (str): The RNN type.
"""
def __init__(self, idim, elayers, cdim, hdim, subsample, dropout, typ="blstm"):
super(RNNP, self).__init__()
bidir = typ[0] == "b"
if bidir:
rnn = L.NStepBiLSTM if "lstm" in typ else L.NStepBiGRU
else:
rnn = L.NStepLSTM if "lstm" in typ else L.NStepGRU
rnn_label = "birnn" if bidir else "rnn"
with self.init_scope():
for i in range(elayers):
if i == 0:
inputdim = idim
else:
inputdim = hdim
_cdim = 2 * cdim if bidir else cdim
# bottleneck layer to merge
setattr(
self, "{}{:d}".format(rnn_label, i), rnn(1, inputdim, cdim, dropout)
)
setattr(self, "bt%d" % i, L.Linear(_cdim, hdim))
self.elayers = elayers
self.rnn_label = rnn_label
self.cdim = cdim
self.subsample = subsample
self.typ = typ
self.bidir = bidir
def __call__(self, xs, ilens):
"""RNNP forward.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.Variable): Batch of length of each input batch. (B,)
Returns:
xs (chainer.Variable):subsampled vector of xs.
chainer.Variable: Subsampled vector of ilens.
"""
logging.info(self.__class__.__name__ + " input lengths: " + str(ilens))
for layer in range(self.elayers):
if "lstm" in self.typ:
_, _, ys = self[self.rnn_label + str(layer)](None, None, xs)
else:
_, ys = self[self.rnn_label + str(layer)](None, xs)
# ys: utt list of frame x cdim x 2 (2: means bidirectional)
# TODO(watanabe) replace subsample and FC layer with CNN
ys, ilens = _subsamplex(ys, self.subsample[layer + 1])
# (sum _utt frame_utt) x dim
ys = self["bt" + str(layer)](F.vstack(ys))
xs = F.split_axis(ys, np.cumsum(ilens[:-1]), axis=0)
# final tanh operation
xs = F.split_axis(F.tanh(F.vstack(xs)), np.cumsum(ilens[:-1]), axis=0)
# 1 utterance case, it becomes an array, so need to make a utt tuple
if not isinstance(xs, tuple):
xs = [xs]
return xs, ilens # x: utt list of frame x dim
class RNN(chainer.Chain):
"""RNN Module.
Args:
idim (int): Dimension of the imput.
elayers (int): Number of encoder layers.
cdim (int): Number of rnn units.
hdim (int): Number of projection units.
dropout (float): Dropout rate.
typ (str): Rnn type.
"""
def __init__(self, idim, elayers, cdim, hdim, dropout, typ="lstm"):
super(RNN, self).__init__()
bidir = typ[0] == "b"
if bidir:
rnn = L.NStepBiLSTM if "lstm" in typ else L.NStepBiGRU
else:
rnn = L.NStepLSTM if "lstm" in typ else L.NStepGRU
_cdim = 2 * cdim if bidir else cdim
with self.init_scope():
self.nbrnn = rnn(elayers, idim, cdim, dropout)
self.l_last = L.Linear(_cdim, hdim)
self.typ = typ
self.bidir = bidir
def __call__(self, xs, ilens):
"""BRNN forward propagation.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.Variable): Batch of length of each input batch. (B,)
Returns:
tuple(chainer.Variable): Tuple of `chainer.Variable` objects.
chainer.Variable: `ilens` .
"""
logging.info(self.__class__.__name__ + " input lengths: " + str(ilens))
# need to move ilens to cpu
ilens = cuda.to_cpu(ilens)
if "lstm" in self.typ:
_, _, ys = self.nbrnn(None, None, xs)
else:
_, ys = self.nbrnn(None, xs)
ys = self.l_last(F.vstack(ys)) # (sum _utt frame_utt) x dim
xs = F.split_axis(ys, np.cumsum(ilens[:-1]), axis=0)
# final tanh operation
xs = F.split_axis(F.tanh(F.vstack(xs)), np.cumsum(ilens[:-1]), axis=0)
# 1 utterance case, it becomes an array, so need to make a utt tuple
if not isinstance(xs, tuple):
xs = [xs]
return xs, ilens # x: utt list of frame x dim
# TODO(watanabe) explanation of VGG2L, VGG2B (Block) might be better
class VGG2L(chainer.Chain):
"""VGG motibated cnn layers.
Args:
in_channel (int): Number of channels.
"""
def __init__(self, in_channel=1):
super(VGG2L, self).__init__()
with self.init_scope():
# CNN layer (VGG motivated)
self.conv1_1 = L.Convolution2D(in_channel, 64, 3, stride=1, pad=1)
self.conv1_2 = L.Convolution2D(64, 64, 3, stride=1, pad=1)
self.conv2_1 = L.Convolution2D(64, 128, 3, stride=1, pad=1)
self.conv2_2 = L.Convolution2D(128, 128, 3, stride=1, pad=1)
self.in_channel = in_channel
def __call__(self, xs, ilens):
"""VGG2L forward propagation.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.Variable): Batch of length of each features. (B,)
Returns:
chainer.Variable: Subsampled vector of xs.
chainer.Variable: Subsampled vector of ilens.
"""
logging.info(self.__class__.__name__ + " input lengths: " + str(ilens))
# x: utt x frame x dim
xs = F.pad_sequence(xs)
# x: utt x 1 (input channel num) x frame x dim
xs = F.swapaxes(
xs.reshape(
xs.shape[0],
xs.shape[1],
self.in_channel,
xs.shape[2] // self.in_channel,
),
1,
2,
)
xs = F.relu(self.conv1_1(xs))
xs = F.relu(self.conv1_2(xs))
xs = F.max_pooling_2d(xs, 2, stride=2)
xs = F.relu(self.conv2_1(xs))
xs = F.relu(self.conv2_2(xs))
xs = F.max_pooling_2d(xs, 2, stride=2)
# change ilens accordingly
ilens = self.xp.array(
self.xp.ceil(self.xp.array(ilens, dtype=np.float32) / 2), dtype=np.int32
)
ilens = self.xp.array(
self.xp.ceil(self.xp.array(ilens, dtype=np.float32) / 2), dtype=np.int32
)
# x: utt_list of frame (remove zeropaded frames) x (input channel num x dim)
xs = F.swapaxes(xs, 1, 2)
xs = xs.reshape(xs.shape[0], xs.shape[1], xs.shape[2] * xs.shape[3])
xs = [xs[i, : ilens[i], :] for i in range(len(ilens))]
return xs, ilens
class Encoder(chainer.Chain):
"""Encoder network class.
Args:
etype (str): Type of encoder network.
idim (int): Number of dimensions of encoder network.
elayers (int): Number of layers of encoder network.
eunits (int): Number of lstm units of encoder network.
eprojs (int): Number of projection units of encoder network.
subsample (np.array): Subsampling number. e.g. 1_2_2_2_1
dropout (float): Dropout rate.
"""
def __init__(
self, etype, idim, elayers, eunits, eprojs, subsample, dropout, in_channel=1
):
super(Encoder, self).__init__()
typ = etype.lstrip("vgg").rstrip("p")
if typ not in ["lstm", "gru", "blstm", "bgru"]:
logging.error("Error: need to specify an appropriate encoder architecture")
with self.init_scope():
if etype.startswith("vgg"):
if etype[-1] == "p":
self.enc = chainer.Sequential(
VGG2L(in_channel),
RNNP(
get_vgg2l_odim(idim, in_channel=in_channel),
elayers,
eunits,
eprojs,
subsample,
dropout,
typ=typ,
),
)
logging.info("Use CNN-VGG + " + typ.upper() + "P for encoder")
else:
self.enc = chainer.Sequential(
VGG2L(in_channel),
RNN(
get_vgg2l_odim(idim, in_channel=in_channel),
elayers,
eunits,
eprojs,
dropout,
typ=typ,
),
)
logging.info("Use CNN-VGG + " + typ.upper() + " for encoder")
self.conv_subsampling_factor = 4
else:
if etype[-1] == "p":
self.enc = chainer.Sequential(
RNNP(idim, elayers, eunits, eprojs, subsample, dropout, typ=typ)
)
logging.info(
typ.upper() + " with every-layer projection for encoder"
)
else:
self.enc = chainer.Sequential(
RNN(idim, elayers, eunits, eprojs, dropout, typ=typ)
)
logging.info(typ.upper() + " without projection for encoder")
self.conv_subsampling_factor = 1
def __call__(self, xs, ilens):
"""Encoder forward.
Args:
xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
ilens (chainer.variable): Batch of length of each features. (B,)
Returns:
chainer.Variable: Output of the encoder.
chainer.Variable: (Subsampled) vector of ilens.
"""
xs, ilens = self.enc(xs, ilens)
return xs, ilens
def encoder_for(args, idim, subsample):
"""Return the Encoder module.
Args:
idim (int): Dimension of input array.
subsample (numpy.array): Subsample number. egs).1_2_2_2_1
Return
chainer.nn.Module: Encoder module.
"""
return Encoder(
args.etype,
idim,
args.elayers,
args.eunits,
args.eprojs,
subsample,
args.dropout_rate,
)
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import collections
import logging
import math
import numpy as np
# chainer related
from chainer import Variable, cuda, training
from chainer.training.updaters.multiprocess_parallel_updater import (
gather_grads,
gather_params,
scatter_grads,
)
# copied from https://github.com/chainer/chainer/blob/master/chainer/optimizer.py
def sum_sqnorm(arr):
"""Calculate the norm of the array.
Args:
arr (numpy.ndarray)
Returns:
Float: Sum of the norm calculated from the given array.
"""
sq_sum = collections.defaultdict(float)
for x in arr:
with cuda.get_device_from_array(x) as dev:
if x is not None:
x = x.ravel()
s = x.dot(x)
sq_sum[int(dev)] += s
return sum([float(i) for i in sq_sum.values()])
class CustomUpdater(training.StandardUpdater):
"""Custom updater for chainer.
Args:
train_iter (iterator | dict[str, iterator]): Dataset iterator for the
training dataset. It can also be a dictionary that maps strings to
iterators. If this is just an iterator, then the iterator is
registered by the name ``'main'``.
optimizer (optimizer | dict[str, optimizer]): Optimizer to update
parameters. It can also be a dictionary that maps strings to
optimizers. If this is just an optimizer, then the optimizer is
registered by the name ``'main'``.
converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
function to build input arrays. Each batch extracted by the main
iterator and the ``device`` option are passed to this function.
:func:`chainer.dataset.concat_examples` is used by default.
device (int or dict): The destination device info to send variables. In the
case of cpu or single gpu, `device=-1 or 0`, respectively.
In the case of multi-gpu, `device={"main":0, "sub_1": 1, ...}`.
accum_grad (int):The number of gradient accumulation. if set to 2, the network
parameters will be updated once in twice,
i.e. actual batchsize will be doubled.
"""
def __init__(self, train_iter, optimizer, converter, device, accum_grad=1):
super(CustomUpdater, self).__init__(
train_iter, optimizer, converter=converter, device=device
)
self.forward_count = 0
self.accum_grad = accum_grad
self.start = True
# To solve #1091, it is required to set the variable inside this class.
self.device = device
# The core part of the update routine can be customized by overriding.
def update_core(self):
"""Main update routine for Custom Updater."""
train_iter = self.get_iterator("main")
optimizer = self.get_optimizer("main")
# Get batch and convert into variables
batch = train_iter.next()
x = self.converter(batch, self.device)
if self.start:
optimizer.target.cleargrads()
self.start = False
# Compute the loss at this time step and accumulate it
loss = optimizer.target(*x) / self.accum_grad
loss.backward() # Backprop
loss.unchain_backward() # Truncate the graph
# update parameters
self.forward_count += 1
if self.forward_count != self.accum_grad:
return
self.forward_count = 0
# compute the gradient norm to check if it is normal or not
grad_norm = np.sqrt(
sum_sqnorm([p.grad for p in optimizer.target.params(False)])
)
logging.info("grad norm={}".format(grad_norm))
if math.isnan(grad_norm):
logging.warning("grad norm is nan. Do not update model.")
else:
optimizer.update()
optimizer.target.cleargrads() # Clear the parameter gradients
def update(self):
self.update_core()
if self.forward_count == 0:
self.iteration += 1
class CustomParallelUpdater(training.updaters.MultiprocessParallelUpdater):
"""Custom Parallel Updater for chainer.
Defines the main update routine.
Args:
train_iter (iterator | dict[str, iterator]): Dataset iterator for the
training dataset. It can also be a dictionary that maps strings to
iterators. If this is just an iterator, then the iterator is
registered by the name ``'main'``.
optimizer (optimizer | dict[str, optimizer]): Optimizer to update
parameters. It can also be a dictionary that maps strings to
optimizers. If this is just an optimizer, then the optimizer is
registered by the name ``'main'``.
converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
function to build input arrays. Each batch extracted by the main
iterator and the ``device`` option are passed to this function.
:func:`chainer.dataset.concat_examples` is used by default.
device (torch.device): Device to which the training data is sent.
Negative value
indicates the host memory (CPU).
accum_grad (int):The number of gradient accumulation. if set to 2,
the network parameters will be updated once in twice,
i.e. actual batchsize will be doubled.
"""
def __init__(self, train_iters, optimizer, converter, devices, accum_grad=1):
super(CustomParallelUpdater, self).__init__(
train_iters, optimizer, converter=converter, devices=devices
)
from cupy.cuda import nccl
self.accum_grad = accum_grad
self.forward_count = 0
self.nccl = nccl
# The core part of the update routine can be customized by overriding.
def update_core(self):
"""Main Update routine of the custom parallel updater."""
self.setup_workers()
self._send_message(("update", None))
with cuda.Device(self._devices[0]):
# For reducing memory
optimizer = self.get_optimizer("main")
batch = self.get_iterator("main").next()
x = self.converter(batch, self._devices[0])
loss = self._master(*x) / self.accum_grad
loss.backward()
loss.unchain_backward()
# NCCL: reduce grads
null_stream = cuda.Stream.null
if self.comm is not None:
gg = gather_grads(self._master)
self.comm.reduce(
gg.data.ptr,
gg.data.ptr,
gg.size,
self.nccl.NCCL_FLOAT,
self.nccl.NCCL_SUM,
0,
null_stream.ptr,
)
scatter_grads(self._master, gg)
del gg
# update parameters
self.forward_count += 1
if self.forward_count != self.accum_grad:
return
self.forward_count = 0
# check gradient value
grad_norm = np.sqrt(
sum_sqnorm([p.grad for p in optimizer.target.params(False)])
)
logging.info("grad norm={}".format(grad_norm))
# update
if math.isnan(grad_norm):
logging.warning("grad norm is nan. Do not update model.")
else:
optimizer.update()
self._master.cleargrads()
if self.comm is not None:
gp = gather_params(self._master)
self.comm.bcast(
gp.data.ptr, gp.size, self.nccl.NCCL_FLOAT, 0, null_stream.ptr
)
def update(self):
self.update_core()
if self.forward_count == 0:
self.iteration += 1
class CustomConverter(object):
"""Custom Converter.
Args:
subsampling_factor (int): The subsampling factor.
"""
def __init__(self, subsampling_factor=1):
self.subsampling_factor = subsampling_factor
def __call__(self, batch, device):
"""Perform sabsampling.
Args:
batch (list): Batch that will be sabsampled.
device (device): GPU device.
Returns:
chainer.Variable: xp.array that sabsampled from batch.
xp.array: xp.array of the length of the mini-batches.
chainer.Variable: xp.array that sabsampled from batch.
"""
# set device
xp = cuda.cupy if device != -1 else np
# batch should be located in list
assert len(batch) == 1
xs, ys = batch[0]
# perform subsampling
if self.subsampling_factor > 1:
xs = [x[:: self.subsampling_factor, :] for x in xs]
# get batch made of lengths of input sequences
ilens = [x.shape[0] for x in xs]
# convert to Variable
xs = [Variable(xp.array(x, dtype=xp.float32)) for x in xs]
ilens = xp.array(ilens, dtype=xp.int32)
ys = [Variable(xp.array(y, dtype=xp.int32)) for y in ys]
return xs, ilens, ys
# encoding: utf-8
"""Class Declaration of Transformer's Attention."""
import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np
MIN_VALUE = float(np.finfo(np.float32).min)
class MultiHeadAttention(chainer.Chain):
"""Multi Head Attention Layer.
Args:
n_units (int): Number of input units.
h (int): Number of attention heads.
dropout (float): Dropout rate.
initialW: Initializer to initialize the weight.
initial_bias: Initializer to initialize the bias.
:param int h: the number of heads
:param int n_units: the number of features
:param float dropout_rate: dropout rate
"""
def __init__(self, n_units, h=8, dropout=0.1, initialW=None, initial_bias=None):
"""Initialize MultiHeadAttention."""
super(MultiHeadAttention, self).__init__()
assert n_units % h == 0
stvd = 1.0 / np.sqrt(n_units)
with self.init_scope():
self.linear_q = L.Linear(
n_units,
n_units,
initialW=initialW(scale=stvd),
initial_bias=initial_bias(scale=stvd),
)
self.linear_k = L.Linear(
n_units,
n_units,
initialW=initialW(scale=stvd),
initial_bias=initial_bias(scale=stvd),
)
self.linear_v = L.Linear(
n_units,
n_units,
initialW=initialW(scale=stvd),
initial_bias=initial_bias(scale=stvd),
)
self.linear_out = L.Linear(
n_units,
n_units,
initialW=initialW(scale=stvd),
initial_bias=initial_bias(scale=stvd),
)
self.d_k = n_units // h
self.h = h
self.dropout = dropout
self.attn = None
def forward(self, e_var, s_var=None, mask=None, batch=1):
"""Core function of the Multi-head attention layer.
Args:
e_var (chainer.Variable): Variable of input array.
s_var (chainer.Variable): Variable of source array from encoder.
mask (chainer.Variable): Attention mask.
batch (int): Batch size.
Returns:
chainer.Variable: Outout of multi-head attention layer.
"""
xp = self.xp
if s_var is None:
# batch, head, time1/2, d_k)
Q = self.linear_q(e_var).reshape(batch, -1, self.h, self.d_k)
K = self.linear_k(e_var).reshape(batch, -1, self.h, self.d_k)
V = self.linear_v(e_var).reshape(batch, -1, self.h, self.d_k)
else:
Q = self.linear_q(e_var).reshape(batch, -1, self.h, self.d_k)
K = self.linear_k(s_var).reshape(batch, -1, self.h, self.d_k)
V = self.linear_v(s_var).reshape(batch, -1, self.h, self.d_k)
scores = F.matmul(F.swapaxes(Q, 1, 2), K.transpose(0, 2, 3, 1)) / np.sqrt(
self.d_k
)
if mask is not None:
mask = xp.stack([mask] * self.h, axis=1)
scores = F.where(mask, scores, xp.full(scores.shape, MIN_VALUE, "f"))
self.attn = F.softmax(scores, axis=-1)
p_attn = F.dropout(self.attn, self.dropout)
x = F.matmul(p_attn, F.swapaxes(V, 1, 2))
x = F.swapaxes(x, 1, 2).reshape(-1, self.h * self.d_k)
return self.linear_out(x)
# encoding: utf-8
"""Class Declaration of Transformer's CTC."""
import logging
import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np
# TODO(nelson): Merge chainer_backend/transformer/ctc.py in chainer_backend/ctc.py
class CTC(chainer.Chain):
"""Chainer implementation of ctc layer.
Args:
odim (int): The output dimension.
eprojs (int | None): Dimension of input vectors from encoder.
dropout_rate (float): Dropout rate.
"""
def __init__(self, odim, eprojs, dropout_rate):
"""Initialize CTC."""
super(CTC, self).__init__()
self.dropout_rate = dropout_rate
self.loss = None
with self.init_scope():
self.ctc_lo = L.Linear(eprojs, odim)
def __call__(self, hs, ys):
"""CTC forward.
Args:
hs (list of chainer.Variable | N-dimension array):
Input variable from encoder.
ys (list of chainer.Variable | N-dimension array):
Input variable of decoder.
Returns:
chainer.Variable: A variable holding a scalar value of the CTC loss.
"""
self.loss = None
ilens = [x.shape[0] for x in hs]
olens = [x.shape[0] for x in ys]
# zero padding for hs
y_hat = self.ctc_lo(
F.dropout(F.pad_sequence(hs), ratio=self.dropout_rate), n_batch_axes=2
)
y_hat = F.separate(y_hat, axis=1) # ilen list of batch x hdim
# zero padding for ys
y_true = F.pad_sequence(ys, padding=-1) # batch x olen
# get length info
input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32))
label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32))
logging.info(
self.__class__.__name__ + " input lengths: " + str(input_length.data)
)
logging.info(
self.__class__.__name__ + " output lengths: " + str(label_length.data)
)
# get ctc loss
self.loss = F.connectionist_temporal_classification(
y_hat, y_true, 0, input_length, label_length
)
logging.info("ctc loss:" + str(self.loss.data))
return self.loss
def log_softmax(self, hs):
"""Log_softmax of frame activations.
Args:
hs (list of chainer.Variable | N-dimension array):
Input variable from encoder.
Returns:
chainer.Variable: A n-dimension float array.
"""
y_hat = self.ctc_lo(F.pad_sequence(hs), n_batch_axes=2)
return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)
# encoding: utf-8
"""Class Declaration of Transformer's Decoder."""
import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np
from espnet.nets.chainer_backend.transformer.decoder_layer import DecoderLayer
from espnet.nets.chainer_backend.transformer.embedding import PositionalEncoding
from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
from espnet.nets.chainer_backend.transformer.mask import make_history_mask
class Decoder(chainer.Chain):
"""Decoder layer.
Args:
odim (int): The output dimension.
n_layers (int): Number of ecoder layers.
n_units (int): Number of attention units.
d_units (int): Dimension of input vector of decoder.
h (int): Number of attention heads.
dropout (float): Dropout rate.
initialW (Initializer): Initializer to initialize the weight.
initial_bias (Initializer): Initializer to initialize the bias.
"""
def __init__(self, odim, args, initialW=None, initial_bias=None):
"""Initialize Decoder."""
super(Decoder, self).__init__()
self.sos = odim - 1
self.eos = odim - 1
initialW = chainer.initializers.Uniform if initialW is None else initialW
initial_bias = (
chainer.initializers.Uniform if initial_bias is None else initial_bias
)
with self.init_scope():
self.output_norm = LayerNorm(args.adim)
self.pe = PositionalEncoding(args.adim, args.dropout_rate)
stvd = 1.0 / np.sqrt(args.adim)
self.output_layer = L.Linear(
args.adim,
odim,
initialW=initialW(scale=stvd),
initial_bias=initial_bias(scale=stvd),
)
self.embed = L.EmbedID(
odim,
args.adim,
ignore_label=-1,
initialW=chainer.initializers.Normal(scale=1.0),
)
for i in range(args.dlayers):
name = "decoders." + str(i)
layer = DecoderLayer(
args.adim,
d_units=args.dunits,
h=args.aheads,
dropout=args.dropout_rate,
initialW=initialW,
initial_bias=initial_bias,
)
self.add_link(name, layer)
self.n_layers = args.dlayers
def make_attention_mask(self, source_block, target_block):
"""Prepare the attention mask.
Args:
source_block (ndarray): Source block with dimensions: (B x S).
target_block (ndarray): Target block with dimensions: (B x T).
Returns:
ndarray: Mask with dimensions (B, S, T).
"""
mask = (target_block[:, None, :] >= 0) * (source_block[:, :, None] >= 0)
# (batch, source_length, target_length)
return mask
def forward(self, ys_pad, source, x_mask):
"""Forward decoder.
:param xp.array e: input token ids, int64 (batch, maxlen_out)
:param xp.array yy_mask: input token mask, uint8 (batch, maxlen_out)
:param xp.array source: encoded memory, float32 (batch, maxlen_in, feat)
:param xp.array xy_mask: encoded memory mask, uint8 (batch, maxlen_in)
:return e: decoded token score before softmax (batch, maxlen_out, token)
:rtype: chainer.Variable
"""
xp = self.xp
sos = np.array([self.sos], np.int32)
ys = [np.concatenate([sos, y], axis=0) for y in ys_pad]
e = F.pad_sequence(ys, padding=self.eos).data
e = xp.array(e)
# mask preparation
xy_mask = self.make_attention_mask(e, xp.array(x_mask))
yy_mask = self.make_attention_mask(e, e)
yy_mask *= make_history_mask(xp, e)
e = self.pe(self.embed(e))
batch, length, dims = e.shape
e = e.reshape(-1, dims)
source = source.reshape(-1, dims)
for i in range(self.n_layers):
e = self["decoders." + str(i)](e, source, xy_mask, yy_mask, batch)
return self.output_layer(self.output_norm(e)).reshape(batch, length, -1)
def recognize(self, e, yy_mask, source):
"""Process recognition function."""
e = self.forward(e, source, yy_mask)
return F.log_softmax(e, axis=-1)
# encoding: utf-8
"""Class Declaration of Transformer's Decoder Block."""
import chainer
import chainer.functions as F
from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention
from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
from espnet.nets.chainer_backend.transformer.positionwise_feed_forward import (
PositionwiseFeedForward,
)
class DecoderLayer(chainer.Chain):
"""Single decoder layer module.
Args:
n_units (int): Number of input/output dimension of a FeedForward layer.
d_units (int): Number of units of hidden layer in a FeedForward layer.
h (int): Number of attention heads.
dropout (float): Dropout rate
"""
def __init__(
self, n_units, d_units=0, h=8, dropout=0.1, initialW=None, initial_bias=None
):
"""Initialize DecoderLayer."""
super(DecoderLayer, self).__init__()
with self.init_scope():
self.self_attn = MultiHeadAttention(
n_units,
h,
dropout=dropout,
initialW=initialW,
initial_bias=initial_bias,
)
self.src_attn = MultiHeadAttention(
n_units,
h,
dropout=dropout,
initialW=initialW,
initial_bias=initial_bias,
)
self.feed_forward = PositionwiseFeedForward(
n_units,
d_units=d_units,
dropout=dropout,
initialW=initialW,
initial_bias=initial_bias,
)
self.norm1 = LayerNorm(n_units)
self.norm2 = LayerNorm(n_units)
self.norm3 = LayerNorm(n_units)
self.dropout = dropout
def forward(self, e, s, xy_mask, yy_mask, batch):
"""Compute Encoder layer.
Args:
e (chainer.Variable): Batch of padded features. (B, Lmax)
s (chainer.Variable): Batch of padded character. (B, Tmax)
Returns:
chainer.Variable: Computed variable of decoder.
"""
n_e = self.norm1(e)
n_e = self.self_attn(n_e, mask=yy_mask, batch=batch)
e = e + F.dropout(n_e, self.dropout)
n_e = self.norm2(e)
n_e = self.src_attn(n_e, s_var=s, mask=xy_mask, batch=batch)
e = e + F.dropout(n_e, self.dropout)
n_e = self.norm3(e)
n_e = self.feed_forward(n_e)
e = e + F.dropout(n_e, self.dropout)
return e
# encoding: utf-8
"""Class Declaration of Transformer's Positional Encoding."""
import chainer
import chainer.functions as F
import numpy as np
class PositionalEncoding(chainer.Chain):
"""Positional encoding module.
:param int n_units: embedding dim
:param float dropout: dropout rate
:param int length: maximum input length
"""
def __init__(self, n_units, dropout=0.1, length=5000):
"""Initialize Positional Encoding."""
# Implementation described in the paper
super(PositionalEncoding, self).__init__()
self.dropout = dropout
posi_block = np.arange(0, length, dtype=np.float32)[:, None]
unit_block = np.exp(
np.arange(0, n_units, 2, dtype=np.float32) * -(np.log(10000.0) / n_units)
)
self.pe = np.zeros((length, n_units), dtype=np.float32)
self.pe[:, ::2] = np.sin(posi_block * unit_block)
self.pe[:, 1::2] = np.cos(posi_block * unit_block)
self.scale = np.sqrt(n_units)
def forward(self, e):
"""Forward Positional Encoding."""
length = e.shape[1]
e = e * self.scale + self.xp.array(self.pe[:length])
return F.dropout(e, self.dropout)
# encoding: utf-8
"""Class Declaration of Transformer's Encoder."""
import logging
import chainer
import numpy as np
from chainer import links as L
from espnet.nets.chainer_backend.transformer.embedding import PositionalEncoding
from espnet.nets.chainer_backend.transformer.encoder_layer import EncoderLayer
from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
from espnet.nets.chainer_backend.transformer.mask import make_history_mask
from espnet.nets.chainer_backend.transformer.subsampling import (
Conv2dSubsampling,
LinearSampling,
)
class Encoder(chainer.Chain):
"""Encoder.
Args:
input_type(str):
Sampling type. `input_type` must be `conv2d` or 'linear' currently.
idim (int): Dimension of inputs.
n_layers (int): Number of encoder layers.
n_units (int): Number of input/output dimension of a FeedForward layer.
d_units (int): Number of units of hidden layer in a FeedForward layer.
h (int): Number of attention heads.
dropout (float): Dropout rate
"""
def __init__(
self,
idim,
attention_dim=256,
attention_heads=4,
linear_units=2048,
num_blocks=6,
dropout_rate=0.1,
positional_dropout_rate=0.1,
attention_dropout_rate=0.0,
input_layer="conv2d",
pos_enc_class=PositionalEncoding,
initialW=None,
initial_bias=None,
):
"""Initialize Encoder.
Args:
idim (int): Input dimension.
args (Namespace): Training config.
initialW (int, optional): Initializer to initialize the weight.
initial_bias (bool, optional): Initializer to initialize the bias.
"""
super(Encoder, self).__init__()
initialW = chainer.initializers.Uniform if initialW is None else initialW
initial_bias = (
chainer.initializers.Uniform if initial_bias is None else initial_bias
)
self.do_history_mask = False
with self.init_scope():
self.conv_subsampling_factor = 1
channels = 64 # Based in paper
if input_layer == "conv2d":
idim = int(np.ceil(np.ceil(idim / 2) / 2)) * channels
self.input_layer = Conv2dSubsampling(
channels,
idim,
attention_dim,
dropout=dropout_rate,
initialW=initialW,
initial_bias=initial_bias,
)
self.conv_subsampling_factor = 4
elif input_layer == "linear":
self.input_layer = LinearSampling(
idim, attention_dim, initialW=initialW, initial_bias=initial_bias
)
elif input_layer == "embed":
self.input_layer = chainer.Sequential(
L.EmbedID(idim, attention_dim, ignore_label=-1),
pos_enc_class(attention_dim, positional_dropout_rate),
)
self.do_history_mask = True
else:
raise ValueError("unknown input_layer: " + input_layer)
self.norm = LayerNorm(attention_dim)
for i in range(num_blocks):
name = "encoders." + str(i)
layer = EncoderLayer(
attention_dim,
d_units=linear_units,
h=attention_heads,
dropout=attention_dropout_rate,
initialW=initialW,
initial_bias=initial_bias,
)
self.add_link(name, layer)
self.n_layers = num_blocks
def forward(self, e, ilens):
"""Compute Encoder layer.
Args:
e (chainer.Variable): Batch of padded character. (B, Tmax)
ilens (chainer.Variable): Batch of length of each input batch. (B,)
Returns:
chainer.Variable: Computed variable of encoder.
numpy.array: Mask.
chainer.Variable: Batch of lengths of each encoder outputs.
"""
if isinstance(self.input_layer, Conv2dSubsampling):
e, ilens = self.input_layer(e, ilens)
else:
e = self.input_layer(e)
batch, length, dims = e.shape
x_mask = np.ones([batch, length])
for j in range(batch):
x_mask[j, ilens[j] :] = -1
xx_mask = (x_mask[:, None, :] >= 0) * (x_mask[:, :, None] >= 0)
xx_mask = self.xp.array(xx_mask)
if self.do_history_mask:
history_mask = make_history_mask(self.xp, x_mask)
xx_mask *= history_mask
logging.debug("encoders size: " + str(e.shape))
e = e.reshape(-1, dims)
for i in range(self.n_layers):
e = self["encoders." + str(i)](e, xx_mask, batch)
return self.norm(e).reshape(batch, length, -1), x_mask, ilens
# encoding: utf-8
"""Class Declaration of Transformer's Encoder Block."""
import chainer
import chainer.functions as F
from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention
from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
from espnet.nets.chainer_backend.transformer.positionwise_feed_forward import (
PositionwiseFeedForward,
)
class EncoderLayer(chainer.Chain):
"""Single encoder layer module.
Args:
n_units (int): Number of input/output dimension of a FeedForward layer.
d_units (int): Number of units of hidden layer in a FeedForward layer.
h (int): Number of attention heads.
dropout (float): Dropout rate
"""
def __init__(
self, n_units, d_units=0, h=8, dropout=0.1, initialW=None, initial_bias=None
):
"""Initialize EncoderLayer."""
super(EncoderLayer, self).__init__()
with self.init_scope():
self.self_attn = MultiHeadAttention(
n_units,
h,
dropout=dropout,
initialW=initialW,
initial_bias=initial_bias,
)
self.feed_forward = PositionwiseFeedForward(
n_units,
d_units=d_units,
dropout=dropout,
initialW=initialW,
initial_bias=initial_bias,
)
self.norm1 = LayerNorm(n_units)
self.norm2 = LayerNorm(n_units)
self.dropout = dropout
self.n_units = n_units
def forward(self, e, xx_mask, batch):
"""Forward Positional Encoding."""
n_e = self.norm1(e)
n_e = self.self_attn(n_e, mask=xx_mask, batch=batch)
e = e + F.dropout(n_e, self.dropout)
n_e = self.norm2(e)
n_e = self.feed_forward(n_e)
e = e + F.dropout(n_e, self.dropout)
return e
# encoding: utf-8
"""Class Declaration of Transformer's Label Smootion loss."""
import logging
import chainer
import chainer.functions as F
class LabelSmoothingLoss(chainer.Chain):
"""Label Smoothing Loss.
Args:
smoothing (float): smoothing rate (0.0 means the conventional CE).
n_target_vocab (int): number of classes.
normalize_length (bool): normalize loss by sequence length if True.
"""
def __init__(self, smoothing, n_target_vocab, normalize_length=False, ignore_id=-1):
"""Initialize Loss."""
super(LabelSmoothingLoss, self).__init__()
self.use_label_smoothing = False
if smoothing > 0.0:
logging.info("Use label smoothing")
self.smoothing = smoothing
self.confidence = 1.0 - smoothing
self.use_label_smoothing = True
self.n_target_vocab = n_target_vocab
self.normalize_length = normalize_length
self.ignore_id = ignore_id
self.acc = None
def forward(self, ys_block, ys_pad):
"""Forward Loss.
Args:
ys_block (chainer.Variable): Predicted labels.
ys_pad (chainer.Variable): Target (true) labels.
Returns:
float: Training loss.
"""
# Output (all together at once for efficiency)
batch, length, dims = ys_block.shape
concat_logit_block = ys_block.reshape(-1, dims)
# Target reshape
concat_t_block = ys_pad.reshape((batch * length))
ignore_mask = concat_t_block >= 0
n_token = ignore_mask.sum()
normalizer = n_token if self.normalize_length else batch
if not self.use_label_smoothing:
loss = F.softmax_cross_entropy(concat_logit_block, concat_t_block)
loss = loss * n_token / normalizer
else:
log_prob = F.log_softmax(concat_logit_block)
broad_ignore_mask = self.xp.broadcast_to(
ignore_mask[:, None], concat_logit_block.shape
)
pre_loss = (
ignore_mask * log_prob[self.xp.arange(batch * length), concat_t_block]
)
loss = -F.sum(pre_loss) / normalizer
label_smoothing = broad_ignore_mask * -1.0 / self.n_target_vocab * log_prob
label_smoothing = F.sum(label_smoothing) / normalizer
loss = self.confidence * loss + self.smoothing * label_smoothing
return loss
# encoding: utf-8
"""Class Declaration of Transformer's Label Smootion loss."""
import chainer.links as L
class LayerNorm(L.LayerNormalization):
"""Redirect to L.LayerNormalization."""
def __init__(self, dims, eps=1e-12):
"""Initialize LayerNorm."""
super(LayerNorm, self).__init__(size=dims, eps=eps)
def __call__(self, e):
"""Forward LayerNorm."""
return super(LayerNorm, self).__call__(e)
"""Create mask for subsequent steps."""
def make_history_mask(xp, block):
"""Prepare the history mask.
Args:
block (ndarray): Block with dimensions: (B x S).
Returns:
ndarray, np.ndarray: History mask with dimensions (B, S, S).
"""
batch, length = block.shape
arange = xp.arange(length)
history_mask = (arange[None] <= arange[:, None])[None,]
history_mask = xp.broadcast_to(history_mask, (batch, length, length))
return history_mask
# encoding: utf-8
"""Class Declaration of Transformer's Positionwise Feedforward."""
import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np
class PositionwiseFeedForward(chainer.Chain):
"""Positionwise feed forward.
Args:
:param int idim: input dimenstion
:param int hidden_units: number of hidden units
:param float dropout_rate: dropout rate
"""
def __init__(
self, n_units, d_units=0, dropout=0.1, initialW=None, initial_bias=None
):
"""Initialize PositionwiseFeedForward.
Args:
n_units (int): Input dimension.
d_units (int, optional): Output dimension of hidden layer.
dropout (float, optional): Dropout ratio.
initialW (int, optional): Initializer to initialize the weight.
initial_bias (bool, optional): Initializer to initialize the bias.
"""
super(PositionwiseFeedForward, self).__init__()
n_inner_units = d_units if d_units > 0 else n_units * 4
with self.init_scope():
stvd = 1.0 / np.sqrt(n_units)
self.w_1 = L.Linear(
n_units,
n_inner_units,
initialW=initialW(scale=stvd),
initial_bias=initial_bias(scale=stvd),
)
stvd = 1.0 / np.sqrt(n_inner_units)
self.w_2 = L.Linear(
n_inner_units,
n_units,
initialW=initialW(scale=stvd),
initial_bias=initial_bias(scale=stvd),
)
self.act = F.relu
self.dropout = dropout
def __call__(self, e):
"""Initialize PositionwiseFeedForward.
Args:
e (chainer.Variable): Input variable.
Return:
chainer.Variable: Output variable.
"""
e = F.dropout(self.act(self.w_1(e)), self.dropout)
return self.w_2(e)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment