update conformer

60a2c57a · sunzhq2 · xuxo · 4a699441 · 60a2c57a · 60a2c57a
Commit 60a2c57a authored Jan 27, 2026 by sunzhq2 Committed by xuxo Jan 27, 2026
20 changed files
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/e2e_asr.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/e2e_asr.py
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""RNN sequence-to-sequence speech recognition model (chainer)."""
+
+import logging
+import math
+
+import chainer
+import numpy as np
+from chainer import reporter
+
+from espnet.nets.chainer_backend.asr_interface import ChainerASRInterface
+from espnet.nets.chainer_backend.ctc import ctc_for
+from espnet.nets.chainer_backend.rnn.attentions import att_for
+from espnet.nets.chainer_backend.rnn.decoders import decoder_for
+from espnet.nets.chainer_backend.rnn.encoders import encoder_for
+from espnet.nets.e2e_asr_common import label_smoothing_dist
+from espnet.nets.pytorch_backend.e2e_asr import E2E as E2E_pytorch
+from espnet.nets.pytorch_backend.nets_utils import get_subsample
+
+CTC_LOSS_THRESHOLD = 10000
+
+
+class E2E(ChainerASRInterface):
+    """E2E module for chainer backend.
+
+    Args:
+        idim (int): Dimension of the inputs.
+        odim (int): Dimension of the outputs.
+        args (parser.args): Training config.
+        flag_return (bool): If True, train() would return
+            additional metrics in addition to the training
+            loss.
+
+    """
+
+    @staticmethod
+    def add_arguments(parser):
+        """Add arguments."""
+        return E2E_pytorch.add_arguments(parser)
+
+    def get_total_subsampling_factor(self):
+        """Get total subsampling factor."""
+        return self.enc.conv_subsampling_factor * int(np.prod(self.subsample))
+
+    def __init__(self, idim, odim, args, flag_return=True):
+        """Construct an E2E object.
+
+        :param int idim: dimension of inputs
+        :param int odim: dimension of outputs
+        :param Namespace args: argument Namespace containing options
+        """
+        chainer.Chain.__init__(self)
+        self.mtlalpha = args.mtlalpha
+        assert 0 <= self.mtlalpha <= 1, "mtlalpha must be [0,1]"
+        self.etype = args.etype
+        self.verbose = args.verbose
+        self.char_list = args.char_list
+        self.outdir = args.outdir
+
+        # below means the last number becomes eos/sos ID
+        # note that sos/eos IDs are identical
+        self.sos = odim - 1
+        self.eos = odim - 1
+
+        # subsample info
+        self.subsample = get_subsample(args, mode="asr", arch="rnn")
+
+        # label smoothing info
+        if args.lsm_type:
+            logging.info("Use label smoothing with " + args.lsm_type)
+            labeldist = label_smoothing_dist(
+                odim, args.lsm_type, transcript=args.train_json
+            )
+        else:
+            labeldist = None
+
+        with self.init_scope():
+            # encoder
+            self.enc = encoder_for(args, idim, self.subsample)
+            # ctc
+            self.ctc = ctc_for(args, odim)
+            # attention
+            self.att = att_for(args)
+            # decoder
+            self.dec = decoder_for(args, odim, self.sos, self.eos, self.att, labeldist)
+
+        self.acc = None
+        self.loss = None
+        self.flag_return = flag_return
+
+    def forward(self, xs, ilens, ys):
+        """E2E forward propagation.
+
+        Args:
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
+            ilens (chainer.Variable): Batch of length of each input batch. (B,)
+            ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)
+
+        Returns:
+            float: Loss that calculated by attention and ctc loss.
+            float (optional): Ctc loss.
+            float (optional): Attention loss.
+            float (optional): Accuracy.
+
+        """
+        # 1. encoder
+        hs, ilens = self.enc(xs, ilens)
+
+        # 3. CTC loss
+        if self.mtlalpha == 0:
+            loss_ctc = None
+        else:
+            loss_ctc = self.ctc(hs, ys)
+
+        # 4. attention loss
+        if self.mtlalpha == 1:
+            loss_att = None
+            acc = None
+        else:
+            loss_att, acc = self.dec(hs, ys)
+
+        self.acc = acc
+        alpha = self.mtlalpha
+        if alpha == 0:
+            self.loss = loss_att
+        elif alpha == 1:
+            self.loss = loss_ctc
+        else:
+            self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
+
+        if self.loss.data < CTC_LOSS_THRESHOLD and not math.isnan(self.loss.data):
+            reporter.report({"loss_ctc": loss_ctc}, self)
+            reporter.report({"loss_att": loss_att}, self)
+            reporter.report({"acc": acc}, self)
+
+            logging.info("mtl loss:" + str(self.loss.data))
+            reporter.report({"loss": self.loss}, self)
+        else:
+            logging.warning("loss (=%f) is not correct", self.loss.data)
+        if self.flag_return:
+            return self.loss, loss_ctc, loss_att, acc
+        else:
+            return self.loss
+
+    def recognize(self, x, recog_args, char_list, rnnlm=None):
+        """E2E greedy/beam search.
+
+        Args:
+            x (chainer.Variable): Input tensor for recognition.
+            recog_args (parser.args): Arguments of config file.
+            char_list (List[str]): List of Characters.
+            rnnlm (Module): RNNLM module defined at `espnet.lm.chainer_backend.lm`.
+
+        Returns:
+            List[Dict[str, Any]]: Result of recognition.
+
+        """
+        # subsample frame
+        x = x[:: self.subsample[0], :]
+        ilen = self.xp.array(x.shape[0], dtype=np.int32)
+        h = chainer.Variable(self.xp.array(x, dtype=np.float32))
+
+        with chainer.no_backprop_mode(), chainer.using_config("train", False):
+            # 1. encoder
+            # make a utt list (1) to use the same interface for encoder
+            h, _ = self.enc([h], [ilen])
+
+            # calculate log P(z_t|X) for CTC scores
+            if recog_args.ctc_weight > 0.0:
+                lpz = self.ctc.log_softmax(h).data[0]
+            else:
+                lpz = None
+
+            # 2. decoder
+            # decode the first utterance
+            y = self.dec.recognize_beam(h[0], lpz, recog_args, char_list, rnnlm)
+
+            return y
+
+    def calculate_all_attentions(self, xs, ilens, ys):
+        """E2E attention calculation.
+
+        Args:
+            xs (List): List of padded input sequences. [(T1, idim), (T2, idim), ...]
+            ilens (np.ndarray): Batch of lengths of input sequences. (B)
+            ys (List): List of character id sequence tensor. [(L1), (L2), (L3), ...]
+
+        Returns:
+            float np.ndarray: Attention weights. (B, Lmax, Tmax)
+
+        """
+        hs, ilens = self.enc(xs, ilens)
+        att_ws = self.dec.calculate_all_attentions(hs, ys)
+
+        return att_ws
+
+    @staticmethod
+    def custom_converter(subsampling_factor=0):
+        """Get customconverter of the model."""
+        from espnet.nets.chainer_backend.rnn.training import CustomConverter
+
+        return CustomConverter(subsampling_factor=subsampling_factor)
+
+    @staticmethod
+    def custom_updater(iters, optimizer, converter, device=-1, accum_grad=1):
+        """Get custom_updater of the model."""
+        from espnet.nets.chainer_backend.rnn.training import CustomUpdater
+
+        return CustomUpdater(
+            iters, optimizer, converter=converter, device=device, accum_grad=accum_grad
+        )
+
+    @staticmethod
+    def custom_parallel_updater(iters, optimizer, converter, devices, accum_grad=1):
+        """Get custom_parallel_updater of the model."""
+        from espnet.nets.chainer_backend.rnn.training import CustomParallelUpdater
+
+        return CustomParallelUpdater(
+            iters,
+            optimizer,
+            converter=converter,
+            devices=devices,
+            accum_grad=accum_grad,
+        )
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/e2e_asr_transformer.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/e2e_asr_transformer.py
+# encoding: utf-8
+"""Transformer-based model for End-to-end ASR."""
+
+import logging
+import math
+from argparse import Namespace
+from distutils.util import strtobool
+
+import chainer
+import chainer.functions as F
+import numpy as np
+from chainer import reporter
+
+from espnet.nets.chainer_backend.asr_interface import ChainerASRInterface
+from espnet.nets.chainer_backend.transformer import ctc
+from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention
+from espnet.nets.chainer_backend.transformer.decoder import Decoder
+from espnet.nets.chainer_backend.transformer.encoder import Encoder
+from espnet.nets.chainer_backend.transformer.label_smoothing_loss import (  # noqa: H301
+    LabelSmoothingLoss,
+)
+from espnet.nets.chainer_backend.transformer.training import (  # noqa: H301
+    CustomConverter,
+    CustomParallelUpdater,
+    CustomUpdater,
+)
+from espnet.nets.ctc_prefix_score import CTCPrefixScore
+from espnet.nets.e2e_asr_common import ErrorCalculator, end_detect
+from espnet.nets.pytorch_backend.nets_utils import get_subsample
+from espnet.nets.pytorch_backend.transformer.plot import PlotAttentionReport
+
+CTC_SCORING_RATIO = 1.5
+MAX_DECODER_OUTPUT = 5
+
+
+class E2E(ChainerASRInterface):
+    """E2E module.
+
+    Args:
+        idim (int): Input dimmensions.
+        odim (int): Output dimmensions.
+        args (Namespace): Training config.
+        ignore_id (int, optional): Id for ignoring a character.
+        flag_return (bool, optional): If true, return a list with (loss,
+        loss_ctc, loss_att, acc) in forward. Otherwise, return loss.
+
+    """
+
+    @staticmethod
+    def add_arguments(parser):
+        """Customize flags for transformer setup.
+
+        Args:
+            parser (Namespace): Training config.
+
+        """
+        group = parser.add_argument_group("transformer model setting")
+        group.add_argument(
+            "--transformer-init",
+            type=str,
+            default="pytorch",
+            help="how to initialize transformer parameters",
+        )
+        group.add_argument(
+            "--transformer-input-layer",
+            type=str,
+            default="conv2d",
+            choices=["conv2d", "linear", "embed"],
+            help="transformer input layer type",
+        )
+        group.add_argument(
+            "--transformer-attn-dropout-rate",
+            default=None,
+            type=float,
+            help="dropout in transformer attention. use --dropout-rate if None is set",
+        )
+        group.add_argument(
+            "--transformer-lr",
+            default=10.0,
+            type=float,
+            help="Initial value of learning rate",
+        )
+        group.add_argument(
+            "--transformer-warmup-steps",
+            default=25000,
+            type=int,
+            help="optimizer warmup steps",
+        )
+        group.add_argument(
+            "--transformer-length-normalized-loss",
+            default=True,
+            type=strtobool,
+            help="normalize loss by length",
+        )
+
+        group.add_argument(
+            "--dropout-rate",
+            default=0.0,
+            type=float,
+            help="Dropout rate for the encoder",
+        )
+        # Encoder
+        group.add_argument(
+            "--elayers",
+            default=4,
+            type=int,
+            help="Number of encoder layers (for shared recognition part "
+            "in multi-speaker asr mode)",
+        )
+        group.add_argument(
+            "--eunits",
+            "-u",
+            default=300,
+            type=int,
+            help="Number of encoder hidden units",
+        )
+        # Attention
+        group.add_argument(
+            "--adim",
+            default=320,
+            type=int,
+            help="Number of attention transformation dimensions",
+        )
+        group.add_argument(
+            "--aheads",
+            default=4,
+            type=int,
+            help="Number of heads for multi head attention",
+        )
+        # Decoder
+        group.add_argument(
+            "--dlayers", default=1, type=int, help="Number of decoder layers"
+        )
+        group.add_argument(
+            "--dunits", default=320, type=int, help="Number of decoder hidden units"
+        )
+        return parser
+
+    def get_total_subsampling_factor(self):
+        """Get total subsampling factor."""
+        return self.encoder.conv_subsampling_factor * int(np.prod(self.subsample))
+
+    def __init__(self, idim, odim, args, ignore_id=-1, flag_return=True):
+        """Initialize the transformer."""
+        chainer.Chain.__init__(self)
+        self.mtlalpha = args.mtlalpha
+        assert 0 <= self.mtlalpha <= 1, "mtlalpha must be [0,1]"
+        if args.transformer_attn_dropout_rate is None:
+            args.transformer_attn_dropout_rate = args.dropout_rate
+        self.use_label_smoothing = False
+        self.char_list = args.char_list
+        self.space = args.sym_space
+        self.blank = args.sym_blank
+        self.scale_emb = args.adim**0.5
+        self.sos = odim - 1
+        self.eos = odim - 1
+        self.subsample = get_subsample(args, mode="asr", arch="transformer")
+        self.ignore_id = ignore_id
+        self.reset_parameters(args)
+        with self.init_scope():
+            self.encoder = Encoder(
+                idim=idim,
+                attention_dim=args.adim,
+                attention_heads=args.aheads,
+                linear_units=args.eunits,
+                input_layer=args.transformer_input_layer,
+                dropout_rate=args.dropout_rate,
+                positional_dropout_rate=args.dropout_rate,
+                attention_dropout_rate=args.transformer_attn_dropout_rate,
+                initialW=self.initialW,
+                initial_bias=self.initialB,
+            )
+            self.decoder = Decoder(
+                odim, args, initialW=self.initialW, initial_bias=self.initialB
+            )
+            self.criterion = LabelSmoothingLoss(
+                args.lsm_weight,
+                len(args.char_list),
+                args.transformer_length_normalized_loss,
+            )
+            if args.mtlalpha > 0.0:
+                if args.ctc_type == "builtin":
+                    logging.info("Using chainer CTC implementation")
+                    self.ctc = ctc.CTC(odim, args.adim, args.dropout_rate)
+                else:
+                    raise ValueError(
+                        'ctc_type must be "builtin": {}'.format(args.ctc_type)
+                    )
+            else:
+                self.ctc = None
+        self.dims = args.adim
+        self.odim = odim
+        self.flag_return = flag_return
+        if args.report_cer or args.report_wer:
+            self.error_calculator = ErrorCalculator(
+                args.char_list,
+                args.sym_space,
+                args.sym_blank,
+                args.report_cer,
+                args.report_wer,
+            )
+        else:
+            self.error_calculator = None
+        if "Namespace" in str(type(args)):
+            self.verbose = 0 if "verbose" not in args else args.verbose
+        else:
+            self.verbose = 0 if args.verbose is None else args.verbose
+
+    def reset_parameters(self, args):
+        """Initialize the Weight according to the give initialize-type.
+
+        Args:
+            args (Namespace): Transformer config.
+
+        """
+        type_init = args.transformer_init
+        if type_init == "lecun_uniform":
+            logging.info("Using LeCunUniform as Parameter initializer")
+            self.initialW = chainer.initializers.LeCunUniform
+        elif type_init == "lecun_normal":
+            logging.info("Using LeCunNormal as Parameter initializer")
+            self.initialW = chainer.initializers.LeCunNormal
+        elif type_init == "gorot_uniform":
+            logging.info("Using GlorotUniform as Parameter initializer")
+            self.initialW = chainer.initializers.GlorotUniform
+        elif type_init == "gorot_normal":
+            logging.info("Using GlorotNormal as Parameter initializer")
+            self.initialW = chainer.initializers.GlorotNormal
+        elif type_init == "he_uniform":
+            logging.info("Using HeUniform as Parameter initializer")
+            self.initialW = chainer.initializers.HeUniform
+        elif type_init == "he_normal":
+            logging.info("Using HeNormal as Parameter initializer")
+            self.initialW = chainer.initializers.HeNormal
+        elif type_init == "pytorch":
+            logging.info("Using Pytorch initializer")
+            self.initialW = chainer.initializers.Uniform
+        else:
+            logging.info("Using Chainer default as Parameter initializer")
+            self.initialW = chainer.initializers.Uniform
+        self.initialB = chainer.initializers.Uniform
+
+    def forward(self, xs, ilens, ys_pad, calculate_attentions=False):
+        """E2E forward propagation.
+
+        Args:
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
+            ilens (chainer.Variable): Batch of length of each input batch. (B,)
+            ys (chainer.Variable): Batch of padded target features. (B, Lmax, odim)
+            calculate_attentions (bool): If true, return value is the output of encoder.
+
+        Returns:
+            float: Training loss.
+            float (optional): Training loss for ctc.
+            float (optional): Training loss for attention.
+            float (optional): Accuracy.
+            chainer.Variable (Optional): Output of the encoder.
+
+        """
+        alpha = self.mtlalpha
+
+        # 1. Encoder
+        xs, x_mask, ilens = self.encoder(xs, ilens)
+
+        # 2. CTC loss
+        cer_ctc = None
+        if alpha == 0.0:
+            loss_ctc = None
+        else:
+            _ys = [y.astype(np.int32) for y in ys_pad]
+            loss_ctc = self.ctc(xs, _ys)
+            if self.error_calculator is not None:
+                with chainer.no_backprop_mode():
+                    ys_hat = chainer.backends.cuda.to_cpu(self.ctc.argmax(xs).data)
+                cer_ctc = self.error_calculator(ys_hat, ys_pad, is_ctc=True)
+
+        # 3. Decoder
+        if calculate_attentions:
+            self.calculate_attentions(xs, x_mask, ys_pad)
+        ys = self.decoder(ys_pad, xs, x_mask)
+
+        # 4. Attention Loss
+        cer, wer = None, None
+        if alpha == 1:
+            loss_att = None
+            acc = None
+        else:
+            # Make target
+            eos = np.array([self.eos], "i")
+            with chainer.no_backprop_mode():
+                ys_pad_out = [np.concatenate([y, eos], axis=0) for y in ys_pad]
+                ys_pad_out = F.pad_sequence(ys_pad_out, padding=-1).data
+                ys_pad_out = self.xp.array(ys_pad_out)
+
+            loss_att = self.criterion(ys, ys_pad_out)
+            acc = F.accuracy(
+                ys.reshape(-1, self.odim), ys_pad_out.reshape(-1), ignore_label=-1
+            )
+            if (not chainer.config.train) and (self.error_calculator is not None):
+                cer, wer = self.error_calculator(ys, ys_pad)
+
+        if alpha == 0.0:
+            self.loss = loss_att
+            loss_att_data = loss_att.data
+            loss_ctc_data = None
+        elif alpha == 1.0:
+            self.loss = loss_ctc
+            loss_att_data = None
+            loss_ctc_data = loss_ctc.data
+        else:
+            self.loss = alpha * loss_ctc + (1 - alpha) * loss_att
+            loss_att_data = loss_att.data
+            loss_ctc_data = loss_ctc.data
+        loss_data = self.loss.data
+
+        if not math.isnan(loss_data):
+            reporter.report({"loss_ctc": loss_ctc_data}, self)
+            reporter.report({"loss_att": loss_att_data}, self)
+            reporter.report({"acc": acc}, self)
+
+            reporter.report({"cer_ctc": cer_ctc}, self)
+            reporter.report({"cer": cer}, self)
+            reporter.report({"wer": wer}, self)
+
+            logging.info("mtl loss:" + str(loss_data))
+            reporter.report({"loss": loss_data}, self)
+        else:
+            logging.warning("loss (=%f) is not correct", loss_data)
+
+        if self.flag_return:
+            loss_ctc = None
+            return self.loss, loss_ctc, loss_att, acc
+        else:
+            return self.loss
+
+    def calculate_attentions(self, xs, x_mask, ys_pad):
+        """Calculate Attentions."""
+        self.decoder(ys_pad, xs, x_mask)
+
+    def recognize(self, x_block, recog_args, char_list=None, rnnlm=None):
+        """E2E recognition function.
+
+        Args:
+            x (ndarray): Input acouctic feature (B, T, D) or (T, D).
+            recog_args (Namespace): Argment namespace contraining options.
+            char_list (List[str]): List of characters.
+            rnnlm (chainer.Chain): Language model module defined at
+            `espnet.lm.chainer_backend.lm`.
+
+        Returns:
+            List: N-best decoding results.
+
+        """
+        with chainer.no_backprop_mode(), chainer.using_config("train", False):
+            # 1. encoder
+            ilens = [x_block.shape[0]]
+            batch = len(ilens)
+            xs, _, _ = self.encoder(x_block[None, :, :], ilens)
+
+            # calculate log P(z_t|X) for CTC scores
+            if recog_args.ctc_weight > 0.0:
+                lpz = self.ctc.log_softmax(xs.reshape(batch, -1, self.dims)).data[0]
+            else:
+                lpz = None
+            # 2. decoder
+            if recog_args.lm_weight == 0.0:
+                rnnlm = None
+            y = self.recognize_beam(xs, lpz, recog_args, char_list, rnnlm)
+
+        return y
+
+    def recognize_beam(self, h, lpz, recog_args, char_list=None, rnnlm=None):
+        """E2E beam search.
+
+        Args:
+            h (ndarray): Encoder output features (B, T, D) or (T, D).
+            lpz (ndarray): Log probabilities from CTC.
+            recog_args (Namespace): Argment namespace contraining options.
+            char_list (List[str]): List of characters.
+            rnnlm (chainer.Chain): Language model module defined at
+            `espnet.lm.chainer_backend.lm`.
+
+        Returns:
+            List: N-best decoding results.
+
+        """
+        logging.info("input lengths: " + str(h.shape[1]))
+
+        # initialization
+        n_len = h.shape[1]
+        xp = self.xp
+        h_mask = xp.ones((1, n_len))
+
+        # search parms
+        beam = recog_args.beam_size
+        penalty = recog_args.penalty
+        ctc_weight = recog_args.ctc_weight
+
+        # prepare sos
+        y = self.sos
+        if recog_args.maxlenratio == 0:
+            maxlen = n_len
+        else:
+            maxlen = max(1, int(recog_args.maxlenratio * n_len))
+        minlen = int(recog_args.minlenratio * n_len)
+        logging.info("max output length: " + str(maxlen))
+        logging.info("min output length: " + str(minlen))
+
+        # initialize hypothesis
+        if rnnlm:
+            hyp = {"score": 0.0, "yseq": [y], "rnnlm_prev": None}
+        else:
+            hyp = {"score": 0.0, "yseq": [y]}
+
+        if lpz is not None:
+            ctc_prefix_score = CTCPrefixScore(lpz, 0, self.eos, self.xp)
+            hyp["ctc_state_prev"] = ctc_prefix_score.initial_state()
+            hyp["ctc_score_prev"] = 0.0
+            if ctc_weight != 1.0:
+                # pre-pruning based on attention scores
+                ctc_beam = min(lpz.shape[-1], int(beam * CTC_SCORING_RATIO))
+            else:
+                ctc_beam = lpz.shape[-1]
+
+        hyps = [hyp]
+        ended_hyps = []
+
+        for i in range(maxlen):
+            logging.debug("position " + str(i))
+
+            hyps_best_kept = []
+            for hyp in hyps:
+                ys = F.expand_dims(xp.array(hyp["yseq"]), axis=0).data
+                out = self.decoder(ys, h, h_mask)
+
+                # get nbest local scores and their ids
+                local_att_scores = F.log_softmax(out[:, -1], axis=-1).data
+                if rnnlm:
+                    rnnlm_state, local_lm_scores = rnnlm.predict(
+                        hyp["rnnlm_prev"], hyp["yseq"][i]
+                    )
+                    local_scores = (
+                        local_att_scores + recog_args.lm_weight * local_lm_scores
+                    )
+                else:
+                    local_scores = local_att_scores
+
+                if lpz is not None:
+                    local_best_ids = xp.argsort(local_scores, axis=1)[0, ::-1][
+                        :ctc_beam
+                    ]
+                    ctc_scores, ctc_states = ctc_prefix_score(
+                        hyp["yseq"], local_best_ids, hyp["ctc_state_prev"]
+                    )
+                    local_scores = (1.0 - ctc_weight) * local_att_scores[
+                        :, local_best_ids
+                    ] + ctc_weight * (ctc_scores - hyp["ctc_score_prev"])
+                    if rnnlm:
+                        local_scores += (
+                            recog_args.lm_weight * local_lm_scores[:, local_best_ids]
+                        )
+                    joint_best_ids = xp.argsort(local_scores, axis=1)[0, ::-1][:beam]
+                    local_best_scores = local_scores[:, joint_best_ids]
+                    local_best_ids = local_best_ids[joint_best_ids]
+                else:
+                    local_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
+                        :beam
+                    ]
+                    local_best_scores = local_scores[:, local_best_ids]
+
+                for j in range(beam):
+                    new_hyp = {}
+                    new_hyp["score"] = hyp["score"] + float(local_best_scores[0, j])
+                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
+                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
+                    new_hyp["yseq"][len(hyp["yseq"])] = int(local_best_ids[j])
+                    if rnnlm:
+                        new_hyp["rnnlm_prev"] = rnnlm_state
+                    if lpz is not None:
+                        new_hyp["ctc_state_prev"] = ctc_states[joint_best_ids[j]]
+                        new_hyp["ctc_score_prev"] = ctc_scores[joint_best_ids[j]]
+                    hyps_best_kept.append(new_hyp)
+
+                hyps_best_kept = sorted(
+                    hyps_best_kept, key=lambda x: x["score"], reverse=True
+                )[:beam]
+
+            # sort and get nbest
+            hyps = hyps_best_kept
+            logging.debug("number of pruned hypothesis: " + str(len(hyps)))
+            if char_list is not None:
+                logging.debug(
+                    "best hypo: "
+                    + "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]])
+                    + " score: "
+                    + str(hyps[0]["score"])
+                )
+
+            # add eos in the final loop to avoid that there are no ended hyps
+            if i == maxlen - 1:
+                logging.info("adding <eos> in the last position in the loop")
+                for hyp in hyps:
+                    hyp["yseq"].append(self.eos)
+
+            # add ended hypothes to a final list, and removed them from current hypothes
+            # (this will be a probmlem, number of hyps < beam)
+            remained_hyps = []
+            for hyp in hyps:
+                if hyp["yseq"][-1] == self.eos:
+                    # only store the sequence that has more than minlen outputs
+                    # also add penalty
+                    if len(hyp["yseq"]) > minlen:
+                        hyp["score"] += (i + 1) * penalty
+                        if rnnlm:  # Word LM needs to add final <eos> score
+                            hyp["score"] += recog_args.lm_weight * rnnlm.final(
+                                hyp["rnnlm_prev"]
+                            )
+                        ended_hyps.append(hyp)
+                else:
+                    remained_hyps.append(hyp)
+
+            # end detection
+            if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0:
+                logging.info("end detected at %d", i)
+                break
+
+            hyps = remained_hyps
+            if len(hyps) > 0:
+                logging.debug("remained hypothes: " + str(len(hyps)))
+            else:
+                logging.info("no hypothesis. Finish decoding.")
+                break
+            if char_list is not None:
+                for hyp in hyps:
+                    logging.debug(
+                        "hypo: " + "".join([char_list[int(x)] for x in hyp["yseq"][1:]])
+                    )
+
+            logging.debug("number of ended hypothes: " + str(len(ended_hyps)))
+
+        nbest_hyps = sorted(
+            ended_hyps, key=lambda x: x["score"], reverse=True
+        )  # [:min(len(ended_hyps), recog_args.nbest)]
+
+        logging.debug(nbest_hyps)
+        # check number of hypotheis
+        if len(nbest_hyps) == 0:
+            logging.warn(
+                "there is no N-best results, perform recognition "
+                "again with smaller minlenratio."
+            )
+            # should copy becasuse Namespace will be overwritten globally
+            recog_args = Namespace(**vars(recog_args))
+            recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1)
+            return self.recognize_beam(h, lpz, recog_args, char_list, rnnlm)
+
+        logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
+        logging.info(
+            "normalized log probability: "
+            + str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
+        )
+        # remove sos
+        return nbest_hyps
+
+    def calculate_all_attentions(self, xs, ilens, ys):
+        """E2E attention calculation.
+
+        Args:
+            xs (List[tuple()]): List of padded input sequences.
+                [(T1, idim), (T2, idim), ...]
+            ilens (ndarray): Batch of lengths of input sequences. (B)
+            ys (List): List of character id sequence tensor. [(L1), (L2), (L3), ...]
+
+        Returns:
+            float ndarray: Attention weights. (B, Lmax, Tmax)
+
+        """
+        with chainer.no_backprop_mode():
+            self(xs, ilens, ys, calculate_attentions=True)
+        ret = dict()
+        for name, m in self.namedlinks():
+            if isinstance(m, MultiHeadAttention):
+                var = m.attn
+                var.to_cpu()
+                _name = name[1:].replace("/", "_")
+                ret[_name] = var.data
+        return ret
+
+    @property
+    def attention_plot_class(self):
+        """Attention plot function.
+
+        Redirects to PlotAttentionReport
+
+        Returns:
+            PlotAttentionReport
+
+        """
+        return PlotAttentionReport
+
+    @staticmethod
+    def custom_converter(subsampling_factor=0):
+        """Get customconverter of the model."""
+        return CustomConverter()
+
+    @staticmethod
+    def custom_updater(iters, optimizer, converter, device=-1, accum_grad=1):
+        """Get custom_updater of the model."""
+        return CustomUpdater(
+            iters, optimizer, converter=converter, device=device, accum_grad=accum_grad
+        )
+
+    @staticmethod
+    def custom_parallel_updater(iters, optimizer, converter, devices, accum_grad=1):
+        """Get custom_parallel_updater of the model."""
+        return CustomParallelUpdater(
+            iters,
+            optimizer,
+            converter=converter,
+            devices=devices,
+            accum_grad=accum_grad,
+        )
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/nets_utils.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/nets_utils.py
+import chainer.functions as F
+
+
+def _subsamplex(x, n):
+    x = [F.get_item(xx, (slice(None, None, n), slice(None))) for xx in x]
+    ilens = [xx.shape[0] for xx in x]
+    return x, ilens
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/__init__.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/__init__.py
+"""Initialize sub package."""
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/attentions.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/attentions.py
+import chainer
+import chainer.functions as F
+import chainer.links as L
+import numpy as np
+
+
+# dot product based attention
+class AttDot(chainer.Chain):
+    """Compute attention based on dot product.
+
+    Args:
+        eprojs (int | None): Dimension of input vectors from encoder.
+        dunits (int | None): Dimension of input vectors for decoder.
+        att_dim (int): Dimension of input vectors for attention.
+
+    """
+
+    def __init__(self, eprojs, dunits, att_dim):
+        super(AttDot, self).__init__()
+        with self.init_scope():
+            self.mlp_enc = L.Linear(eprojs, att_dim)
+            self.mlp_dec = L.Linear(dunits, att_dim)
+
+        self.dunits = dunits
+        self.eprojs = eprojs
+        self.att_dim = att_dim
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+
+    def reset(self):
+        """Reset states."""
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+
+    def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0):
+        """Compute AttDot forward layer.
+
+        Args:
+            enc_hs (chainer.Variable | N-dimensional array):
+                Input variable from encoder.
+            dec_z (chainer.Variable | N-dimensional array): Input variable of decoder.
+            scaling (float): Scaling weight to make attention sharp.
+
+        Returns:
+            chainer.Variable: Weighted sum over flames.
+            chainer.Variable: Attention weight.
+
+        """
+        batch = len(enc_hs)
+        # pre-compute all h outside the decoder loop
+        if self.pre_compute_enc_h is None:
+            self.enc_h = F.pad_sequence(enc_hs)  # utt x frame x hdim
+            self.h_length = self.enc_h.shape[1]
+            # utt x frame x att_dim
+            self.pre_compute_enc_h = F.tanh(self.mlp_enc(self.enc_h, n_batch_axes=2))
+
+        if dec_z is None:
+            dec_z = chainer.Variable(
+                self.xp.zeros((batch, self.dunits), dtype=np.float32)
+            )
+        else:
+            dec_z = dec_z.reshape(batch, self.dunits)
+
+        # <phi (h_t), psi (s)> for all t
+        u = F.broadcast_to(
+            F.expand_dims(F.tanh(self.mlp_dec(dec_z)), 1), self.pre_compute_enc_h.shape
+        )
+        e = F.sum(self.pre_compute_enc_h * u, axis=2)  # utt x frame
+        # Applying a minus-large-number filter
+        # to make a probability value zero for a padded area
+        # simply degrades the performance, and I gave up this implementation
+        # Apply a scaling to make an attention sharp
+        w = F.softmax(scaling * e)
+        # weighted sum over flames
+        # utt x hdim
+        c = F.sum(
+            self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1
+        )
+
+        return c, w
+
+
+# location based attention
+class AttLoc(chainer.Chain):
+    """Compute location-based attention.
+
+    Args:
+        eprojs (int | None): Dimension of input vectors from encoder.
+        dunits (int | None): Dimension of input vectors for decoder.
+        att_dim (int): Dimension of input vectors for attention.
+        aconv_chans (int): Number of channels of output arrays from convolutional layer.
+        aconv_filts (int): Size of filters of convolutional layer.
+
+    """
+
+    def __init__(self, eprojs, dunits, att_dim, aconv_chans, aconv_filts):
+        super(AttLoc, self).__init__()
+        with self.init_scope():
+            self.mlp_enc = L.Linear(eprojs, att_dim)
+            self.mlp_dec = L.Linear(dunits, att_dim, nobias=True)
+            self.mlp_att = L.Linear(aconv_chans, att_dim, nobias=True)
+            self.loc_conv = L.Convolution2D(
+                1, aconv_chans, ksize=(1, 2 * aconv_filts + 1), pad=(0, aconv_filts)
+            )
+            self.gvec = L.Linear(att_dim, 1)
+
+        self.dunits = dunits
+        self.eprojs = eprojs
+        self.att_dim = att_dim
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.aconv_chans = aconv_chans
+
+    def reset(self):
+        """Reset states."""
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+
+    def __call__(self, enc_hs, dec_z, att_prev, scaling=2.0):
+        """Compute AttLoc forward layer.
+
+        Args:
+            enc_hs (chainer.Variable | N-dimensional array):
+                Input variable from encoders.
+            dec_z (chainer.Variable | N-dimensional array): Input variable of decoder.
+            att_prev (chainer.Variable | None): Attention weight.
+            scaling (float): Scaling weight to make attention sharp.
+
+        Returns:
+            chainer.Variable: Weighted sum over flames.
+            chainer.Variable: Attention weight.
+
+        """
+        batch = len(enc_hs)
+        # pre-compute all h outside the decoder loop
+        if self.pre_compute_enc_h is None:
+            self.enc_h = F.pad_sequence(enc_hs)  # utt x frame x hdim
+            self.h_length = self.enc_h.shape[1]
+            # utt x frame x att_dim
+            self.pre_compute_enc_h = self.mlp_enc(self.enc_h, n_batch_axes=2)
+
+        if dec_z is None:
+            dec_z = chainer.Variable(
+                self.xp.zeros((batch, self.dunits), dtype=np.float32)
+            )
+        else:
+            dec_z = dec_z.reshape(batch, self.dunits)
+
+        # initialize attention weight with uniform dist.
+        if att_prev is None:
+            att_prev = [
+                self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32)
+                for hh in enc_hs
+            ]
+            att_prev = [chainer.Variable(att) for att in att_prev]
+            att_prev = F.pad_sequence(att_prev)
+
+        # att_prev: utt x frame -> utt x 1 x 1 x frame
+        # -> utt x att_conv_chans x 1 x frame
+        att_conv = self.loc_conv(att_prev.reshape(batch, 1, 1, self.h_length))
+        # att_conv: utt x att_conv_chans x 1 x frame -> utt x frame x att_conv_chans
+        att_conv = F.swapaxes(F.squeeze(att_conv, axis=2), 1, 2)
+        # att_conv: utt x frame x att_conv_chans -> utt x frame x att_dim
+        att_conv = self.mlp_att(att_conv, n_batch_axes=2)
+
+        # dec_z_tiled: utt x frame x att_dim
+        dec_z_tiled = F.broadcast_to(
+            F.expand_dims(self.mlp_dec(dec_z), 1), self.pre_compute_enc_h.shape
+        )
+
+        # dot with gvec
+        # utt x frame x att_dim -> utt x frame
+        # TODO(watanabe) use batch_matmul
+        e = F.squeeze(
+            self.gvec(
+                F.tanh(att_conv + self.pre_compute_enc_h + dec_z_tiled), n_batch_axes=2
+            ),
+            axis=2,
+        )
+        # Applying a minus-large-number filter
+        # to make a probability value zero for a padded area
+        # simply degrades the performance, and I gave up this implementation
+        # Apply a scaling to make an attention sharp
+        w = F.softmax(scaling * e)
+
+        # weighted sum over flames
+        # utt x hdim
+        c = F.sum(
+            self.enc_h * F.broadcast_to(F.expand_dims(w, 2), self.enc_h.shape), axis=1
+        )
+
+        return c, w
+
+
+class NoAtt(chainer.Chain):
+    """Compute non-attention layer.
+
+    This layer is a dummy attention layer to be compatible with other
+    attention-based models.
+
+    """
+
+    def __init__(self):
+        super(NoAtt, self).__init__()
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.c = None
+
+    def reset(self):
+        """Reset states."""
+        self.h_length = None
+        self.enc_h = None
+        self.pre_compute_enc_h = None
+        self.c = None
+
+    def __call__(self, enc_hs, dec_z, att_prev):
+        """Compute NoAtt forward layer.
+
+        Args:
+            enc_hs (chainer.Variable | N-dimensional array):
+                Input variable from encoders.
+            dec_z: Dummy.
+            att_prev (chainer.Variable | None): Attention weight.
+
+        Returns:
+            chainer.Variable: Sum over flames.
+            chainer.Variable: Attention weight.
+
+        """
+        # pre-compute all h outside the decoder loop
+        if self.pre_compute_enc_h is None:
+            self.enc_h = F.pad_sequence(enc_hs)  # utt x frame x hdim
+            self.h_length = self.enc_h.shape[1]
+
+        # initialize attention weight with uniform dist.
+        if att_prev is None:
+            att_prev = [
+                self.xp.full(hh.shape[0], 1.0 / hh.shape[0], dtype=np.float32)
+                for hh in enc_hs
+            ]
+            att_prev = [chainer.Variable(att) for att in att_prev]
+            att_prev = F.pad_sequence(att_prev)
+            self.c = F.sum(
+                self.enc_h
+                * F.broadcast_to(F.expand_dims(att_prev, 2), self.enc_h.shape),
+                axis=1,
+            )
+
+        return self.c, att_prev
+
+
+def att_for(args):
+    """Returns an attention layer given the program arguments.
+
+    Args:
+        args (Namespace): The arguments.
+
+    Returns:
+        chainer.Chain: The corresponding attention module.
+
+    """
+    if args.atype == "dot":
+        att = AttDot(args.eprojs, args.dunits, args.adim)
+    elif args.atype == "location":
+        att = AttLoc(
+            args.eprojs, args.dunits, args.adim, args.aconv_chans, args.aconv_filts
+        )
+    elif args.atype == "noatt":
+        att = NoAtt()
+    else:
+        raise NotImplementedError(
+            "chainer supports only noatt, dot, and location attention."
+        )
+    return att
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/decoders.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/decoders.py
+import logging
+import random
+from argparse import Namespace
+
+import chainer
+import chainer.functions as F
+import chainer.links as L
+import numpy as np
+
+import espnet.nets.chainer_backend.deterministic_embed_id as DL
+from espnet.nets.ctc_prefix_score import CTCPrefixScore
+from espnet.nets.e2e_asr_common import end_detect
+
+CTC_SCORING_RATIO = 1.5
+MAX_DECODER_OUTPUT = 5
+
+
+class Decoder(chainer.Chain):
+    """Decoder layer.
+
+    Args:
+        eprojs (int): Dimension of input variables from encoder.
+        odim (int): The output dimension.
+        dtype (str): Decoder type.
+        dlayers (int): Number of layers for decoder.
+        dunits (int): Dimension of input vector of decoder.
+        sos (int): Number to indicate the start of sequences.
+        eos (int): Number to indicate the end of sequences.
+        att (Module): Attention module defined at
+            `espnet.espnet.nets.chainer_backend.attentions`.
+        verbose (int): Verbosity level.
+        char_list (List[str]): List of all characters.
+        labeldist (numpy.array): Distributed array of counted transcript length.
+        lsm_weight (float): Weight to use when calculating the training loss.
+        sampling_probability (float): Threshold for scheduled sampling.
+
+    """
+
+    def __init__(
+        self,
+        eprojs,
+        odim,
+        dtype,
+        dlayers,
+        dunits,
+        sos,
+        eos,
+        att,
+        verbose=0,
+        char_list=None,
+        labeldist=None,
+        lsm_weight=0.0,
+        sampling_probability=0.0,
+    ):
+        super(Decoder, self).__init__()
+        with self.init_scope():
+            self.embed = DL.EmbedID(odim, dunits)
+            self.rnn0 = (
+                L.StatelessLSTM(dunits + eprojs, dunits)
+                if dtype == "lstm"
+                else L.StatelessGRU(dunits + eprojs, dunits)
+            )
+            for i in range(1, dlayers):
+                setattr(
+                    self,
+                    "rnn%d" % i,
+                    L.StatelessLSTM(dunits, dunits)
+                    if dtype == "lstm"
+                    else L.StatelessGRU(dunits, dunits),
+                )
+            self.output = L.Linear(dunits, odim)
+        self.dtype = dtype
+        self.loss = None
+        self.att = att
+        self.dlayers = dlayers
+        self.dunits = dunits
+        self.sos = sos
+        self.eos = eos
+        self.verbose = verbose
+        self.char_list = char_list
+        # for label smoothing
+        self.labeldist = labeldist
+        self.vlabeldist = None
+        self.lsm_weight = lsm_weight
+        self.sampling_probability = sampling_probability
+
+    def rnn_forward(self, ey, z_list, c_list, z_prev, c_prev):
+        if self.dtype == "lstm":
+            c_list[0], z_list[0] = self.rnn0(c_prev[0], z_prev[0], ey)
+            for i in range(1, self.dlayers):
+                c_list[i], z_list[i] = self["rnn%d" % i](
+                    c_prev[i], z_prev[i], z_list[i - 1]
+                )
+        else:
+            if z_prev[0] is None:
+                xp = self.xp
+                with chainer.backends.cuda.get_device_from_id(self._device_id):
+                    z_prev[0] = chainer.Variable(
+                        xp.zeros((ey.shape[0], self.dunits), dtype=ey.dtype)
+                    )
+            z_list[0] = self.rnn0(z_prev[0], ey)
+            for i in range(1, self.dlayers):
+                if z_prev[i] is None:
+                    xp = self.xp
+                    with chainer.backends.cuda.get_device_from_id(self._device_id):
+                        z_prev[i] = chainer.Variable(
+                            xp.zeros(
+                                (z_list[i - 1].shape[0], self.dunits),
+                                dtype=z_list[i - 1].dtype,
+                            )
+                        )
+                z_list[i] = self["rnn%d" % i](z_prev[i], z_list[i - 1])
+        return z_list, c_list
+
+    def __call__(self, hs, ys):
+        """Core function of Decoder layer.
+
+        Args:
+            hs (list of chainer.Variable | N-dimension array):
+                Input variable from encoder.
+            ys (list of chainer.Variable | N-dimension array):
+                Input variable of decoder.
+
+        Returns:
+            chainer.Variable: A variable holding a scalar array of the training loss.
+            chainer.Variable: A variable holding a scalar array of the accuracy.
+
+        """
+        self.loss = None
+        # prepare input and output word sequences with sos/eos IDs
+        eos = self.xp.array([self.eos], "i")
+        sos = self.xp.array([self.sos], "i")
+        ys_in = [F.concat([sos, y], axis=0) for y in ys]
+        ys_out = [F.concat([y, eos], axis=0) for y in ys]
+
+        # padding for ys with -1
+        # pys: utt x olen
+        pad_ys_in = F.pad_sequence(ys_in, padding=self.eos)
+        pad_ys_out = F.pad_sequence(ys_out, padding=-1)
+
+        # get dim, length info
+        batch = pad_ys_out.shape[0]
+        olength = pad_ys_out.shape[1]
+        logging.info(
+            self.__class__.__name__
+            + " input lengths:  "
+            + str(self.xp.array([h.shape[0] for h in hs]))
+        )
+        logging.info(
+            self.__class__.__name__
+            + " output lengths: "
+            + str(self.xp.array([y.shape[0] for y in ys_out]))
+        )
+
+        # initialization
+        c_list = [None]  # list of cell state of each layer
+        z_list = [None]  # list of hidden state of each layer
+        for _ in range(1, self.dlayers):
+            c_list.append(None)
+            z_list.append(None)
+        att_w = None
+        z_all = []
+        self.att.reset()  # reset pre-computation of h
+
+        # pre-computation of embedding
+        eys = self.embed(pad_ys_in)  # utt x olen x zdim
+        eys = F.separate(eys, axis=1)
+
+        # loop for an output sequence
+        for i in range(olength):
+            att_c, att_w = self.att(hs, z_list[0], att_w)
+            if i > 0 and random.random() < self.sampling_probability:
+                logging.info(" scheduled sampling ")
+                z_out = self.output(z_all[-1])
+                z_out = F.argmax(F.log_softmax(z_out), axis=1)
+                z_out = self.embed(z_out)
+                ey = F.hstack((z_out, att_c))  # utt x (zdim + hdim)
+            else:
+                ey = F.hstack((eys[i], att_c))  # utt x (zdim + hdim)
+            z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list)
+            z_all.append(z_list[-1])
+
+        z_all = F.stack(z_all, axis=1).reshape(batch * olength, self.dunits)
+        # compute loss
+        y_all = self.output(z_all)
+        self.loss = F.softmax_cross_entropy(y_all, F.flatten(pad_ys_out))
+        # -1: eos, which is removed in the loss computation
+        self.loss *= np.mean([len(x) for x in ys_in]) - 1
+        acc = F.accuracy(y_all, F.flatten(pad_ys_out), ignore_label=-1)
+        logging.info("att loss:" + str(self.loss.data))
+
+        # show predicted character sequence for debug
+        if self.verbose > 0 and self.char_list is not None:
+            y_hat = y_all.reshape(batch, olength, -1)
+            y_true = pad_ys_out
+            for (i, y_hat_), y_true_ in zip(enumerate(y_hat.data), y_true.data):
+                if i == MAX_DECODER_OUTPUT:
+                    break
+                idx_hat = self.xp.argmax(y_hat_[y_true_ != -1], axis=1)
+                idx_true = y_true_[y_true_ != -1]
+                seq_hat = [self.char_list[int(idx)] for idx in idx_hat]
+                seq_true = [self.char_list[int(idx)] for idx in idx_true]
+                seq_hat = "".join(seq_hat).replace("<space>", " ")
+                seq_true = "".join(seq_true).replace("<space>", " ")
+                logging.info("groundtruth[%d]: " % i + seq_true)
+                logging.info("prediction [%d]: " % i + seq_hat)
+
+        if self.labeldist is not None:
+            if self.vlabeldist is None:
+                self.vlabeldist = chainer.Variable(self.xp.asarray(self.labeldist))
+            loss_reg = -F.sum(
+                F.scale(F.log_softmax(y_all), self.vlabeldist, axis=1)
+            ) / len(ys_in)
+            self.loss = (1.0 - self.lsm_weight) * self.loss + self.lsm_weight * loss_reg
+
+        return self.loss, acc
+
+    def recognize_beam(self, h, lpz, recog_args, char_list, rnnlm=None):
+        """Beam search implementation.
+
+        Args:
+            h (chainer.Variable): One of the output from the encoder.
+            lpz (chainer.Variable | None): Result of net propagation.
+            recog_args (Namespace): The argument.
+            char_list (List[str]): List of all characters.
+            rnnlm (Module): RNNLM module. Defined at `espnet.lm.chainer_backend.lm`
+
+        Returns:
+            List[Dict[str,Any]]: Result of recognition.
+
+        """
+        logging.info("input lengths: " + str(h.shape[0]))
+        # initialization
+        c_list = [None]  # list of cell state of each layer
+        z_list = [None]  # list of hidden state of each layer
+        for _ in range(1, self.dlayers):
+            c_list.append(None)
+            z_list.append(None)
+        a = None
+        self.att.reset()  # reset pre-computation of h
+
+        # search parms
+        beam = recog_args.beam_size
+        penalty = recog_args.penalty
+        ctc_weight = recog_args.ctc_weight
+
+        # preprate sos
+        y = self.xp.full(1, self.sos, "i")
+        if recog_args.maxlenratio == 0:
+            maxlen = h.shape[0]
+        else:
+            # maxlen >= 1
+            maxlen = max(1, int(recog_args.maxlenratio * h.shape[0]))
+        minlen = int(recog_args.minlenratio * h.shape[0])
+        logging.info("max output length: " + str(maxlen))
+        logging.info("min output length: " + str(minlen))
+
+        # initialize hypothesis
+        if rnnlm:
+            hyp = {
+                "score": 0.0,
+                "yseq": [y],
+                "c_prev": c_list,
+                "z_prev": z_list,
+                "a_prev": a,
+                "rnnlm_prev": None,
+            }
+        else:
+            hyp = {
+                "score": 0.0,
+                "yseq": [y],
+                "c_prev": c_list,
+                "z_prev": z_list,
+                "a_prev": a,
+            }
+        if lpz is not None:
+            ctc_prefix_score = CTCPrefixScore(lpz, 0, self.eos, self.xp)
+            hyp["ctc_state_prev"] = ctc_prefix_score.initial_state()
+            hyp["ctc_score_prev"] = 0.0
+            if ctc_weight != 1.0:
+                # pre-pruning based on attention scores
+                ctc_beam = min(lpz.shape[-1], int(beam * CTC_SCORING_RATIO))
+            else:
+                ctc_beam = lpz.shape[-1]
+        hyps = [hyp]
+        ended_hyps = []
+
+        for i in range(maxlen):
+            logging.debug("position " + str(i))
+
+            hyps_best_kept = []
+            for hyp in hyps:
+                ey = self.embed(hyp["yseq"][i])  # utt list (1) x zdim
+                att_c, att_w = self.att([h], hyp["z_prev"][0], hyp["a_prev"])
+                ey = F.hstack((ey, att_c))  # utt(1) x (zdim + hdim)
+
+                z_list, c_list = self.rnn_forward(
+                    ey, z_list, c_list, hyp["z_prev"], hyp["c_prev"]
+                )
+
+                # get nbest local scores and their ids
+                local_att_scores = F.log_softmax(self.output(z_list[-1])).data
+                if rnnlm:
+                    rnnlm_state, local_lm_scores = rnnlm.predict(
+                        hyp["rnnlm_prev"], hyp["yseq"][i]
+                    )
+                    local_scores = (
+                        local_att_scores + recog_args.lm_weight * local_lm_scores
+                    )
+                else:
+                    local_scores = local_att_scores
+
+                if lpz is not None:
+                    local_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
+                        :ctc_beam
+                    ]
+                    ctc_scores, ctc_states = ctc_prefix_score(
+                        hyp["yseq"], local_best_ids, hyp["ctc_state_prev"]
+                    )
+                    local_scores = (1.0 - ctc_weight) * local_att_scores[
+                        :, local_best_ids
+                    ] + ctc_weight * (ctc_scores - hyp["ctc_score_prev"])
+                    if rnnlm:
+                        local_scores += (
+                            recog_args.lm_weight * local_lm_scores[:, local_best_ids]
+                        )
+                    joint_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
+                        :beam
+                    ]
+                    local_best_scores = local_scores[:, joint_best_ids]
+                    local_best_ids = local_best_ids[joint_best_ids]
+                else:
+                    local_best_ids = self.xp.argsort(local_scores, axis=1)[0, ::-1][
+                        :beam
+                    ]
+                    local_best_scores = local_scores[:, local_best_ids]
+
+                for j in range(beam):
+                    new_hyp = {}
+                    # do not copy {z,c}_list directly
+                    new_hyp["z_prev"] = z_list[:]
+                    new_hyp["c_prev"] = c_list[:]
+                    new_hyp["a_prev"] = att_w
+                    new_hyp["score"] = hyp["score"] + local_best_scores[0, j]
+                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
+                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
+                    new_hyp["yseq"][len(hyp["yseq"])] = self.xp.full(
+                        1, local_best_ids[j], "i"
+                    )
+                    if rnnlm:
+                        new_hyp["rnnlm_prev"] = rnnlm_state
+                    if lpz is not None:
+                        new_hyp["ctc_state_prev"] = ctc_states[joint_best_ids[j]]
+                        new_hyp["ctc_score_prev"] = ctc_scores[joint_best_ids[j]]
+                    # will be (2 x beam) hyps at most
+                    hyps_best_kept.append(new_hyp)
+
+                hyps_best_kept = sorted(
+                    hyps_best_kept, key=lambda x: x["score"], reverse=True
+                )[:beam]
+
+            # sort and get nbest
+            hyps = hyps_best_kept
+            logging.debug("number of pruned hypotheses: " + str(len(hyps)))
+            logging.debug(
+                "best hypo: "
+                + "".join([char_list[int(x)] for x in hyps[0]["yseq"][1:]]).replace(
+                    "<space>", " "
+                )
+            )
+
+            # add eos in the final loop to avoid that there are no ended hyps
+            if i == maxlen - 1:
+                logging.info("adding <eos> in the last position in the loop")
+                for hyp in hyps:
+                    hyp["yseq"].append(self.xp.full(1, self.eos, "i"))
+
+            # add ended hypotheses to a final list,
+            # and removed them from current hypotheses
+            # (this will be a problem, number of hyps < beam)
+            remained_hyps = []
+            for hyp in hyps:
+                if hyp["yseq"][-1] == self.eos:
+                    # only store the sequence that has more than minlen outputs
+                    # also add penalty
+                    if len(hyp["yseq"]) > minlen:
+                        hyp["score"] += (i + 1) * penalty
+                        if rnnlm:  # Word LM needs to add final <eos> score
+                            hyp["score"] += recog_args.lm_weight * rnnlm.final(
+                                hyp["rnnlm_prev"]
+                            )
+                        ended_hyps.append(hyp)
+                else:
+                    remained_hyps.append(hyp)
+
+            # end detection
+            if end_detect(ended_hyps, i) and recog_args.maxlenratio == 0.0:
+                logging.info("end detected at %d", i)
+                break
+
+            hyps = remained_hyps
+            if len(hyps) > 0:
+                logging.debug("remaining hypotheses: " + str(len(hyps)))
+            else:
+                logging.info("no hypothesis. Finish decoding.")
+                break
+
+            for hyp in hyps:
+                logging.debug(
+                    "hypo: "
+                    + "".join([char_list[int(x)] for x in hyp["yseq"][1:]]).replace(
+                        "<space>", " "
+                    )
+                )
+
+            logging.debug("number of ended hypotheses: " + str(len(ended_hyps)))
+
+        nbest_hyps = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[
+            : min(len(ended_hyps), recog_args.nbest)
+        ]
+
+        # check number of hypotheses
+        if len(nbest_hyps) == 0:
+            logging.warning(
+                "there is no N-best results, "
+                "perform recognition again with smaller minlenratio."
+            )
+            # should copy because Namespace will be overwritten globally
+            recog_args = Namespace(**vars(recog_args))
+            recog_args.minlenratio = max(0.0, recog_args.minlenratio - 0.1)
+            return self.recognize_beam(h, lpz, recog_args, char_list, rnnlm)
+
+        logging.info("total log probability: " + str(nbest_hyps[0]["score"]))
+        logging.info(
+            "normalized log probability: "
+            + str(nbest_hyps[0]["score"] / len(nbest_hyps[0]["yseq"]))
+        )
+
+        return nbest_hyps
+
+    def calculate_all_attentions(self, hs, ys):
+        """Calculate all of attentions.
+
+        Args:
+            hs (list of chainer.Variable | N-dimensional array):
+                Input variable from encoder.
+            ys (list of chainer.Variable | N-dimensional array):
+                Input variable of decoder.
+
+        Returns:
+            chainer.Variable: List of attention weights.
+
+        """
+        # prepare input and output word sequences with sos/eos IDs
+        eos = self.xp.array([self.eos], "i")
+        sos = self.xp.array([self.sos], "i")
+        ys_in = [F.concat([sos, y], axis=0) for y in ys]
+        ys_out = [F.concat([y, eos], axis=0) for y in ys]
+
+        # padding for ys with -1
+        # pys: utt x olen
+        pad_ys_in = F.pad_sequence(ys_in, padding=self.eos)
+        pad_ys_out = F.pad_sequence(ys_out, padding=-1)
+
+        # get length info
+        olength = pad_ys_out.shape[1]
+
+        # initialization
+        c_list = [None]  # list of cell state of each layer
+        z_list = [None]  # list of hidden state of each layer
+        for _ in range(1, self.dlayers):
+            c_list.append(None)
+            z_list.append(None)
+        att_w = None
+        att_ws = []
+        self.att.reset()  # reset pre-computation of h
+
+        # pre-computation of embedding
+        eys = self.embed(pad_ys_in)  # utt x olen x zdim
+        eys = F.separate(eys, axis=1)
+
+        # loop for an output sequence
+        for i in range(olength):
+            att_c, att_w = self.att(hs, z_list[0], att_w)
+            ey = F.hstack((eys[i], att_c))  # utt x (zdim + hdim)
+            z_list, c_list = self.rnn_forward(ey, z_list, c_list, z_list, c_list)
+            att_ws.append(att_w)  # for debugging
+
+        att_ws = F.stack(att_ws, axis=1)
+        att_ws.to_cpu()
+
+        return att_ws.data
+
+
+def decoder_for(args, odim, sos, eos, att, labeldist):
+    """Return the decoding layer corresponding to the args.
+
+    Args:
+        args (Namespace): The program arguments.
+        odim (int): The output dimension.
+        sos (int): Number to indicate the start of sequences.
+        eos (int) Number to indicate the end of sequences.
+        att (Module):
+            Attention module defined at `espnet.nets.chainer_backend.attentions`.
+        labeldist (numpy.array): Distributed array of length od transcript.
+
+    Returns:
+        chainer.Chain: The decoder module.
+
+    """
+    return Decoder(
+        args.eprojs,
+        odim,
+        args.dtype,
+        args.dlayers,
+        args.dunits,
+        sos,
+        eos,
+        att,
+        args.verbose,
+        args.char_list,
+        labeldist,
+        args.lsm_weight,
+        args.sampling_probability,
+    )
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/encoders.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/encoders.py
+import logging
+
+import chainer
+import chainer.functions as F
+import chainer.links as L
+import numpy as np
+from chainer import cuda
+
+from espnet.nets.chainer_backend.nets_utils import _subsamplex
+from espnet.nets.e2e_asr_common import get_vgg2l_odim
+
+
+# TODO(watanabe) explanation of BLSTMP
+class RNNP(chainer.Chain):
+    """RNN with projection layer module.
+
+    Args:
+        idim (int): Dimension of inputs.
+        elayers (int): Number of encoder layers.
+        cdim (int): Number of rnn units. (resulted in cdim * 2 if bidirectional)
+        hdim (int): Number of projection units.
+        subsample (np.ndarray): List to use sabsample the input array.
+        dropout (float): Dropout rate.
+        typ (str): The RNN type.
+
+    """
+
+    def __init__(self, idim, elayers, cdim, hdim, subsample, dropout, typ="blstm"):
+        super(RNNP, self).__init__()
+        bidir = typ[0] == "b"
+        if bidir:
+            rnn = L.NStepBiLSTM if "lstm" in typ else L.NStepBiGRU
+        else:
+            rnn = L.NStepLSTM if "lstm" in typ else L.NStepGRU
+        rnn_label = "birnn" if bidir else "rnn"
+        with self.init_scope():
+            for i in range(elayers):
+                if i == 0:
+                    inputdim = idim
+                else:
+                    inputdim = hdim
+                _cdim = 2 * cdim if bidir else cdim
+                # bottleneck layer to merge
+                setattr(
+                    self, "{}{:d}".format(rnn_label, i), rnn(1, inputdim, cdim, dropout)
+                )
+                setattr(self, "bt%d" % i, L.Linear(_cdim, hdim))
+
+        self.elayers = elayers
+        self.rnn_label = rnn_label
+        self.cdim = cdim
+        self.subsample = subsample
+        self.typ = typ
+        self.bidir = bidir
+
+    def __call__(self, xs, ilens):
+        """RNNP forward.
+
+        Args:
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
+            ilens (chainer.Variable): Batch of length of each input batch. (B,)
+
+        Returns:
+            xs (chainer.Variable):subsampled vector of xs.
+            chainer.Variable: Subsampled vector of ilens.
+
+        """
+        logging.info(self.__class__.__name__ + " input lengths: " + str(ilens))
+
+        for layer in range(self.elayers):
+            if "lstm" in self.typ:
+                _, _, ys = self[self.rnn_label + str(layer)](None, None, xs)
+            else:
+                _, ys = self[self.rnn_label + str(layer)](None, xs)
+            # ys: utt list of frame x cdim x 2 (2: means bidirectional)
+            # TODO(watanabe) replace subsample and FC layer with CNN
+            ys, ilens = _subsamplex(ys, self.subsample[layer + 1])
+            # (sum _utt frame_utt) x dim
+            ys = self["bt" + str(layer)](F.vstack(ys))
+            xs = F.split_axis(ys, np.cumsum(ilens[:-1]), axis=0)
+
+        # final tanh operation
+        xs = F.split_axis(F.tanh(F.vstack(xs)), np.cumsum(ilens[:-1]), axis=0)
+
+        # 1 utterance case, it becomes an array, so need to make a utt tuple
+        if not isinstance(xs, tuple):
+            xs = [xs]
+
+        return xs, ilens  # x: utt list of frame x dim
+
+
+class RNN(chainer.Chain):
+    """RNN Module.
+
+    Args:
+        idim (int): Dimension of the imput.
+        elayers (int): Number of encoder layers.
+        cdim (int): Number of rnn units.
+        hdim (int): Number of projection units.
+        dropout (float): Dropout rate.
+        typ (str): Rnn type.
+
+    """
+
+    def __init__(self, idim, elayers, cdim, hdim, dropout, typ="lstm"):
+        super(RNN, self).__init__()
+        bidir = typ[0] == "b"
+        if bidir:
+            rnn = L.NStepBiLSTM if "lstm" in typ else L.NStepBiGRU
+        else:
+            rnn = L.NStepLSTM if "lstm" in typ else L.NStepGRU
+        _cdim = 2 * cdim if bidir else cdim
+        with self.init_scope():
+            self.nbrnn = rnn(elayers, idim, cdim, dropout)
+            self.l_last = L.Linear(_cdim, hdim)
+        self.typ = typ
+        self.bidir = bidir
+
+    def __call__(self, xs, ilens):
+        """BRNN forward propagation.
+
+        Args:
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
+            ilens (chainer.Variable): Batch of length of each input batch. (B,)
+
+        Returns:
+            tuple(chainer.Variable): Tuple of `chainer.Variable` objects.
+            chainer.Variable: `ilens` .
+
+        """
+        logging.info(self.__class__.__name__ + " input lengths: " + str(ilens))
+        # need to move ilens to cpu
+        ilens = cuda.to_cpu(ilens)
+
+        if "lstm" in self.typ:
+            _, _, ys = self.nbrnn(None, None, xs)
+        else:
+            _, ys = self.nbrnn(None, xs)
+        ys = self.l_last(F.vstack(ys))  # (sum _utt frame_utt) x dim
+        xs = F.split_axis(ys, np.cumsum(ilens[:-1]), axis=0)
+
+        # final tanh operation
+        xs = F.split_axis(F.tanh(F.vstack(xs)), np.cumsum(ilens[:-1]), axis=0)
+
+        # 1 utterance case, it becomes an array, so need to make a utt tuple
+        if not isinstance(xs, tuple):
+            xs = [xs]
+
+        return xs, ilens  # x: utt list of frame x dim
+
+
+# TODO(watanabe) explanation of VGG2L, VGG2B (Block) might be better
+class VGG2L(chainer.Chain):
+    """VGG motibated cnn layers.
+
+    Args:
+        in_channel (int): Number of channels.
+
+    """
+
+    def __init__(self, in_channel=1):
+        super(VGG2L, self).__init__()
+        with self.init_scope():
+            # CNN layer (VGG motivated)
+            self.conv1_1 = L.Convolution2D(in_channel, 64, 3, stride=1, pad=1)
+            self.conv1_2 = L.Convolution2D(64, 64, 3, stride=1, pad=1)
+            self.conv2_1 = L.Convolution2D(64, 128, 3, stride=1, pad=1)
+            self.conv2_2 = L.Convolution2D(128, 128, 3, stride=1, pad=1)
+
+        self.in_channel = in_channel
+
+    def __call__(self, xs, ilens):
+        """VGG2L forward propagation.
+
+        Args:
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
+            ilens (chainer.Variable): Batch of length of each features. (B,)
+
+        Returns:
+            chainer.Variable: Subsampled vector of xs.
+            chainer.Variable: Subsampled vector of ilens.
+
+        """
+        logging.info(self.__class__.__name__ + " input lengths: " + str(ilens))
+
+        # x: utt x frame x dim
+        xs = F.pad_sequence(xs)
+
+        # x: utt x 1 (input channel num) x frame x dim
+        xs = F.swapaxes(
+            xs.reshape(
+                xs.shape[0],
+                xs.shape[1],
+                self.in_channel,
+                xs.shape[2] // self.in_channel,
+            ),
+            1,
+            2,
+        )
+
+        xs = F.relu(self.conv1_1(xs))
+        xs = F.relu(self.conv1_2(xs))
+        xs = F.max_pooling_2d(xs, 2, stride=2)
+
+        xs = F.relu(self.conv2_1(xs))
+        xs = F.relu(self.conv2_2(xs))
+        xs = F.max_pooling_2d(xs, 2, stride=2)
+
+        # change ilens accordingly
+        ilens = self.xp.array(
+            self.xp.ceil(self.xp.array(ilens, dtype=np.float32) / 2), dtype=np.int32
+        )
+        ilens = self.xp.array(
+            self.xp.ceil(self.xp.array(ilens, dtype=np.float32) / 2), dtype=np.int32
+        )
+
+        # x: utt_list of frame (remove zeropaded frames) x (input channel num x dim)
+        xs = F.swapaxes(xs, 1, 2)
+        xs = xs.reshape(xs.shape[0], xs.shape[1], xs.shape[2] * xs.shape[3])
+        xs = [xs[i, : ilens[i], :] for i in range(len(ilens))]
+
+        return xs, ilens
+
+
+class Encoder(chainer.Chain):
+    """Encoder network class.
+
+    Args:
+        etype (str): Type of encoder network.
+        idim (int): Number of dimensions of encoder network.
+        elayers (int): Number of layers of encoder network.
+        eunits (int): Number of lstm units of encoder network.
+        eprojs (int): Number of projection units of encoder network.
+        subsample (np.array): Subsampling number. e.g. 1_2_2_2_1
+        dropout (float): Dropout rate.
+
+    """
+
+    def __init__(
+        self, etype, idim, elayers, eunits, eprojs, subsample, dropout, in_channel=1
+    ):
+        super(Encoder, self).__init__()
+        typ = etype.lstrip("vgg").rstrip("p")
+        if typ not in ["lstm", "gru", "blstm", "bgru"]:
+            logging.error("Error: need to specify an appropriate encoder architecture")
+        with self.init_scope():
+            if etype.startswith("vgg"):
+                if etype[-1] == "p":
+                    self.enc = chainer.Sequential(
+                        VGG2L(in_channel),
+                        RNNP(
+                            get_vgg2l_odim(idim, in_channel=in_channel),
+                            elayers,
+                            eunits,
+                            eprojs,
+                            subsample,
+                            dropout,
+                            typ=typ,
+                        ),
+                    )
+                    logging.info("Use CNN-VGG + " + typ.upper() + "P for encoder")
+                else:
+                    self.enc = chainer.Sequential(
+                        VGG2L(in_channel),
+                        RNN(
+                            get_vgg2l_odim(idim, in_channel=in_channel),
+                            elayers,
+                            eunits,
+                            eprojs,
+                            dropout,
+                            typ=typ,
+                        ),
+                    )
+                    logging.info("Use CNN-VGG + " + typ.upper() + " for encoder")
+                self.conv_subsampling_factor = 4
+            else:
+                if etype[-1] == "p":
+                    self.enc = chainer.Sequential(
+                        RNNP(idim, elayers, eunits, eprojs, subsample, dropout, typ=typ)
+                    )
+                    logging.info(
+                        typ.upper() + " with every-layer projection for encoder"
+                    )
+                else:
+                    self.enc = chainer.Sequential(
+                        RNN(idim, elayers, eunits, eprojs, dropout, typ=typ)
+                    )
+                    logging.info(typ.upper() + " without projection for encoder")
+                self.conv_subsampling_factor = 1
+
+    def __call__(self, xs, ilens):
+        """Encoder forward.
+
+        Args:
+            xs (chainer.Variable): Batch of padded character ids. (B, Tmax)
+            ilens (chainer.variable): Batch of length of each features. (B,)
+
+        Returns:
+            chainer.Variable: Output of the encoder.
+            chainer.Variable: (Subsampled) vector of ilens.
+
+        """
+        xs, ilens = self.enc(xs, ilens)
+
+        return xs, ilens
+
+
+def encoder_for(args, idim, subsample):
+    """Return the Encoder module.
+
+    Args:
+        idim (int): Dimension of input array.
+        subsample (numpy.array): Subsample number. egs).1_2_2_2_1
+
+    Return
+        chainer.nn.Module: Encoder module.
+
+    """
+    return Encoder(
+        args.etype,
+        idim,
+        args.elayers,
+        args.eunits,
+        args.eprojs,
+        subsample,
+        args.dropout_rate,
+    )
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/training.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/rnn/training.py
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import collections
+import logging
+import math
+
+import numpy as np
+
+# chainer related
+from chainer import Variable, cuda, training
+from chainer.training.updaters.multiprocess_parallel_updater import (
+    gather_grads,
+    gather_params,
+    scatter_grads,
+)
+
+
+# copied from https://github.com/chainer/chainer/blob/master/chainer/optimizer.py
+def sum_sqnorm(arr):
+    """Calculate the norm of the array.
+
+    Args:
+        arr (numpy.ndarray)
+
+    Returns:
+        Float: Sum of the norm calculated from the given array.
+
+    """
+    sq_sum = collections.defaultdict(float)
+    for x in arr:
+        with cuda.get_device_from_array(x) as dev:
+            if x is not None:
+                x = x.ravel()
+                s = x.dot(x)
+                sq_sum[int(dev)] += s
+    return sum([float(i) for i in sq_sum.values()])
+
+
+class CustomUpdater(training.StandardUpdater):
+    """Custom updater for chainer.
+
+    Args:
+        train_iter (iterator | dict[str, iterator]): Dataset iterator for the
+            training dataset. It can also be a dictionary that maps strings to
+            iterators. If this is just an iterator, then the iterator is
+            registered by the name ``'main'``.
+        optimizer (optimizer | dict[str, optimizer]): Optimizer to update
+            parameters. It can also be a dictionary that maps strings to
+            optimizers. If this is just an optimizer, then the optimizer is
+            registered by the name ``'main'``.
+        converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
+            function to build input arrays. Each batch extracted by the main
+            iterator and the ``device`` option are passed to this function.
+            :func:`chainer.dataset.concat_examples` is used by default.
+        device (int or dict): The destination device info to send variables. In the
+            case of cpu or single gpu, `device=-1 or 0`, respectively.
+            In the case of multi-gpu, `device={"main":0, "sub_1": 1, ...}`.
+        accum_grad (int):The number of gradient accumulation. if set to 2, the network
+            parameters will be updated once in twice,
+            i.e. actual batchsize will be doubled.
+
+    """
+
+    def __init__(self, train_iter, optimizer, converter, device, accum_grad=1):
+        super(CustomUpdater, self).__init__(
+            train_iter, optimizer, converter=converter, device=device
+        )
+        self.forward_count = 0
+        self.accum_grad = accum_grad
+        self.start = True
+        # To solve #1091, it is required to set the variable inside this class.
+        self.device = device
+
+    # The core part of the update routine can be customized by overriding.
+    def update_core(self):
+        """Main update routine for Custom Updater."""
+        train_iter = self.get_iterator("main")
+        optimizer = self.get_optimizer("main")
+
+        # Get batch and convert into variables
+        batch = train_iter.next()
+        x = self.converter(batch, self.device)
+        if self.start:
+            optimizer.target.cleargrads()
+            self.start = False
+
+        # Compute the loss at this time step and accumulate it
+        loss = optimizer.target(*x) / self.accum_grad
+        loss.backward()  # Backprop
+        loss.unchain_backward()  # Truncate the graph
+
+        # update parameters
+        self.forward_count += 1
+        if self.forward_count != self.accum_grad:
+            return
+        self.forward_count = 0
+        # compute the gradient norm to check if it is normal or not
+        grad_norm = np.sqrt(
+            sum_sqnorm([p.grad for p in optimizer.target.params(False)])
+        )
+        logging.info("grad norm={}".format(grad_norm))
+        if math.isnan(grad_norm):
+            logging.warning("grad norm is nan. Do not update model.")
+        else:
+            optimizer.update()
+        optimizer.target.cleargrads()  # Clear the parameter gradients
+
+    def update(self):
+        self.update_core()
+        if self.forward_count == 0:
+            self.iteration += 1
+
+
+class CustomParallelUpdater(training.updaters.MultiprocessParallelUpdater):
+    """Custom Parallel Updater for chainer.
+
+    Defines the main update routine.
+
+    Args:
+        train_iter (iterator | dict[str, iterator]): Dataset iterator for the
+            training dataset. It can also be a dictionary that maps strings to
+            iterators. If this is just an iterator, then the iterator is
+            registered by the name ``'main'``.
+        optimizer (optimizer | dict[str, optimizer]): Optimizer to update
+            parameters. It can also be a dictionary that maps strings to
+            optimizers. If this is just an optimizer, then the optimizer is
+            registered by the name ``'main'``.
+        converter (espnet.asr.chainer_backend.asr.CustomConverter): Converter
+            function to build input arrays. Each batch extracted by the main
+            iterator and the ``device`` option are passed to this function.
+            :func:`chainer.dataset.concat_examples` is used by default.
+        device (torch.device): Device to which the training data is sent.
+            Negative value
+            indicates the host memory (CPU).
+        accum_grad (int):The number of gradient accumulation. if set to 2,
+            the network parameters will be updated once in twice,
+            i.e. actual batchsize will be doubled.
+
+    """
+
+    def __init__(self, train_iters, optimizer, converter, devices, accum_grad=1):
+        super(CustomParallelUpdater, self).__init__(
+            train_iters, optimizer, converter=converter, devices=devices
+        )
+        from cupy.cuda import nccl
+
+        self.accum_grad = accum_grad
+        self.forward_count = 0
+        self.nccl = nccl
+
+    # The core part of the update routine can be customized by overriding.
+    def update_core(self):
+        """Main Update routine of the custom parallel updater."""
+        self.setup_workers()
+
+        self._send_message(("update", None))
+        with cuda.Device(self._devices[0]):
+            # For reducing memory
+
+            optimizer = self.get_optimizer("main")
+            batch = self.get_iterator("main").next()
+            x = self.converter(batch, self._devices[0])
+
+            loss = self._master(*x) / self.accum_grad
+            loss.backward()
+            loss.unchain_backward()
+
+            # NCCL: reduce grads
+            null_stream = cuda.Stream.null
+            if self.comm is not None:
+                gg = gather_grads(self._master)
+                self.comm.reduce(
+                    gg.data.ptr,
+                    gg.data.ptr,
+                    gg.size,
+                    self.nccl.NCCL_FLOAT,
+                    self.nccl.NCCL_SUM,
+                    0,
+                    null_stream.ptr,
+                )
+                scatter_grads(self._master, gg)
+                del gg
+
+            # update parameters
+            self.forward_count += 1
+            if self.forward_count != self.accum_grad:
+                return
+            self.forward_count = 0
+            # check gradient value
+            grad_norm = np.sqrt(
+                sum_sqnorm([p.grad for p in optimizer.target.params(False)])
+            )
+            logging.info("grad norm={}".format(grad_norm))
+
+            # update
+            if math.isnan(grad_norm):
+                logging.warning("grad norm is nan. Do not update model.")
+            else:
+                optimizer.update()
+            self._master.cleargrads()
+
+            if self.comm is not None:
+                gp = gather_params(self._master)
+                self.comm.bcast(
+                    gp.data.ptr, gp.size, self.nccl.NCCL_FLOAT, 0, null_stream.ptr
+                )
+
+    def update(self):
+        self.update_core()
+        if self.forward_count == 0:
+            self.iteration += 1
+
+
+class CustomConverter(object):
+    """Custom Converter.
+
+    Args:
+        subsampling_factor (int): The subsampling factor.
+
+    """
+
+    def __init__(self, subsampling_factor=1):
+        self.subsampling_factor = subsampling_factor
+
+    def __call__(self, batch, device):
+        """Perform sabsampling.
+
+        Args:
+            batch (list): Batch that will be sabsampled.
+            device (device): GPU device.
+
+        Returns:
+            chainer.Variable: xp.array that sabsampled from batch.
+            xp.array: xp.array of the length of the mini-batches.
+            chainer.Variable: xp.array that sabsampled from batch.
+
+        """
+        # set device
+        xp = cuda.cupy if device != -1 else np
+
+        # batch should be located in list
+        assert len(batch) == 1
+        xs, ys = batch[0]
+
+        # perform subsampling
+        if self.subsampling_factor > 1:
+            xs = [x[:: self.subsampling_factor, :] for x in xs]
+
+        # get batch made of lengths of input sequences
+        ilens = [x.shape[0] for x in xs]
+
+        # convert to Variable
+        xs = [Variable(xp.array(x, dtype=xp.float32)) for x in xs]
+        ilens = xp.array(ilens, dtype=xp.int32)
+        ys = [Variable(xp.array(y, dtype=xp.int32)) for y in ys]
+
+        return xs, ilens, ys
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/__init__.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/__init__.py
+"""Initialize sub package."""
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/attention.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/attention.py
+# encoding: utf-8
+"""Class Declaration of Transformer's Attention."""
+
+import chainer
+import chainer.functions as F
+import chainer.links as L
+import numpy as np
+
+MIN_VALUE = float(np.finfo(np.float32).min)
+
+
+class MultiHeadAttention(chainer.Chain):
+    """Multi Head Attention Layer.
+
+    Args:
+        n_units (int): Number of input units.
+        h (int): Number of attention heads.
+        dropout (float): Dropout rate.
+        initialW: Initializer to initialize the weight.
+        initial_bias: Initializer to initialize the bias.
+
+    :param int h: the number of heads
+    :param int n_units: the number of features
+    :param float dropout_rate: dropout rate
+
+    """
+
+    def __init__(self, n_units, h=8, dropout=0.1, initialW=None, initial_bias=None):
+        """Initialize MultiHeadAttention."""
+        super(MultiHeadAttention, self).__init__()
+        assert n_units % h == 0
+        stvd = 1.0 / np.sqrt(n_units)
+        with self.init_scope():
+            self.linear_q = L.Linear(
+                n_units,
+                n_units,
+                initialW=initialW(scale=stvd),
+                initial_bias=initial_bias(scale=stvd),
+            )
+            self.linear_k = L.Linear(
+                n_units,
+                n_units,
+                initialW=initialW(scale=stvd),
+                initial_bias=initial_bias(scale=stvd),
+            )
+            self.linear_v = L.Linear(
+                n_units,
+                n_units,
+                initialW=initialW(scale=stvd),
+                initial_bias=initial_bias(scale=stvd),
+            )
+            self.linear_out = L.Linear(
+                n_units,
+                n_units,
+                initialW=initialW(scale=stvd),
+                initial_bias=initial_bias(scale=stvd),
+            )
+        self.d_k = n_units // h
+        self.h = h
+        self.dropout = dropout
+        self.attn = None
+
+    def forward(self, e_var, s_var=None, mask=None, batch=1):
+        """Core function of the Multi-head attention layer.
+
+        Args:
+            e_var (chainer.Variable): Variable of input array.
+            s_var (chainer.Variable): Variable of source array from encoder.
+            mask (chainer.Variable): Attention mask.
+            batch (int): Batch size.
+
+        Returns:
+            chainer.Variable: Outout of multi-head attention layer.
+
+        """
+        xp = self.xp
+        if s_var is None:
+            # batch, head, time1/2, d_k)
+            Q = self.linear_q(e_var).reshape(batch, -1, self.h, self.d_k)
+            K = self.linear_k(e_var).reshape(batch, -1, self.h, self.d_k)
+            V = self.linear_v(e_var).reshape(batch, -1, self.h, self.d_k)
+        else:
+            Q = self.linear_q(e_var).reshape(batch, -1, self.h, self.d_k)
+            K = self.linear_k(s_var).reshape(batch, -1, self.h, self.d_k)
+            V = self.linear_v(s_var).reshape(batch, -1, self.h, self.d_k)
+        scores = F.matmul(F.swapaxes(Q, 1, 2), K.transpose(0, 2, 3, 1)) / np.sqrt(
+            self.d_k
+        )
+        if mask is not None:
+            mask = xp.stack([mask] * self.h, axis=1)
+            scores = F.where(mask, scores, xp.full(scores.shape, MIN_VALUE, "f"))
+        self.attn = F.softmax(scores, axis=-1)
+        p_attn = F.dropout(self.attn, self.dropout)
+        x = F.matmul(p_attn, F.swapaxes(V, 1, 2))
+        x = F.swapaxes(x, 1, 2).reshape(-1, self.h * self.d_k)
+        return self.linear_out(x)
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/ctc.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/ctc.py
+# encoding: utf-8
+"""Class Declaration of Transformer's CTC."""
+import logging
+
+import chainer
+import chainer.functions as F
+import chainer.links as L
+import numpy as np
+
+
+# TODO(nelson): Merge chainer_backend/transformer/ctc.py in chainer_backend/ctc.py
+class CTC(chainer.Chain):
+    """Chainer implementation of ctc layer.
+
+    Args:
+        odim (int): The output dimension.
+        eprojs (int | None): Dimension of input vectors from encoder.
+        dropout_rate (float): Dropout rate.
+
+    """
+
+    def __init__(self, odim, eprojs, dropout_rate):
+        """Initialize CTC."""
+        super(CTC, self).__init__()
+        self.dropout_rate = dropout_rate
+        self.loss = None
+
+        with self.init_scope():
+            self.ctc_lo = L.Linear(eprojs, odim)
+
+    def __call__(self, hs, ys):
+        """CTC forward.
+
+        Args:
+            hs (list of chainer.Variable | N-dimension array):
+                Input variable from encoder.
+            ys (list of chainer.Variable | N-dimension array):
+                Input variable of decoder.
+
+        Returns:
+            chainer.Variable: A variable holding a scalar value of the CTC loss.
+
+        """
+        self.loss = None
+        ilens = [x.shape[0] for x in hs]
+        olens = [x.shape[0] for x in ys]
+
+        # zero padding for hs
+        y_hat = self.ctc_lo(
+            F.dropout(F.pad_sequence(hs), ratio=self.dropout_rate), n_batch_axes=2
+        )
+        y_hat = F.separate(y_hat, axis=1)  # ilen list of batch x hdim
+
+        # zero padding for ys
+        y_true = F.pad_sequence(ys, padding=-1)  # batch x olen
+
+        # get length info
+        input_length = chainer.Variable(self.xp.array(ilens, dtype=np.int32))
+        label_length = chainer.Variable(self.xp.array(olens, dtype=np.int32))
+        logging.info(
+            self.__class__.__name__ + " input lengths:  " + str(input_length.data)
+        )
+        logging.info(
+            self.__class__.__name__ + " output lengths: " + str(label_length.data)
+        )
+
+        # get ctc loss
+        self.loss = F.connectionist_temporal_classification(
+            y_hat, y_true, 0, input_length, label_length
+        )
+        logging.info("ctc loss:" + str(self.loss.data))
+
+        return self.loss
+
+    def log_softmax(self, hs):
+        """Log_softmax of frame activations.
+
+        Args:
+            hs (list of chainer.Variable | N-dimension array):
+                Input variable from encoder.
+
+        Returns:
+            chainer.Variable: A n-dimension float array.
+
+        """
+        y_hat = self.ctc_lo(F.pad_sequence(hs), n_batch_axes=2)
+        return F.log_softmax(y_hat.reshape(-1, y_hat.shape[-1])).reshape(y_hat.shape)
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/decoder.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/decoder.py
+# encoding: utf-8
+"""Class Declaration of Transformer's Decoder."""
+
+import chainer
+import chainer.functions as F
+import chainer.links as L
+import numpy as np
+
+from espnet.nets.chainer_backend.transformer.decoder_layer import DecoderLayer
+from espnet.nets.chainer_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.chainer_backend.transformer.mask import make_history_mask
+
+
+class Decoder(chainer.Chain):
+    """Decoder layer.
+
+    Args:
+        odim (int): The output dimension.
+        n_layers (int): Number of ecoder layers.
+        n_units (int): Number of attention units.
+        d_units (int): Dimension of input vector of decoder.
+        h (int): Number of attention heads.
+        dropout (float): Dropout rate.
+        initialW (Initializer): Initializer to initialize the weight.
+        initial_bias (Initializer): Initializer to initialize the bias.
+
+    """
+
+    def __init__(self, odim, args, initialW=None, initial_bias=None):
+        """Initialize Decoder."""
+        super(Decoder, self).__init__()
+        self.sos = odim - 1
+        self.eos = odim - 1
+        initialW = chainer.initializers.Uniform if initialW is None else initialW
+        initial_bias = (
+            chainer.initializers.Uniform if initial_bias is None else initial_bias
+        )
+        with self.init_scope():
+            self.output_norm = LayerNorm(args.adim)
+            self.pe = PositionalEncoding(args.adim, args.dropout_rate)
+            stvd = 1.0 / np.sqrt(args.adim)
+            self.output_layer = L.Linear(
+                args.adim,
+                odim,
+                initialW=initialW(scale=stvd),
+                initial_bias=initial_bias(scale=stvd),
+            )
+            self.embed = L.EmbedID(
+                odim,
+                args.adim,
+                ignore_label=-1,
+                initialW=chainer.initializers.Normal(scale=1.0),
+            )
+        for i in range(args.dlayers):
+            name = "decoders." + str(i)
+            layer = DecoderLayer(
+                args.adim,
+                d_units=args.dunits,
+                h=args.aheads,
+                dropout=args.dropout_rate,
+                initialW=initialW,
+                initial_bias=initial_bias,
+            )
+            self.add_link(name, layer)
+        self.n_layers = args.dlayers
+
+    def make_attention_mask(self, source_block, target_block):
+        """Prepare the attention mask.
+
+        Args:
+            source_block (ndarray): Source block with dimensions: (B x S).
+            target_block (ndarray): Target block with dimensions: (B x T).
+        Returns:
+            ndarray: Mask with dimensions (B, S, T).
+
+        """
+        mask = (target_block[:, None, :] >= 0) * (source_block[:, :, None] >= 0)
+        # (batch, source_length, target_length)
+        return mask
+
+    def forward(self, ys_pad, source, x_mask):
+        """Forward decoder.
+
+        :param xp.array e: input token ids, int64 (batch, maxlen_out)
+        :param xp.array yy_mask: input token mask, uint8  (batch, maxlen_out)
+        :param xp.array source: encoded memory, float32  (batch, maxlen_in, feat)
+        :param xp.array xy_mask: encoded memory mask, uint8  (batch, maxlen_in)
+        :return e: decoded token score before softmax (batch, maxlen_out, token)
+        :rtype: chainer.Variable
+        """
+        xp = self.xp
+        sos = np.array([self.sos], np.int32)
+        ys = [np.concatenate([sos, y], axis=0) for y in ys_pad]
+        e = F.pad_sequence(ys, padding=self.eos).data
+        e = xp.array(e)
+        # mask preparation
+        xy_mask = self.make_attention_mask(e, xp.array(x_mask))
+        yy_mask = self.make_attention_mask(e, e)
+        yy_mask *= make_history_mask(xp, e)
+
+        e = self.pe(self.embed(e))
+        batch, length, dims = e.shape
+        e = e.reshape(-1, dims)
+        source = source.reshape(-1, dims)
+        for i in range(self.n_layers):
+            e = self["decoders." + str(i)](e, source, xy_mask, yy_mask, batch)
+        return self.output_layer(self.output_norm(e)).reshape(batch, length, -1)
+
+    def recognize(self, e, yy_mask, source):
+        """Process recognition function."""
+        e = self.forward(e, source, yy_mask)
+        return F.log_softmax(e, axis=-1)
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/decoder_layer.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/decoder_layer.py
+# encoding: utf-8
+"""Class Declaration of Transformer's Decoder Block."""
+
+import chainer
+import chainer.functions as F
+
+from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention
+from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.chainer_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,
+)
+
+
+class DecoderLayer(chainer.Chain):
+    """Single decoder layer module.
+
+    Args:
+        n_units (int): Number of input/output dimension of a FeedForward layer.
+        d_units (int): Number of units of hidden layer in a FeedForward layer.
+        h (int): Number of attention heads.
+        dropout (float): Dropout rate
+
+    """
+
+    def __init__(
+        self, n_units, d_units=0, h=8, dropout=0.1, initialW=None, initial_bias=None
+    ):
+        """Initialize DecoderLayer."""
+        super(DecoderLayer, self).__init__()
+        with self.init_scope():
+            self.self_attn = MultiHeadAttention(
+                n_units,
+                h,
+                dropout=dropout,
+                initialW=initialW,
+                initial_bias=initial_bias,
+            )
+            self.src_attn = MultiHeadAttention(
+                n_units,
+                h,
+                dropout=dropout,
+                initialW=initialW,
+                initial_bias=initial_bias,
+            )
+            self.feed_forward = PositionwiseFeedForward(
+                n_units,
+                d_units=d_units,
+                dropout=dropout,
+                initialW=initialW,
+                initial_bias=initial_bias,
+            )
+            self.norm1 = LayerNorm(n_units)
+            self.norm2 = LayerNorm(n_units)
+            self.norm3 = LayerNorm(n_units)
+        self.dropout = dropout
+
+    def forward(self, e, s, xy_mask, yy_mask, batch):
+        """Compute Encoder layer.
+
+        Args:
+            e (chainer.Variable): Batch of padded features. (B, Lmax)
+            s (chainer.Variable): Batch of padded character. (B, Tmax)
+
+        Returns:
+            chainer.Variable: Computed variable of decoder.
+
+        """
+        n_e = self.norm1(e)
+        n_e = self.self_attn(n_e, mask=yy_mask, batch=batch)
+        e = e + F.dropout(n_e, self.dropout)
+
+        n_e = self.norm2(e)
+        n_e = self.src_attn(n_e, s_var=s, mask=xy_mask, batch=batch)
+        e = e + F.dropout(n_e, self.dropout)
+
+        n_e = self.norm3(e)
+        n_e = self.feed_forward(n_e)
+        e = e + F.dropout(n_e, self.dropout)
+        return e
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/embedding.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/embedding.py
+# encoding: utf-8
+"""Class Declaration of Transformer's Positional Encoding."""
+
+import chainer
+import chainer.functions as F
+import numpy as np
+
+
+class PositionalEncoding(chainer.Chain):
+    """Positional encoding module.
+
+    :param int n_units: embedding dim
+    :param float dropout: dropout rate
+    :param int length: maximum input length
+
+    """
+
+    def __init__(self, n_units, dropout=0.1, length=5000):
+        """Initialize Positional Encoding."""
+        # Implementation described in the paper
+        super(PositionalEncoding, self).__init__()
+        self.dropout = dropout
+        posi_block = np.arange(0, length, dtype=np.float32)[:, None]
+        unit_block = np.exp(
+            np.arange(0, n_units, 2, dtype=np.float32) * -(np.log(10000.0) / n_units)
+        )
+        self.pe = np.zeros((length, n_units), dtype=np.float32)
+        self.pe[:, ::2] = np.sin(posi_block * unit_block)
+        self.pe[:, 1::2] = np.cos(posi_block * unit_block)
+        self.scale = np.sqrt(n_units)
+
+    def forward(self, e):
+        """Forward Positional Encoding."""
+        length = e.shape[1]
+        e = e * self.scale + self.xp.array(self.pe[:length])
+        return F.dropout(e, self.dropout)
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/encoder.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/encoder.py
+# encoding: utf-8
+"""Class Declaration of Transformer's Encoder."""
+
+import logging
+
+import chainer
+import numpy as np
+from chainer import links as L
+
+from espnet.nets.chainer_backend.transformer.embedding import PositionalEncoding
+from espnet.nets.chainer_backend.transformer.encoder_layer import EncoderLayer
+from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.chainer_backend.transformer.mask import make_history_mask
+from espnet.nets.chainer_backend.transformer.subsampling import (
+    Conv2dSubsampling,
+    LinearSampling,
+)
+
+
+class Encoder(chainer.Chain):
+    """Encoder.
+
+    Args:
+        input_type(str):
+            Sampling type. `input_type` must be `conv2d` or 'linear' currently.
+        idim (int): Dimension of inputs.
+        n_layers (int): Number of encoder layers.
+        n_units (int): Number of input/output dimension of a FeedForward layer.
+        d_units (int): Number of units of hidden layer in a FeedForward layer.
+        h (int): Number of attention heads.
+        dropout (float): Dropout rate
+
+    """
+
+    def __init__(
+        self,
+        idim,
+        attention_dim=256,
+        attention_heads=4,
+        linear_units=2048,
+        num_blocks=6,
+        dropout_rate=0.1,
+        positional_dropout_rate=0.1,
+        attention_dropout_rate=0.0,
+        input_layer="conv2d",
+        pos_enc_class=PositionalEncoding,
+        initialW=None,
+        initial_bias=None,
+    ):
+        """Initialize Encoder.
+
+        Args:
+            idim (int): Input dimension.
+            args (Namespace): Training config.
+            initialW (int, optional):  Initializer to initialize the weight.
+            initial_bias (bool, optional): Initializer to initialize the bias.
+
+        """
+        super(Encoder, self).__init__()
+        initialW = chainer.initializers.Uniform if initialW is None else initialW
+        initial_bias = (
+            chainer.initializers.Uniform if initial_bias is None else initial_bias
+        )
+        self.do_history_mask = False
+        with self.init_scope():
+            self.conv_subsampling_factor = 1
+            channels = 64  # Based in paper
+            if input_layer == "conv2d":
+                idim = int(np.ceil(np.ceil(idim / 2) / 2)) * channels
+                self.input_layer = Conv2dSubsampling(
+                    channels,
+                    idim,
+                    attention_dim,
+                    dropout=dropout_rate,
+                    initialW=initialW,
+                    initial_bias=initial_bias,
+                )
+                self.conv_subsampling_factor = 4
+            elif input_layer == "linear":
+                self.input_layer = LinearSampling(
+                    idim, attention_dim, initialW=initialW, initial_bias=initial_bias
+                )
+            elif input_layer == "embed":
+                self.input_layer = chainer.Sequential(
+                    L.EmbedID(idim, attention_dim, ignore_label=-1),
+                    pos_enc_class(attention_dim, positional_dropout_rate),
+                )
+                self.do_history_mask = True
+            else:
+                raise ValueError("unknown input_layer: " + input_layer)
+            self.norm = LayerNorm(attention_dim)
+        for i in range(num_blocks):
+            name = "encoders." + str(i)
+            layer = EncoderLayer(
+                attention_dim,
+                d_units=linear_units,
+                h=attention_heads,
+                dropout=attention_dropout_rate,
+                initialW=initialW,
+                initial_bias=initial_bias,
+            )
+            self.add_link(name, layer)
+        self.n_layers = num_blocks
+
+    def forward(self, e, ilens):
+        """Compute Encoder layer.
+
+        Args:
+            e (chainer.Variable): Batch of padded character. (B, Tmax)
+            ilens (chainer.Variable): Batch of length of each input batch. (B,)
+
+        Returns:
+            chainer.Variable: Computed variable of encoder.
+            numpy.array: Mask.
+            chainer.Variable: Batch of lengths of each encoder outputs.
+
+        """
+        if isinstance(self.input_layer, Conv2dSubsampling):
+            e, ilens = self.input_layer(e, ilens)
+        else:
+            e = self.input_layer(e)
+        batch, length, dims = e.shape
+        x_mask = np.ones([batch, length])
+        for j in range(batch):
+            x_mask[j, ilens[j] :] = -1
+        xx_mask = (x_mask[:, None, :] >= 0) * (x_mask[:, :, None] >= 0)
+        xx_mask = self.xp.array(xx_mask)
+        if self.do_history_mask:
+            history_mask = make_history_mask(self.xp, x_mask)
+            xx_mask *= history_mask
+        logging.debug("encoders size: " + str(e.shape))
+        e = e.reshape(-1, dims)
+        for i in range(self.n_layers):
+            e = self["encoders." + str(i)](e, xx_mask, batch)
+        return self.norm(e).reshape(batch, length, -1), x_mask, ilens
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/encoder_layer.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/encoder_layer.py
+# encoding: utf-8
+"""Class Declaration of Transformer's Encoder Block."""
+
+import chainer
+import chainer.functions as F
+
+from espnet.nets.chainer_backend.transformer.attention import MultiHeadAttention
+from espnet.nets.chainer_backend.transformer.layer_norm import LayerNorm
+from espnet.nets.chainer_backend.transformer.positionwise_feed_forward import (
+    PositionwiseFeedForward,
+)
+
+
+class EncoderLayer(chainer.Chain):
+    """Single encoder layer module.
+
+    Args:
+        n_units (int): Number of input/output dimension of a FeedForward layer.
+        d_units (int): Number of units of hidden layer in a FeedForward layer.
+        h (int): Number of attention heads.
+        dropout (float): Dropout rate
+
+    """
+
+    def __init__(
+        self, n_units, d_units=0, h=8, dropout=0.1, initialW=None, initial_bias=None
+    ):
+        """Initialize EncoderLayer."""
+        super(EncoderLayer, self).__init__()
+        with self.init_scope():
+            self.self_attn = MultiHeadAttention(
+                n_units,
+                h,
+                dropout=dropout,
+                initialW=initialW,
+                initial_bias=initial_bias,
+            )
+            self.feed_forward = PositionwiseFeedForward(
+                n_units,
+                d_units=d_units,
+                dropout=dropout,
+                initialW=initialW,
+                initial_bias=initial_bias,
+            )
+            self.norm1 = LayerNorm(n_units)
+            self.norm2 = LayerNorm(n_units)
+        self.dropout = dropout
+        self.n_units = n_units
+
+    def forward(self, e, xx_mask, batch):
+        """Forward Positional Encoding."""
+        n_e = self.norm1(e)
+        n_e = self.self_attn(n_e, mask=xx_mask, batch=batch)
+        e = e + F.dropout(n_e, self.dropout)
+
+        n_e = self.norm2(e)
+        n_e = self.feed_forward(n_e)
+        e = e + F.dropout(n_e, self.dropout)
+        return e
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/label_smoothing_loss.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/label_smoothing_loss.py
+# encoding: utf-8
+"""Class Declaration of Transformer's Label Smootion loss."""
+
+import logging
+
+import chainer
+import chainer.functions as F
+
+
+class LabelSmoothingLoss(chainer.Chain):
+    """Label Smoothing Loss.
+
+    Args:
+        smoothing (float): smoothing rate (0.0 means the conventional CE).
+        n_target_vocab (int): number of classes.
+        normalize_length (bool): normalize loss by sequence length if True.
+
+    """
+
+    def __init__(self, smoothing, n_target_vocab, normalize_length=False, ignore_id=-1):
+        """Initialize Loss."""
+        super(LabelSmoothingLoss, self).__init__()
+        self.use_label_smoothing = False
+        if smoothing > 0.0:
+            logging.info("Use label smoothing")
+            self.smoothing = smoothing
+            self.confidence = 1.0 - smoothing
+            self.use_label_smoothing = True
+            self.n_target_vocab = n_target_vocab
+        self.normalize_length = normalize_length
+        self.ignore_id = ignore_id
+        self.acc = None
+
+    def forward(self, ys_block, ys_pad):
+        """Forward Loss.
+
+        Args:
+            ys_block (chainer.Variable): Predicted labels.
+            ys_pad (chainer.Variable): Target (true) labels.
+
+        Returns:
+            float: Training loss.
+
+        """
+        # Output (all together at once for efficiency)
+        batch, length, dims = ys_block.shape
+        concat_logit_block = ys_block.reshape(-1, dims)
+
+        # Target reshape
+        concat_t_block = ys_pad.reshape((batch * length))
+        ignore_mask = concat_t_block >= 0
+        n_token = ignore_mask.sum()
+        normalizer = n_token if self.normalize_length else batch
+
+        if not self.use_label_smoothing:
+            loss = F.softmax_cross_entropy(concat_logit_block, concat_t_block)
+            loss = loss * n_token / normalizer
+        else:
+            log_prob = F.log_softmax(concat_logit_block)
+            broad_ignore_mask = self.xp.broadcast_to(
+                ignore_mask[:, None], concat_logit_block.shape
+            )
+            pre_loss = (
+                ignore_mask * log_prob[self.xp.arange(batch * length), concat_t_block]
+            )
+            loss = -F.sum(pre_loss) / normalizer
+            label_smoothing = broad_ignore_mask * -1.0 / self.n_target_vocab * log_prob
+            label_smoothing = F.sum(label_smoothing) / normalizer
+            loss = self.confidence * loss + self.smoothing * label_smoothing
+        return loss
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/layer_norm.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/layer_norm.py
+# encoding: utf-8
+"""Class Declaration of Transformer's Label Smootion loss."""
+
+import chainer.links as L
+
+
+class LayerNorm(L.LayerNormalization):
+    """Redirect to L.LayerNormalization."""
+
+    def __init__(self, dims, eps=1e-12):
+        """Initialize LayerNorm."""
+        super(LayerNorm, self).__init__(size=dims, eps=eps)
+
+    def __call__(self, e):
+        """Forward LayerNorm."""
+        return super(LayerNorm, self).__call__(e)
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/mask.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/mask.py
+"""Create mask for subsequent steps."""
+
+
+def make_history_mask(xp, block):
+    """Prepare the history mask.
+
+    Args:
+        block (ndarray): Block with dimensions: (B x S).
+    Returns:
+        ndarray, np.ndarray: History mask with dimensions (B, S, S).
+
+    """
+    batch, length = block.shape
+    arange = xp.arange(length)
+    history_mask = (arange[None] <= arange[:, None])[None,]
+    history_mask = xp.broadcast_to(history_mask, (batch, length, length))
+    return history_mask
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/positionwise_feed_forward.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/nets/chainer_backend/transformer/positionwise_feed_forward.py
+# encoding: utf-8
+"""Class Declaration of Transformer's Positionwise Feedforward."""
+
+import chainer
+import chainer.functions as F
+import chainer.links as L
+import numpy as np
+
+
+class PositionwiseFeedForward(chainer.Chain):
+    """Positionwise feed forward.
+
+    Args:
+        :param int idim: input dimenstion
+        :param int hidden_units: number of hidden units
+        :param float dropout_rate: dropout rate
+
+    """
+
+    def __init__(
+        self, n_units, d_units=0, dropout=0.1, initialW=None, initial_bias=None
+    ):
+        """Initialize PositionwiseFeedForward.
+
+        Args:
+            n_units (int): Input dimension.
+            d_units (int, optional): Output dimension of hidden layer.
+            dropout (float, optional): Dropout ratio.
+            initialW (int, optional):  Initializer to initialize the weight.
+            initial_bias (bool, optional): Initializer to initialize the bias.
+
+        """
+        super(PositionwiseFeedForward, self).__init__()
+        n_inner_units = d_units if d_units > 0 else n_units * 4
+        with self.init_scope():
+            stvd = 1.0 / np.sqrt(n_units)
+            self.w_1 = L.Linear(
+                n_units,
+                n_inner_units,
+                initialW=initialW(scale=stvd),
+                initial_bias=initial_bias(scale=stvd),
+            )
+            stvd = 1.0 / np.sqrt(n_inner_units)
+            self.w_2 = L.Linear(
+                n_inner_units,
+                n_units,
+                initialW=initialW(scale=stvd),
+                initial_bias=initial_bias(scale=stvd),
+            )
+            self.act = F.relu
+        self.dropout = dropout
+
+    def __call__(self, e):
+        """Initialize PositionwiseFeedForward.
+
+        Args:
+            e (chainer.Variable): Input variable.
+
+        Return:
+            chainer.Variable: Output variable.
+
+        """
+        e = F.dropout(self.act(self.w_1(e)), self.dropout)
+        return self.w_2(e)