update conformer

60a2c57a · sunzhq2 · xuxo · 4a699441 · 60a2c57a · 60a2c57a
Commit 60a2c57a authored Jan 27, 2026 by sunzhq2 Committed by xuxo Jan 27, 2026
20 changed files
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/asr/pytorch_backend/recog.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/asr/pytorch_backend/recog.py
+"""V2 backend for `asr_recog.py` using py:class:`espnet.nets.beam_search.BeamSearch`."""
+
+import json
+import logging
+
+import torch
+from packaging.version import parse as V
+
+from espnet.asr.asr_utils import add_results_to_json, get_model_conf, torch_load
+from espnet.asr.pytorch_backend.asr import load_trained_model
+from espnet.nets.asr_interface import ASRInterface
+from espnet.nets.batch_beam_search import BatchBeamSearch
+from espnet.nets.beam_search import BeamSearch
+from espnet.nets.lm_interface import dynamic_import_lm
+from espnet.nets.scorer_interface import BatchScorerInterface
+from espnet.nets.scorers.length_bonus import LengthBonus
+from espnet.utils.deterministic_utils import set_deterministic_pytorch
+from espnet.utils.io_utils import LoadInputsAndTargets
+
+
+def recog_v2(args):
+    """Decode with custom models that implements ScorerInterface.
+
+    Notes:
+        The previous backend espnet.asr.pytorch_backend.asr.recog
+        only supports E2E and RNNLM
+
+    Args:
+        args (namespace): The program arguments.
+        See py:func:`espnet.bin.asr_recog.get_parser` for details
+
+    """
+    logging.warning("experimental API for custom LMs is selected by --api v2")
+    if args.batchsize > 1:
+        raise NotImplementedError("multi-utt batch decoding is not implemented")
+    if args.streaming_mode is not None:
+        raise NotImplementedError("streaming mode is not implemented")
+    if args.word_rnnlm:
+        raise NotImplementedError("word LM is not implemented")
+
+    set_deterministic_pytorch(args)
+    model, train_args = load_trained_model(args.model)
+    assert isinstance(model, ASRInterface)
+
+    if args.quantize_config is not None:
+        q_config = set([getattr(torch.nn, q) for q in args.quantize_config])
+    else:
+        q_config = {torch.nn.Linear}
+
+    if args.quantize_asr_model:
+        logging.info("Use quantized asr model for decoding")
+
+        # See https://github.com/espnet/espnet/pull/3616 for more information.
+        if (
+            V(torch.__version__) < V("1.4.0")
+            and "lstm" in train_args.etype
+            and torch.nn.LSTM in q_config
+        ):
+            raise ValueError(
+                "Quantized LSTM in ESPnet is only supported with torch 1.4+."
+            )
+
+        if args.quantize_dtype == "float16" and V(torch.__version__) < V("1.5.0"):
+            raise ValueError(
+                "float16 dtype for dynamic quantization is not supported with torch "
+                "version < 1.5.0. Switching to qint8 dtype instead."
+            )
+
+        dtype = getattr(torch, args.quantize_dtype)
+
+        model = torch.quantization.quantize_dynamic(model, q_config, dtype=dtype)
+
+    model.eval()
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=False,
+        sort_in_input_length=False,
+        preprocess_conf=train_args.preprocess_conf
+        if args.preprocess_conf is None
+        else args.preprocess_conf,
+        preprocess_args={"train": False},
+    )
+
+    if args.rnnlm:
+        lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
+        # NOTE: for a compatibility with less than 0.5.0 version models
+        lm_model_module = getattr(lm_args, "model_module", "default")
+        lm_class = dynamic_import_lm(lm_model_module, lm_args.backend)
+        lm = lm_class(len(train_args.char_list), lm_args)
+        torch_load(args.rnnlm, lm)
+        if args.quantize_lm_model:
+            logging.info("Use quantized lm model")
+            dtype = getattr(torch, args.quantize_dtype)
+            lm = torch.quantization.quantize_dynamic(lm, q_config, dtype=dtype)
+        lm.eval()
+    else:
+        lm = None
+
+    if args.ngram_model:
+        from espnet.nets.scorers.ngram import NgramFullScorer, NgramPartScorer
+
+        if args.ngram_scorer == "full":
+            ngram = NgramFullScorer(args.ngram_model, train_args.char_list)
+        else:
+            ngram = NgramPartScorer(args.ngram_model, train_args.char_list)
+    else:
+        ngram = None
+
+    scorers = model.scorers()
+    scorers["lm"] = lm
+    scorers["ngram"] = ngram
+    scorers["length_bonus"] = LengthBonus(len(train_args.char_list))
+    weights = dict(
+        decoder=1.0 - args.ctc_weight,
+        ctc=args.ctc_weight,
+        lm=args.lm_weight,
+        ngram=args.ngram_weight,
+        length_bonus=args.penalty,
+    )
+    beam_search = BeamSearch(
+        beam_size=args.beam_size,
+        vocab_size=len(train_args.char_list),
+        weights=weights,
+        scorers=scorers,
+        sos=model.sos,
+        eos=model.eos,
+        token_list=train_args.char_list,
+        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full",
+    )
+    # TODO(karita): make all scorers batchfied
+    if args.batchsize == 1:
+        non_batch = [
+            k
+            for k, v in beam_search.full_scorers.items()
+            if not isinstance(v, BatchScorerInterface)
+        ]
+        if len(non_batch) == 0:
+            beam_search.__class__ = BatchBeamSearch
+            logging.info("BatchBeamSearch implementation is selected.")
+        else:
+            logging.warning(
+                f"As non-batch scorers {non_batch} are found, "
+                f"fall back to non-batch implementation."
+            )
+
+    if args.ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+    if args.ngpu == 1:
+        device = "cuda"
+    else:
+        device = "cpu"
+    dtype = getattr(torch, args.dtype)
+    logging.info(f"Decoding device={device}, dtype={dtype}")
+    model.to(device=device, dtype=dtype).eval()
+    beam_search.to(device=device, dtype=dtype).eval()
+
+    # read json data
+    with open(args.recog_json, "rb") as f:
+        js = json.load(f)["utts"]
+    new_js = {}
+    with torch.no_grad():
+        for idx, name in enumerate(js.keys(), 1):
+            logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
+            batch = [(name, js[name])]
+            feat = load_inputs_and_targets(batch)[0][0]
+            enc = model.encode(torch.as_tensor(feat).to(device=device, dtype=dtype))
+            nbest_hyps = beam_search(
+                x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio
+            )
+            nbest_hyps = [
+                h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)]
+            ]
+            new_js[name] = add_results_to_json(
+                js[name], nbest_hyps, train_args.char_list
+            )
+
+    with open(args.result_label, "wb") as f:
+        f.write(
+            json.dumps(
+                {"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
+            ).encode("utf_8")
+        )
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/__init__.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/__init__.py
+"""Initialize sub package."""
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/asr_align.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/asr_align.py
+#!/usr/bin/env python3
+# encoding: utf-8
+
+# Copyright 2020 Johns Hopkins University (Xuankai Chang)
+#           2020, Technische Universität München;  Dominik Winkelbauer, Ludwig Kürzinger
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""
+This program performs CTC segmentation to align utterances within audio files.
+
+Inputs:
+    `--data-json`:
+        A json containing list of utterances and audio files
+    `--model`:
+        An already trained ASR model
+
+Output:
+    `--output`:
+        A plain `segments` file with utterance positions in the audio files.
+
+Selected parameters:
+    `--min-window-size`:
+        Minimum window size considered for a single utterance. The current default value
+        should be OK in most cases. Larger values might give better results; too large
+        values cause IndexErrors.
+    `--subsampling-factor`:
+        If the encoder sub-samples its input, the number of frames at the CTC layer is
+        reduced by this factor.
+    `--frame-duration`:
+        This is the non-overlapping duration of a single frame in milliseconds (the
+        inverse of frames per millisecond).
+    `--set-blank`:
+        In the rare case that the blank token has not the index 0 in the character
+        dictionary, this parameter sets the index of the blank token.
+    `--gratis-blank`:
+        Sets the transition cost for blank tokens to zero. Useful if there are longer
+        unrelated segments between segments.
+    `--replace-spaces-with-blanks`:
+        Spaces are replaced with blanks. Helps to model pauses between words. May
+        increase length of ground truth. May lead to misaligned segments when combined
+        with the option `--gratis-blank`.
+"""
+
+import json
+import logging
+import os
+import sys
+
+import configargparse
+import torch
+
+# imports for CTC segmentation
+from ctc_segmentation import (
+    CtcSegmentationParameters,
+    ctc_segmentation,
+    determine_utterance_segments,
+    prepare_text,
+)
+
+# imports for inference
+from espnet.asr.pytorch_backend.asr_init import load_trained_model
+from espnet.nets.asr_interface import ASRInterface
+from espnet.utils.io_utils import LoadInputsAndTargets
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Align text to audio using CTC segmentation."
+        "using a pre-trained speech recognition model.",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="Decoding config file path.")
+    parser.add_argument(
+        "--ngpu", type=int, default=0, help="Number of GPUs (max. 1 is supported)"
+    )
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="pytorch",
+        choices=["pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
+    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    # task related
+    parser.add_argument(
+        "--data-json", type=str, help="Json of recognition data for audio and text"
+    )
+    parser.add_argument("--utt-text", type=str, help="Text separated into utterances")
+    # model (parameter) related
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    parser.add_argument(
+        "--num-encs", default=1, type=int, help="Number of encoders in the model."
+    )
+    # ctc-segmentation related
+    parser.add_argument(
+        "--subsampling-factor",
+        type=int,
+        default=None,
+        help="Subsampling factor."
+        " If the encoder sub-samples its input, the number of frames at the CTC layer"
+        " is reduced by this factor. For example, a BLSTMP with subsampling 1_2_2_1_1"
+        " has a subsampling factor of 4.",
+    )
+    parser.add_argument(
+        "--frame-duration",
+        type=int,
+        default=None,
+        help="Non-overlapping duration of a single frame in milliseconds.",
+    )
+    parser.add_argument(
+        "--min-window-size",
+        type=int,
+        default=None,
+        help="Minimum window size considered for utterance.",
+    )
+    parser.add_argument(
+        "--max-window-size",
+        type=int,
+        default=None,
+        help="Maximum window size considered for utterance.",
+    )
+    parser.add_argument(
+        "--use-dict-blank",
+        type=int,
+        default=None,
+        help="DEPRECATED.",
+    )
+    parser.add_argument(
+        "--set-blank",
+        type=int,
+        default=None,
+        help="Index of model dictionary for blank token (default: 0).",
+    )
+    parser.add_argument(
+        "--gratis-blank",
+        type=int,
+        default=None,
+        help="Set the transition cost of the blank token to zero. Audio sections"
+        " labeled with blank tokens can then be skipped without penalty. Useful"
+        " if there are unrelated audio segments between utterances.",
+    )
+    parser.add_argument(
+        "--replace-spaces-with-blanks",
+        type=int,
+        default=None,
+        help="Fill blanks in between words to better model pauses between words."
+        " Segments can be misaligned if this option is combined with --gratis-blank."
+        " May increase length of ground truth.",
+    )
+    parser.add_argument(
+        "--scoring-length",
+        type=int,
+        default=None,
+        help="Changes partitioning length L for calculation of the confidence score.",
+    )
+    parser.add_argument(
+        "--output",
+        type=configargparse.FileType("w"),
+        required=True,
+        help="Output segments file",
+    )
+    return parser
+
+
+def main(args):
+    """Run the main decoding function."""
+    parser = get_parser()
+    args, extra = parser.parse_known_args(args)
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+    if args.ngpu == 0 and args.dtype == "float16":
+        raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.")
+    # check CUDA_VISIBLE_DEVICES
+    device = "cpu"
+    if args.ngpu == 1:
+        device = "cuda"
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+    elif args.ngpu > 1:
+        logging.error("Decoding only supports ngpu=1.")
+        sys.exit(1)
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+    # recog
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        ctc_align(args, device)
+    else:
+        raise ValueError("Only pytorch is supported.")
+    sys.exit(0)
+
+
+def ctc_align(args, device):
+    """ESPnet-specific interface for CTC segmentation.
+
+    Parses configuration, infers the CTC posterior probabilities,
+    and then aligns start and end of utterances using CTC segmentation.
+    Results are written to the output file given in the args.
+
+    :param args: given configuration
+    :param device: for inference; one of ['cuda', 'cpu']
+    :return:  0 on success
+    """
+    model, train_args = load_trained_model(args.model)
+    assert isinstance(model, ASRInterface)
+    load_inputs_and_targets = LoadInputsAndTargets(
+        mode="asr",
+        load_output=True,
+        sort_in_input_length=False,
+        preprocess_conf=train_args.preprocess_conf
+        if args.preprocess_conf is None
+        else args.preprocess_conf,
+        preprocess_args={"train": False},
+    )
+    logging.info(f"Decoding device={device}")
+    # Warn for nets with high memory consumption on long audio files
+    if hasattr(model, "enc"):
+        encoder_module = model.enc.__class__.__module__
+    elif hasattr(model, "encoder"):
+        encoder_module = model.encoder.__class__.__module__
+    else:
+        encoder_module = "Unknown"
+    logging.info(f"Encoder module: {encoder_module}")
+    logging.info(f"CTC module:     {model.ctc.__class__.__module__}")
+    if "rnn" not in encoder_module:
+        logging.warning("No BLSTM model detected; memory consumption may be high.")
+    model.to(device=device).eval()
+    # read audio and text json data
+    with open(args.data_json, "rb") as f:
+        js = json.load(f)["utts"]
+    with open(args.utt_text, "r", encoding="utf-8") as f:
+        lines = f.readlines()
+        i = 0
+        text = {}
+        segment_names = {}
+        for name in js.keys():
+            text_per_audio = []
+            segment_names_per_audio = []
+            while i < len(lines) and lines[i].startswith(name):
+                text_per_audio.append(lines[i][lines[i].find(" ") + 1 :])
+                segment_names_per_audio.append(lines[i][: lines[i].find(" ")])
+                i += 1
+            text[name] = text_per_audio
+            segment_names[name] = segment_names_per_audio
+    # apply configuration
+    config = CtcSegmentationParameters()
+    subsampling_factor = 1
+    frame_duration_ms = 10
+    if args.subsampling_factor is not None:
+        subsampling_factor = args.subsampling_factor
+    if args.frame_duration is not None:
+        frame_duration_ms = args.frame_duration
+    # Backwards compatibility to ctc_segmentation <= 1.5.3
+    if hasattr(config, "index_duration"):
+        config.index_duration = frame_duration_ms * subsampling_factor / 1000
+    else:
+        config.subsampling_factor = subsampling_factor
+        config.frame_duration_ms = frame_duration_ms
+    if args.min_window_size is not None:
+        config.min_window_size = args.min_window_size
+    if args.max_window_size is not None:
+        config.max_window_size = args.max_window_size
+    config.char_list = train_args.char_list
+    if args.use_dict_blank is not None:
+        logging.warning(
+            "The option --use-dict-blank is deprecated. If needed,"
+            " use --set-blank instead."
+        )
+    if args.set_blank is not None:
+        config.blank = args.set_blank
+    if args.replace_spaces_with_blanks is not None:
+        if args.replace_spaces_with_blanks:
+            config.replace_spaces_with_blanks = True
+        else:
+            config.replace_spaces_with_blanks = False
+    if args.gratis_blank:
+        config.blank_transition_cost_zero = True
+    if config.blank_transition_cost_zero and args.replace_spaces_with_blanks:
+        logging.error(
+            "Blanks are inserted between words, and also the transition cost of blank"
+            " is zero. This configuration may lead to misalignments!"
+        )
+    if args.scoring_length is not None:
+        config.score_min_mean_over_L = args.scoring_length
+    logging.info(f"Frame timings: {frame_duration_ms}ms * {subsampling_factor}")
+    # Iterate over audio files to decode and align
+    for idx, name in enumerate(js.keys(), 1):
+        logging.info("(%d/%d) Aligning " + name, idx, len(js.keys()))
+        batch = [(name, js[name])]
+        feat, label = load_inputs_and_targets(batch)
+        feat = feat[0]
+        with torch.no_grad():
+            # Encode input frames
+            enc_output = model.encode(torch.as_tensor(feat).to(device)).unsqueeze(0)
+            # Apply ctc layer to obtain log character probabilities
+            lpz = model.ctc.log_softmax(enc_output)[0].cpu().numpy()
+        # Prepare the text for aligning
+        ground_truth_mat, utt_begin_indices = prepare_text(config, text[name])
+        # Align using CTC segmentation
+        timings, char_probs, state_list = ctc_segmentation(
+            config, lpz, ground_truth_mat
+        )
+        logging.debug(f"state_list = {state_list}")
+        # Obtain list of utterances with time intervals and confidence score
+        segments = determine_utterance_segments(
+            config, utt_begin_indices, char_probs, timings, text[name]
+        )
+        # Write to "segments" file
+        for i, boundary in enumerate(segments):
+            utt_segment = (
+                f"{segment_names[name][i]} {name} {boundary[0]:.2f}"
+                f" {boundary[1]:.2f} {boundary[2]:.9f}\n"
+            )
+            args.output.write(utt_segment)
+    return 0
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/asr_enhance.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/asr_enhance.py
+#!/usr/bin/env python3
+import logging
+import os
+import random
+import sys
+from distutils.util import strtobool
+
+import configargparse
+import numpy as np
+
+from espnet.asr.pytorch_backend.asr import enhance
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    parser = configargparse.ArgumentParser(
+        description="Enhance noisy speech for speech recognition",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+
+    parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
+    parser.add_argument(
+        "--backend",
+        default="chainer",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--verbose", "-V", default=1, type=int, help="Verbose option")
+    parser.add_argument(
+        "--batchsize",
+        default=1,
+        type=int,
+        help="Batch size for beam search (0: means no batch processing)",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    # task related
+    parser.add_argument(
+        "--recog-json", type=str, help="Filename of recognition data (json)"
+    )
+    # model (parameter) related
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+
+    # Outputs configuration
+    parser.add_argument(
+        "--enh-wspecifier",
+        type=str,
+        default=None,
+        help="Specify the output way for enhanced speech."
+        "e.g. ark,scp:outdir,wav.scp",
+    )
+    parser.add_argument(
+        "--enh-filetype",
+        type=str,
+        default="sound",
+        choices=["mat", "hdf5", "sound.hdf5", "sound"],
+        help="Specify the file format for enhanced speech. "
+        '"mat" is the matrix format in kaldi',
+    )
+    parser.add_argument("--fs", type=int, default=16000, help="The sample frequency")
+    parser.add_argument(
+        "--keep-length",
+        type=strtobool,
+        default=True,
+        help="Adjust the output length to match " "with the input for enhanced speech",
+    )
+    parser.add_argument(
+        "--image-dir", type=str, default=None, help="The directory saving the images."
+    )
+    parser.add_argument(
+        "--num-images",
+        type=int,
+        default=20,
+        help="The number of images files to be saved. "
+        "If negative, all samples are to be saved.",
+    )
+
+    # IStft
+    parser.add_argument(
+        "--apply-istft",
+        type=strtobool,
+        default=True,
+        help="Apply istft to the output from the network",
+    )
+    parser.add_argument(
+        "--istft-win-length",
+        type=int,
+        default=512,
+        help="The window length for istft. "
+        "This option is ignored "
+        "if stft is found in the preprocess-conf",
+    )
+    parser.add_argument(
+        "--istft-n-shift",
+        type=str,
+        default=256,
+        help="The window type for istft. "
+        "This option is ignored "
+        "if stft is found in the preprocess-conf",
+    )
+    parser.add_argument(
+        "--istft-window",
+        type=str,
+        default="hann",
+        help="The window type for istft. "
+        "This option is ignored "
+        "if stft is found in the preprocess-conf",
+    )
+    return parser
+
+
+def main(args):
+    parser = get_parser()
+    args = parser.parse_args(args)
+
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+
+        # TODO(kamo): support of multiple GPUs
+        if args.ngpu > 1:
+            logging.error("The program only supports ngpu=1.")
+            sys.exit(1)
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # seed setting
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logging.info("set random seed = %d" % args.seed)
+
+    # recog
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        enhance(args)
+    else:
+        raise ValueError("Only pytorch is supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/asr_recog.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/asr_recog.py
+#!/usr/bin/env python3
+# encoding: utf-8
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""End-to-end speech recognition model decoding script."""
+
+import logging
+import os
+import random
+import sys
+
+import configargparse
+import numpy as np
+
+from espnet.utils.cli_utils import strtobool
+
+# NOTE: you need this func to generate our sphinx doc
+
+
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Transcribe text from speech using "
+        "a speech recognition model on one CPU or GPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="Config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="Second config file path that overwrites the settings in `--config`",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="Third config file path that overwrites the settings "
+        "in `--config` and `--config2`",
+    )
+
+    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="chainer",
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
+    parser.add_argument("--seed", type=int, default=1, help="Random seed")
+    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
+    parser.add_argument(
+        "--batchsize",
+        type=int,
+        default=1,
+        help="Batch size for beam search (0: means no batch processing)",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    parser.add_argument(
+        "--api",
+        default="v1",
+        choices=["v1", "v2"],
+        help="Beam search APIs "
+        "v1: Default API. It only supports the ASRInterface.recognize method "
+        "and DefaultRNNLM. "
+        "v2: Experimental API. It supports any models that implements ScorerInterface.",
+    )
+    # task related
+    parser.add_argument(
+        "--recog-json", type=str, help="Filename of recognition data (json)"
+    )
+    parser.add_argument(
+        "--result-label",
+        type=str,
+        required=True,
+        help="Filename of result label data (json)",
+    )
+    # model (parameter) related
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    parser.add_argument(
+        "--num-spkrs",
+        type=int,
+        default=1,
+        choices=[1, 2],
+        help="Number of speakers in the speech",
+    )
+    parser.add_argument(
+        "--num-encs", default=1, type=int, help="Number of encoders in the model."
+    )
+    # search related
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
+    parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths.
+                        If maxlenratio<0.0, its absolute value is interpreted
+                        as a constant max output length""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    parser.add_argument(
+        "--ctc-weight", type=float, default=0.0, help="CTC weight in joint decoding"
+    )
+    parser.add_argument(
+        "--weights-ctc-dec",
+        type=float,
+        action="append",
+        help="ctc weight assigned to each encoder during decoding."
+        "[in multi-encoder mode only]",
+    )
+    parser.add_argument(
+        "--ctc-window-margin",
+        type=int,
+        default=0,
+        help="""Use CTC window with margin parameter to accelerate
+                        CTC/attention decoding especially on GPU. Smaller magin
+                        makes decoding faster, but may increase search errors.
+                        If margin=0 (default), this function is disabled""",
+    )
+    # transducer related
+    parser.add_argument(
+        "--search-type",
+        type=str,
+        default="default",
+        choices=["default", "nsc", "tsd", "alsd", "maes"],
+        help="""Type of beam search implementation to use during inference.
+        Can be either: default beam search ("default"),
+        N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"),
+        Alignment-Length Synchronous Decoding ("alsd") or
+        modified Adaptive Expansion Search ("maes").""",
+    )
+    parser.add_argument(
+        "--nstep",
+        type=int,
+        default=1,
+        help="""Number of expansion steps allowed in NSC beam search or mAES
+        (nstep > 0 for NSC and nstep > 1 for mAES).""",
+    )
+    parser.add_argument(
+        "--prefix-alpha",
+        type=int,
+        default=2,
+        help="Length prefix difference allowed in NSC beam search or mAES.",
+    )
+    parser.add_argument(
+        "--max-sym-exp",
+        type=int,
+        default=2,
+        help="Number of symbol expansions allowed in TSD.",
+    )
+    parser.add_argument(
+        "--u-max",
+        type=int,
+        default=400,
+        help="Length prefix difference allowed in ALSD.",
+    )
+    parser.add_argument(
+        "--expansion-gamma",
+        type=float,
+        default=2.3,
+        help="Allowed logp difference for prune-by-value method in mAES.",
+    )
+    parser.add_argument(
+        "--expansion-beta",
+        type=int,
+        default=2,
+        help="""Number of additional candidates for expanded hypotheses
+                selection in mAES.""",
+    )
+    parser.add_argument(
+        "--score-norm",
+        type=strtobool,
+        nargs="?",
+        default=True,
+        help="Normalize final hypotheses' score by length",
+    )
+    parser.add_argument(
+        "--softmax-temperature",
+        type=float,
+        default=1.0,
+        help="Penalization term for softmax function.",
+    )
+    # rnnlm related
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
+    )
+    parser.add_argument(
+        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
+    )
+    parser.add_argument(
+        "--word-rnnlm", type=str, default=None, help="Word RNNLM model file to read"
+    )
+    parser.add_argument(
+        "--word-rnnlm-conf",
+        type=str,
+        default=None,
+        help="Word RNNLM model config file to read",
+    )
+    parser.add_argument("--word-dict", type=str, default=None, help="Word list to read")
+    parser.add_argument("--lm-weight", type=float, default=0.1, help="RNNLM weight")
+    # ngram related
+    parser.add_argument(
+        "--ngram-model", type=str, default=None, help="ngram model file to read"
+    )
+    parser.add_argument("--ngram-weight", type=float, default=0.1, help="ngram weight")
+    parser.add_argument(
+        "--ngram-scorer",
+        type=str,
+        default="part",
+        choices=("full", "part"),
+        help="""if the ngram is set as a part scorer, similar with CTC scorer,
+                ngram scorer only scores topK hypethesis.
+                if the ngram is set as full scorer, ngram scorer scores all hypthesis
+                the decoding speed of part scorer is musch faster than full one""",
+    )
+    # streaming related
+    parser.add_argument(
+        "--streaming-mode",
+        type=str,
+        default=None,
+        choices=["window", "segment"],
+        help="""Use streaming recognizer for inference.
+                        `--batchsize` must be set to 0 to enable this mode""",
+    )
+    parser.add_argument("--streaming-window", type=int, default=10, help="Window size")
+    parser.add_argument(
+        "--streaming-min-blank-dur",
+        type=int,
+        default=10,
+        help="Minimum blank duration threshold",
+    )
+    parser.add_argument(
+        "--streaming-onset-margin", type=int, default=1, help="Onset margin"
+    )
+    parser.add_argument(
+        "--streaming-offset-margin", type=int, default=1, help="Offset margin"
+    )
+    # non-autoregressive related
+    # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail.
+    parser.add_argument(
+        "--maskctc-n-iterations",
+        type=int,
+        default=10,
+        help="Number of decoding iterations."
+        "For Mask CTC, set 0 to predict 1 mask/iter.",
+    )
+    parser.add_argument(
+        "--maskctc-probability-threshold",
+        type=float,
+        default=0.999,
+        help="Threshold probability for CTC output",
+    )
+    # quantize model related
+    parser.add_argument(
+        "--quantize-config",
+        nargs="*",
+        help="""Config for dynamic quantization provided as a list of modules,
+        separated by a comma. E.g.: --quantize-config=[Linear,LSTM,GRU].
+        Each specified module should be an attribute of 'torch.nn', e.g.:
+        torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...""",
+    )
+    parser.add_argument(
+        "--quantize-dtype",
+        type=str,
+        default="qint8",
+        choices=["float16", "qint8"],
+        help="Dtype for dynamic quantization.",
+    )
+    parser.add_argument(
+        "--quantize-asr-model",
+        type=bool,
+        default=False,
+        help="Apply dynamic quantization to ASR model.",
+    )
+    parser.add_argument(
+        "--quantize-lm-model",
+        type=bool,
+        default=False,
+        help="Apply dynamic quantization to LM.",
+    )
+    return parser
+
+
+def main(args):
+    """Run the main decoding function."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+
+    if args.ngpu == 0 and args.dtype == "float16":
+        raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.")
+
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+
+        # TODO(mn5k): support of multiple GPUs
+        if args.ngpu > 1:
+            logging.error("The program only supports ngpu=1.")
+            sys.exit(1)
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # seed setting
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logging.info("set random seed = %d" % args.seed)
+
+    # validate rnn options
+    if args.rnnlm is not None and args.word_rnnlm is not None:
+        logging.error(
+            "It seems that both --rnnlm and --word-rnnlm are specified. "
+            "Please use either option."
+        )
+        sys.exit(1)
+
+    # recog
+    logging.info("backend = " + args.backend)
+    if args.num_spkrs == 1:
+        if args.backend == "chainer":
+            from espnet.asr.chainer_backend.asr import recog
+
+            recog(args)
+        elif args.backend == "pytorch":
+            if args.num_encs == 1:
+                # Experimental API that supports custom LMs
+                if args.api == "v2":
+                    from espnet.asr.pytorch_backend.recog import recog_v2
+
+                    recog_v2(args)
+                else:
+                    from espnet.asr.pytorch_backend.asr import recog
+
+                    if args.dtype != "float32":
+                        raise NotImplementedError(
+                            f"`--dtype {args.dtype}` is only available with `--api v2`"
+                        )
+                    recog(args)
+            else:
+                if args.api == "v2":
+                    raise NotImplementedError(
+                        f"--num-encs {args.num_encs} > 1 is not supported in --api v2"
+                    )
+                else:
+                    from espnet.asr.pytorch_backend.asr import recog
+
+                    recog(args)
+        else:
+            raise ValueError("Only chainer and pytorch are supported.")
+    elif args.num_spkrs == 2:
+        if args.backend == "pytorch":
+            from espnet.asr.pytorch_backend.asr_mix import recog
+
+            recog(args)
+        else:
+            raise ValueError("Only pytorch is supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/asr_train.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/asr_train.py
+#!/usr/bin/env python3
+# encoding: utf-8
+
+# Copyright 2017 Tomoki Hayashi (Nagoya University)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Automatic speech recognition model training script."""
+
+import logging
+import os
+import random
+import subprocess
+import sys
+
+import configargparse
+import numpy as np
+
+from espnet import __version__
+from espnet.utils.cli_utils import strtobool
+from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser(parser=None, required=True):
+    """Get default arguments."""
+    if parser is None:
+        parser = configargparse.ArgumentParser(
+            description="Train an automatic speech recognition (ASR) model on one CPU, "
+            "one or multiple GPUs",
+            config_file_parser_class=configargparse.YAMLConfigFileParser,
+            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings in "
+        "`--config` and `--config2`.",
+    )
+
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--use-ddp",
+        default=False,
+        action="store_true",
+        help="Enable process-based data parallel. "
+        "--ngpu's GPUs will be used. "
+        "If --ngpu is not given, this tries to identify "
+        "how many GPUs can be used. But, if it fails, "
+        "the application will abort. "
+        "And, currently, single node multi GPUs job is only supported.",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        default="float32",
+        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
+        help="Data type for training (only pytorch backend). "
+        "O0,O1,.. flags require apex. "
+        "See https://nvidia.github.io/apex/amp.html#opt-levels",
+    )
+    parser.add_argument(
+        "--backend",
+        default="chainer",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument(
+        "--outdir", type=str, required=required, help="Output directory"
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--dict", required=required, help="Dictionary")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument(
+        "--minibatches",
+        "-N",
+        type=int,
+        default="-1",
+        help="Process only N minibatches (for debug)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log dir path",
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=100,
+        type=int,
+        help="Report interval iterations",
+    )
+    parser.add_argument(
+        "--save-interval-iters",
+        default=0,
+        type=int,
+        help="Save snapshot interval iterations",
+    )
+    # task related
+    parser.add_argument(
+        "--train-json",
+        type=str,
+        default=None,
+        help="Filename of train label data (json)",
+    )
+    parser.add_argument(
+        "--valid-json",
+        type=str,
+        default=None,
+        help="Filename of validation label data (json)",
+    )
+    # network architecture
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default=None,
+        help="model defined module (default: espnet.nets.xxx_backend.e2e_asr:E2E)",
+    )
+    # encoder
+    parser.add_argument(
+        "--num-encs", default=1, type=int, help="Number of encoders in the model."
+    )
+    # loss related
+    parser.add_argument(
+        "--ctc_type",
+        default="builtin",
+        type=str,
+        choices=["builtin", "gtnctc", "cudnnctc"],
+        help="Type of CTC implementation to calculate loss.",
+    )
+    parser.add_argument(
+        "--mtlalpha",
+        default=0.5,
+        type=float,
+        help="Multitask learning coefficient, "
+        "alpha: alpha*ctc_loss + (1-alpha)*att_loss ",
+    )
+    parser.add_argument(
+        "--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
+    )
+    # recognition options to compute CER/WER
+    parser.add_argument(
+        "--report-cer",
+        default=False,
+        action="store_true",
+        help="Compute CER on development set",
+    )
+    parser.add_argument(
+        "--report-wer",
+        default=False,
+        action="store_true",
+        help="Compute WER on development set",
+    )
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
+    parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        default=0.0,
+        type=float,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        default=0.0,
+        type=float,
+        help="Input length ratio to obtain min output length",
+    )
+    parser.add_argument(
+        "--ctc-weight", default=0.3, type=float, help="CTC weight in joint decoding"
+    )
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
+    )
+    parser.add_argument(
+        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
+    )
+    parser.add_argument("--lm-weight", default=0.1, type=float, help="RNNLM weight.")
+    parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
+    parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
+    # minibatch related
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batch-count",
+        default="auto",
+        choices=BATCH_COUNT_CHOICES,
+        help="How to count batch_size. "
+        "The default (auto) will find how to count by args.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "--batch-seqs",
+        "-b",
+        default=0,
+        type=int,
+        help="Maximum seqs in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-bins",
+        default=0,
+        type=int,
+        help="Maximum bins in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-in",
+        default=0,
+        type=int,
+        help="Maximum input frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-out",
+        default=0,
+        type=int,
+        help="Maximum output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-inout",
+        default=0,
+        type=int,
+        help="Maximum input+output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--maxlen-in",
+        "--batch-seq-maxlen-in",
+        default=800,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the input sequence length > ML.",
+    )
+    parser.add_argument(
+        "--maxlen-out",
+        "--batch-seq-maxlen-out",
+        default=150,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the output sequence length > ML",
+    )
+    parser.add_argument(
+        "--n-iter-processes",
+        default=0,
+        type=int,
+        help="Number of processes of iterator",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        nargs="?",
+        help="The configuration file for the pre-processing",
+    )
+    # optimization related
+    parser.add_argument(
+        "--opt",
+        default="adadelta",
+        type=str,
+        choices=["adadelta", "adam", "noam"],
+        help="Optimizer",
+    )
+    parser.add_argument(
+        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
+    )
+    parser.add_argument(
+        "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
+    )
+    parser.add_argument(
+        "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
+    )
+    parser.add_argument(
+        "--weight-decay", default=0.0, type=float, help="Weight decay ratio"
+    )
+    parser.add_argument(
+        "--criterion",
+        default="acc",
+        type=str,
+        choices=["loss", "loss_eps_decay_only", "acc"],
+        help="Criterion to perform epsilon decay",
+    )
+    parser.add_argument(
+        "--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
+    )
+    parser.add_argument(
+        "--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/acc",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs to wait without improvement "
+        "before stopping the training",
+    )
+    parser.add_argument(
+        "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
+    )
+    parser.add_argument(
+        "--num-save-attention",
+        default=3,
+        type=int,
+        help="Number of samples of attention to be saved",
+    )
+    parser.add_argument(
+        "--num-save-ctc",
+        default=3,
+        type=int,
+        help="Number of samples of CTC probability to be saved",
+    )
+    parser.add_argument(
+        "--grad-noise",
+        type=strtobool,
+        default=False,
+        help="The flag to switch to use noise injection to gradients during training",
+    )
+    # asr_mix related
+    parser.add_argument(
+        "--num-spkrs",
+        default=1,
+        type=int,
+        choices=[1, 2],
+        help="Number of speakers in the speech.",
+    )
+    # decoder related
+    parser.add_argument(
+        "--context-residual",
+        default=False,
+        type=strtobool,
+        nargs="?",
+        help="The flag to switch to use context vector residual in the decoder network",
+    )
+    # finetuning related
+    parser.add_argument(
+        "--enc-init",
+        default=None,
+        type=str,
+        help="Pre-trained ASR model to initialize encoder.",
+    )
+    parser.add_argument(
+        "--enc-init-mods",
+        default="enc.enc.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--dec-init",
+        default=None,
+        type=str,
+        help="Pre-trained ASR, MT or LM model to initialize decoder.",
+    )
+    parser.add_argument(
+        "--dec-init-mods",
+        default="att.,dec.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of decoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--freeze-mods",
+        default=None,
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of modules to freeze, separated by a comma.",
+    )
+    # front end related
+    parser.add_argument(
+        "--use-frontend",
+        type=strtobool,
+        default=False,
+        help="The flag to switch to use frontend system.",
+    )
+
+    # WPE related
+    parser.add_argument(
+        "--use-wpe",
+        type=strtobool,
+        default=False,
+        help="Apply Weighted Prediction Error",
+    )
+    parser.add_argument(
+        "--wtype",
+        default="blstmp",
+        type=str,
+        choices=[
+            "lstm",
+            "blstm",
+            "lstmp",
+            "blstmp",
+            "vgglstmp",
+            "vggblstmp",
+            "vgglstm",
+            "vggblstm",
+            "gru",
+            "bgru",
+            "grup",
+            "bgrup",
+            "vgggrup",
+            "vggbgrup",
+            "vgggru",
+            "vggbgru",
+        ],
+        help="Type of encoder network architecture "
+        "of the mask estimator for WPE. "
+        "",
+    )
+    parser.add_argument("--wlayers", type=int, default=2, help="")
+    parser.add_argument("--wunits", type=int, default=300, help="")
+    parser.add_argument("--wprojs", type=int, default=300, help="")
+    parser.add_argument("--wdropout-rate", type=float, default=0.0, help="")
+    parser.add_argument("--wpe-taps", type=int, default=5, help="")
+    parser.add_argument("--wpe-delay", type=int, default=3, help="")
+    parser.add_argument(
+        "--use-dnn-mask-for-wpe",
+        type=strtobool,
+        default=False,
+        help="Use DNN to estimate the power spectrogram. "
+        "This option is experimental.",
+    )
+    # Beamformer related
+    parser.add_argument("--use-beamformer", type=strtobool, default=True, help="")
+    parser.add_argument(
+        "--btype",
+        default="blstmp",
+        type=str,
+        choices=[
+            "lstm",
+            "blstm",
+            "lstmp",
+            "blstmp",
+            "vgglstmp",
+            "vggblstmp",
+            "vgglstm",
+            "vggblstm",
+            "gru",
+            "bgru",
+            "grup",
+            "bgrup",
+            "vgggrup",
+            "vggbgrup",
+            "vgggru",
+            "vggbgru",
+        ],
+        help="Type of encoder network architecture "
+        "of the mask estimator for Beamformer.",
+    )
+    parser.add_argument("--blayers", type=int, default=2, help="")
+    parser.add_argument("--bunits", type=int, default=300, help="")
+    parser.add_argument("--bprojs", type=int, default=300, help="")
+    parser.add_argument("--badim", type=int, default=320, help="")
+    parser.add_argument(
+        "--bnmask",
+        type=int,
+        default=2,
+        help="Number of beamforming masks, " "default is 2 for [speech, noise].",
+    )
+    parser.add_argument(
+        "--ref-channel",
+        type=int,
+        default=-1,
+        help="The reference channel used for beamformer. "
+        "By default, the channel is estimated by DNN.",
+    )
+    parser.add_argument("--bdropout-rate", type=float, default=0.0, help="")
+    # Feature transform: Normalization
+    parser.add_argument(
+        "--stats-file",
+        type=str,
+        default=None,
+        help="The stats file for the feature normalization",
+    )
+    parser.add_argument(
+        "--apply-uttmvn",
+        type=strtobool,
+        default=True,
+        help="Apply utterance level mean " "variance normalization.",
+    )
+    parser.add_argument("--uttmvn-norm-means", type=strtobool, default=True, help="")
+    parser.add_argument("--uttmvn-norm-vars", type=strtobool, default=False, help="")
+    # Feature transform: Fbank
+    parser.add_argument(
+        "--fbank-fs",
+        type=int,
+        default=16000,
+        help="The sample frequency used for " "the mel-fbank creation.",
+    )
+    parser.add_argument(
+        "--n-mels", type=int, default=80, help="The number of mel-frequency bins."
+    )
+    parser.add_argument("--fbank-fmin", type=float, default=0.0, help="")
+    parser.add_argument("--fbank-fmax", type=float, default=None, help="")
+    return parser
+
+
+def setup_logging(verbose):
+    """Make logging setup with a given log level."""
+    if verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+
+def main(cmd_args):
+    """Run the main training function."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+    if args.backend == "chainer" and args.train_dtype != "float32":
+        raise NotImplementedError(
+            f"chainer backend does not support --train-dtype {args.train_dtype}."
+            "Use --dtype float32."
+        )
+    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
+        raise ValueError(
+            f"--train-dtype {args.train_dtype} does not support the CPU backend."
+        )
+
+    from espnet.utils.dynamic_import import dynamic_import
+
+    if args.model_module is None:
+        if args.num_spkrs == 1:
+            model_module = "espnet.nets." + args.backend + "_backend.e2e_asr:E2E"
+        else:
+            model_module = "espnet.nets." + args.backend + "_backend.e2e_asr_mix:E2E"
+    else:
+        model_module = args.model_module
+    model_class = dynamic_import(model_module)
+    model_class.add_arguments(parser)
+
+    args = parser.parse_args(cmd_args)
+    args.model_module = model_module
+    if "chainer_backend" in args.model_module:
+        args.backend = "chainer"
+    if "pytorch_backend" in args.model_module:
+        args.backend = "pytorch"
+
+    # add version info in args
+    args.version = __version__
+
+    # logging info
+    setup_logging(args.verbose)
+
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+    else:
+        if args.ngpu != 1:
+            logging.debug(
+                "There are some bugs with multi-GPU processing in PyTorch 1.2+"
+                + " (see https://github.com/pytorch/pytorch/issues/21108)"
+            )
+        ngpu = args.ngpu
+    if args.use_ddp and ngpu <= 0:
+        raise ValueError("DDP requires at least 1 GPU.")
+    logging.info(f"ngpu: {ngpu}")
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # set random seed
+    logging.info("random seed = %d" % args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    # load dictionary for debug log
+    if args.dict is not None:
+        with open(args.dict, "rb") as f:
+            dictionary = f.readlines()
+        char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
+        char_list.insert(0, "<blank>")
+        char_list.append("<eos>")
+        # for non-autoregressive maskctc model
+        if "maskctc" in args.model_module:
+            char_list.append("<mask>")
+        args.char_list = char_list
+    else:
+        args.char_list = None
+
+    # train
+    logging.info("backend = " + args.backend)
+
+    if args.use_ddp:
+        # When using DDP, only PyTorch is supported.
+        # Chainer is out-of-scope.
+        if args.num_spkrs == 1:
+            if args.backend == "chainer":
+                raise ValueError("Chainer with DDP is not supported.")
+            from espnet.distributed.pytorch_backend.launch import (
+                launch,
+                set_start_method,
+            )
+
+            # NOTE: it's necessary to set "spawn" as a multiprocessing
+            # start method. Because, in this use case, CUDA initialization
+            # procedure has been already done, but CUDA context can't be
+            # shared with processes.
+            # By default, multiprocessing tries to launch a process with
+            # "fork" method. But, it will make processes which share
+            # memory address spaces with a parent process.
+            # To ensure a separate memory space, "spawn" method is required.
+            set_start_method("spawn")
+            launch(_reinitialize_logging_and_call_train, args, args.ngpu)
+        else:
+            raise ValueError("Single speaker is only supported when using DDP.")
+    else:
+        if args.num_spkrs == 1:
+            if args.backend == "chainer":
+                from espnet.asr.chainer_backend.asr import train
+
+                train(args)
+            elif args.backend == "pytorch":
+                from espnet.asr.pytorch_backend.asr import train
+
+                train(args)
+            else:
+                raise ValueError("Only chainer and pytorch are supported.")
+        else:
+            # FIXME(kamo): Support --model-module
+            if args.backend == "pytorch":
+                from espnet.asr.pytorch_backend.asr_mix import train
+
+                train(args)
+            else:
+                raise ValueError("Only pytorch is supported.")
+
+
+def _reinitialize_logging_and_call_train(args):
+    # NOTE: it looks like logging setting is cleared
+    # by launching processes with "spawn" method.
+    # Within each worker process,
+    # logging configuraiton must be set again.
+    from espnet.asr.pytorch_backend.asr import train
+
+    setup_logging(args.verbose)
+    train(args)
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/lm_train.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/lm_train.py
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# This code is ported from the following implementation written in Torch.
+# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py
+
+"""Language model training script."""
+
+import logging
+import os
+import random
+import subprocess
+import sys
+
+import configargparse
+import numpy as np
+
+from espnet import __version__
+from espnet.nets.lm_interface import dynamic_import_lm
+from espnet.optimizer.factory import dynamic_import_optimizer
+from espnet.scheduler.scheduler import dynamic_import_scheduler
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser(parser=None, required=True):
+    """Get parser."""
+    if parser is None:
+        parser = configargparse.ArgumentParser(
+            description="Train a new language model on one CPU or one GPU",
+            config_file_parser_class=configargparse.YAMLConfigFileParser,
+            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        default="float32",
+        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
+        help="Data type for training (only pytorch backend). "
+        "O0,O1,.. flags require apex. "
+        "See https://nvidia.github.io/apex/amp.html#opt-levels",
+    )
+    parser.add_argument(
+        "--backend",
+        default="chainer",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument(
+        "--outdir", type=str, required=required, help="Output directory"
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--dict", type=str, required=required, help="Dictionary")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log dir path",
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=100,
+        type=int,
+        help="Report interval iterations",
+    )
+    # task related
+    parser.add_argument(
+        "--train-label",
+        type=str,
+        required=required,
+        help="Filename of train label data",
+    )
+    parser.add_argument(
+        "--valid-label",
+        type=str,
+        required=required,
+        help="Filename of validation label data",
+    )
+    parser.add_argument("--test-label", type=str, help="Filename of test label data")
+    parser.add_argument(
+        "--dump-hdf5-path",
+        type=str,
+        default=None,
+        help="Path to dump a preprocessed dataset as hdf5",
+    )
+    # training configuration
+    parser.add_argument("--opt", default="sgd", type=str, help="Optimizer")
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batchsize",
+        "-b",
+        type=int,
+        default=300,
+        help="Number of examples in each mini-batch",
+    )
+    parser.add_argument(
+        "--accum-grad", type=int, default=1, help="Number of gradient accumueration"
+    )
+    parser.add_argument(
+        "--epoch",
+        "-e",
+        type=int,
+        default=20,
+        help="Number of sweeps over the dataset to train",
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/loss",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs "
+        "to wait without improvement before stopping the training",
+    )
+    parser.add_argument(
+        "--schedulers",
+        default=None,
+        action="append",
+        type=lambda kv: kv.split("="),
+        help="optimizer schedulers, you can configure params like:"
+        " <optimizer-param>-<scheduler-name>-<schduler-param>"
+        ' e.g., "--schedulers lr=noam --lr-noam-warmup 1000".',
+    )
+    parser.add_argument(
+        "--gradclip",
+        "-c",
+        type=float,
+        default=5,
+        help="Gradient norm threshold to clip",
+    )
+    parser.add_argument(
+        "--maxlen",
+        type=int,
+        default=40,
+        help="Batch size is reduced if the input sequence > ML",
+    )
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default="default",
+        help="model defined module "
+        "(default: espnet.nets.xxx_backend.lm.default:DefaultRNNLM)",
+    )
+    return parser
+
+
+def main(cmd_args):
+    """Train LM."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+    if args.backend == "chainer" and args.train_dtype != "float32":
+        raise NotImplementedError(
+            f"chainer backend does not support --train-dtype {args.train_dtype}."
+            "Use --dtype float32."
+        )
+    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
+        raise ValueError(
+            f"--train-dtype {args.train_dtype} does not support the CPU backend."
+        )
+
+    # parse arguments dynamically
+    model_class = dynamic_import_lm(args.model_module, args.backend)
+    model_class.add_arguments(parser)
+    if args.schedulers is not None:
+        for k, v in args.schedulers:
+            scheduler_class = dynamic_import_scheduler(v)
+            scheduler_class.add_arguments(k, parser)
+
+    opt_class = dynamic_import_optimizer(args.opt, args.backend)
+    opt_class.add_arguments(parser)
+
+    args = parser.parse_args(cmd_args)
+
+    # add version info in args
+    args.version = __version__
+
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+        args.ngpu = ngpu
+    else:
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # seed setting
+    nseed = args.seed
+    random.seed(nseed)
+    np.random.seed(nseed)
+
+    # load dictionary
+    with open(args.dict, "rb") as f:
+        dictionary = f.readlines()
+    char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
+    char_list.insert(0, "<blank>")
+    char_list.append("<eos>")
+    args.char_list_dict = {x: i for i, x in enumerate(char_list)}
+    args.n_vocab = len(char_list)
+
+    # train
+    logging.info("backend = " + args.backend)
+    if args.backend == "chainer":
+        from espnet.lm.chainer_backend.lm import train
+
+        train(args)
+    elif args.backend == "pytorch":
+        from espnet.lm.pytorch_backend.lm import train
+
+        train(args)
+    else:
+        raise ValueError("Only chainer and pytorch are supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/mt_train.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/mt_train.py
+#!/usr/bin/env python3
+# encoding: utf-8
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Neural machine translation model training script."""
+
+import logging
+import os
+import random
+import subprocess
+import sys
+
+import configargparse
+import numpy as np
+
+from espnet import __version__
+from espnet.utils.cli_utils import strtobool
+from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser(parser=None, required=True):
+    """Get default arguments."""
+    if parser is None:
+        parser = configargparse.ArgumentParser(
+            description="Train a neural machine translation (NMT) model on one CPU, "
+            "one or multiple GPUs",
+            config_file_parser_class=configargparse.YAMLConfigFileParser,
+            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        default="float32",
+        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
+        help="Data type for training (only pytorch backend). "
+        "O0,O1,.. flags require apex. "
+        "See https://nvidia.github.io/apex/amp.html#opt-levels",
+    )
+    parser.add_argument(
+        "--backend",
+        default="chainer",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument(
+        "--outdir", type=str, required=required, help="Output directory"
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument(
+        "--dict", required=required, help="Dictionary for source/target languages"
+    )
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument(
+        "--minibatches",
+        "-N",
+        type=int,
+        default="-1",
+        help="Process only N minibatches (for debug)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log dir path",
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=100,
+        type=int,
+        help="Report interval iterations",
+    )
+    parser.add_argument(
+        "--save-interval-iters",
+        default=0,
+        type=int,
+        help="Save snapshot interval iterations",
+    )
+    # task related
+    parser.add_argument(
+        "--train-json",
+        type=str,
+        default=None,
+        help="Filename of train label data (json)",
+    )
+    parser.add_argument(
+        "--valid-json",
+        type=str,
+        default=None,
+        help="Filename of validation label data (json)",
+    )
+    # network architecture
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default=None,
+        help="model defined module (default: espnet.nets.xxx_backend.e2e_mt:E2E)",
+    )
+    # loss related
+    parser.add_argument(
+        "--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
+    )
+    # translations options to compute BLEU
+    parser.add_argument(
+        "--report-bleu",
+        default=True,
+        action="store_true",
+        help="Compute BLEU on development set",
+    )
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
+    parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        default=0.0,
+        type=float,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        default=0.0,
+        type=float,
+        help="Input length ratio to obtain min output length",
+    )
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
+    )
+    parser.add_argument(
+        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
+    )
+    parser.add_argument("--lm-weight", default=0.0, type=float, help="RNNLM weight.")
+    parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
+    parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
+    # minibatch related
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batch-count",
+        default="auto",
+        choices=BATCH_COUNT_CHOICES,
+        help="How to count batch_size. "
+        "The default (auto) will find how to count by args.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "--batch-seqs",
+        "-b",
+        default=0,
+        type=int,
+        help="Maximum seqs in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-bins",
+        default=0,
+        type=int,
+        help="Maximum bins in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-in",
+        default=0,
+        type=int,
+        help="Maximum input frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-out",
+        default=0,
+        type=int,
+        help="Maximum output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-inout",
+        default=0,
+        type=int,
+        help="Maximum input+output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--maxlen-in",
+        "--batch-seq-maxlen-in",
+        default=100,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the input sequence length > ML.",
+    )
+    parser.add_argument(
+        "--maxlen-out",
+        "--batch-seq-maxlen-out",
+        default=100,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the output sequence length > ML",
+    )
+    parser.add_argument(
+        "--n-iter-processes",
+        default=0,
+        type=int,
+        help="Number of processes of iterator",
+    )
+    # optimization related
+    parser.add_argument(
+        "--opt",
+        default="adadelta",
+        type=str,
+        choices=["adadelta", "adam", "noam"],
+        help="Optimizer",
+    )
+    parser.add_argument(
+        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
+    )
+    parser.add_argument(
+        "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
+    )
+    parser.add_argument(
+        "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
+    )
+    parser.add_argument(
+        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
+    )
+    parser.add_argument(
+        "--lr-decay", default=1.0, type=float, help="Decaying ratio of learning rate"
+    )
+    parser.add_argument(
+        "--weight-decay", default=0.0, type=float, help="Weight decay ratio"
+    )
+    parser.add_argument(
+        "--criterion",
+        default="acc",
+        type=str,
+        choices=["loss", "acc"],
+        help="Criterion to perform epsilon decay",
+    )
+    parser.add_argument(
+        "--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
+    )
+    parser.add_argument(
+        "--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/acc",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs to wait "
+        "without improvement before stopping the training",
+    )
+    parser.add_argument(
+        "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
+    )
+    parser.add_argument(
+        "--num-save-attention",
+        default=3,
+        type=int,
+        help="Number of samples of attention to be saved",
+    )
+    # decoder related
+    parser.add_argument(
+        "--context-residual",
+        default=False,
+        type=strtobool,
+        nargs="?",
+        help="The flag to switch to use context vector residual in the decoder network",
+    )
+    parser.add_argument(
+        "--tie-src-tgt-embedding",
+        default=False,
+        type=strtobool,
+        nargs="?",
+        help="Tie parameters of source embedding and target embedding.",
+    )
+    parser.add_argument(
+        "--tie-classifier",
+        default=False,
+        type=strtobool,
+        nargs="?",
+        help="Tie parameters of target embedding and output projection layer.",
+    )
+    # finetuning related
+    parser.add_argument(
+        "--enc-init",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Pre-trained ASR model to initialize encoder.",
+    )
+    parser.add_argument(
+        "--enc-init-mods",
+        default="enc.enc.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--dec-init",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Pre-trained ASR, MT or LM model to initialize decoder.",
+    )
+    parser.add_argument(
+        "--dec-init-mods",
+        default="att., dec.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of decoder modules to initialize, separated by a comma.",
+    )
+    # multilingual related
+    parser.add_argument(
+        "--multilingual",
+        default=False,
+        type=strtobool,
+        help="Prepend target language ID to the source sentence. "
+        "Both source/target language IDs must be prepend in the pre-processing stage.",
+    )
+    parser.add_argument(
+        "--replace-sos",
+        default=False,
+        type=strtobool,
+        help="Replace <sos> in the decoder with a target language ID "
+        "(the first token in the target sequence)",
+    )
+
+    return parser
+
+
+def main(cmd_args):
+    """Run the main training function."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+    if args.backend == "chainer" and args.train_dtype != "float32":
+        raise NotImplementedError(
+            f"chainer backend does not support --train-dtype {args.train_dtype}."
+            "Use --dtype float32."
+        )
+    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
+        raise ValueError(
+            f"--train-dtype {args.train_dtype} does not support the CPU backend."
+        )
+
+    from espnet.utils.dynamic_import import dynamic_import
+
+    if args.model_module is None:
+        model_module = "espnet.nets." + args.backend + "_backend.e2e_mt:E2E"
+    else:
+        model_module = args.model_module
+    model_class = dynamic_import(model_module)
+    model_class.add_arguments(parser)
+
+    args = parser.parse_args(cmd_args)
+    args.model_module = model_module
+    if "chainer_backend" in args.model_module:
+        args.backend = "chainer"
+    if "pytorch_backend" in args.model_module:
+        args.backend = "pytorch"
+
+    # add version info in args
+    args.version = __version__
+
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+        args.ngpu = ngpu
+    else:
+        if args.ngpu != 1:
+            logging.debug(
+                "There are some bugs with multi-GPU processing in PyTorch 1.2+"
+                + " (see https://github.com/pytorch/pytorch/issues/21108)"
+            )
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # set random seed
+    logging.info("random seed = %d" % args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    # load dictionary for debug log
+    if args.dict is not None:
+        with open(args.dict, "rb") as f:
+            dictionary = f.readlines()
+        char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
+        char_list.insert(0, "<blank>")
+        char_list.append("<eos>")
+        args.char_list = char_list
+    else:
+        args.char_list = None
+
+    # train
+    logging.info("backend = " + args.backend)
+
+    if args.backend == "pytorch":
+        from espnet.mt.pytorch_backend.mt import train
+
+        train(args)
+    else:
+        raise ValueError("Only pytorch are supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/mt_trans.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/mt_trans.py
+#!/usr/bin/env python3
+# encoding: utf-8
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Neural machine translation model decoding script."""
+
+import logging
+import os
+import random
+import sys
+
+import configargparse
+import numpy as np
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Translate text from speech "
+        "using a speech translation model on one CPU or GPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="Config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="Second config file path that overwrites the settings in `--config`",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="Third config file path "
+        "that overwrites the settings in `--config` and `--config2`",
+    )
+
+    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="chainer",
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
+    parser.add_argument("--seed", type=int, default=1, help="Random seed")
+    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
+    parser.add_argument(
+        "--batchsize",
+        type=int,
+        default=1,
+        help="Batch size for beam search (0: means no batch processing)",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    parser.add_argument(
+        "--api",
+        default="v1",
+        choices=["v1", "v2"],
+        help="Beam search APIs "
+        "v1: Default API. It only supports "
+        "the ASRInterface.recognize method and DefaultRNNLM. "
+        "v2: Experimental API. "
+        "It supports any models that implements ScorerInterface.",
+    )
+    # task related
+    parser.add_argument(
+        "--trans-json", type=str, help="Filename of translation data (json)"
+    )
+    parser.add_argument(
+        "--result-label",
+        type=str,
+        required=True,
+        help="Filename of result label data (json)",
+    )
+    # model (parameter) related
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    # search related
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
+    parser.add_argument("--penalty", type=float, default=0.1, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=3.0,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    # multilingual related
+    parser.add_argument(
+        "--tgt-lang",
+        default=False,
+        type=str,
+        help="target language ID (e.g., <en>, <de>, and <fr> etc.)",
+    )
+    return parser
+
+
+def main(args):
+    """Run the main decoding function."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+
+        # TODO(mn5k): support of multiple GPUs
+        if args.ngpu > 1:
+            logging.error("The program only supports ngpu=1.")
+            sys.exit(1)
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # seed setting
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logging.info("set random seed = %d" % args.seed)
+
+    # trans
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        # Experimental API that supports custom LMs
+        from espnet.mt.pytorch_backend.mt import trans
+
+        if args.dtype != "float32":
+            raise NotImplementedError(
+                f"`--dtype {args.dtype}` is only available with `--api v2`"
+            )
+        trans(args)
+    else:
+        raise ValueError("Only pytorch are supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/st_train.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/st_train.py
+#!/usr/bin/env python3
+# encoding: utf-8
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""End-to-end speech translation model training script."""
+
+import logging
+import os
+import random
+import subprocess
+import sys
+
+import configargparse
+import numpy as np
+
+from espnet import __version__
+from espnet.utils.cli_utils import strtobool
+from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser(parser=None, required=True):
+    """Get default arguments."""
+    if parser is None:
+        parser = configargparse.ArgumentParser(
+            description="Train a speech translation (ST) model on one CPU, "
+            "one or multiple GPUs",
+            config_file_parser_class=configargparse.YAMLConfigFileParser,
+            formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+        )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--train-dtype",
+        default="float32",
+        choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
+        help="Data type for training (only pytorch backend). "
+        "O0,O1,.. flags require apex. "
+        "See https://nvidia.github.io/apex/amp.html#opt-levels",
+    )
+    parser.add_argument(
+        "--backend",
+        default="chainer",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument(
+        "--outdir", type=str, required=required, help="Output directory"
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--dict", required=required, help="Dictionary")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument(
+        "--minibatches",
+        "-N",
+        type=int,
+        default="-1",
+        help="Process only N minibatches (for debug)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log dir path",
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=100,
+        type=int,
+        help="Report interval iterations",
+    )
+    parser.add_argument(
+        "--save-interval-iters",
+        default=0,
+        type=int,
+        help="Save snapshot interval iterations",
+    )
+    # task related
+    parser.add_argument(
+        "--train-json",
+        type=str,
+        default=None,
+        help="Filename of train label data (json)",
+    )
+    parser.add_argument(
+        "--valid-json",
+        type=str,
+        default=None,
+        help="Filename of validation label data (json)",
+    )
+    # network architecture
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default=None,
+        help="model defined module (default: espnet.nets.xxx_backend.e2e_st:E2E)",
+    )
+    # loss related
+    parser.add_argument(
+        "--ctc_type",
+        default="builtin",
+        type=str,
+        choices=["builtin", "gtnctc", "cudnnctc"],
+        help="Type of CTC implementation to calculate loss.",
+    )
+    parser.add_argument(
+        "--mtlalpha",
+        default=0.0,
+        type=float,
+        help="Multitask learning coefficient, alpha: \
+                                alpha*ctc_loss + (1-alpha)*att_loss",
+    )
+    parser.add_argument(
+        "--asr-weight",
+        default=0.0,
+        type=float,
+        help="Multitask learning coefficient for ASR task, weight: "
+        " asr_weight*(alpha*ctc_loss + (1-alpha)*att_loss)"
+        " + (1-asr_weight-mt_weight)*st_loss",
+    )
+    parser.add_argument(
+        "--mt-weight",
+        default=0.0,
+        type=float,
+        help="Multitask learning coefficient for MT task, weight: \
+                                mt_weight*mt_loss + (1-mt_weight-asr_weight)*st_loss",
+    )
+    parser.add_argument(
+        "--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
+    )
+    # recognition options to compute CER/WER
+    parser.add_argument(
+        "--report-cer",
+        default=False,
+        action="store_true",
+        help="Compute CER on development set",
+    )
+    parser.add_argument(
+        "--report-wer",
+        default=False,
+        action="store_true",
+        help="Compute WER on development set",
+    )
+    # translations options to compute BLEU
+    parser.add_argument(
+        "--report-bleu",
+        default=True,
+        action="store_true",
+        help="Compute BLEU on development set",
+    )
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
+    parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        default=0.0,
+        type=float,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        default=0.0,
+        type=float,
+        help="Input length ratio to obtain min output length",
+    )
+    parser.add_argument(
+        "--rnnlm", type=str, default=None, help="RNNLM model file to read"
+    )
+    parser.add_argument(
+        "--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
+    )
+    parser.add_argument("--lm-weight", default=0.0, type=float, help="RNNLM weight.")
+    parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
+    parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
+    # minibatch related
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batch-count",
+        default="auto",
+        choices=BATCH_COUNT_CHOICES,
+        help="How to count batch_size. "
+        "The default (auto) will find how to count by args.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "--batch-seqs",
+        "-b",
+        default=0,
+        type=int,
+        help="Maximum seqs in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-bins",
+        default=0,
+        type=int,
+        help="Maximum bins in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-in",
+        default=0,
+        type=int,
+        help="Maximum input frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-out",
+        default=0,
+        type=int,
+        help="Maximum output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-inout",
+        default=0,
+        type=int,
+        help="Maximum input+output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--maxlen-in",
+        "--batch-seq-maxlen-in",
+        default=800,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, batch size is reduced "
+        "if the input sequence length > ML.",
+    )
+    parser.add_argument(
+        "--maxlen-out",
+        "--batch-seq-maxlen-out",
+        default=150,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the output sequence length > ML",
+    )
+    parser.add_argument(
+        "--n-iter-processes",
+        default=0,
+        type=int,
+        help="Number of processes of iterator",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        nargs="?",
+        help="The configuration file for the pre-processing",
+    )
+    # optimization related
+    parser.add_argument(
+        "--opt",
+        default="adadelta",
+        type=str,
+        choices=["adadelta", "adam", "noam"],
+        help="Optimizer",
+    )
+    parser.add_argument(
+        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
+    )
+    parser.add_argument(
+        "--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
+    )
+    parser.add_argument(
+        "--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
+    )
+    parser.add_argument(
+        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
+    )
+    parser.add_argument(
+        "--lr-decay", default=1.0, type=float, help="Decaying ratio of learning rate"
+    )
+    parser.add_argument(
+        "--weight-decay", default=0.0, type=float, help="Weight decay ratio"
+    )
+    parser.add_argument(
+        "--criterion",
+        default="acc",
+        type=str,
+        choices=["loss", "acc"],
+        help="Criterion to perform epsilon decay",
+    )
+    parser.add_argument(
+        "--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
+    )
+    parser.add_argument(
+        "--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/acc",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs to wait "
+        "without improvement before stopping the training",
+    )
+    parser.add_argument(
+        "--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
+    )
+    parser.add_argument(
+        "--num-save-attention",
+        default=3,
+        type=int,
+        help="Number of samples of attention to be saved",
+    )
+    parser.add_argument(
+        "--num-save-ctc",
+        default=3,
+        type=int,
+        help="Number of samples of CTC probability to be saved",
+    )
+    parser.add_argument(
+        "--grad-noise",
+        type=strtobool,
+        default=False,
+        help="The flag to switch to use noise injection to gradients during training",
+    )
+    # speech translation related
+    parser.add_argument(
+        "--context-residual",
+        default=False,
+        type=strtobool,
+        nargs="?",
+        help="The flag to switch to use context vector residual in the decoder network",
+    )
+    # finetuning related
+    parser.add_argument(
+        "--enc-init",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Pre-trained ASR model to initialize encoder.",
+    )
+    parser.add_argument(
+        "--enc-init-mods",
+        default="enc.enc.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--dec-init",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Pre-trained ASR, MT or LM model to initialize decoder.",
+    )
+    parser.add_argument(
+        "--dec-init-mods",
+        default="att., dec.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of decoder modules to initialize, separated by a comma.",
+    )
+    # multilingual related
+    parser.add_argument(
+        "--multilingual",
+        default=False,
+        type=strtobool,
+        help="Prepend target language ID to the source sentence. "
+        " Both source/target language IDs must be prepend in the pre-processing stage.",
+    )
+    parser.add_argument(
+        "--replace-sos",
+        default=False,
+        type=strtobool,
+        help="Replace <sos> in the decoder with a target language ID \
+                              (the first token in the target sequence)",
+    )
+    # Feature transform: Normalization
+    parser.add_argument(
+        "--stats-file",
+        type=str,
+        default=None,
+        help="The stats file for the feature normalization",
+    )
+    parser.add_argument(
+        "--apply-uttmvn",
+        type=strtobool,
+        default=True,
+        help="Apply utterance level mean " "variance normalization.",
+    )
+    parser.add_argument("--uttmvn-norm-means", type=strtobool, default=True, help="")
+    parser.add_argument("--uttmvn-norm-vars", type=strtobool, default=False, help="")
+    # Feature transform: Fbank
+    parser.add_argument(
+        "--fbank-fs",
+        type=int,
+        default=16000,
+        help="The sample frequency used for " "the mel-fbank creation.",
+    )
+    parser.add_argument(
+        "--n-mels", type=int, default=80, help="The number of mel-frequency bins."
+    )
+    parser.add_argument("--fbank-fmin", type=float, default=0.0, help="")
+    parser.add_argument("--fbank-fmax", type=float, default=None, help="")
+    return parser
+
+
+def main(cmd_args):
+    """Run the main training function."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+    if args.backend == "chainer" and args.train_dtype != "float32":
+        raise NotImplementedError(
+            f"chainer backend does not support --train-dtype {args.train_dtype}."
+            "Use --dtype float32."
+        )
+    if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
+        raise ValueError(
+            f"--train-dtype {args.train_dtype} does not support the CPU backend."
+        )
+
+    from espnet.utils.dynamic_import import dynamic_import
+
+    if args.model_module is None:
+        model_module = "espnet.nets." + args.backend + "_backend.e2e_st:E2E"
+    else:
+        model_module = args.model_module
+    model_class = dynamic_import(model_module)
+    model_class.add_arguments(parser)
+
+    args = parser.parse_args(cmd_args)
+    args.model_module = model_module
+    if "chainer_backend" in args.model_module:
+        args.backend = "chainer"
+    if "pytorch_backend" in args.model_module:
+        args.backend = "pytorch"
+
+    # add version info in args
+    args.version = __version__
+
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+        args.ngpu = ngpu
+    else:
+        if args.ngpu != 1:
+            logging.debug(
+                "There are some bugs with multi-GPU processing in PyTorch 1.2+"
+                + " (see https://github.com/pytorch/pytorch/issues/21108)"
+            )
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # set random seed
+    logging.info("random seed = %d" % args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    # load dictionary for debug log
+    if args.dict is not None:
+        with open(args.dict, "rb") as f:
+            dictionary = f.readlines()
+        char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
+        char_list.insert(0, "<blank>")
+        char_list.append("<eos>")
+        args.char_list = char_list
+    else:
+        args.char_list = None
+
+    # train
+    logging.info("backend = " + args.backend)
+
+    if args.backend == "pytorch":
+        from espnet.st.pytorch_backend.st import train
+
+        train(args)
+    else:
+        raise ValueError("Only pytorch are supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/st_trans.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/st_trans.py
+#!/usr/bin/env python3
+# encoding: utf-8
+
+# Copyright 2019 Kyoto University (Hirofumi Inaguma)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""End-to-end speech translation model decoding script."""
+
+import logging
+import os
+import random
+import sys
+
+import configargparse
+import numpy as np
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get default arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Translate text from speech using a speech translation "
+        "model on one CPU or GPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="Config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="Second config file path that overwrites the settings in `--config`",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="Third config file path that overwrites "
+        "the settings in `--config` and `--config2`",
+    )
+
+    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
+    parser.add_argument(
+        "--dtype",
+        choices=("float16", "float32", "float64"),
+        default="float32",
+        help="Float precision (only available in --api v2)",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="chainer",
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
+    parser.add_argument("--seed", type=int, default=1, help="Random seed")
+    parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
+    parser.add_argument(
+        "--batchsize",
+        type=int,
+        default=1,
+        help="Batch size for beam search (0: means no batch processing)",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    parser.add_argument(
+        "--api",
+        default="v1",
+        choices=["v1", "v2"],
+        help="Beam search APIs "
+        "v1: Default API. "
+        "It only supports the ASRInterface.recognize method and DefaultRNNLM. "
+        "v2: Experimental API. "
+        "It supports any models that implements ScorerInterface.",
+    )
+    # task related
+    parser.add_argument(
+        "--trans-json", type=str, help="Filename of translation data (json)"
+    )
+    parser.add_argument(
+        "--result-label",
+        type=str,
+        required=True,
+        help="Filename of result label data (json)",
+    )
+    # model (parameter) related
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    # search related
+    parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
+    parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty")
+    parser.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="""Input length ratio to obtain max output length.
+                        If maxlenratio=0.0 (default), it uses a end-detect function
+                        to automatically find maximum hypothesis lengths""",
+    )
+    parser.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    # multilingual related
+    parser.add_argument(
+        "--tgt-lang",
+        default=False,
+        type=str,
+        help="target language ID (e.g., <en>, <de>, and <fr> etc.)",
+    )
+    return parser
+
+
+def main(args):
+    """Run the main decoding function."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+
+    # logging info
+    if args.verbose == 1:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    elif args.verbose == 2:
+        logging.basicConfig(
+            level=logging.DEBUG,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+
+        # TODO(mn5k): support of multiple GPUs
+        if args.ngpu > 1:
+            logging.error("The program only supports ngpu=1.")
+            sys.exit(1)
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # seed setting
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    logging.info("set random seed = %d" % args.seed)
+
+    # trans
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        # Experimental API that supports custom LMs
+        from espnet.st.pytorch_backend.st import trans
+
+        if args.dtype != "float32":
+            raise NotImplementedError(
+                f"`--dtype {args.dtype}` is only available with `--api v2`"
+            )
+        trans(args)
+    else:
+        raise ValueError("Only pytorch are supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/tts_decode.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/tts_decode.py
+#!/usr/bin/env python3
+
+# Copyright 2018 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""TTS decoding script."""
+
+import logging
+import os
+import subprocess
+import sys
+
+import configargparse
+
+from espnet.utils.cli_utils import strtobool
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get parser of decoding arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Synthesize speech from text using a TTS model on one CPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites "
+        "the settings in `--config` and `--config2`.",
+    )
+
+    parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
+    parser.add_argument(
+        "--backend",
+        default="pytorch",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--out", type=str, required=True, help="Output filename")
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    # task related
+    parser.add_argument(
+        "--json", type=str, required=True, help="Filename of train label data (json)"
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    # decoding related
+    parser.add_argument(
+        "--maxlenratio", type=float, default=5, help="Maximum length ratio in decoding"
+    )
+    parser.add_argument(
+        "--minlenratio", type=float, default=0, help="Minimum length ratio in decoding"
+    )
+    parser.add_argument(
+        "--threshold", type=float, default=0.5, help="Threshold value in decoding"
+    )
+    parser.add_argument(
+        "--use-att-constraint",
+        type=strtobool,
+        default=False,
+        help="Whether to use the attention constraint",
+    )
+    parser.add_argument(
+        "--backward-window",
+        type=int,
+        default=1,
+        help="Backward window size in the attention constraint",
+    )
+    parser.add_argument(
+        "--forward-window",
+        type=int,
+        default=3,
+        help="Forward window size in the attention constraint",
+    )
+    parser.add_argument(
+        "--fastspeech-alpha",
+        type=float,
+        default=1.0,
+        help="Alpha to change the speed for FastSpeech",
+    )
+    # save related
+    parser.add_argument(
+        "--save-durations",
+        default=False,
+        type=strtobool,
+        help="Whether to save durations converted from attentions",
+    )
+    parser.add_argument(
+        "--save-focus-rates",
+        default=False,
+        type=strtobool,
+        help="Whether to save focus rates of attentions",
+    )
+    return parser
+
+
+def main(args):
+    """Run deocding."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
+            cvd = (
+                subprocess.check_output(
+                    ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
+                )
+                .decode()
+                .strip()
+            )
+            logging.info("CLSP: use gpu" + cvd)
+            os.environ["CUDA_VISIBLE_DEVICES"] = cvd
+
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # extract
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        from espnet.tts.pytorch_backend.tts import decode
+
+        decode(args)
+    else:
+        raise NotImplementedError("Only pytorch is supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/tts_train.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/tts_train.py
+#!/usr/bin/env python3
+
+# Copyright 2018 Nagoya University (Tomoki Hayashi)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Text-to-speech model training script."""
+
+import logging
+import os
+import random
+import subprocess
+import sys
+
+import configargparse
+import numpy as np
+
+from espnet import __version__
+from espnet.nets.tts_interface import TTSInterface
+from espnet.utils.cli_utils import strtobool
+from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get parser of training arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Train a new text-to-speech (TTS) model on one CPU, "
+        "one or multiple GPUs",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites "
+        "the settings in `--config` and `--config2`.",
+    )
+
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--backend",
+        default="pytorch",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--outdir", type=str, required=True, help="Output directory")
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        type=str,
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument(
+        "--minibatches",
+        "-N",
+        type=int,
+        default="-1",
+        help="Process only N minibatches (for debug)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log directory path",
+    )
+    parser.add_argument(
+        "--eval-interval-epochs", default=1, type=int, help="Evaluation interval epochs"
+    )
+    parser.add_argument(
+        "--save-interval-epochs", default=1, type=int, help="Save interval epochs"
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=100,
+        type=int,
+        help="Report interval iterations",
+    )
+    # task related
+    parser.add_argument(
+        "--train-json", type=str, required=True, help="Filename of training json"
+    )
+    parser.add_argument(
+        "--valid-json", type=str, required=True, help="Filename of validation json"
+    )
+    # network architecture
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default="espnet.nets.pytorch_backend.e2e_tts_tacotron2:Tacotron2",
+        help="model defined module",
+    )
+    # minibatch related
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batch-sort-key",
+        default="shuffle",
+        type=str,
+        choices=["shuffle", "output", "input"],
+        nargs="?",
+        help='Batch sorting key. "shuffle" only work with --batch-count "seq".',
+    )
+    parser.add_argument(
+        "--batch-count",
+        default="auto",
+        choices=BATCH_COUNT_CHOICES,
+        help="How to count batch_size. "
+        "The default (auto) will find how to count by args.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "--batch-seqs",
+        "-b",
+        default=0,
+        type=int,
+        help="Maximum seqs in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-bins",
+        default=0,
+        type=int,
+        help="Maximum bins in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-in",
+        default=0,
+        type=int,
+        help="Maximum input frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-out",
+        default=0,
+        type=int,
+        help="Maximum output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-inout",
+        default=0,
+        type=int,
+        help="Maximum input+output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--maxlen-in",
+        "--batch-seq-maxlen-in",
+        default=100,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the input sequence length > ML.",
+    )
+    parser.add_argument(
+        "--maxlen-out",
+        "--batch-seq-maxlen-out",
+        default=200,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the output sequence length > ML",
+    )
+    parser.add_argument(
+        "--num-iter-processes",
+        default=0,
+        type=int,
+        help="Number of processes of iterator",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    parser.add_argument(
+        "--use-speaker-embedding",
+        default=False,
+        type=strtobool,
+        help="Whether to use speaker embedding",
+    )
+    parser.add_argument(
+        "--use-second-target",
+        default=False,
+        type=strtobool,
+        help="Whether to use second target",
+    )
+    # optimization related
+    parser.add_argument(
+        "--opt", default="adam", type=str, choices=["adam", "noam"], help="Optimizer"
+    )
+    parser.add_argument(
+        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
+    )
+    parser.add_argument(
+        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
+    )
+    parser.add_argument("--eps", default=1e-6, type=float, help="Epsilon for optimizer")
+    parser.add_argument(
+        "--weight-decay",
+        default=1e-6,
+        type=float,
+        help="Weight decay coefficient for optimizer",
+    )
+    parser.add_argument(
+        "--epochs", "-e", default=30, type=int, help="Number of maximum epochs"
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/loss",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs to wait "
+        "without improvement before stopping the training",
+    )
+    parser.add_argument(
+        "--grad-clip", default=1, type=float, help="Gradient norm threshold to clip"
+    )
+    parser.add_argument(
+        "--num-save-attention",
+        default=5,
+        type=int,
+        help="Number of samples of attention to be saved",
+    )
+    parser.add_argument(
+        "--keep-all-data-on-mem",
+        default=False,
+        type=strtobool,
+        help="Whether to keep all data on memory",
+    )
+    # finetuning related
+    parser.add_argument(
+        "--enc-init",
+        default=None,
+        type=str,
+        help="Pre-trained TTS model path to initialize encoder.",
+    )
+    parser.add_argument(
+        "--enc-init-mods",
+        default="enc.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--dec-init",
+        default=None,
+        type=str,
+        help="Pre-trained TTS model path to initialize decoder.",
+    )
+    parser.add_argument(
+        "--dec-init-mods",
+        default="dec.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of decoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--freeze-mods",
+        default=None,
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of modules to freeze (not to train), separated by a comma.",
+    )
+
+    return parser
+
+
+def main(cmd_args):
+    """Run training."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+
+    from espnet.utils.dynamic_import import dynamic_import
+
+    model_class = dynamic_import(args.model_module)
+    assert issubclass(model_class, TTSInterface)
+    model_class.add_arguments(parser)
+    args = parser.parse_args(cmd_args)
+
+    # add version info in args
+    args.version = __version__
+
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+        args.ngpu = ngpu
+    else:
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+
+    # set random seed
+    logging.info("random seed = %d" % args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    if args.backend == "pytorch":
+        from espnet.tts.pytorch_backend.tts import train
+
+        train(args)
+    else:
+        raise NotImplementedError("Only pytorch is supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/vc_decode.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/vc_decode.py
+#!/usr/bin/env python3
+
+# Copyright 2020 Nagoya University (Wen-Chin Huang)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""VC decoding script."""
+
+import logging
+import os
+import subprocess
+import sys
+
+import configargparse
+
+from espnet.utils.cli_utils import strtobool
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get parser of decoding arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Converting speech using a VC model on one CPU",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+
+    parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
+    parser.add_argument(
+        "--backend",
+        default="pytorch",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument("--out", type=str, required=True, help="Output filename")
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    # task related
+    parser.add_argument(
+        "--json", type=str, required=True, help="Filename of train label data (json)"
+    )
+    parser.add_argument(
+        "--model", type=str, required=True, help="Model file parameters to read"
+    )
+    parser.add_argument(
+        "--model-conf", type=str, default=None, help="Model config file"
+    )
+    # decoding related
+    parser.add_argument(
+        "--maxlenratio", type=float, default=5, help="Maximum length ratio in decoding"
+    )
+    parser.add_argument(
+        "--minlenratio", type=float, default=0, help="Minimum length ratio in decoding"
+    )
+    parser.add_argument(
+        "--threshold", type=float, default=0.5, help="Threshold value in decoding"
+    )
+    parser.add_argument(
+        "--use-att-constraint",
+        type=strtobool,
+        default=False,
+        help="Whether to use the attention constraint",
+    )
+    parser.add_argument(
+        "--backward-window",
+        type=int,
+        default=1,
+        help="Backward window size in the attention constraint",
+    )
+    parser.add_argument(
+        "--forward-window",
+        type=int,
+        default=3,
+        help="Forward window size in the attention constraint",
+    )
+    # save related
+    parser.add_argument(
+        "--save-durations",
+        default=False,
+        type=strtobool,
+        help="Whether to save durations converted from attentions",
+    )
+    parser.add_argument(
+        "--save-focus-rates",
+        default=False,
+        type=strtobool,
+        help="Whether to save focus rates of attentions",
+    )
+    return parser
+
+
+def main(args):
+    """Run deocding."""
+    parser = get_parser()
+    args = parser.parse_args(args)
+
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # check CUDA_VISIBLE_DEVICES
+    if args.ngpu > 0:
+        if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
+            cvd = (
+                subprocess.check_output(
+                    ["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
+                )
+                .decode()
+                .strip()
+            )
+            logging.info("CLSP: use gpu" + cvd)
+            os.environ["CUDA_VISIBLE_DEVICES"] = cvd
+
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is None:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+        elif args.ngpu != len(cvd.split(",")):
+            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
+            sys.exit(1)
+
+    # display PYTHONPATH
+    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
+
+    # extract
+    logging.info("backend = " + args.backend)
+    if args.backend == "pytorch":
+        from espnet.vc.pytorch_backend.vc import decode
+
+        decode(args)
+    else:
+        raise NotImplementedError("Only pytorch is supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/vc_train.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/bin/vc_train.py
+#!/usr/bin/env python3
+
+# Copyright 2020 Nagoya University (Wen-Chin Huang)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Voice conversion model training script."""
+
+import logging
+import os
+import random
+import subprocess
+import sys
+
+import configargparse
+import numpy as np
+
+from espnet import __version__
+from espnet.nets.tts_interface import TTSInterface
+from espnet.utils.cli_utils import strtobool
+from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
+
+
+# NOTE: you need this func to generate our sphinx doc
+def get_parser():
+    """Get parser of training arguments."""
+    parser = configargparse.ArgumentParser(
+        description="Train a new voice conversion (VC) model on one CPU, "
+        "one or multiple GPUs",
+        config_file_parser_class=configargparse.YAMLConfigFileParser,
+        formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # general configuration
+    parser.add("--config", is_config_file=True, help="config file path")
+    parser.add(
+        "--config2",
+        is_config_file=True,
+        help="second config file path that overwrites the settings in `--config`.",
+    )
+    parser.add(
+        "--config3",
+        is_config_file=True,
+        help="third config file path that overwrites the settings "
+        "in `--config` and `--config2`.",
+    )
+
+    parser.add_argument(
+        "--ngpu",
+        default=None,
+        type=int,
+        help="Number of GPUs. If not given, use all visible devices",
+    )
+    parser.add_argument(
+        "--backend",
+        default="pytorch",
+        type=str,
+        choices=["chainer", "pytorch"],
+        help="Backend library",
+    )
+    parser.add_argument("--outdir", type=str, required=True, help="Output directory")
+    parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
+    parser.add_argument("--seed", default=1, type=int, help="Random seed")
+    parser.add_argument(
+        "--resume",
+        "-r",
+        default="",
+        type=str,
+        nargs="?",
+        help="Resume the training from snapshot",
+    )
+    parser.add_argument(
+        "--minibatches",
+        "-N",
+        type=int,
+        default="-1",
+        help="Process only N minibatches (for debug)",
+    )
+    parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
+    parser.add_argument(
+        "--tensorboard-dir",
+        default=None,
+        type=str,
+        nargs="?",
+        help="Tensorboard log directory path",
+    )
+    parser.add_argument(
+        "--eval-interval-epochs",
+        default=100,
+        type=int,
+        help="Evaluation interval epochs",
+    )
+    parser.add_argument(
+        "--save-interval-epochs", default=1, type=int, help="Save interval epochs"
+    )
+    parser.add_argument(
+        "--report-interval-iters",
+        default=10,
+        type=int,
+        help="Report interval iterations",
+    )
+    # task related
+    parser.add_argument("--srcspk", type=str, help="Source speaker")
+    parser.add_argument("--trgspk", type=str, help="Target speaker")
+    parser.add_argument(
+        "--train-json", type=str, required=True, help="Filename of training json"
+    )
+    parser.add_argument(
+        "--valid-json", type=str, required=True, help="Filename of validation json"
+    )
+
+    # network architecture
+    parser.add_argument(
+        "--model-module",
+        type=str,
+        default="espnet.nets.pytorch_backend.e2e_tts_tacotron2:Tacotron2",
+        help="model defined module",
+    )
+    # minibatch related
+    parser.add_argument(
+        "--sortagrad",
+        default=0,
+        type=int,
+        nargs="?",
+        help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
+    )
+    parser.add_argument(
+        "--batch-sort-key",
+        default="shuffle",
+        type=str,
+        choices=["shuffle", "output", "input"],
+        nargs="?",
+        help='Batch sorting key. "shuffle" only work with --batch-count "seq".',
+    )
+    parser.add_argument(
+        "--batch-count",
+        default="auto",
+        choices=BATCH_COUNT_CHOICES,
+        help="How to count batch_size. "
+        "The default (auto) will find how to count by args.",
+    )
+    parser.add_argument(
+        "--batch-size",
+        "--batch-seqs",
+        "-b",
+        default=0,
+        type=int,
+        help="Maximum seqs in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-bins",
+        default=0,
+        type=int,
+        help="Maximum bins in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-in",
+        default=0,
+        type=int,
+        help="Maximum input frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-out",
+        default=0,
+        type=int,
+        help="Maximum output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--batch-frames-inout",
+        default=0,
+        type=int,
+        help="Maximum input+output frames in a minibatch (0 to disable)",
+    )
+    parser.add_argument(
+        "--maxlen-in",
+        "--batch-seq-maxlen-in",
+        default=100,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the input sequence length > ML.",
+    )
+    parser.add_argument(
+        "--maxlen-out",
+        "--batch-seq-maxlen-out",
+        default=200,
+        type=int,
+        metavar="ML",
+        help="When --batch-count=seq, "
+        "batch size is reduced if the output sequence length > ML",
+    )
+    parser.add_argument(
+        "--num-iter-processes",
+        default=0,
+        type=int,
+        help="Number of processes of iterator",
+    )
+    parser.add_argument(
+        "--preprocess-conf",
+        type=str,
+        default=None,
+        help="The configuration file for the pre-processing",
+    )
+    parser.add_argument(
+        "--use-speaker-embedding",
+        default=False,
+        type=strtobool,
+        help="Whether to use speaker embedding",
+    )
+    parser.add_argument(
+        "--use-second-target",
+        default=False,
+        type=strtobool,
+        help="Whether to use second target",
+    )
+    # optimization related
+    parser.add_argument(
+        "--opt",
+        default="adam",
+        type=str,
+        choices=["adam", "noam", "lamb"],
+        help="Optimizer",
+    )
+    parser.add_argument(
+        "--accum-grad", default=1, type=int, help="Number of gradient accumuration"
+    )
+    parser.add_argument(
+        "--lr", default=1e-3, type=float, help="Learning rate for optimizer"
+    )
+    parser.add_argument("--eps", default=1e-6, type=float, help="Epsilon for optimizer")
+    parser.add_argument(
+        "--weight-decay",
+        default=1e-6,
+        type=float,
+        help="Weight decay coefficient for optimizer",
+    )
+    parser.add_argument(
+        "--epochs", "-e", default=30, type=int, help="Number of maximum epochs"
+    )
+    parser.add_argument(
+        "--early-stop-criterion",
+        default="validation/main/loss",
+        type=str,
+        nargs="?",
+        help="Value to monitor to trigger an early stopping of the training",
+    )
+    parser.add_argument(
+        "--patience",
+        default=3,
+        type=int,
+        nargs="?",
+        help="Number of epochs to wait without improvement "
+        "before stopping the training",
+    )
+    parser.add_argument(
+        "--grad-clip", default=1, type=float, help="Gradient norm threshold to clip"
+    )
+    parser.add_argument(
+        "--num-save-attention",
+        default=5,
+        type=int,
+        help="Number of samples of attention to be saved",
+    )
+    parser.add_argument(
+        "--keep-all-data-on-mem",
+        default=False,
+        type=strtobool,
+        help="Whether to keep all data on memory",
+    )
+
+    parser.add_argument(
+        "--enc-init",
+        default=None,
+        type=str,
+        help="Pre-trained model path to initialize encoder.",
+    )
+    parser.add_argument(
+        "--enc-init-mods",
+        default="enc.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of encoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--dec-init",
+        default=None,
+        type=str,
+        help="Pre-trained model path to initialize decoder.",
+    )
+    parser.add_argument(
+        "--dec-init-mods",
+        default="dec.",
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of decoder modules to initialize, separated by a comma.",
+    )
+    parser.add_argument(
+        "--freeze-mods",
+        default=None,
+        type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
+        help="List of modules to freeze (not to train), separated by a comma.",
+    )
+
+    return parser
+
+
+def main(cmd_args):
+    """Run training."""
+    parser = get_parser()
+    args, _ = parser.parse_known_args(cmd_args)
+
+    from espnet.utils.dynamic_import import dynamic_import
+
+    model_class = dynamic_import(args.model_module)
+    assert issubclass(model_class, TTSInterface)
+    model_class.add_arguments(parser)
+    args = parser.parse_args(cmd_args)
+
+    # add version info in args
+    args.version = __version__
+
+    # logging info
+    if args.verbose > 0:
+        logging.basicConfig(
+            level=logging.INFO,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+    else:
+        logging.basicConfig(
+            level=logging.WARN,
+            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+        )
+        logging.warning("Skip DEBUG/INFO messages")
+
+    # If --ngpu is not given,
+    #   1. if CUDA_VISIBLE_DEVICES is set, all visible devices
+    #   2. if nvidia-smi exists, use all devices
+    #   3. else ngpu=0
+    if args.ngpu is None:
+        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        if cvd is not None:
+            ngpu = len(cvd.split(","))
+        else:
+            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
+            try:
+                p = subprocess.run(
+                    ["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
+                )
+            except (subprocess.CalledProcessError, FileNotFoundError):
+                ngpu = 0
+            else:
+                ngpu = len(p.stderr.decode().split("\n")) - 1
+    else:
+        ngpu = args.ngpu
+    logging.info(f"ngpu: {ngpu}")
+
+    # set random seed
+    logging.info("random seed = %d" % args.seed)
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+
+    if args.backend == "pytorch":
+        from espnet.vc.pytorch_backend.vc import train
+
+        train(args)
+    else:
+        raise NotImplementedError("Only pytorch is supported.")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/distributed/__init__.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/distributed/__init__.py
+#
+# SPDX-FileCopyrightText:
+#   Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+
+"""Initialize sub package."""
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/lm/__init__.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/lm/__init__.py
+"""Initialize sub package."""
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/lm/chainer_backend/__init__.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/lm/chainer_backend/__init__.py
+"""Initialize sub package."""
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/lm/chainer_backend/extlm.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/lm/chainer_backend/extlm.py
+#!/usr/bin/env python3
+
+# Copyright 2018 Mitsubishi Electric Research Laboratories (Takaaki Hori)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+
+import math
+
+import chainer
+import chainer.functions as F
+
+from espnet.lm.lm_utils import make_lexical_tree
+
+
+# Definition of a multi-level (subword/word) language model
+class MultiLevelLM(chainer.Chain):
+    logzero = -10000000000.0
+    zero = 1.0e-10
+
+    def __init__(
+        self,
+        wordlm,
+        subwordlm,
+        word_dict,
+        subword_dict,
+        subwordlm_weight=0.8,
+        oov_penalty=1.0,
+        open_vocab=True,
+    ):
+        super(MultiLevelLM, self).__init__()
+        self.wordlm = wordlm
+        self.subwordlm = subwordlm
+        self.word_eos = word_dict["<eos>"]
+        self.word_unk = word_dict["<unk>"]
+        self.xp_word_eos = self.xp.full(1, self.word_eos, "i")
+        self.xp_word_unk = self.xp.full(1, self.word_unk, "i")
+        self.space = subword_dict["<space>"]
+        self.eos = subword_dict["<eos>"]
+        self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
+        self.log_oov_penalty = math.log(oov_penalty)
+        self.open_vocab = open_vocab
+        self.subword_dict_size = len(subword_dict)
+        self.subwordlm_weight = subwordlm_weight
+        self.normalized = True
+
+    def __call__(self, state, x):
+        # update state with input label x
+        if state is None:  # make initial states and log-prob vectors
+            wlm_state, z_wlm = self.wordlm(None, self.xp_word_eos)
+            wlm_logprobs = F.log_softmax(z_wlm).data
+            clm_state, z_clm = self.subwordlm(None, x)
+            log_y = F.log_softmax(z_clm).data * self.subwordlm_weight
+            new_node = self.lexroot
+            clm_logprob = 0.0
+            xi = self.space
+        else:
+            clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
+            xi = int(x)
+            if xi == self.space:  # inter-word transition
+                if node is not None and node[1] >= 0:  # check if the node is word end
+                    w = self.xp.full(1, node[1], "i")
+                else:  # this node is not a word end, which means <unk>
+                    w = self.xp_word_unk
+                # update wordlm state and log-prob vector
+                wlm_state, z_wlm = self.wordlm(wlm_state, w)
+                wlm_logprobs = F.log_softmax(z_wlm).data
+                new_node = self.lexroot  # move to the tree root
+                clm_logprob = 0.0
+            elif node is not None and xi in node[0]:  # intra-word transition
+                new_node = node[0][xi]
+                clm_logprob += log_y[0, xi]
+            elif self.open_vocab:  # if no path in the tree, enter open-vocabulary mode
+                new_node = None
+                clm_logprob += log_y[0, xi]
+            else:  # if open_vocab flag is disabled, return 0 probabilities
+                log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
+                return (clm_state, wlm_state, None, log_y, 0.0), log_y
+
+            clm_state, z_clm = self.subwordlm(clm_state, x)
+            log_y = F.log_softmax(z_clm).data * self.subwordlm_weight
+
+        # apply word-level probabilies for <space> and <eos> labels
+        if xi != self.space:
+            if new_node is not None and new_node[1] >= 0:  # if new node is word end
+                wlm_logprob = wlm_logprobs[:, new_node[1]] - clm_logprob
+            else:
+                wlm_logprob = wlm_logprobs[:, self.word_unk] + self.log_oov_penalty
+            log_y[:, self.space] = wlm_logprob
+            log_y[:, self.eos] = wlm_logprob
+        else:
+            log_y[:, self.space] = self.logzero
+            log_y[:, self.eos] = self.logzero
+
+        return (clm_state, wlm_state, wlm_logprobs, new_node, log_y, clm_logprob), log_y
+
+    def final(self, state):
+        clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
+        if node is not None and node[1] >= 0:  # check if the node is word end
+            w = self.xp.full(1, node[1], "i")
+        else:  # this node is not a word end, which means <unk>
+            w = self.xp_word_unk
+        wlm_state, z_wlm = self.wordlm(wlm_state, w)
+        return F.log_softmax(z_wlm).data[:, self.word_eos]
+
+
+# Definition of a look-ahead word language model
+class LookAheadWordLM(chainer.Chain):
+    logzero = -10000000000.0
+    zero = 1.0e-10
+
+    def __init__(
+        self, wordlm, word_dict, subword_dict, oov_penalty=0.0001, open_vocab=True
+    ):
+        super(LookAheadWordLM, self).__init__()
+        self.wordlm = wordlm
+        self.word_eos = word_dict["<eos>"]
+        self.word_unk = word_dict["<unk>"]
+        self.xp_word_eos = self.xp.full(1, self.word_eos, "i")
+        self.xp_word_unk = self.xp.full(1, self.word_unk, "i")
+        self.space = subword_dict["<space>"]
+        self.eos = subword_dict["<eos>"]
+        self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
+        self.oov_penalty = oov_penalty
+        self.open_vocab = open_vocab
+        self.subword_dict_size = len(subword_dict)
+        self.normalized = True
+
+    def __call__(self, state, x):
+        # update state with input label x
+        if state is None:  # make initial states and cumlative probability vector
+            wlm_state, z_wlm = self.wordlm(None, self.xp_word_eos)
+            cumsum_probs = self.xp.cumsum(F.softmax(z_wlm).data, axis=1)
+            new_node = self.lexroot
+            xi = self.space
+        else:
+            wlm_state, cumsum_probs, node = state
+            xi = int(x)
+            if xi == self.space:  # inter-word transition
+                if node is not None and node[1] >= 0:  # check if the node is word end
+                    w = self.xp.full(1, node[1], "i")
+                else:  # this node is not a word end, which means <unk>
+                    w = self.xp_word_unk
+                # update wordlm state and cumlative probability vector
+                wlm_state, z_wlm = self.wordlm(wlm_state, w)
+                cumsum_probs = self.xp.cumsum(F.softmax(z_wlm).data, axis=1)
+                new_node = self.lexroot  # move to the tree root
+            elif node is not None and xi in node[0]:  # intra-word transition
+                new_node = node[0][xi]
+            elif self.open_vocab:  # if no path in the tree, enter open-vocabulary mode
+                new_node = None
+            else:  # if open_vocab flag is disabled, return 0 probabilities
+                log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
+                return (wlm_state, None, None), log_y
+
+        if new_node is not None:
+            succ, wid, wids = new_node
+            # compute parent node probability
+            sum_prob = (
+                (cumsum_probs[:, wids[1]] - cumsum_probs[:, wids[0]])
+                if wids is not None
+                else 1.0
+            )
+            if sum_prob < self.zero:
+                log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
+                return (wlm_state, cumsum_probs, new_node), log_y
+            # set <unk> probability as a default value
+            unk_prob = (
+                cumsum_probs[:, self.word_unk] - cumsum_probs[:, self.word_unk - 1]
+            )
+            y = self.xp.full(
+                (1, self.subword_dict_size), unk_prob * self.oov_penalty, "f"
+            )
+            # compute transition probabilities to child nodes
+            for cid, nd in succ.items():
+                y[:, cid] = (
+                    cumsum_probs[:, nd[2][1]] - cumsum_probs[:, nd[2][0]]
+                ) / sum_prob
+            # apply word-level probabilies for <space> and <eos> labels
+            if wid >= 0:
+                wlm_prob = (cumsum_probs[:, wid] - cumsum_probs[:, wid - 1]) / sum_prob
+                y[:, self.space] = wlm_prob
+                y[:, self.eos] = wlm_prob
+            elif xi == self.space:
+                y[:, self.space] = self.zero
+                y[:, self.eos] = self.zero
+            log_y = self.xp.log(
+                self.xp.clip(y, self.zero, None)
+            )  # clip to avoid log(0)
+        else:  # if no path in the tree, transition probability is one
+            log_y = self.xp.zeros((1, self.subword_dict_size), "f")
+        return (wlm_state, cumsum_probs, new_node), log_y
+
+    def final(self, state):
+        wlm_state, cumsum_probs, node = state
+        if node is not None and node[1] >= 0:  # check if the node is word end
+            w = self.xp.full(1, node[1], "i")
+        else:  # this node is not a word end, which means <unk>
+            w = self.xp_word_unk
+        wlm_state, z_wlm = self.wordlm(wlm_state, w)
+        return F.log_softmax(z_wlm).data[:, self.word_eos]
--- a/conformer/espnet-v.202304_20240621/build/lib/espnet/lm/chainer_backend/lm.py
+++ b/conformer/espnet-v.202304_20240621/build/lib/espnet/lm/chainer_backend/lm.py
+#!/usr/bin/env python3
+
+# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# This code is ported from the following implementation written in Torch.
+# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py
+
+
+import copy
+import json
+import logging
+
+import chainer
+import chainer.functions as F
+import chainer.links as L
+import numpy as np
+from chainer import link, reporter, training
+from chainer.dataset import convert
+
+# for classifier link
+from chainer.functions.loss import softmax_cross_entropy
+from chainer.training import extensions
+
+import espnet.nets.chainer_backend.deterministic_embed_id as DL
+from espnet.lm.lm_utils import (
+    MakeSymlinkToBestModel,
+    ParallelSentenceIterator,
+    compute_perplexity,
+    count_tokens,
+    read_tokens,
+)
+from espnet.nets.lm_interface import LMInterface
+from espnet.optimizer.factory import dynamic_import_optimizer
+from espnet.scheduler.chainer import ChainerScheduler
+from espnet.scheduler.scheduler import dynamic_import_scheduler
+from espnet.utils.deterministic_utils import set_deterministic_chainer
+from espnet.utils.training.evaluator import BaseEvaluator
+from espnet.utils.training.iterators import ShufflingEnabler
+from espnet.utils.training.tensorboard_logger import TensorboardLogger
+from espnet.utils.training.train_utils import check_early_stop, set_early_stop
+
+
+# TODO(karita): reimplement RNNLM with new interface
+class DefaultRNNLM(LMInterface, link.Chain):
+    """Default RNNLM wrapper to compute reduce framewise loss values.
+
+    Args:
+        n_vocab (int): The size of the vocabulary
+        args (argparse.Namespace): configurations. see `add_arguments`
+    """
+
+    @staticmethod
+    def add_arguments(parser):
+        parser.add_argument(
+            "--type",
+            type=str,
+            default="lstm",
+            nargs="?",
+            choices=["lstm", "gru"],
+            help="Which type of RNN to use",
+        )
+        parser.add_argument(
+            "--layer", "-l", type=int, default=2, help="Number of hidden layers"
+        )
+        parser.add_argument(
+            "--unit", "-u", type=int, default=650, help="Number of hidden units"
+        )
+        return parser
+
+
+class ClassifierWithState(link.Chain):
+    """A wrapper for a chainer RNNLM
+
+    :param link.Chain predictor : The RNNLM
+    :param function lossfun: The loss function to use
+    :param int/str label_key:
+    """
+
+    def __init__(
+        self,
+        predictor,
+        lossfun=softmax_cross_entropy.softmax_cross_entropy,
+        label_key=-1,
+    ):
+        if not (isinstance(label_key, (int, str))):
+            raise TypeError("label_key must be int or str, but is %s" % type(label_key))
+
+        super(ClassifierWithState, self).__init__()
+        self.lossfun = lossfun
+        self.y = None
+        self.loss = None
+        self.label_key = label_key
+
+        with self.init_scope():
+            self.predictor = predictor
+
+    def __call__(self, state, *args, **kwargs):
+        """Computes the loss value for an input and label pair.
+
+            It also computes accuracy and stores it to the attribute.
+            When ``label_key`` is ``int``, the corresponding element in ``args``
+            is treated as ground truth labels. And when it is ``str``, the
+            element in ``kwargs`` is used.
+            The all elements of ``args`` and ``kwargs`` except the groundtruth
+            labels are features.
+            It feeds features to the predictor and compare the result
+            with ground truth labels.
+
+        :param state : The LM state
+        :param list[chainer.Variable] args : Input minibatch
+        :param dict[chainer.Variable] kwargs : Input minibatch
+        :return loss value
+        :rtype chainer.Variable
+        """
+
+        if isinstance(self.label_key, int):
+            if not (-len(args) <= self.label_key < len(args)):
+                msg = "Label key %d is out of bounds" % self.label_key
+                raise ValueError(msg)
+            t = args[self.label_key]
+            if self.label_key == -1:
+                args = args[:-1]
+            else:
+                args = args[: self.label_key] + args[self.label_key + 1 :]
+        elif isinstance(self.label_key, str):
+            if self.label_key not in kwargs:
+                msg = 'Label key "%s" is not found' % self.label_key
+                raise ValueError(msg)
+            t = kwargs[self.label_key]
+            del kwargs[self.label_key]
+
+        self.y = None
+        self.loss = None
+        state, self.y = self.predictor(state, *args, **kwargs)
+        self.loss = self.lossfun(self.y, t)
+        return state, self.loss
+
+    def predict(self, state, x):
+        """Predict log probabilities for given state and input x using the predictor
+
+        :param state : the state
+        :param x : the input
+        :return a tuple (state, log prob vector)
+        :rtype cupy/numpy array
+        """
+        if hasattr(self.predictor, "normalized") and self.predictor.normalized:
+            return self.predictor(state, x)
+        else:
+            state, z = self.predictor(state, x)
+            return state, F.log_softmax(z).data
+
+    def final(self, state):
+        """Predict final log probabilities for given state using the predictor
+
+        :param state : the state
+        :return log probability vector
+        :rtype cupy/numpy array
+
+        """
+        if hasattr(self.predictor, "final"):
+            return self.predictor.final(state)
+        else:
+            return 0.0
+
+
+# Definition of a recurrent net for language modeling
+class RNNLM(chainer.Chain):
+    """A chainer RNNLM
+
+    :param int n_vocab: The size of the vocabulary
+    :param int n_layers: The number of layers to create
+    :param int n_units: The number of units per layer
+    :param str type: The RNN type
+    """
+
+    def __init__(self, n_vocab, n_layers, n_units, typ="lstm"):
+        super(RNNLM, self).__init__()
+        with self.init_scope():
+            self.embed = DL.EmbedID(n_vocab, n_units)
+            self.rnn = (
+                chainer.ChainList(
+                    *[L.StatelessLSTM(n_units, n_units) for _ in range(n_layers)]
+                )
+                if typ == "lstm"
+                else chainer.ChainList(
+                    *[L.StatelessGRU(n_units, n_units) for _ in range(n_layers)]
+                )
+            )
+            self.lo = L.Linear(n_units, n_vocab)
+
+        for param in self.params():
+            param.data[...] = np.random.uniform(-0.1, 0.1, param.data.shape)
+        self.n_layers = n_layers
+        self.n_units = n_units
+        self.typ = typ
+
+    def __call__(self, state, x):
+        if state is None:
+            if self.typ == "lstm":
+                state = {"c": [None] * self.n_layers, "h": [None] * self.n_layers}
+            else:
+                state = {"h": [None] * self.n_layers}
+
+        h = [None] * self.n_layers
+        emb = self.embed(x)
+        if self.typ == "lstm":
+            c = [None] * self.n_layers
+            c[0], h[0] = self.rnn[0](state["c"][0], state["h"][0], F.dropout(emb))
+            for n in range(1, self.n_layers):
+                c[n], h[n] = self.rnn[n](
+                    state["c"][n], state["h"][n], F.dropout(h[n - 1])
+                )
+            state = {"c": c, "h": h}
+        else:
+            if state["h"][0] is None:
+                xp = self.xp
+                with chainer.backends.cuda.get_device_from_id(self._device_id):
+                    state["h"][0] = chainer.Variable(
+                        xp.zeros((emb.shape[0], self.n_units), dtype=emb.dtype)
+                    )
+            h[0] = self.rnn[0](state["h"][0], F.dropout(emb))
+            for n in range(1, self.n_layers):
+                if state["h"][n] is None:
+                    xp = self.xp
+                    with chainer.backends.cuda.get_device_from_id(self._device_id):
+                        state["h"][n] = chainer.Variable(
+                            xp.zeros(
+                                (h[n - 1].shape[0], self.n_units), dtype=h[n - 1].dtype
+                            )
+                        )
+                h[n] = self.rnn[n](state["h"][n], F.dropout(h[n - 1]))
+            state = {"h": h}
+        y = self.lo(F.dropout(h[-1]))
+        return state, y
+
+
+class BPTTUpdater(training.updaters.StandardUpdater):
+    """An updater for a chainer LM
+
+    :param chainer.dataset.Iterator train_iter : The train iterator
+    :param optimizer:
+    :param schedulers:
+    :param int device : The device id
+    :param int accum_grad :
+    """
+
+    def __init__(self, train_iter, optimizer, schedulers, device, accum_grad):
+        super(BPTTUpdater, self).__init__(train_iter, optimizer, device=device)
+        self.scheduler = ChainerScheduler(schedulers, optimizer)
+        self.accum_grad = accum_grad
+
+    # The core part of the update routine can be customized by overriding.
+    def update_core(self):
+        # When we pass one iterator and optimizer to StandardUpdater.__init__,
+        # they are automatically named 'main'.
+        train_iter = self.get_iterator("main")
+        optimizer = self.get_optimizer("main")
+
+        count = 0
+        sum_loss = 0
+        optimizer.target.cleargrads()  # Clear the parameter gradients
+        for _ in range(self.accum_grad):
+            # Progress the dataset iterator for sentences at each iteration.
+            batch = train_iter.__next__()
+            x, t = convert.concat_examples(batch, device=self.device, padding=(0, -1))
+            # Concatenate the token IDs to matrices and send them to the device
+            # self.converter does this job
+            # (it is chainer.dataset.concat_examples by default)
+            xp = chainer.backends.cuda.get_array_module(x)
+            loss = 0
+            state = None
+            batch_size, sequence_length = x.shape
+            for i in range(sequence_length):
+                # Compute the loss at this time step and accumulate it
+                state, loss_batch = optimizer.target(
+                    state, chainer.Variable(x[:, i]), chainer.Variable(t[:, i])
+                )
+                non_zeros = xp.count_nonzero(x[:, i])
+                loss += loss_batch * non_zeros
+                count += int(non_zeros)
+            # backward
+            loss /= batch_size * self.accum_grad  # normalized by batch size
+            sum_loss += float(loss.data)
+            loss.backward()  # Backprop
+            loss.unchain_backward()  # Truncate the graph
+
+        reporter.report({"loss": sum_loss}, optimizer.target)
+        reporter.report({"count": count}, optimizer.target)
+        # update
+        optimizer.update()  # Update the parameters
+        self.scheduler.step(self.iteration)
+
+
+class LMEvaluator(BaseEvaluator):
+    """A custom evaluator for a chainer LM
+
+    :param chainer.dataset.Iterator val_iter : The validation iterator
+    :param eval_model : The model to evaluate
+    :param int device : The device id to use
+    """
+
+    def __init__(self, val_iter, eval_model, device):
+        super(LMEvaluator, self).__init__(val_iter, eval_model, device=device)
+
+    def evaluate(self):
+        val_iter = self.get_iterator("main")
+        target = self.get_target("main")
+        loss = 0
+        count = 0
+        for batch in copy.copy(val_iter):
+            x, t = convert.concat_examples(batch, device=self.device, padding=(0, -1))
+            xp = chainer.backends.cuda.get_array_module(x)
+            state = None
+            for i in range(len(x[0])):
+                state, loss_batch = target(state, x[:, i], t[:, i])
+                non_zeros = xp.count_nonzero(x[:, i])
+                loss += loss_batch.data * non_zeros
+                count += int(non_zeros)
+        # report validation loss
+        observation = {}
+        with reporter.report_scope(observation):
+            reporter.report({"loss": float(loss / count)}, target)
+        return observation
+
+
+def train(args):
+    """Train with the given args
+
+    :param Namespace args: The program arguments
+    """
+    # TODO(karita): support this
+    if args.model_module != "default":
+        raise NotImplementedError("chainer backend does not support --model-module")
+
+    # display chainer version
+    logging.info("chainer version = " + chainer.__version__)
+
+    set_deterministic_chainer(args)
+
+    # check cuda and cudnn availability
+    if not chainer.cuda.available:
+        logging.warning("cuda is not available")
+    if not chainer.cuda.cudnn_enabled:
+        logging.warning("cudnn is not available")
+
+    # get special label ids
+    unk = args.char_list_dict["<unk>"]
+    eos = args.char_list_dict["<eos>"]
+    # read tokens as a sequence of sentences
+    train = read_tokens(args.train_label, args.char_list_dict)
+    val = read_tokens(args.valid_label, args.char_list_dict)
+    # count tokens
+    n_train_tokens, n_train_oovs = count_tokens(train, unk)
+    n_val_tokens, n_val_oovs = count_tokens(val, unk)
+    logging.info("#vocab = " + str(args.n_vocab))
+    logging.info("#sentences in the training data = " + str(len(train)))
+    logging.info("#tokens in the training data = " + str(n_train_tokens))
+    logging.info(
+        "oov rate in the training data = %.2f %%"
+        % (n_train_oovs / n_train_tokens * 100)
+    )
+    logging.info("#sentences in the validation data = " + str(len(val)))
+    logging.info("#tokens in the validation data = " + str(n_val_tokens))
+    logging.info(
+        "oov rate in the validation data = %.2f %%" % (n_val_oovs / n_val_tokens * 100)
+    )
+
+    use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
+
+    # Create the dataset iterators
+    train_iter = ParallelSentenceIterator(
+        train,
+        args.batchsize,
+        max_length=args.maxlen,
+        sos=eos,
+        eos=eos,
+        shuffle=not use_sortagrad,
+    )
+    val_iter = ParallelSentenceIterator(
+        val, args.batchsize, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
+    )
+    epoch_iters = int(len(train_iter.batch_indices) / args.accum_grad)
+    logging.info("#iterations per epoch = %d" % epoch_iters)
+    logging.info("#total iterations = " + str(args.epoch * epoch_iters))
+    # Prepare an RNNLM model
+    rnn = RNNLM(args.n_vocab, args.layer, args.unit, args.type)
+    model = ClassifierWithState(rnn)
+    if args.ngpu > 1:
+        logging.warning("currently, multi-gpu is not supported. use single gpu.")
+    if args.ngpu > 0:
+        # Make the specified GPU current
+        gpu_id = 0
+        chainer.cuda.get_device_from_id(gpu_id).use()
+        model.to_gpu()
+    else:
+        gpu_id = -1
+
+    # Save model conf to json
+    model_conf = args.outdir + "/model.json"
+    with open(model_conf, "wb") as f:
+        logging.info("writing a model config file to " + model_conf)
+        f.write(
+            json.dumps(vars(args), indent=4, ensure_ascii=False, sort_keys=True).encode(
+                "utf_8"
+            )
+        )
+
+    # Set up an optimizer
+    opt_class = dynamic_import_optimizer(args.opt, args.backend)
+    optimizer = opt_class.from_args(model, args)
+    if args.schedulers is None:
+        schedulers = []
+    else:
+        schedulers = [dynamic_import_scheduler(v)(k, args) for k, v in args.schedulers]
+
+    optimizer.setup(model)
+    optimizer.add_hook(chainer.optimizer.GradientClipping(args.gradclip))
+
+    updater = BPTTUpdater(train_iter, optimizer, schedulers, gpu_id, args.accum_grad)
+    trainer = training.Trainer(updater, (args.epoch, "epoch"), out=args.outdir)
+    trainer.extend(LMEvaluator(val_iter, model, device=gpu_id))
+    trainer.extend(
+        extensions.LogReport(
+            postprocess=compute_perplexity,
+            trigger=(args.report_interval_iters, "iteration"),
+        )
+    )
+    trainer.extend(
+        extensions.PrintReport(
+            ["epoch", "iteration", "perplexity", "val_perplexity", "elapsed_time"]
+        ),
+        trigger=(args.report_interval_iters, "iteration"),
+    )
+    trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
+    trainer.extend(extensions.snapshot(filename="snapshot.ep.{.updater.epoch}"))
+    trainer.extend(extensions.snapshot_object(model, "rnnlm.model.{.updater.epoch}"))
+    # MEMO(Hori): wants to use MinValueTrigger, but it seems to fail in resuming
+    trainer.extend(MakeSymlinkToBestModel("validation/main/loss", "rnnlm.model"))
+
+    if use_sortagrad:
+        trainer.extend(
+            ShufflingEnabler([train_iter]),
+            trigger=(args.sortagrad if args.sortagrad != -1 else args.epoch, "epoch"),
+        )
+
+    if args.resume:
+        logging.info("resumed from %s" % args.resume)
+        chainer.serializers.load_npz(args.resume, trainer)
+
+    set_early_stop(trainer, args, is_lm=True)
+    if args.tensorboard_dir is not None and args.tensorboard_dir != "":
+        try:
+            from tensorboardX import SummaryWriter
+        except Exception:
+            logging.error("Please install tensorboardx")
+            raise
+        writer = SummaryWriter(args.tensorboard_dir)
+        trainer.extend(
+            TensorboardLogger(writer), trigger=(args.report_interval_iters, "iteration")
+        )
+
+    trainer.run()
+    check_early_stop(trainer, args.epoch)
+
+    # compute perplexity for test set
+    if args.test_label:
+        logging.info("test the best model")
+        chainer.serializers.load_npz(args.outdir + "/rnnlm.model.best", model)
+        test = read_tokens(args.test_label, args.char_list_dict)
+        n_test_tokens, n_test_oovs = count_tokens(test, unk)
+        logging.info("#sentences in the test data = " + str(len(test)))
+        logging.info("#tokens in the test data = " + str(n_test_tokens))
+        logging.info(
+            "oov rate in the test data = %.2f %%" % (n_test_oovs / n_test_tokens * 100)
+        )
+        test_iter = ParallelSentenceIterator(
+            test, args.batchsize, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
+        )
+        evaluator = LMEvaluator(test_iter, model, device=gpu_id)
+        with chainer.using_config("train", False):
+            result = evaluator()
+        logging.info("test perplexity: " + str(np.exp(float(result["main/loss"]))))