Commit 60a2c57a authored by sunzhq2's avatar sunzhq2 Committed by xuxo
Browse files

update conformer

parent 4a699441
"""V2 backend for `asr_recog.py` using py:class:`espnet.nets.beam_search.BeamSearch`."""
import json
import logging
import torch
from packaging.version import parse as V
from espnet.asr.asr_utils import add_results_to_json, get_model_conf, torch_load
from espnet.asr.pytorch_backend.asr import load_trained_model
from espnet.nets.asr_interface import ASRInterface
from espnet.nets.batch_beam_search import BatchBeamSearch
from espnet.nets.beam_search import BeamSearch
from espnet.nets.lm_interface import dynamic_import_lm
from espnet.nets.scorer_interface import BatchScorerInterface
from espnet.nets.scorers.length_bonus import LengthBonus
from espnet.utils.deterministic_utils import set_deterministic_pytorch
from espnet.utils.io_utils import LoadInputsAndTargets
def recog_v2(args):
"""Decode with custom models that implements ScorerInterface.
Notes:
The previous backend espnet.asr.pytorch_backend.asr.recog
only supports E2E and RNNLM
Args:
args (namespace): The program arguments.
See py:func:`espnet.bin.asr_recog.get_parser` for details
"""
logging.warning("experimental API for custom LMs is selected by --api v2")
if args.batchsize > 1:
raise NotImplementedError("multi-utt batch decoding is not implemented")
if args.streaming_mode is not None:
raise NotImplementedError("streaming mode is not implemented")
if args.word_rnnlm:
raise NotImplementedError("word LM is not implemented")
set_deterministic_pytorch(args)
model, train_args = load_trained_model(args.model)
assert isinstance(model, ASRInterface)
if args.quantize_config is not None:
q_config = set([getattr(torch.nn, q) for q in args.quantize_config])
else:
q_config = {torch.nn.Linear}
if args.quantize_asr_model:
logging.info("Use quantized asr model for decoding")
# See https://github.com/espnet/espnet/pull/3616 for more information.
if (
V(torch.__version__) < V("1.4.0")
and "lstm" in train_args.etype
and torch.nn.LSTM in q_config
):
raise ValueError(
"Quantized LSTM in ESPnet is only supported with torch 1.4+."
)
if args.quantize_dtype == "float16" and V(torch.__version__) < V("1.5.0"):
raise ValueError(
"float16 dtype for dynamic quantization is not supported with torch "
"version < 1.5.0. Switching to qint8 dtype instead."
)
dtype = getattr(torch, args.quantize_dtype)
model = torch.quantization.quantize_dynamic(model, q_config, dtype=dtype)
model.eval()
load_inputs_and_targets = LoadInputsAndTargets(
mode="asr",
load_output=False,
sort_in_input_length=False,
preprocess_conf=train_args.preprocess_conf
if args.preprocess_conf is None
else args.preprocess_conf,
preprocess_args={"train": False},
)
if args.rnnlm:
lm_args = get_model_conf(args.rnnlm, args.rnnlm_conf)
# NOTE: for a compatibility with less than 0.5.0 version models
lm_model_module = getattr(lm_args, "model_module", "default")
lm_class = dynamic_import_lm(lm_model_module, lm_args.backend)
lm = lm_class(len(train_args.char_list), lm_args)
torch_load(args.rnnlm, lm)
if args.quantize_lm_model:
logging.info("Use quantized lm model")
dtype = getattr(torch, args.quantize_dtype)
lm = torch.quantization.quantize_dynamic(lm, q_config, dtype=dtype)
lm.eval()
else:
lm = None
if args.ngram_model:
from espnet.nets.scorers.ngram import NgramFullScorer, NgramPartScorer
if args.ngram_scorer == "full":
ngram = NgramFullScorer(args.ngram_model, train_args.char_list)
else:
ngram = NgramPartScorer(args.ngram_model, train_args.char_list)
else:
ngram = None
scorers = model.scorers()
scorers["lm"] = lm
scorers["ngram"] = ngram
scorers["length_bonus"] = LengthBonus(len(train_args.char_list))
weights = dict(
decoder=1.0 - args.ctc_weight,
ctc=args.ctc_weight,
lm=args.lm_weight,
ngram=args.ngram_weight,
length_bonus=args.penalty,
)
beam_search = BeamSearch(
beam_size=args.beam_size,
vocab_size=len(train_args.char_list),
weights=weights,
scorers=scorers,
sos=model.sos,
eos=model.eos,
token_list=train_args.char_list,
pre_beam_score_key=None if args.ctc_weight == 1.0 else "full",
)
# TODO(karita): make all scorers batchfied
if args.batchsize == 1:
non_batch = [
k
for k, v in beam_search.full_scorers.items()
if not isinstance(v, BatchScorerInterface)
]
if len(non_batch) == 0:
beam_search.__class__ = BatchBeamSearch
logging.info("BatchBeamSearch implementation is selected.")
else:
logging.warning(
f"As non-batch scorers {non_batch} are found, "
f"fall back to non-batch implementation."
)
if args.ngpu > 1:
raise NotImplementedError("only single GPU decoding is supported")
if args.ngpu == 1:
device = "cuda"
else:
device = "cpu"
dtype = getattr(torch, args.dtype)
logging.info(f"Decoding device={device}, dtype={dtype}")
model.to(device=device, dtype=dtype).eval()
beam_search.to(device=device, dtype=dtype).eval()
# read json data
with open(args.recog_json, "rb") as f:
js = json.load(f)["utts"]
new_js = {}
with torch.no_grad():
for idx, name in enumerate(js.keys(), 1):
logging.info("(%d/%d) decoding " + name, idx, len(js.keys()))
batch = [(name, js[name])]
feat = load_inputs_and_targets(batch)[0][0]
enc = model.encode(torch.as_tensor(feat).to(device=device, dtype=dtype))
nbest_hyps = beam_search(
x=enc, maxlenratio=args.maxlenratio, minlenratio=args.minlenratio
)
nbest_hyps = [
h.asdict() for h in nbest_hyps[: min(len(nbest_hyps), args.nbest)]
]
new_js[name] = add_results_to_json(
js[name], nbest_hyps, train_args.char_list
)
with open(args.result_label, "wb") as f:
f.write(
json.dumps(
{"utts": new_js}, indent=4, ensure_ascii=False, sort_keys=True
).encode("utf_8")
)
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2020 Johns Hopkins University (Xuankai Chang)
# 2020, Technische Universität München; Dominik Winkelbauer, Ludwig Kürzinger
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""
This program performs CTC segmentation to align utterances within audio files.
Inputs:
`--data-json`:
A json containing list of utterances and audio files
`--model`:
An already trained ASR model
Output:
`--output`:
A plain `segments` file with utterance positions in the audio files.
Selected parameters:
`--min-window-size`:
Minimum window size considered for a single utterance. The current default value
should be OK in most cases. Larger values might give better results; too large
values cause IndexErrors.
`--subsampling-factor`:
If the encoder sub-samples its input, the number of frames at the CTC layer is
reduced by this factor.
`--frame-duration`:
This is the non-overlapping duration of a single frame in milliseconds (the
inverse of frames per millisecond).
`--set-blank`:
In the rare case that the blank token has not the index 0 in the character
dictionary, this parameter sets the index of the blank token.
`--gratis-blank`:
Sets the transition cost for blank tokens to zero. Useful if there are longer
unrelated segments between segments.
`--replace-spaces-with-blanks`:
Spaces are replaced with blanks. Helps to model pauses between words. May
increase length of ground truth. May lead to misaligned segments when combined
with the option `--gratis-blank`.
"""
import json
import logging
import os
import sys
import configargparse
import torch
# imports for CTC segmentation
from ctc_segmentation import (
CtcSegmentationParameters,
ctc_segmentation,
determine_utterance_segments,
prepare_text,
)
# imports for inference
from espnet.asr.pytorch_backend.asr_init import load_trained_model
from espnet.nets.asr_interface import ASRInterface
from espnet.utils.io_utils import LoadInputsAndTargets
# NOTE: you need this func to generate our sphinx doc
def get_parser():
"""Get default arguments."""
parser = configargparse.ArgumentParser(
description="Align text to audio using CTC segmentation."
"using a pre-trained speech recognition model.",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="Decoding config file path.")
parser.add_argument(
"--ngpu", type=int, default=0, help="Number of GPUs (max. 1 is supported)"
)
parser.add_argument(
"--dtype",
choices=("float16", "float32", "float64"),
default="float32",
help="Float precision (only available in --api v2)",
)
parser.add_argument(
"--backend",
type=str,
default="pytorch",
choices=["pytorch"],
help="Backend library",
)
parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
# task related
parser.add_argument(
"--data-json", type=str, help="Json of recognition data for audio and text"
)
parser.add_argument("--utt-text", type=str, help="Text separated into utterances")
# model (parameter) related
parser.add_argument(
"--model", type=str, required=True, help="Model file parameters to read"
)
parser.add_argument(
"--model-conf", type=str, default=None, help="Model config file"
)
parser.add_argument(
"--num-encs", default=1, type=int, help="Number of encoders in the model."
)
# ctc-segmentation related
parser.add_argument(
"--subsampling-factor",
type=int,
default=None,
help="Subsampling factor."
" If the encoder sub-samples its input, the number of frames at the CTC layer"
" is reduced by this factor. For example, a BLSTMP with subsampling 1_2_2_1_1"
" has a subsampling factor of 4.",
)
parser.add_argument(
"--frame-duration",
type=int,
default=None,
help="Non-overlapping duration of a single frame in milliseconds.",
)
parser.add_argument(
"--min-window-size",
type=int,
default=None,
help="Minimum window size considered for utterance.",
)
parser.add_argument(
"--max-window-size",
type=int,
default=None,
help="Maximum window size considered for utterance.",
)
parser.add_argument(
"--use-dict-blank",
type=int,
default=None,
help="DEPRECATED.",
)
parser.add_argument(
"--set-blank",
type=int,
default=None,
help="Index of model dictionary for blank token (default: 0).",
)
parser.add_argument(
"--gratis-blank",
type=int,
default=None,
help="Set the transition cost of the blank token to zero. Audio sections"
" labeled with blank tokens can then be skipped without penalty. Useful"
" if there are unrelated audio segments between utterances.",
)
parser.add_argument(
"--replace-spaces-with-blanks",
type=int,
default=None,
help="Fill blanks in between words to better model pauses between words."
" Segments can be misaligned if this option is combined with --gratis-blank."
" May increase length of ground truth.",
)
parser.add_argument(
"--scoring-length",
type=int,
default=None,
help="Changes partitioning length L for calculation of the confidence score.",
)
parser.add_argument(
"--output",
type=configargparse.FileType("w"),
required=True,
help="Output segments file",
)
return parser
def main(args):
"""Run the main decoding function."""
parser = get_parser()
args, extra = parser.parse_known_args(args)
# logging info
if args.verbose == 1:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
elif args.verbose == 2:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
if args.ngpu == 0 and args.dtype == "float16":
raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.")
# check CUDA_VISIBLE_DEVICES
device = "cpu"
if args.ngpu == 1:
device = "cuda"
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is None:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
elif args.ngpu > 1:
logging.error("Decoding only supports ngpu=1.")
sys.exit(1)
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# recog
logging.info("backend = " + args.backend)
if args.backend == "pytorch":
ctc_align(args, device)
else:
raise ValueError("Only pytorch is supported.")
sys.exit(0)
def ctc_align(args, device):
"""ESPnet-specific interface for CTC segmentation.
Parses configuration, infers the CTC posterior probabilities,
and then aligns start and end of utterances using CTC segmentation.
Results are written to the output file given in the args.
:param args: given configuration
:param device: for inference; one of ['cuda', 'cpu']
:return: 0 on success
"""
model, train_args = load_trained_model(args.model)
assert isinstance(model, ASRInterface)
load_inputs_and_targets = LoadInputsAndTargets(
mode="asr",
load_output=True,
sort_in_input_length=False,
preprocess_conf=train_args.preprocess_conf
if args.preprocess_conf is None
else args.preprocess_conf,
preprocess_args={"train": False},
)
logging.info(f"Decoding device={device}")
# Warn for nets with high memory consumption on long audio files
if hasattr(model, "enc"):
encoder_module = model.enc.__class__.__module__
elif hasattr(model, "encoder"):
encoder_module = model.encoder.__class__.__module__
else:
encoder_module = "Unknown"
logging.info(f"Encoder module: {encoder_module}")
logging.info(f"CTC module: {model.ctc.__class__.__module__}")
if "rnn" not in encoder_module:
logging.warning("No BLSTM model detected; memory consumption may be high.")
model.to(device=device).eval()
# read audio and text json data
with open(args.data_json, "rb") as f:
js = json.load(f)["utts"]
with open(args.utt_text, "r", encoding="utf-8") as f:
lines = f.readlines()
i = 0
text = {}
segment_names = {}
for name in js.keys():
text_per_audio = []
segment_names_per_audio = []
while i < len(lines) and lines[i].startswith(name):
text_per_audio.append(lines[i][lines[i].find(" ") + 1 :])
segment_names_per_audio.append(lines[i][: lines[i].find(" ")])
i += 1
text[name] = text_per_audio
segment_names[name] = segment_names_per_audio
# apply configuration
config = CtcSegmentationParameters()
subsampling_factor = 1
frame_duration_ms = 10
if args.subsampling_factor is not None:
subsampling_factor = args.subsampling_factor
if args.frame_duration is not None:
frame_duration_ms = args.frame_duration
# Backwards compatibility to ctc_segmentation <= 1.5.3
if hasattr(config, "index_duration"):
config.index_duration = frame_duration_ms * subsampling_factor / 1000
else:
config.subsampling_factor = subsampling_factor
config.frame_duration_ms = frame_duration_ms
if args.min_window_size is not None:
config.min_window_size = args.min_window_size
if args.max_window_size is not None:
config.max_window_size = args.max_window_size
config.char_list = train_args.char_list
if args.use_dict_blank is not None:
logging.warning(
"The option --use-dict-blank is deprecated. If needed,"
" use --set-blank instead."
)
if args.set_blank is not None:
config.blank = args.set_blank
if args.replace_spaces_with_blanks is not None:
if args.replace_spaces_with_blanks:
config.replace_spaces_with_blanks = True
else:
config.replace_spaces_with_blanks = False
if args.gratis_blank:
config.blank_transition_cost_zero = True
if config.blank_transition_cost_zero and args.replace_spaces_with_blanks:
logging.error(
"Blanks are inserted between words, and also the transition cost of blank"
" is zero. This configuration may lead to misalignments!"
)
if args.scoring_length is not None:
config.score_min_mean_over_L = args.scoring_length
logging.info(f"Frame timings: {frame_duration_ms}ms * {subsampling_factor}")
# Iterate over audio files to decode and align
for idx, name in enumerate(js.keys(), 1):
logging.info("(%d/%d) Aligning " + name, idx, len(js.keys()))
batch = [(name, js[name])]
feat, label = load_inputs_and_targets(batch)
feat = feat[0]
with torch.no_grad():
# Encode input frames
enc_output = model.encode(torch.as_tensor(feat).to(device)).unsqueeze(0)
# Apply ctc layer to obtain log character probabilities
lpz = model.ctc.log_softmax(enc_output)[0].cpu().numpy()
# Prepare the text for aligning
ground_truth_mat, utt_begin_indices = prepare_text(config, text[name])
# Align using CTC segmentation
timings, char_probs, state_list = ctc_segmentation(
config, lpz, ground_truth_mat
)
logging.debug(f"state_list = {state_list}")
# Obtain list of utterances with time intervals and confidence score
segments = determine_utterance_segments(
config, utt_begin_indices, char_probs, timings, text[name]
)
# Write to "segments" file
for i, boundary in enumerate(segments):
utt_segment = (
f"{segment_names[name][i]} {name} {boundary[0]:.2f}"
f" {boundary[1]:.2f} {boundary[2]:.9f}\n"
)
args.output.write(utt_segment)
return 0
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
import logging
import os
import random
import sys
from distutils.util import strtobool
import configargparse
import numpy as np
from espnet.asr.pytorch_backend.asr import enhance
# NOTE: you need this func to generate our sphinx doc
def get_parser():
parser = configargparse.ArgumentParser(
description="Enhance noisy speech for speech recognition",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="config file path")
parser.add(
"--config2",
is_config_file=True,
help="second config file path that overwrites the settings in `--config`.",
)
parser.add(
"--config3",
is_config_file=True,
help="third config file path that overwrites the settings "
"in `--config` and `--config2`.",
)
parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
parser.add_argument(
"--backend",
default="chainer",
type=str,
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
parser.add_argument("--seed", default=1, type=int, help="Random seed")
parser.add_argument("--verbose", "-V", default=1, type=int, help="Verbose option")
parser.add_argument(
"--batchsize",
default=1,
type=int,
help="Batch size for beam search (0: means no batch processing)",
)
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
# task related
parser.add_argument(
"--recog-json", type=str, help="Filename of recognition data (json)"
)
# model (parameter) related
parser.add_argument(
"--model", type=str, required=True, help="Model file parameters to read"
)
parser.add_argument(
"--model-conf", type=str, default=None, help="Model config file"
)
# Outputs configuration
parser.add_argument(
"--enh-wspecifier",
type=str,
default=None,
help="Specify the output way for enhanced speech."
"e.g. ark,scp:outdir,wav.scp",
)
parser.add_argument(
"--enh-filetype",
type=str,
default="sound",
choices=["mat", "hdf5", "sound.hdf5", "sound"],
help="Specify the file format for enhanced speech. "
'"mat" is the matrix format in kaldi',
)
parser.add_argument("--fs", type=int, default=16000, help="The sample frequency")
parser.add_argument(
"--keep-length",
type=strtobool,
default=True,
help="Adjust the output length to match " "with the input for enhanced speech",
)
parser.add_argument(
"--image-dir", type=str, default=None, help="The directory saving the images."
)
parser.add_argument(
"--num-images",
type=int,
default=20,
help="The number of images files to be saved. "
"If negative, all samples are to be saved.",
)
# IStft
parser.add_argument(
"--apply-istft",
type=strtobool,
default=True,
help="Apply istft to the output from the network",
)
parser.add_argument(
"--istft-win-length",
type=int,
default=512,
help="The window length for istft. "
"This option is ignored "
"if stft is found in the preprocess-conf",
)
parser.add_argument(
"--istft-n-shift",
type=str,
default=256,
help="The window type for istft. "
"This option is ignored "
"if stft is found in the preprocess-conf",
)
parser.add_argument(
"--istft-window",
type=str,
default="hann",
help="The window type for istft. "
"This option is ignored "
"if stft is found in the preprocess-conf",
)
return parser
def main(args):
parser = get_parser()
args = parser.parse_args(args)
# logging info
if args.verbose == 1:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
elif args.verbose == 2:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# check CUDA_VISIBLE_DEVICES
if args.ngpu > 0:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is None:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
elif args.ngpu != len(cvd.split(",")):
logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
sys.exit(1)
# TODO(kamo): support of multiple GPUs
if args.ngpu > 1:
logging.error("The program only supports ngpu=1.")
sys.exit(1)
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# seed setting
random.seed(args.seed)
np.random.seed(args.seed)
logging.info("set random seed = %d" % args.seed)
# recog
logging.info("backend = " + args.backend)
if args.backend == "pytorch":
enhance(args)
else:
raise ValueError("Only pytorch is supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""End-to-end speech recognition model decoding script."""
import logging
import os
import random
import sys
import configargparse
import numpy as np
from espnet.utils.cli_utils import strtobool
# NOTE: you need this func to generate our sphinx doc
def get_parser():
"""Get default arguments."""
parser = configargparse.ArgumentParser(
description="Transcribe text from speech using "
"a speech recognition model on one CPU or GPU",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="Config file path")
parser.add(
"--config2",
is_config_file=True,
help="Second config file path that overwrites the settings in `--config`",
)
parser.add(
"--config3",
is_config_file=True,
help="Third config file path that overwrites the settings "
"in `--config` and `--config2`",
)
parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
parser.add_argument(
"--dtype",
choices=("float16", "float32", "float64"),
default="float32",
help="Float precision (only available in --api v2)",
)
parser.add_argument(
"--backend",
type=str,
default="chainer",
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
parser.add_argument("--seed", type=int, default=1, help="Random seed")
parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
parser.add_argument(
"--batchsize",
type=int,
default=1,
help="Batch size for beam search (0: means no batch processing)",
)
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
parser.add_argument(
"--api",
default="v1",
choices=["v1", "v2"],
help="Beam search APIs "
"v1: Default API. It only supports the ASRInterface.recognize method "
"and DefaultRNNLM. "
"v2: Experimental API. It supports any models that implements ScorerInterface.",
)
# task related
parser.add_argument(
"--recog-json", type=str, help="Filename of recognition data (json)"
)
parser.add_argument(
"--result-label",
type=str,
required=True,
help="Filename of result label data (json)",
)
# model (parameter) related
parser.add_argument(
"--model", type=str, required=True, help="Model file parameters to read"
)
parser.add_argument(
"--model-conf", type=str, default=None, help="Model config file"
)
parser.add_argument(
"--num-spkrs",
type=int,
default=1,
choices=[1, 2],
help="Number of speakers in the speech",
)
parser.add_argument(
"--num-encs", default=1, type=int, help="Number of encoders in the model."
)
# search related
parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty")
parser.add_argument(
"--maxlenratio",
type=float,
default=0.0,
help="""Input length ratio to obtain max output length.
If maxlenratio=0.0 (default), it uses a end-detect function
to automatically find maximum hypothesis lengths.
If maxlenratio<0.0, its absolute value is interpreted
as a constant max output length""",
)
parser.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
parser.add_argument(
"--ctc-weight", type=float, default=0.0, help="CTC weight in joint decoding"
)
parser.add_argument(
"--weights-ctc-dec",
type=float,
action="append",
help="ctc weight assigned to each encoder during decoding."
"[in multi-encoder mode only]",
)
parser.add_argument(
"--ctc-window-margin",
type=int,
default=0,
help="""Use CTC window with margin parameter to accelerate
CTC/attention decoding especially on GPU. Smaller magin
makes decoding faster, but may increase search errors.
If margin=0 (default), this function is disabled""",
)
# transducer related
parser.add_argument(
"--search-type",
type=str,
default="default",
choices=["default", "nsc", "tsd", "alsd", "maes"],
help="""Type of beam search implementation to use during inference.
Can be either: default beam search ("default"),
N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"),
Alignment-Length Synchronous Decoding ("alsd") or
modified Adaptive Expansion Search ("maes").""",
)
parser.add_argument(
"--nstep",
type=int,
default=1,
help="""Number of expansion steps allowed in NSC beam search or mAES
(nstep > 0 for NSC and nstep > 1 for mAES).""",
)
parser.add_argument(
"--prefix-alpha",
type=int,
default=2,
help="Length prefix difference allowed in NSC beam search or mAES.",
)
parser.add_argument(
"--max-sym-exp",
type=int,
default=2,
help="Number of symbol expansions allowed in TSD.",
)
parser.add_argument(
"--u-max",
type=int,
default=400,
help="Length prefix difference allowed in ALSD.",
)
parser.add_argument(
"--expansion-gamma",
type=float,
default=2.3,
help="Allowed logp difference for prune-by-value method in mAES.",
)
parser.add_argument(
"--expansion-beta",
type=int,
default=2,
help="""Number of additional candidates for expanded hypotheses
selection in mAES.""",
)
parser.add_argument(
"--score-norm",
type=strtobool,
nargs="?",
default=True,
help="Normalize final hypotheses' score by length",
)
parser.add_argument(
"--softmax-temperature",
type=float,
default=1.0,
help="Penalization term for softmax function.",
)
# rnnlm related
parser.add_argument(
"--rnnlm", type=str, default=None, help="RNNLM model file to read"
)
parser.add_argument(
"--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
)
parser.add_argument(
"--word-rnnlm", type=str, default=None, help="Word RNNLM model file to read"
)
parser.add_argument(
"--word-rnnlm-conf",
type=str,
default=None,
help="Word RNNLM model config file to read",
)
parser.add_argument("--word-dict", type=str, default=None, help="Word list to read")
parser.add_argument("--lm-weight", type=float, default=0.1, help="RNNLM weight")
# ngram related
parser.add_argument(
"--ngram-model", type=str, default=None, help="ngram model file to read"
)
parser.add_argument("--ngram-weight", type=float, default=0.1, help="ngram weight")
parser.add_argument(
"--ngram-scorer",
type=str,
default="part",
choices=("full", "part"),
help="""if the ngram is set as a part scorer, similar with CTC scorer,
ngram scorer only scores topK hypethesis.
if the ngram is set as full scorer, ngram scorer scores all hypthesis
the decoding speed of part scorer is musch faster than full one""",
)
# streaming related
parser.add_argument(
"--streaming-mode",
type=str,
default=None,
choices=["window", "segment"],
help="""Use streaming recognizer for inference.
`--batchsize` must be set to 0 to enable this mode""",
)
parser.add_argument("--streaming-window", type=int, default=10, help="Window size")
parser.add_argument(
"--streaming-min-blank-dur",
type=int,
default=10,
help="Minimum blank duration threshold",
)
parser.add_argument(
"--streaming-onset-margin", type=int, default=1, help="Onset margin"
)
parser.add_argument(
"--streaming-offset-margin", type=int, default=1, help="Offset margin"
)
# non-autoregressive related
# Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail.
parser.add_argument(
"--maskctc-n-iterations",
type=int,
default=10,
help="Number of decoding iterations."
"For Mask CTC, set 0 to predict 1 mask/iter.",
)
parser.add_argument(
"--maskctc-probability-threshold",
type=float,
default=0.999,
help="Threshold probability for CTC output",
)
# quantize model related
parser.add_argument(
"--quantize-config",
nargs="*",
help="""Config for dynamic quantization provided as a list of modules,
separated by a comma. E.g.: --quantize-config=[Linear,LSTM,GRU].
Each specified module should be an attribute of 'torch.nn', e.g.:
torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...""",
)
parser.add_argument(
"--quantize-dtype",
type=str,
default="qint8",
choices=["float16", "qint8"],
help="Dtype for dynamic quantization.",
)
parser.add_argument(
"--quantize-asr-model",
type=bool,
default=False,
help="Apply dynamic quantization to ASR model.",
)
parser.add_argument(
"--quantize-lm-model",
type=bool,
default=False,
help="Apply dynamic quantization to LM.",
)
return parser
def main(args):
"""Run the main decoding function."""
parser = get_parser()
args = parser.parse_args(args)
if args.ngpu == 0 and args.dtype == "float16":
raise ValueError(f"--dtype {args.dtype} does not support the CPU backend.")
# logging info
if args.verbose == 1:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
elif args.verbose == 2:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# check CUDA_VISIBLE_DEVICES
if args.ngpu > 0:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is None:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
elif args.ngpu != len(cvd.split(",")):
logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
sys.exit(1)
# TODO(mn5k): support of multiple GPUs
if args.ngpu > 1:
logging.error("The program only supports ngpu=1.")
sys.exit(1)
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# seed setting
random.seed(args.seed)
np.random.seed(args.seed)
logging.info("set random seed = %d" % args.seed)
# validate rnn options
if args.rnnlm is not None and args.word_rnnlm is not None:
logging.error(
"It seems that both --rnnlm and --word-rnnlm are specified. "
"Please use either option."
)
sys.exit(1)
# recog
logging.info("backend = " + args.backend)
if args.num_spkrs == 1:
if args.backend == "chainer":
from espnet.asr.chainer_backend.asr import recog
recog(args)
elif args.backend == "pytorch":
if args.num_encs == 1:
# Experimental API that supports custom LMs
if args.api == "v2":
from espnet.asr.pytorch_backend.recog import recog_v2
recog_v2(args)
else:
from espnet.asr.pytorch_backend.asr import recog
if args.dtype != "float32":
raise NotImplementedError(
f"`--dtype {args.dtype}` is only available with `--api v2`"
)
recog(args)
else:
if args.api == "v2":
raise NotImplementedError(
f"--num-encs {args.num_encs} > 1 is not supported in --api v2"
)
else:
from espnet.asr.pytorch_backend.asr import recog
recog(args)
else:
raise ValueError("Only chainer and pytorch are supported.")
elif args.num_spkrs == 2:
if args.backend == "pytorch":
from espnet.asr.pytorch_backend.asr_mix import recog
recog(args)
else:
raise ValueError("Only pytorch is supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2017 Tomoki Hayashi (Nagoya University)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""Automatic speech recognition model training script."""
import logging
import os
import random
import subprocess
import sys
import configargparse
import numpy as np
from espnet import __version__
from espnet.utils.cli_utils import strtobool
from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
# NOTE: you need this func to generate our sphinx doc
def get_parser(parser=None, required=True):
"""Get default arguments."""
if parser is None:
parser = configargparse.ArgumentParser(
description="Train an automatic speech recognition (ASR) model on one CPU, "
"one or multiple GPUs",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="config file path")
parser.add(
"--config2",
is_config_file=True,
help="second config file path that overwrites the settings in `--config`.",
)
parser.add(
"--config3",
is_config_file=True,
help="third config file path that overwrites the settings in "
"`--config` and `--config2`.",
)
parser.add_argument(
"--ngpu",
default=None,
type=int,
help="Number of GPUs. If not given, use all visible devices",
)
parser.add_argument(
"--use-ddp",
default=False,
action="store_true",
help="Enable process-based data parallel. "
"--ngpu's GPUs will be used. "
"If --ngpu is not given, this tries to identify "
"how many GPUs can be used. But, if it fails, "
"the application will abort. "
"And, currently, single node multi GPUs job is only supported.",
)
parser.add_argument(
"--train-dtype",
default="float32",
choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
help="Data type for training (only pytorch backend). "
"O0,O1,.. flags require apex. "
"See https://nvidia.github.io/apex/amp.html#opt-levels",
)
parser.add_argument(
"--backend",
default="chainer",
type=str,
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument(
"--outdir", type=str, required=required, help="Output directory"
)
parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
parser.add_argument("--dict", required=required, help="Dictionary")
parser.add_argument("--seed", default=1, type=int, help="Random seed")
parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
parser.add_argument(
"--resume",
"-r",
default="",
nargs="?",
help="Resume the training from snapshot",
)
parser.add_argument(
"--minibatches",
"-N",
type=int,
default="-1",
help="Process only N minibatches (for debug)",
)
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--tensorboard-dir",
default=None,
type=str,
nargs="?",
help="Tensorboard log dir path",
)
parser.add_argument(
"--report-interval-iters",
default=100,
type=int,
help="Report interval iterations",
)
parser.add_argument(
"--save-interval-iters",
default=0,
type=int,
help="Save snapshot interval iterations",
)
# task related
parser.add_argument(
"--train-json",
type=str,
default=None,
help="Filename of train label data (json)",
)
parser.add_argument(
"--valid-json",
type=str,
default=None,
help="Filename of validation label data (json)",
)
# network architecture
parser.add_argument(
"--model-module",
type=str,
default=None,
help="model defined module (default: espnet.nets.xxx_backend.e2e_asr:E2E)",
)
# encoder
parser.add_argument(
"--num-encs", default=1, type=int, help="Number of encoders in the model."
)
# loss related
parser.add_argument(
"--ctc_type",
default="builtin",
type=str,
choices=["builtin", "gtnctc", "cudnnctc"],
help="Type of CTC implementation to calculate loss.",
)
parser.add_argument(
"--mtlalpha",
default=0.5,
type=float,
help="Multitask learning coefficient, "
"alpha: alpha*ctc_loss + (1-alpha)*att_loss ",
)
parser.add_argument(
"--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
)
# recognition options to compute CER/WER
parser.add_argument(
"--report-cer",
default=False,
action="store_true",
help="Compute CER on development set",
)
parser.add_argument(
"--report-wer",
default=False,
action="store_true",
help="Compute WER on development set",
)
parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
parser.add_argument(
"--maxlenratio",
default=0.0,
type=float,
help="""Input length ratio to obtain max output length.
If maxlenratio=0.0 (default), it uses a end-detect function
to automatically find maximum hypothesis lengths""",
)
parser.add_argument(
"--minlenratio",
default=0.0,
type=float,
help="Input length ratio to obtain min output length",
)
parser.add_argument(
"--ctc-weight", default=0.3, type=float, help="CTC weight in joint decoding"
)
parser.add_argument(
"--rnnlm", type=str, default=None, help="RNNLM model file to read"
)
parser.add_argument(
"--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
)
parser.add_argument("--lm-weight", default=0.1, type=float, help="RNNLM weight.")
parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
# minibatch related
parser.add_argument(
"--sortagrad",
default=0,
type=int,
nargs="?",
help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
)
parser.add_argument(
"--batch-count",
default="auto",
choices=BATCH_COUNT_CHOICES,
help="How to count batch_size. "
"The default (auto) will find how to count by args.",
)
parser.add_argument(
"--batch-size",
"--batch-seqs",
"-b",
default=0,
type=int,
help="Maximum seqs in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-bins",
default=0,
type=int,
help="Maximum bins in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-in",
default=0,
type=int,
help="Maximum input frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-out",
default=0,
type=int,
help="Maximum output frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-inout",
default=0,
type=int,
help="Maximum input+output frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--maxlen-in",
"--batch-seq-maxlen-in",
default=800,
type=int,
metavar="ML",
help="When --batch-count=seq, "
"batch size is reduced if the input sequence length > ML.",
)
parser.add_argument(
"--maxlen-out",
"--batch-seq-maxlen-out",
default=150,
type=int,
metavar="ML",
help="When --batch-count=seq, "
"batch size is reduced if the output sequence length > ML",
)
parser.add_argument(
"--n-iter-processes",
default=0,
type=int,
help="Number of processes of iterator",
)
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
nargs="?",
help="The configuration file for the pre-processing",
)
# optimization related
parser.add_argument(
"--opt",
default="adadelta",
type=str,
choices=["adadelta", "adam", "noam"],
help="Optimizer",
)
parser.add_argument(
"--accum-grad", default=1, type=int, help="Number of gradient accumuration"
)
parser.add_argument(
"--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
)
parser.add_argument(
"--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
)
parser.add_argument(
"--weight-decay", default=0.0, type=float, help="Weight decay ratio"
)
parser.add_argument(
"--criterion",
default="acc",
type=str,
choices=["loss", "loss_eps_decay_only", "acc"],
help="Criterion to perform epsilon decay",
)
parser.add_argument(
"--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
)
parser.add_argument(
"--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
)
parser.add_argument(
"--early-stop-criterion",
default="validation/main/acc",
type=str,
nargs="?",
help="Value to monitor to trigger an early stopping of the training",
)
parser.add_argument(
"--patience",
default=3,
type=int,
nargs="?",
help="Number of epochs to wait without improvement "
"before stopping the training",
)
parser.add_argument(
"--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
)
parser.add_argument(
"--num-save-attention",
default=3,
type=int,
help="Number of samples of attention to be saved",
)
parser.add_argument(
"--num-save-ctc",
default=3,
type=int,
help="Number of samples of CTC probability to be saved",
)
parser.add_argument(
"--grad-noise",
type=strtobool,
default=False,
help="The flag to switch to use noise injection to gradients during training",
)
# asr_mix related
parser.add_argument(
"--num-spkrs",
default=1,
type=int,
choices=[1, 2],
help="Number of speakers in the speech.",
)
# decoder related
parser.add_argument(
"--context-residual",
default=False,
type=strtobool,
nargs="?",
help="The flag to switch to use context vector residual in the decoder network",
)
# finetuning related
parser.add_argument(
"--enc-init",
default=None,
type=str,
help="Pre-trained ASR model to initialize encoder.",
)
parser.add_argument(
"--enc-init-mods",
default="enc.enc.",
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of encoder modules to initialize, separated by a comma.",
)
parser.add_argument(
"--dec-init",
default=None,
type=str,
help="Pre-trained ASR, MT or LM model to initialize decoder.",
)
parser.add_argument(
"--dec-init-mods",
default="att.,dec.",
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of decoder modules to initialize, separated by a comma.",
)
parser.add_argument(
"--freeze-mods",
default=None,
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of modules to freeze, separated by a comma.",
)
# front end related
parser.add_argument(
"--use-frontend",
type=strtobool,
default=False,
help="The flag to switch to use frontend system.",
)
# WPE related
parser.add_argument(
"--use-wpe",
type=strtobool,
default=False,
help="Apply Weighted Prediction Error",
)
parser.add_argument(
"--wtype",
default="blstmp",
type=str,
choices=[
"lstm",
"blstm",
"lstmp",
"blstmp",
"vgglstmp",
"vggblstmp",
"vgglstm",
"vggblstm",
"gru",
"bgru",
"grup",
"bgrup",
"vgggrup",
"vggbgrup",
"vgggru",
"vggbgru",
],
help="Type of encoder network architecture "
"of the mask estimator for WPE. "
"",
)
parser.add_argument("--wlayers", type=int, default=2, help="")
parser.add_argument("--wunits", type=int, default=300, help="")
parser.add_argument("--wprojs", type=int, default=300, help="")
parser.add_argument("--wdropout-rate", type=float, default=0.0, help="")
parser.add_argument("--wpe-taps", type=int, default=5, help="")
parser.add_argument("--wpe-delay", type=int, default=3, help="")
parser.add_argument(
"--use-dnn-mask-for-wpe",
type=strtobool,
default=False,
help="Use DNN to estimate the power spectrogram. "
"This option is experimental.",
)
# Beamformer related
parser.add_argument("--use-beamformer", type=strtobool, default=True, help="")
parser.add_argument(
"--btype",
default="blstmp",
type=str,
choices=[
"lstm",
"blstm",
"lstmp",
"blstmp",
"vgglstmp",
"vggblstmp",
"vgglstm",
"vggblstm",
"gru",
"bgru",
"grup",
"bgrup",
"vgggrup",
"vggbgrup",
"vgggru",
"vggbgru",
],
help="Type of encoder network architecture "
"of the mask estimator for Beamformer.",
)
parser.add_argument("--blayers", type=int, default=2, help="")
parser.add_argument("--bunits", type=int, default=300, help="")
parser.add_argument("--bprojs", type=int, default=300, help="")
parser.add_argument("--badim", type=int, default=320, help="")
parser.add_argument(
"--bnmask",
type=int,
default=2,
help="Number of beamforming masks, " "default is 2 for [speech, noise].",
)
parser.add_argument(
"--ref-channel",
type=int,
default=-1,
help="The reference channel used for beamformer. "
"By default, the channel is estimated by DNN.",
)
parser.add_argument("--bdropout-rate", type=float, default=0.0, help="")
# Feature transform: Normalization
parser.add_argument(
"--stats-file",
type=str,
default=None,
help="The stats file for the feature normalization",
)
parser.add_argument(
"--apply-uttmvn",
type=strtobool,
default=True,
help="Apply utterance level mean " "variance normalization.",
)
parser.add_argument("--uttmvn-norm-means", type=strtobool, default=True, help="")
parser.add_argument("--uttmvn-norm-vars", type=strtobool, default=False, help="")
# Feature transform: Fbank
parser.add_argument(
"--fbank-fs",
type=int,
default=16000,
help="The sample frequency used for " "the mel-fbank creation.",
)
parser.add_argument(
"--n-mels", type=int, default=80, help="The number of mel-frequency bins."
)
parser.add_argument("--fbank-fmin", type=float, default=0.0, help="")
parser.add_argument("--fbank-fmax", type=float, default=None, help="")
return parser
def setup_logging(verbose):
"""Make logging setup with a given log level."""
if verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
def main(cmd_args):
"""Run the main training function."""
parser = get_parser()
args, _ = parser.parse_known_args(cmd_args)
if args.backend == "chainer" and args.train_dtype != "float32":
raise NotImplementedError(
f"chainer backend does not support --train-dtype {args.train_dtype}."
"Use --dtype float32."
)
if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
raise ValueError(
f"--train-dtype {args.train_dtype} does not support the CPU backend."
)
from espnet.utils.dynamic_import import dynamic_import
if args.model_module is None:
if args.num_spkrs == 1:
model_module = "espnet.nets." + args.backend + "_backend.e2e_asr:E2E"
else:
model_module = "espnet.nets." + args.backend + "_backend.e2e_asr_mix:E2E"
else:
model_module = args.model_module
model_class = dynamic_import(model_module)
model_class.add_arguments(parser)
args = parser.parse_args(cmd_args)
args.model_module = model_module
if "chainer_backend" in args.model_module:
args.backend = "chainer"
if "pytorch_backend" in args.model_module:
args.backend = "pytorch"
# add version info in args
args.version = __version__
# logging info
setup_logging(args.verbose)
# If --ngpu is not given,
# 1. if CUDA_VISIBLE_DEVICES is set, all visible devices
# 2. if nvidia-smi exists, use all devices
# 3. else ngpu=0
if args.ngpu is None:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is not None:
ngpu = len(cvd.split(","))
else:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
try:
p = subprocess.run(
["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
except (subprocess.CalledProcessError, FileNotFoundError):
ngpu = 0
else:
ngpu = len(p.stderr.decode().split("\n")) - 1
else:
if args.ngpu != 1:
logging.debug(
"There are some bugs with multi-GPU processing in PyTorch 1.2+"
+ " (see https://github.com/pytorch/pytorch/issues/21108)"
)
ngpu = args.ngpu
if args.use_ddp and ngpu <= 0:
raise ValueError("DDP requires at least 1 GPU.")
logging.info(f"ngpu: {ngpu}")
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# set random seed
logging.info("random seed = %d" % args.seed)
random.seed(args.seed)
np.random.seed(args.seed)
# load dictionary for debug log
if args.dict is not None:
with open(args.dict, "rb") as f:
dictionary = f.readlines()
char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
char_list.insert(0, "<blank>")
char_list.append("<eos>")
# for non-autoregressive maskctc model
if "maskctc" in args.model_module:
char_list.append("<mask>")
args.char_list = char_list
else:
args.char_list = None
# train
logging.info("backend = " + args.backend)
if args.use_ddp:
# When using DDP, only PyTorch is supported.
# Chainer is out-of-scope.
if args.num_spkrs == 1:
if args.backend == "chainer":
raise ValueError("Chainer with DDP is not supported.")
from espnet.distributed.pytorch_backend.launch import (
launch,
set_start_method,
)
# NOTE: it's necessary to set "spawn" as a multiprocessing
# start method. Because, in this use case, CUDA initialization
# procedure has been already done, but CUDA context can't be
# shared with processes.
# By default, multiprocessing tries to launch a process with
# "fork" method. But, it will make processes which share
# memory address spaces with a parent process.
# To ensure a separate memory space, "spawn" method is required.
set_start_method("spawn")
launch(_reinitialize_logging_and_call_train, args, args.ngpu)
else:
raise ValueError("Single speaker is only supported when using DDP.")
else:
if args.num_spkrs == 1:
if args.backend == "chainer":
from espnet.asr.chainer_backend.asr import train
train(args)
elif args.backend == "pytorch":
from espnet.asr.pytorch_backend.asr import train
train(args)
else:
raise ValueError("Only chainer and pytorch are supported.")
else:
# FIXME(kamo): Support --model-module
if args.backend == "pytorch":
from espnet.asr.pytorch_backend.asr_mix import train
train(args)
else:
raise ValueError("Only pytorch is supported.")
def _reinitialize_logging_and_call_train(args):
# NOTE: it looks like logging setting is cleared
# by launching processes with "spawn" method.
# Within each worker process,
# logging configuraiton must be set again.
from espnet.asr.pytorch_backend.asr import train
setup_logging(args.verbose)
train(args)
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
# This code is ported from the following implementation written in Torch.
# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py
"""Language model training script."""
import logging
import os
import random
import subprocess
import sys
import configargparse
import numpy as np
from espnet import __version__
from espnet.nets.lm_interface import dynamic_import_lm
from espnet.optimizer.factory import dynamic_import_optimizer
from espnet.scheduler.scheduler import dynamic_import_scheduler
# NOTE: you need this func to generate our sphinx doc
def get_parser(parser=None, required=True):
"""Get parser."""
if parser is None:
parser = configargparse.ArgumentParser(
description="Train a new language model on one CPU or one GPU",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="config file path")
parser.add(
"--config2",
is_config_file=True,
help="second config file path that overwrites the settings in `--config`.",
)
parser.add(
"--config3",
is_config_file=True,
help="third config file path that overwrites the settings "
"in `--config` and `--config2`.",
)
parser.add_argument(
"--ngpu",
default=None,
type=int,
help="Number of GPUs. If not given, use all visible devices",
)
parser.add_argument(
"--train-dtype",
default="float32",
choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
help="Data type for training (only pytorch backend). "
"O0,O1,.. flags require apex. "
"See https://nvidia.github.io/apex/amp.html#opt-levels",
)
parser.add_argument(
"--backend",
default="chainer",
type=str,
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument(
"--outdir", type=str, required=required, help="Output directory"
)
parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
parser.add_argument("--dict", type=str, required=required, help="Dictionary")
parser.add_argument("--seed", default=1, type=int, help="Random seed")
parser.add_argument(
"--resume",
"-r",
default="",
nargs="?",
help="Resume the training from snapshot",
)
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--tensorboard-dir",
default=None,
type=str,
nargs="?",
help="Tensorboard log dir path",
)
parser.add_argument(
"--report-interval-iters",
default=100,
type=int,
help="Report interval iterations",
)
# task related
parser.add_argument(
"--train-label",
type=str,
required=required,
help="Filename of train label data",
)
parser.add_argument(
"--valid-label",
type=str,
required=required,
help="Filename of validation label data",
)
parser.add_argument("--test-label", type=str, help="Filename of test label data")
parser.add_argument(
"--dump-hdf5-path",
type=str,
default=None,
help="Path to dump a preprocessed dataset as hdf5",
)
# training configuration
parser.add_argument("--opt", default="sgd", type=str, help="Optimizer")
parser.add_argument(
"--sortagrad",
default=0,
type=int,
nargs="?",
help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
)
parser.add_argument(
"--batchsize",
"-b",
type=int,
default=300,
help="Number of examples in each mini-batch",
)
parser.add_argument(
"--accum-grad", type=int, default=1, help="Number of gradient accumueration"
)
parser.add_argument(
"--epoch",
"-e",
type=int,
default=20,
help="Number of sweeps over the dataset to train",
)
parser.add_argument(
"--early-stop-criterion",
default="validation/main/loss",
type=str,
nargs="?",
help="Value to monitor to trigger an early stopping of the training",
)
parser.add_argument(
"--patience",
default=3,
type=int,
nargs="?",
help="Number of epochs "
"to wait without improvement before stopping the training",
)
parser.add_argument(
"--schedulers",
default=None,
action="append",
type=lambda kv: kv.split("="),
help="optimizer schedulers, you can configure params like:"
" <optimizer-param>-<scheduler-name>-<schduler-param>"
' e.g., "--schedulers lr=noam --lr-noam-warmup 1000".',
)
parser.add_argument(
"--gradclip",
"-c",
type=float,
default=5,
help="Gradient norm threshold to clip",
)
parser.add_argument(
"--maxlen",
type=int,
default=40,
help="Batch size is reduced if the input sequence > ML",
)
parser.add_argument(
"--model-module",
type=str,
default="default",
help="model defined module "
"(default: espnet.nets.xxx_backend.lm.default:DefaultRNNLM)",
)
return parser
def main(cmd_args):
"""Train LM."""
parser = get_parser()
args, _ = parser.parse_known_args(cmd_args)
if args.backend == "chainer" and args.train_dtype != "float32":
raise NotImplementedError(
f"chainer backend does not support --train-dtype {args.train_dtype}."
"Use --dtype float32."
)
if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
raise ValueError(
f"--train-dtype {args.train_dtype} does not support the CPU backend."
)
# parse arguments dynamically
model_class = dynamic_import_lm(args.model_module, args.backend)
model_class.add_arguments(parser)
if args.schedulers is not None:
for k, v in args.schedulers:
scheduler_class = dynamic_import_scheduler(v)
scheduler_class.add_arguments(k, parser)
opt_class = dynamic_import_optimizer(args.opt, args.backend)
opt_class.add_arguments(parser)
args = parser.parse_args(cmd_args)
# add version info in args
args.version = __version__
# logging info
if args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# If --ngpu is not given,
# 1. if CUDA_VISIBLE_DEVICES is set, all visible devices
# 2. if nvidia-smi exists, use all devices
# 3. else ngpu=0
if args.ngpu is None:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is not None:
ngpu = len(cvd.split(","))
else:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
try:
p = subprocess.run(
["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
except (subprocess.CalledProcessError, FileNotFoundError):
ngpu = 0
else:
ngpu = len(p.stderr.decode().split("\n")) - 1
args.ngpu = ngpu
else:
ngpu = args.ngpu
logging.info(f"ngpu: {ngpu}")
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# seed setting
nseed = args.seed
random.seed(nseed)
np.random.seed(nseed)
# load dictionary
with open(args.dict, "rb") as f:
dictionary = f.readlines()
char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
char_list.insert(0, "<blank>")
char_list.append("<eos>")
args.char_list_dict = {x: i for i, x in enumerate(char_list)}
args.n_vocab = len(char_list)
# train
logging.info("backend = " + args.backend)
if args.backend == "chainer":
from espnet.lm.chainer_backend.lm import train
train(args)
elif args.backend == "pytorch":
from espnet.lm.pytorch_backend.lm import train
train(args)
else:
raise ValueError("Only chainer and pytorch are supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""Neural machine translation model training script."""
import logging
import os
import random
import subprocess
import sys
import configargparse
import numpy as np
from espnet import __version__
from espnet.utils.cli_utils import strtobool
from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
# NOTE: you need this func to generate our sphinx doc
def get_parser(parser=None, required=True):
"""Get default arguments."""
if parser is None:
parser = configargparse.ArgumentParser(
description="Train a neural machine translation (NMT) model on one CPU, "
"one or multiple GPUs",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="config file path")
parser.add(
"--config2",
is_config_file=True,
help="second config file path that overwrites the settings in `--config`.",
)
parser.add(
"--config3",
is_config_file=True,
help="third config file path that overwrites the settings "
"in `--config` and `--config2`.",
)
parser.add_argument(
"--ngpu",
default=None,
type=int,
help="Number of GPUs. If not given, use all visible devices",
)
parser.add_argument(
"--train-dtype",
default="float32",
choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
help="Data type for training (only pytorch backend). "
"O0,O1,.. flags require apex. "
"See https://nvidia.github.io/apex/amp.html#opt-levels",
)
parser.add_argument(
"--backend",
default="chainer",
type=str,
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument(
"--outdir", type=str, required=required, help="Output directory"
)
parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
parser.add_argument(
"--dict", required=required, help="Dictionary for source/target languages"
)
parser.add_argument("--seed", default=1, type=int, help="Random seed")
parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
parser.add_argument(
"--resume",
"-r",
default="",
nargs="?",
help="Resume the training from snapshot",
)
parser.add_argument(
"--minibatches",
"-N",
type=int,
default="-1",
help="Process only N minibatches (for debug)",
)
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--tensorboard-dir",
default=None,
type=str,
nargs="?",
help="Tensorboard log dir path",
)
parser.add_argument(
"--report-interval-iters",
default=100,
type=int,
help="Report interval iterations",
)
parser.add_argument(
"--save-interval-iters",
default=0,
type=int,
help="Save snapshot interval iterations",
)
# task related
parser.add_argument(
"--train-json",
type=str,
default=None,
help="Filename of train label data (json)",
)
parser.add_argument(
"--valid-json",
type=str,
default=None,
help="Filename of validation label data (json)",
)
# network architecture
parser.add_argument(
"--model-module",
type=str,
default=None,
help="model defined module (default: espnet.nets.xxx_backend.e2e_mt:E2E)",
)
# loss related
parser.add_argument(
"--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
)
# translations options to compute BLEU
parser.add_argument(
"--report-bleu",
default=True,
action="store_true",
help="Compute BLEU on development set",
)
parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
parser.add_argument(
"--maxlenratio",
default=0.0,
type=float,
help="""Input length ratio to obtain max output length.
If maxlenratio=0.0 (default), it uses a end-detect function
to automatically find maximum hypothesis lengths""",
)
parser.add_argument(
"--minlenratio",
default=0.0,
type=float,
help="Input length ratio to obtain min output length",
)
parser.add_argument(
"--rnnlm", type=str, default=None, help="RNNLM model file to read"
)
parser.add_argument(
"--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
)
parser.add_argument("--lm-weight", default=0.0, type=float, help="RNNLM weight.")
parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
# minibatch related
parser.add_argument(
"--sortagrad",
default=0,
type=int,
nargs="?",
help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
)
parser.add_argument(
"--batch-count",
default="auto",
choices=BATCH_COUNT_CHOICES,
help="How to count batch_size. "
"The default (auto) will find how to count by args.",
)
parser.add_argument(
"--batch-size",
"--batch-seqs",
"-b",
default=0,
type=int,
help="Maximum seqs in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-bins",
default=0,
type=int,
help="Maximum bins in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-in",
default=0,
type=int,
help="Maximum input frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-out",
default=0,
type=int,
help="Maximum output frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-inout",
default=0,
type=int,
help="Maximum input+output frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--maxlen-in",
"--batch-seq-maxlen-in",
default=100,
type=int,
metavar="ML",
help="When --batch-count=seq, "
"batch size is reduced if the input sequence length > ML.",
)
parser.add_argument(
"--maxlen-out",
"--batch-seq-maxlen-out",
default=100,
type=int,
metavar="ML",
help="When --batch-count=seq, "
"batch size is reduced if the output sequence length > ML",
)
parser.add_argument(
"--n-iter-processes",
default=0,
type=int,
help="Number of processes of iterator",
)
# optimization related
parser.add_argument(
"--opt",
default="adadelta",
type=str,
choices=["adadelta", "adam", "noam"],
help="Optimizer",
)
parser.add_argument(
"--accum-grad", default=1, type=int, help="Number of gradient accumuration"
)
parser.add_argument(
"--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
)
parser.add_argument(
"--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
)
parser.add_argument(
"--lr", default=1e-3, type=float, help="Learning rate for optimizer"
)
parser.add_argument(
"--lr-decay", default=1.0, type=float, help="Decaying ratio of learning rate"
)
parser.add_argument(
"--weight-decay", default=0.0, type=float, help="Weight decay ratio"
)
parser.add_argument(
"--criterion",
default="acc",
type=str,
choices=["loss", "acc"],
help="Criterion to perform epsilon decay",
)
parser.add_argument(
"--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
)
parser.add_argument(
"--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
)
parser.add_argument(
"--early-stop-criterion",
default="validation/main/acc",
type=str,
nargs="?",
help="Value to monitor to trigger an early stopping of the training",
)
parser.add_argument(
"--patience",
default=3,
type=int,
nargs="?",
help="Number of epochs to wait "
"without improvement before stopping the training",
)
parser.add_argument(
"--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
)
parser.add_argument(
"--num-save-attention",
default=3,
type=int,
help="Number of samples of attention to be saved",
)
# decoder related
parser.add_argument(
"--context-residual",
default=False,
type=strtobool,
nargs="?",
help="The flag to switch to use context vector residual in the decoder network",
)
parser.add_argument(
"--tie-src-tgt-embedding",
default=False,
type=strtobool,
nargs="?",
help="Tie parameters of source embedding and target embedding.",
)
parser.add_argument(
"--tie-classifier",
default=False,
type=strtobool,
nargs="?",
help="Tie parameters of target embedding and output projection layer.",
)
# finetuning related
parser.add_argument(
"--enc-init",
default=None,
type=str,
nargs="?",
help="Pre-trained ASR model to initialize encoder.",
)
parser.add_argument(
"--enc-init-mods",
default="enc.enc.",
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of encoder modules to initialize, separated by a comma.",
)
parser.add_argument(
"--dec-init",
default=None,
type=str,
nargs="?",
help="Pre-trained ASR, MT or LM model to initialize decoder.",
)
parser.add_argument(
"--dec-init-mods",
default="att., dec.",
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of decoder modules to initialize, separated by a comma.",
)
# multilingual related
parser.add_argument(
"--multilingual",
default=False,
type=strtobool,
help="Prepend target language ID to the source sentence. "
"Both source/target language IDs must be prepend in the pre-processing stage.",
)
parser.add_argument(
"--replace-sos",
default=False,
type=strtobool,
help="Replace <sos> in the decoder with a target language ID "
"(the first token in the target sequence)",
)
return parser
def main(cmd_args):
"""Run the main training function."""
parser = get_parser()
args, _ = parser.parse_known_args(cmd_args)
if args.backend == "chainer" and args.train_dtype != "float32":
raise NotImplementedError(
f"chainer backend does not support --train-dtype {args.train_dtype}."
"Use --dtype float32."
)
if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
raise ValueError(
f"--train-dtype {args.train_dtype} does not support the CPU backend."
)
from espnet.utils.dynamic_import import dynamic_import
if args.model_module is None:
model_module = "espnet.nets." + args.backend + "_backend.e2e_mt:E2E"
else:
model_module = args.model_module
model_class = dynamic_import(model_module)
model_class.add_arguments(parser)
args = parser.parse_args(cmd_args)
args.model_module = model_module
if "chainer_backend" in args.model_module:
args.backend = "chainer"
if "pytorch_backend" in args.model_module:
args.backend = "pytorch"
# add version info in args
args.version = __version__
# logging info
if args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# If --ngpu is not given,
# 1. if CUDA_VISIBLE_DEVICES is set, all visible devices
# 2. if nvidia-smi exists, use all devices
# 3. else ngpu=0
if args.ngpu is None:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is not None:
ngpu = len(cvd.split(","))
else:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
try:
p = subprocess.run(
["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
except (subprocess.CalledProcessError, FileNotFoundError):
ngpu = 0
else:
ngpu = len(p.stderr.decode().split("\n")) - 1
args.ngpu = ngpu
else:
if args.ngpu != 1:
logging.debug(
"There are some bugs with multi-GPU processing in PyTorch 1.2+"
+ " (see https://github.com/pytorch/pytorch/issues/21108)"
)
ngpu = args.ngpu
logging.info(f"ngpu: {ngpu}")
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# set random seed
logging.info("random seed = %d" % args.seed)
random.seed(args.seed)
np.random.seed(args.seed)
# load dictionary for debug log
if args.dict is not None:
with open(args.dict, "rb") as f:
dictionary = f.readlines()
char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
char_list.insert(0, "<blank>")
char_list.append("<eos>")
args.char_list = char_list
else:
args.char_list = None
# train
logging.info("backend = " + args.backend)
if args.backend == "pytorch":
from espnet.mt.pytorch_backend.mt import train
train(args)
else:
raise ValueError("Only pytorch are supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""Neural machine translation model decoding script."""
import logging
import os
import random
import sys
import configargparse
import numpy as np
# NOTE: you need this func to generate our sphinx doc
def get_parser():
"""Get default arguments."""
parser = configargparse.ArgumentParser(
description="Translate text from speech "
"using a speech translation model on one CPU or GPU",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="Config file path")
parser.add(
"--config2",
is_config_file=True,
help="Second config file path that overwrites the settings in `--config`",
)
parser.add(
"--config3",
is_config_file=True,
help="Third config file path "
"that overwrites the settings in `--config` and `--config2`",
)
parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
parser.add_argument(
"--dtype",
choices=("float16", "float32", "float64"),
default="float32",
help="Float precision (only available in --api v2)",
)
parser.add_argument(
"--backend",
type=str,
default="chainer",
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
parser.add_argument("--seed", type=int, default=1, help="Random seed")
parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
parser.add_argument(
"--batchsize",
type=int,
default=1,
help="Batch size for beam search (0: means no batch processing)",
)
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
parser.add_argument(
"--api",
default="v1",
choices=["v1", "v2"],
help="Beam search APIs "
"v1: Default API. It only supports "
"the ASRInterface.recognize method and DefaultRNNLM. "
"v2: Experimental API. "
"It supports any models that implements ScorerInterface.",
)
# task related
parser.add_argument(
"--trans-json", type=str, help="Filename of translation data (json)"
)
parser.add_argument(
"--result-label",
type=str,
required=True,
help="Filename of result label data (json)",
)
# model (parameter) related
parser.add_argument(
"--model", type=str, required=True, help="Model file parameters to read"
)
parser.add_argument(
"--model-conf", type=str, default=None, help="Model config file"
)
# search related
parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
parser.add_argument("--penalty", type=float, default=0.1, help="Incertion penalty")
parser.add_argument(
"--maxlenratio",
type=float,
default=3.0,
help="""Input length ratio to obtain max output length.
If maxlenratio=0.0 (default), it uses a end-detect function
to automatically find maximum hypothesis lengths""",
)
parser.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
# multilingual related
parser.add_argument(
"--tgt-lang",
default=False,
type=str,
help="target language ID (e.g., <en>, <de>, and <fr> etc.)",
)
return parser
def main(args):
"""Run the main decoding function."""
parser = get_parser()
args = parser.parse_args(args)
# logging info
if args.verbose == 1:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
elif args.verbose == 2:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# check CUDA_VISIBLE_DEVICES
if args.ngpu > 0:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is None:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
elif args.ngpu != len(cvd.split(",")):
logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
sys.exit(1)
# TODO(mn5k): support of multiple GPUs
if args.ngpu > 1:
logging.error("The program only supports ngpu=1.")
sys.exit(1)
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# seed setting
random.seed(args.seed)
np.random.seed(args.seed)
logging.info("set random seed = %d" % args.seed)
# trans
logging.info("backend = " + args.backend)
if args.backend == "pytorch":
# Experimental API that supports custom LMs
from espnet.mt.pytorch_backend.mt import trans
if args.dtype != "float32":
raise NotImplementedError(
f"`--dtype {args.dtype}` is only available with `--api v2`"
)
trans(args)
else:
raise ValueError("Only pytorch are supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""End-to-end speech translation model training script."""
import logging
import os
import random
import subprocess
import sys
import configargparse
import numpy as np
from espnet import __version__
from espnet.utils.cli_utils import strtobool
from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
# NOTE: you need this func to generate our sphinx doc
def get_parser(parser=None, required=True):
"""Get default arguments."""
if parser is None:
parser = configargparse.ArgumentParser(
description="Train a speech translation (ST) model on one CPU, "
"one or multiple GPUs",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="config file path")
parser.add(
"--config2",
is_config_file=True,
help="second config file path that overwrites the settings in `--config`.",
)
parser.add(
"--config3",
is_config_file=True,
help="third config file path that overwrites the settings "
"in `--config` and `--config2`.",
)
parser.add_argument(
"--ngpu",
default=None,
type=int,
help="Number of GPUs. If not given, use all visible devices",
)
parser.add_argument(
"--train-dtype",
default="float32",
choices=["float16", "float32", "float64", "O0", "O1", "O2", "O3"],
help="Data type for training (only pytorch backend). "
"O0,O1,.. flags require apex. "
"See https://nvidia.github.io/apex/amp.html#opt-levels",
)
parser.add_argument(
"--backend",
default="chainer",
type=str,
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument(
"--outdir", type=str, required=required, help="Output directory"
)
parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
parser.add_argument("--dict", required=required, help="Dictionary")
parser.add_argument("--seed", default=1, type=int, help="Random seed")
parser.add_argument("--debugdir", type=str, help="Output directory for debugging")
parser.add_argument(
"--resume",
"-r",
default="",
nargs="?",
help="Resume the training from snapshot",
)
parser.add_argument(
"--minibatches",
"-N",
type=int,
default="-1",
help="Process only N minibatches (for debug)",
)
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--tensorboard-dir",
default=None,
type=str,
nargs="?",
help="Tensorboard log dir path",
)
parser.add_argument(
"--report-interval-iters",
default=100,
type=int,
help="Report interval iterations",
)
parser.add_argument(
"--save-interval-iters",
default=0,
type=int,
help="Save snapshot interval iterations",
)
# task related
parser.add_argument(
"--train-json",
type=str,
default=None,
help="Filename of train label data (json)",
)
parser.add_argument(
"--valid-json",
type=str,
default=None,
help="Filename of validation label data (json)",
)
# network architecture
parser.add_argument(
"--model-module",
type=str,
default=None,
help="model defined module (default: espnet.nets.xxx_backend.e2e_st:E2E)",
)
# loss related
parser.add_argument(
"--ctc_type",
default="builtin",
type=str,
choices=["builtin", "gtnctc", "cudnnctc"],
help="Type of CTC implementation to calculate loss.",
)
parser.add_argument(
"--mtlalpha",
default=0.0,
type=float,
help="Multitask learning coefficient, alpha: \
alpha*ctc_loss + (1-alpha)*att_loss",
)
parser.add_argument(
"--asr-weight",
default=0.0,
type=float,
help="Multitask learning coefficient for ASR task, weight: "
" asr_weight*(alpha*ctc_loss + (1-alpha)*att_loss)"
" + (1-asr_weight-mt_weight)*st_loss",
)
parser.add_argument(
"--mt-weight",
default=0.0,
type=float,
help="Multitask learning coefficient for MT task, weight: \
mt_weight*mt_loss + (1-mt_weight-asr_weight)*st_loss",
)
parser.add_argument(
"--lsm-weight", default=0.0, type=float, help="Label smoothing weight"
)
# recognition options to compute CER/WER
parser.add_argument(
"--report-cer",
default=False,
action="store_true",
help="Compute CER on development set",
)
parser.add_argument(
"--report-wer",
default=False,
action="store_true",
help="Compute WER on development set",
)
# translations options to compute BLEU
parser.add_argument(
"--report-bleu",
default=True,
action="store_true",
help="Compute BLEU on development set",
)
parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
parser.add_argument("--beam-size", type=int, default=4, help="Beam size")
parser.add_argument("--penalty", default=0.0, type=float, help="Incertion penalty")
parser.add_argument(
"--maxlenratio",
default=0.0,
type=float,
help="""Input length ratio to obtain max output length.
If maxlenratio=0.0 (default), it uses a end-detect function
to automatically find maximum hypothesis lengths""",
)
parser.add_argument(
"--minlenratio",
default=0.0,
type=float,
help="Input length ratio to obtain min output length",
)
parser.add_argument(
"--rnnlm", type=str, default=None, help="RNNLM model file to read"
)
parser.add_argument(
"--rnnlm-conf", type=str, default=None, help="RNNLM model config file to read"
)
parser.add_argument("--lm-weight", default=0.0, type=float, help="RNNLM weight.")
parser.add_argument("--sym-space", default="<space>", type=str, help="Space symbol")
parser.add_argument("--sym-blank", default="<blank>", type=str, help="Blank symbol")
# minibatch related
parser.add_argument(
"--sortagrad",
default=0,
type=int,
nargs="?",
help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
)
parser.add_argument(
"--batch-count",
default="auto",
choices=BATCH_COUNT_CHOICES,
help="How to count batch_size. "
"The default (auto) will find how to count by args.",
)
parser.add_argument(
"--batch-size",
"--batch-seqs",
"-b",
default=0,
type=int,
help="Maximum seqs in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-bins",
default=0,
type=int,
help="Maximum bins in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-in",
default=0,
type=int,
help="Maximum input frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-out",
default=0,
type=int,
help="Maximum output frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-inout",
default=0,
type=int,
help="Maximum input+output frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--maxlen-in",
"--batch-seq-maxlen-in",
default=800,
type=int,
metavar="ML",
help="When --batch-count=seq, batch size is reduced "
"if the input sequence length > ML.",
)
parser.add_argument(
"--maxlen-out",
"--batch-seq-maxlen-out",
default=150,
type=int,
metavar="ML",
help="When --batch-count=seq, "
"batch size is reduced if the output sequence length > ML",
)
parser.add_argument(
"--n-iter-processes",
default=0,
type=int,
help="Number of processes of iterator",
)
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
nargs="?",
help="The configuration file for the pre-processing",
)
# optimization related
parser.add_argument(
"--opt",
default="adadelta",
type=str,
choices=["adadelta", "adam", "noam"],
help="Optimizer",
)
parser.add_argument(
"--accum-grad", default=1, type=int, help="Number of gradient accumuration"
)
parser.add_argument(
"--eps", default=1e-8, type=float, help="Epsilon constant for optimizer"
)
parser.add_argument(
"--eps-decay", default=0.01, type=float, help="Decaying ratio of epsilon"
)
parser.add_argument(
"--lr", default=1e-3, type=float, help="Learning rate for optimizer"
)
parser.add_argument(
"--lr-decay", default=1.0, type=float, help="Decaying ratio of learning rate"
)
parser.add_argument(
"--weight-decay", default=0.0, type=float, help="Weight decay ratio"
)
parser.add_argument(
"--criterion",
default="acc",
type=str,
choices=["loss", "acc"],
help="Criterion to perform epsilon decay",
)
parser.add_argument(
"--threshold", default=1e-4, type=float, help="Threshold to stop iteration"
)
parser.add_argument(
"--epochs", "-e", default=30, type=int, help="Maximum number of epochs"
)
parser.add_argument(
"--early-stop-criterion",
default="validation/main/acc",
type=str,
nargs="?",
help="Value to monitor to trigger an early stopping of the training",
)
parser.add_argument(
"--patience",
default=3,
type=int,
nargs="?",
help="Number of epochs to wait "
"without improvement before stopping the training",
)
parser.add_argument(
"--grad-clip", default=5, type=float, help="Gradient norm threshold to clip"
)
parser.add_argument(
"--num-save-attention",
default=3,
type=int,
help="Number of samples of attention to be saved",
)
parser.add_argument(
"--num-save-ctc",
default=3,
type=int,
help="Number of samples of CTC probability to be saved",
)
parser.add_argument(
"--grad-noise",
type=strtobool,
default=False,
help="The flag to switch to use noise injection to gradients during training",
)
# speech translation related
parser.add_argument(
"--context-residual",
default=False,
type=strtobool,
nargs="?",
help="The flag to switch to use context vector residual in the decoder network",
)
# finetuning related
parser.add_argument(
"--enc-init",
default=None,
type=str,
nargs="?",
help="Pre-trained ASR model to initialize encoder.",
)
parser.add_argument(
"--enc-init-mods",
default="enc.enc.",
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of encoder modules to initialize, separated by a comma.",
)
parser.add_argument(
"--dec-init",
default=None,
type=str,
nargs="?",
help="Pre-trained ASR, MT or LM model to initialize decoder.",
)
parser.add_argument(
"--dec-init-mods",
default="att., dec.",
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of decoder modules to initialize, separated by a comma.",
)
# multilingual related
parser.add_argument(
"--multilingual",
default=False,
type=strtobool,
help="Prepend target language ID to the source sentence. "
" Both source/target language IDs must be prepend in the pre-processing stage.",
)
parser.add_argument(
"--replace-sos",
default=False,
type=strtobool,
help="Replace <sos> in the decoder with a target language ID \
(the first token in the target sequence)",
)
# Feature transform: Normalization
parser.add_argument(
"--stats-file",
type=str,
default=None,
help="The stats file for the feature normalization",
)
parser.add_argument(
"--apply-uttmvn",
type=strtobool,
default=True,
help="Apply utterance level mean " "variance normalization.",
)
parser.add_argument("--uttmvn-norm-means", type=strtobool, default=True, help="")
parser.add_argument("--uttmvn-norm-vars", type=strtobool, default=False, help="")
# Feature transform: Fbank
parser.add_argument(
"--fbank-fs",
type=int,
default=16000,
help="The sample frequency used for " "the mel-fbank creation.",
)
parser.add_argument(
"--n-mels", type=int, default=80, help="The number of mel-frequency bins."
)
parser.add_argument("--fbank-fmin", type=float, default=0.0, help="")
parser.add_argument("--fbank-fmax", type=float, default=None, help="")
return parser
def main(cmd_args):
"""Run the main training function."""
parser = get_parser()
args, _ = parser.parse_known_args(cmd_args)
if args.backend == "chainer" and args.train_dtype != "float32":
raise NotImplementedError(
f"chainer backend does not support --train-dtype {args.train_dtype}."
"Use --dtype float32."
)
if args.ngpu == 0 and args.train_dtype in ("O0", "O1", "O2", "O3", "float16"):
raise ValueError(
f"--train-dtype {args.train_dtype} does not support the CPU backend."
)
from espnet.utils.dynamic_import import dynamic_import
if args.model_module is None:
model_module = "espnet.nets." + args.backend + "_backend.e2e_st:E2E"
else:
model_module = args.model_module
model_class = dynamic_import(model_module)
model_class.add_arguments(parser)
args = parser.parse_args(cmd_args)
args.model_module = model_module
if "chainer_backend" in args.model_module:
args.backend = "chainer"
if "pytorch_backend" in args.model_module:
args.backend = "pytorch"
# add version info in args
args.version = __version__
# logging info
if args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# If --ngpu is not given,
# 1. if CUDA_VISIBLE_DEVICES is set, all visible devices
# 2. if nvidia-smi exists, use all devices
# 3. else ngpu=0
if args.ngpu is None:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is not None:
ngpu = len(cvd.split(","))
else:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
try:
p = subprocess.run(
["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
except (subprocess.CalledProcessError, FileNotFoundError):
ngpu = 0
else:
ngpu = len(p.stderr.decode().split("\n")) - 1
args.ngpu = ngpu
else:
if args.ngpu != 1:
logging.debug(
"There are some bugs with multi-GPU processing in PyTorch 1.2+"
+ " (see https://github.com/pytorch/pytorch/issues/21108)"
)
ngpu = args.ngpu
logging.info(f"ngpu: {ngpu}")
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# set random seed
logging.info("random seed = %d" % args.seed)
random.seed(args.seed)
np.random.seed(args.seed)
# load dictionary for debug log
if args.dict is not None:
with open(args.dict, "rb") as f:
dictionary = f.readlines()
char_list = [entry.decode("utf-8").split(" ")[0] for entry in dictionary]
char_list.insert(0, "<blank>")
char_list.append("<eos>")
args.char_list = char_list
else:
args.char_list = None
# train
logging.info("backend = " + args.backend)
if args.backend == "pytorch":
from espnet.st.pytorch_backend.st import train
train(args)
else:
raise ValueError("Only pytorch are supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# encoding: utf-8
# Copyright 2019 Kyoto University (Hirofumi Inaguma)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""End-to-end speech translation model decoding script."""
import logging
import os
import random
import sys
import configargparse
import numpy as np
# NOTE: you need this func to generate our sphinx doc
def get_parser():
"""Get default arguments."""
parser = configargparse.ArgumentParser(
description="Translate text from speech using a speech translation "
"model on one CPU or GPU",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="Config file path")
parser.add(
"--config2",
is_config_file=True,
help="Second config file path that overwrites the settings in `--config`",
)
parser.add(
"--config3",
is_config_file=True,
help="Third config file path that overwrites "
"the settings in `--config` and `--config2`",
)
parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
parser.add_argument(
"--dtype",
choices=("float16", "float32", "float64"),
default="float32",
help="Float precision (only available in --api v2)",
)
parser.add_argument(
"--backend",
type=str,
default="chainer",
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
parser.add_argument("--seed", type=int, default=1, help="Random seed")
parser.add_argument("--verbose", "-V", type=int, default=1, help="Verbose option")
parser.add_argument(
"--batchsize",
type=int,
default=1,
help="Batch size for beam search (0: means no batch processing)",
)
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
parser.add_argument(
"--api",
default="v1",
choices=["v1", "v2"],
help="Beam search APIs "
"v1: Default API. "
"It only supports the ASRInterface.recognize method and DefaultRNNLM. "
"v2: Experimental API. "
"It supports any models that implements ScorerInterface.",
)
# task related
parser.add_argument(
"--trans-json", type=str, help="Filename of translation data (json)"
)
parser.add_argument(
"--result-label",
type=str,
required=True,
help="Filename of result label data (json)",
)
# model (parameter) related
parser.add_argument(
"--model", type=str, required=True, help="Model file parameters to read"
)
# search related
parser.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
parser.add_argument("--penalty", type=float, default=0.0, help="Incertion penalty")
parser.add_argument(
"--maxlenratio",
type=float,
default=0.0,
help="""Input length ratio to obtain max output length.
If maxlenratio=0.0 (default), it uses a end-detect function
to automatically find maximum hypothesis lengths""",
)
parser.add_argument(
"--minlenratio",
type=float,
default=0.0,
help="Input length ratio to obtain min output length",
)
# multilingual related
parser.add_argument(
"--tgt-lang",
default=False,
type=str,
help="target language ID (e.g., <en>, <de>, and <fr> etc.)",
)
return parser
def main(args):
"""Run the main decoding function."""
parser = get_parser()
args = parser.parse_args(args)
# logging info
if args.verbose == 1:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
elif args.verbose == 2:
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# check CUDA_VISIBLE_DEVICES
if args.ngpu > 0:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is None:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
elif args.ngpu != len(cvd.split(",")):
logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
sys.exit(1)
# TODO(mn5k): support of multiple GPUs
if args.ngpu > 1:
logging.error("The program only supports ngpu=1.")
sys.exit(1)
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# seed setting
random.seed(args.seed)
np.random.seed(args.seed)
logging.info("set random seed = %d" % args.seed)
# trans
logging.info("backend = " + args.backend)
if args.backend == "pytorch":
# Experimental API that supports custom LMs
from espnet.st.pytorch_backend.st import trans
if args.dtype != "float32":
raise NotImplementedError(
f"`--dtype {args.dtype}` is only available with `--api v2`"
)
trans(args)
else:
raise ValueError("Only pytorch are supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# Copyright 2018 Nagoya University (Tomoki Hayashi)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""TTS decoding script."""
import logging
import os
import subprocess
import sys
import configargparse
from espnet.utils.cli_utils import strtobool
# NOTE: you need this func to generate our sphinx doc
def get_parser():
"""Get parser of decoding arguments."""
parser = configargparse.ArgumentParser(
description="Synthesize speech from text using a TTS model on one CPU",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="config file path")
parser.add(
"--config2",
is_config_file=True,
help="second config file path that overwrites the settings in `--config`.",
)
parser.add(
"--config3",
is_config_file=True,
help="third config file path that overwrites "
"the settings in `--config` and `--config2`.",
)
parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
parser.add_argument(
"--backend",
default="pytorch",
type=str,
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
parser.add_argument("--seed", default=1, type=int, help="Random seed")
parser.add_argument("--out", type=str, required=True, help="Output filename")
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
# task related
parser.add_argument(
"--json", type=str, required=True, help="Filename of train label data (json)"
)
parser.add_argument(
"--model", type=str, required=True, help="Model file parameters to read"
)
parser.add_argument(
"--model-conf", type=str, default=None, help="Model config file"
)
# decoding related
parser.add_argument(
"--maxlenratio", type=float, default=5, help="Maximum length ratio in decoding"
)
parser.add_argument(
"--minlenratio", type=float, default=0, help="Minimum length ratio in decoding"
)
parser.add_argument(
"--threshold", type=float, default=0.5, help="Threshold value in decoding"
)
parser.add_argument(
"--use-att-constraint",
type=strtobool,
default=False,
help="Whether to use the attention constraint",
)
parser.add_argument(
"--backward-window",
type=int,
default=1,
help="Backward window size in the attention constraint",
)
parser.add_argument(
"--forward-window",
type=int,
default=3,
help="Forward window size in the attention constraint",
)
parser.add_argument(
"--fastspeech-alpha",
type=float,
default=1.0,
help="Alpha to change the speed for FastSpeech",
)
# save related
parser.add_argument(
"--save-durations",
default=False,
type=strtobool,
help="Whether to save durations converted from attentions",
)
parser.add_argument(
"--save-focus-rates",
default=False,
type=strtobool,
help="Whether to save focus rates of attentions",
)
return parser
def main(args):
"""Run deocding."""
parser = get_parser()
args = parser.parse_args(args)
# logging info
if args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# check CUDA_VISIBLE_DEVICES
if args.ngpu > 0:
if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
cvd = (
subprocess.check_output(
["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
)
.decode()
.strip()
)
logging.info("CLSP: use gpu" + cvd)
os.environ["CUDA_VISIBLE_DEVICES"] = cvd
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is None:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
elif args.ngpu != len(cvd.split(",")):
logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
sys.exit(1)
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# extract
logging.info("backend = " + args.backend)
if args.backend == "pytorch":
from espnet.tts.pytorch_backend.tts import decode
decode(args)
else:
raise NotImplementedError("Only pytorch is supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# Copyright 2018 Nagoya University (Tomoki Hayashi)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""Text-to-speech model training script."""
import logging
import os
import random
import subprocess
import sys
import configargparse
import numpy as np
from espnet import __version__
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.cli_utils import strtobool
from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
# NOTE: you need this func to generate our sphinx doc
def get_parser():
"""Get parser of training arguments."""
parser = configargparse.ArgumentParser(
description="Train a new text-to-speech (TTS) model on one CPU, "
"one or multiple GPUs",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="config file path")
parser.add(
"--config2",
is_config_file=True,
help="second config file path that overwrites the settings in `--config`.",
)
parser.add(
"--config3",
is_config_file=True,
help="third config file path that overwrites "
"the settings in `--config` and `--config2`.",
)
parser.add_argument(
"--ngpu",
default=None,
type=int,
help="Number of GPUs. If not given, use all visible devices",
)
parser.add_argument(
"--backend",
default="pytorch",
type=str,
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument("--outdir", type=str, required=True, help="Output directory")
parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
parser.add_argument("--seed", default=1, type=int, help="Random seed")
parser.add_argument(
"--resume",
"-r",
default="",
type=str,
nargs="?",
help="Resume the training from snapshot",
)
parser.add_argument(
"--minibatches",
"-N",
type=int,
default="-1",
help="Process only N minibatches (for debug)",
)
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--tensorboard-dir",
default=None,
type=str,
nargs="?",
help="Tensorboard log directory path",
)
parser.add_argument(
"--eval-interval-epochs", default=1, type=int, help="Evaluation interval epochs"
)
parser.add_argument(
"--save-interval-epochs", default=1, type=int, help="Save interval epochs"
)
parser.add_argument(
"--report-interval-iters",
default=100,
type=int,
help="Report interval iterations",
)
# task related
parser.add_argument(
"--train-json", type=str, required=True, help="Filename of training json"
)
parser.add_argument(
"--valid-json", type=str, required=True, help="Filename of validation json"
)
# network architecture
parser.add_argument(
"--model-module",
type=str,
default="espnet.nets.pytorch_backend.e2e_tts_tacotron2:Tacotron2",
help="model defined module",
)
# minibatch related
parser.add_argument(
"--sortagrad",
default=0,
type=int,
nargs="?",
help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
)
parser.add_argument(
"--batch-sort-key",
default="shuffle",
type=str,
choices=["shuffle", "output", "input"],
nargs="?",
help='Batch sorting key. "shuffle" only work with --batch-count "seq".',
)
parser.add_argument(
"--batch-count",
default="auto",
choices=BATCH_COUNT_CHOICES,
help="How to count batch_size. "
"The default (auto) will find how to count by args.",
)
parser.add_argument(
"--batch-size",
"--batch-seqs",
"-b",
default=0,
type=int,
help="Maximum seqs in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-bins",
default=0,
type=int,
help="Maximum bins in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-in",
default=0,
type=int,
help="Maximum input frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-out",
default=0,
type=int,
help="Maximum output frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-inout",
default=0,
type=int,
help="Maximum input+output frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--maxlen-in",
"--batch-seq-maxlen-in",
default=100,
type=int,
metavar="ML",
help="When --batch-count=seq, "
"batch size is reduced if the input sequence length > ML.",
)
parser.add_argument(
"--maxlen-out",
"--batch-seq-maxlen-out",
default=200,
type=int,
metavar="ML",
help="When --batch-count=seq, "
"batch size is reduced if the output sequence length > ML",
)
parser.add_argument(
"--num-iter-processes",
default=0,
type=int,
help="Number of processes of iterator",
)
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
parser.add_argument(
"--use-speaker-embedding",
default=False,
type=strtobool,
help="Whether to use speaker embedding",
)
parser.add_argument(
"--use-second-target",
default=False,
type=strtobool,
help="Whether to use second target",
)
# optimization related
parser.add_argument(
"--opt", default="adam", type=str, choices=["adam", "noam"], help="Optimizer"
)
parser.add_argument(
"--accum-grad", default=1, type=int, help="Number of gradient accumuration"
)
parser.add_argument(
"--lr", default=1e-3, type=float, help="Learning rate for optimizer"
)
parser.add_argument("--eps", default=1e-6, type=float, help="Epsilon for optimizer")
parser.add_argument(
"--weight-decay",
default=1e-6,
type=float,
help="Weight decay coefficient for optimizer",
)
parser.add_argument(
"--epochs", "-e", default=30, type=int, help="Number of maximum epochs"
)
parser.add_argument(
"--early-stop-criterion",
default="validation/main/loss",
type=str,
nargs="?",
help="Value to monitor to trigger an early stopping of the training",
)
parser.add_argument(
"--patience",
default=3,
type=int,
nargs="?",
help="Number of epochs to wait "
"without improvement before stopping the training",
)
parser.add_argument(
"--grad-clip", default=1, type=float, help="Gradient norm threshold to clip"
)
parser.add_argument(
"--num-save-attention",
default=5,
type=int,
help="Number of samples of attention to be saved",
)
parser.add_argument(
"--keep-all-data-on-mem",
default=False,
type=strtobool,
help="Whether to keep all data on memory",
)
# finetuning related
parser.add_argument(
"--enc-init",
default=None,
type=str,
help="Pre-trained TTS model path to initialize encoder.",
)
parser.add_argument(
"--enc-init-mods",
default="enc.",
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of encoder modules to initialize, separated by a comma.",
)
parser.add_argument(
"--dec-init",
default=None,
type=str,
help="Pre-trained TTS model path to initialize decoder.",
)
parser.add_argument(
"--dec-init-mods",
default="dec.",
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of decoder modules to initialize, separated by a comma.",
)
parser.add_argument(
"--freeze-mods",
default=None,
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of modules to freeze (not to train), separated by a comma.",
)
return parser
def main(cmd_args):
"""Run training."""
parser = get_parser()
args, _ = parser.parse_known_args(cmd_args)
from espnet.utils.dynamic_import import dynamic_import
model_class = dynamic_import(args.model_module)
assert issubclass(model_class, TTSInterface)
model_class.add_arguments(parser)
args = parser.parse_args(cmd_args)
# add version info in args
args.version = __version__
# logging info
if args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# If --ngpu is not given,
# 1. if CUDA_VISIBLE_DEVICES is set, all visible devices
# 2. if nvidia-smi exists, use all devices
# 3. else ngpu=0
if args.ngpu is None:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is not None:
ngpu = len(cvd.split(","))
else:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
try:
p = subprocess.run(
["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
except (subprocess.CalledProcessError, FileNotFoundError):
ngpu = 0
else:
ngpu = len(p.stderr.decode().split("\n")) - 1
args.ngpu = ngpu
else:
ngpu = args.ngpu
logging.info(f"ngpu: {ngpu}")
# set random seed
logging.info("random seed = %d" % args.seed)
random.seed(args.seed)
np.random.seed(args.seed)
if args.backend == "pytorch":
from espnet.tts.pytorch_backend.tts import train
train(args)
else:
raise NotImplementedError("Only pytorch is supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# Copyright 2020 Nagoya University (Wen-Chin Huang)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""VC decoding script."""
import logging
import os
import subprocess
import sys
import configargparse
from espnet.utils.cli_utils import strtobool
# NOTE: you need this func to generate our sphinx doc
def get_parser():
"""Get parser of decoding arguments."""
parser = configargparse.ArgumentParser(
description="Converting speech using a VC model on one CPU",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="config file path")
parser.add(
"--config2",
is_config_file=True,
help="second config file path that overwrites the settings in `--config`.",
)
parser.add(
"--config3",
is_config_file=True,
help="third config file path that overwrites the settings "
"in `--config` and `--config2`.",
)
parser.add_argument("--ngpu", default=0, type=int, help="Number of GPUs")
parser.add_argument(
"--backend",
default="pytorch",
type=str,
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
parser.add_argument("--seed", default=1, type=int, help="Random seed")
parser.add_argument("--out", type=str, required=True, help="Output filename")
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
# task related
parser.add_argument(
"--json", type=str, required=True, help="Filename of train label data (json)"
)
parser.add_argument(
"--model", type=str, required=True, help="Model file parameters to read"
)
parser.add_argument(
"--model-conf", type=str, default=None, help="Model config file"
)
# decoding related
parser.add_argument(
"--maxlenratio", type=float, default=5, help="Maximum length ratio in decoding"
)
parser.add_argument(
"--minlenratio", type=float, default=0, help="Minimum length ratio in decoding"
)
parser.add_argument(
"--threshold", type=float, default=0.5, help="Threshold value in decoding"
)
parser.add_argument(
"--use-att-constraint",
type=strtobool,
default=False,
help="Whether to use the attention constraint",
)
parser.add_argument(
"--backward-window",
type=int,
default=1,
help="Backward window size in the attention constraint",
)
parser.add_argument(
"--forward-window",
type=int,
default=3,
help="Forward window size in the attention constraint",
)
# save related
parser.add_argument(
"--save-durations",
default=False,
type=strtobool,
help="Whether to save durations converted from attentions",
)
parser.add_argument(
"--save-focus-rates",
default=False,
type=strtobool,
help="Whether to save focus rates of attentions",
)
return parser
def main(args):
"""Run deocding."""
parser = get_parser()
args = parser.parse_args(args)
# logging info
if args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# check CUDA_VISIBLE_DEVICES
if args.ngpu > 0:
if "clsp.jhu.edu" in subprocess.check_output(["hostname", "-f"]).decode():
cvd = (
subprocess.check_output(
["/usr/local/bin/free-gpu", "-n", str(args.ngpu)]
)
.decode()
.strip()
)
logging.info("CLSP: use gpu" + cvd)
os.environ["CUDA_VISIBLE_DEVICES"] = cvd
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is None:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
elif args.ngpu != len(cvd.split(",")):
logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
sys.exit(1)
# display PYTHONPATH
logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
# extract
logging.info("backend = " + args.backend)
if args.backend == "pytorch":
from espnet.vc.pytorch_backend.vc import decode
decode(args)
else:
raise NotImplementedError("Only pytorch is supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#!/usr/bin/env python3
# Copyright 2020 Nagoya University (Wen-Chin Huang)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
"""Voice conversion model training script."""
import logging
import os
import random
import subprocess
import sys
import configargparse
import numpy as np
from espnet import __version__
from espnet.nets.tts_interface import TTSInterface
from espnet.utils.cli_utils import strtobool
from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
# NOTE: you need this func to generate our sphinx doc
def get_parser():
"""Get parser of training arguments."""
parser = configargparse.ArgumentParser(
description="Train a new voice conversion (VC) model on one CPU, "
"one or multiple GPUs",
config_file_parser_class=configargparse.YAMLConfigFileParser,
formatter_class=configargparse.ArgumentDefaultsHelpFormatter,
)
# general configuration
parser.add("--config", is_config_file=True, help="config file path")
parser.add(
"--config2",
is_config_file=True,
help="second config file path that overwrites the settings in `--config`.",
)
parser.add(
"--config3",
is_config_file=True,
help="third config file path that overwrites the settings "
"in `--config` and `--config2`.",
)
parser.add_argument(
"--ngpu",
default=None,
type=int,
help="Number of GPUs. If not given, use all visible devices",
)
parser.add_argument(
"--backend",
default="pytorch",
type=str,
choices=["chainer", "pytorch"],
help="Backend library",
)
parser.add_argument("--outdir", type=str, required=True, help="Output directory")
parser.add_argument("--debugmode", default=1, type=int, help="Debugmode")
parser.add_argument("--seed", default=1, type=int, help="Random seed")
parser.add_argument(
"--resume",
"-r",
default="",
type=str,
nargs="?",
help="Resume the training from snapshot",
)
parser.add_argument(
"--minibatches",
"-N",
type=int,
default="-1",
help="Process only N minibatches (for debug)",
)
parser.add_argument("--verbose", "-V", default=0, type=int, help="Verbose option")
parser.add_argument(
"--tensorboard-dir",
default=None,
type=str,
nargs="?",
help="Tensorboard log directory path",
)
parser.add_argument(
"--eval-interval-epochs",
default=100,
type=int,
help="Evaluation interval epochs",
)
parser.add_argument(
"--save-interval-epochs", default=1, type=int, help="Save interval epochs"
)
parser.add_argument(
"--report-interval-iters",
default=10,
type=int,
help="Report interval iterations",
)
# task related
parser.add_argument("--srcspk", type=str, help="Source speaker")
parser.add_argument("--trgspk", type=str, help="Target speaker")
parser.add_argument(
"--train-json", type=str, required=True, help="Filename of training json"
)
parser.add_argument(
"--valid-json", type=str, required=True, help="Filename of validation json"
)
# network architecture
parser.add_argument(
"--model-module",
type=str,
default="espnet.nets.pytorch_backend.e2e_tts_tacotron2:Tacotron2",
help="model defined module",
)
# minibatch related
parser.add_argument(
"--sortagrad",
default=0,
type=int,
nargs="?",
help="How many epochs to use sortagrad for. 0 = deactivated, -1 = all epochs",
)
parser.add_argument(
"--batch-sort-key",
default="shuffle",
type=str,
choices=["shuffle", "output", "input"],
nargs="?",
help='Batch sorting key. "shuffle" only work with --batch-count "seq".',
)
parser.add_argument(
"--batch-count",
default="auto",
choices=BATCH_COUNT_CHOICES,
help="How to count batch_size. "
"The default (auto) will find how to count by args.",
)
parser.add_argument(
"--batch-size",
"--batch-seqs",
"-b",
default=0,
type=int,
help="Maximum seqs in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-bins",
default=0,
type=int,
help="Maximum bins in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-in",
default=0,
type=int,
help="Maximum input frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-out",
default=0,
type=int,
help="Maximum output frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--batch-frames-inout",
default=0,
type=int,
help="Maximum input+output frames in a minibatch (0 to disable)",
)
parser.add_argument(
"--maxlen-in",
"--batch-seq-maxlen-in",
default=100,
type=int,
metavar="ML",
help="When --batch-count=seq, "
"batch size is reduced if the input sequence length > ML.",
)
parser.add_argument(
"--maxlen-out",
"--batch-seq-maxlen-out",
default=200,
type=int,
metavar="ML",
help="When --batch-count=seq, "
"batch size is reduced if the output sequence length > ML",
)
parser.add_argument(
"--num-iter-processes",
default=0,
type=int,
help="Number of processes of iterator",
)
parser.add_argument(
"--preprocess-conf",
type=str,
default=None,
help="The configuration file for the pre-processing",
)
parser.add_argument(
"--use-speaker-embedding",
default=False,
type=strtobool,
help="Whether to use speaker embedding",
)
parser.add_argument(
"--use-second-target",
default=False,
type=strtobool,
help="Whether to use second target",
)
# optimization related
parser.add_argument(
"--opt",
default="adam",
type=str,
choices=["adam", "noam", "lamb"],
help="Optimizer",
)
parser.add_argument(
"--accum-grad", default=1, type=int, help="Number of gradient accumuration"
)
parser.add_argument(
"--lr", default=1e-3, type=float, help="Learning rate for optimizer"
)
parser.add_argument("--eps", default=1e-6, type=float, help="Epsilon for optimizer")
parser.add_argument(
"--weight-decay",
default=1e-6,
type=float,
help="Weight decay coefficient for optimizer",
)
parser.add_argument(
"--epochs", "-e", default=30, type=int, help="Number of maximum epochs"
)
parser.add_argument(
"--early-stop-criterion",
default="validation/main/loss",
type=str,
nargs="?",
help="Value to monitor to trigger an early stopping of the training",
)
parser.add_argument(
"--patience",
default=3,
type=int,
nargs="?",
help="Number of epochs to wait without improvement "
"before stopping the training",
)
parser.add_argument(
"--grad-clip", default=1, type=float, help="Gradient norm threshold to clip"
)
parser.add_argument(
"--num-save-attention",
default=5,
type=int,
help="Number of samples of attention to be saved",
)
parser.add_argument(
"--keep-all-data-on-mem",
default=False,
type=strtobool,
help="Whether to keep all data on memory",
)
parser.add_argument(
"--enc-init",
default=None,
type=str,
help="Pre-trained model path to initialize encoder.",
)
parser.add_argument(
"--enc-init-mods",
default="enc.",
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of encoder modules to initialize, separated by a comma.",
)
parser.add_argument(
"--dec-init",
default=None,
type=str,
help="Pre-trained model path to initialize decoder.",
)
parser.add_argument(
"--dec-init-mods",
default="dec.",
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of decoder modules to initialize, separated by a comma.",
)
parser.add_argument(
"--freeze-mods",
default=None,
type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
help="List of modules to freeze (not to train), separated by a comma.",
)
return parser
def main(cmd_args):
"""Run training."""
parser = get_parser()
args, _ = parser.parse_known_args(cmd_args)
from espnet.utils.dynamic_import import dynamic_import
model_class = dynamic_import(args.model_module)
assert issubclass(model_class, TTSInterface)
model_class.add_arguments(parser)
args = parser.parse_args(cmd_args)
# add version info in args
args.version = __version__
# logging info
if args.verbose > 0:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
else:
logging.basicConfig(
level=logging.WARN,
format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
)
logging.warning("Skip DEBUG/INFO messages")
# If --ngpu is not given,
# 1. if CUDA_VISIBLE_DEVICES is set, all visible devices
# 2. if nvidia-smi exists, use all devices
# 3. else ngpu=0
if args.ngpu is None:
cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
if cvd is not None:
ngpu = len(cvd.split(","))
else:
logging.warning("CUDA_VISIBLE_DEVICES is not set.")
try:
p = subprocess.run(
["nvidia-smi", "-L"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
except (subprocess.CalledProcessError, FileNotFoundError):
ngpu = 0
else:
ngpu = len(p.stderr.decode().split("\n")) - 1
else:
ngpu = args.ngpu
logging.info(f"ngpu: {ngpu}")
# set random seed
logging.info("random seed = %d" % args.seed)
random.seed(args.seed)
np.random.seed(args.seed)
if args.backend == "pytorch":
from espnet.vc.pytorch_backend.vc import train
train(args)
else:
raise NotImplementedError("Only pytorch is supported.")
if __name__ == "__main__":
main(sys.argv[1:])
#
# SPDX-FileCopyrightText:
# Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
"""Initialize sub package."""
#!/usr/bin/env python3
# Copyright 2018 Mitsubishi Electric Research Laboratories (Takaaki Hori)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
import math
import chainer
import chainer.functions as F
from espnet.lm.lm_utils import make_lexical_tree
# Definition of a multi-level (subword/word) language model
class MultiLevelLM(chainer.Chain):
logzero = -10000000000.0
zero = 1.0e-10
def __init__(
self,
wordlm,
subwordlm,
word_dict,
subword_dict,
subwordlm_weight=0.8,
oov_penalty=1.0,
open_vocab=True,
):
super(MultiLevelLM, self).__init__()
self.wordlm = wordlm
self.subwordlm = subwordlm
self.word_eos = word_dict["<eos>"]
self.word_unk = word_dict["<unk>"]
self.xp_word_eos = self.xp.full(1, self.word_eos, "i")
self.xp_word_unk = self.xp.full(1, self.word_unk, "i")
self.space = subword_dict["<space>"]
self.eos = subword_dict["<eos>"]
self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
self.log_oov_penalty = math.log(oov_penalty)
self.open_vocab = open_vocab
self.subword_dict_size = len(subword_dict)
self.subwordlm_weight = subwordlm_weight
self.normalized = True
def __call__(self, state, x):
# update state with input label x
if state is None: # make initial states and log-prob vectors
wlm_state, z_wlm = self.wordlm(None, self.xp_word_eos)
wlm_logprobs = F.log_softmax(z_wlm).data
clm_state, z_clm = self.subwordlm(None, x)
log_y = F.log_softmax(z_clm).data * self.subwordlm_weight
new_node = self.lexroot
clm_logprob = 0.0
xi = self.space
else:
clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
xi = int(x)
if xi == self.space: # inter-word transition
if node is not None and node[1] >= 0: # check if the node is word end
w = self.xp.full(1, node[1], "i")
else: # this node is not a word end, which means <unk>
w = self.xp_word_unk
# update wordlm state and log-prob vector
wlm_state, z_wlm = self.wordlm(wlm_state, w)
wlm_logprobs = F.log_softmax(z_wlm).data
new_node = self.lexroot # move to the tree root
clm_logprob = 0.0
elif node is not None and xi in node[0]: # intra-word transition
new_node = node[0][xi]
clm_logprob += log_y[0, xi]
elif self.open_vocab: # if no path in the tree, enter open-vocabulary mode
new_node = None
clm_logprob += log_y[0, xi]
else: # if open_vocab flag is disabled, return 0 probabilities
log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
return (clm_state, wlm_state, None, log_y, 0.0), log_y
clm_state, z_clm = self.subwordlm(clm_state, x)
log_y = F.log_softmax(z_clm).data * self.subwordlm_weight
# apply word-level probabilies for <space> and <eos> labels
if xi != self.space:
if new_node is not None and new_node[1] >= 0: # if new node is word end
wlm_logprob = wlm_logprobs[:, new_node[1]] - clm_logprob
else:
wlm_logprob = wlm_logprobs[:, self.word_unk] + self.log_oov_penalty
log_y[:, self.space] = wlm_logprob
log_y[:, self.eos] = wlm_logprob
else:
log_y[:, self.space] = self.logzero
log_y[:, self.eos] = self.logzero
return (clm_state, wlm_state, wlm_logprobs, new_node, log_y, clm_logprob), log_y
def final(self, state):
clm_state, wlm_state, wlm_logprobs, node, log_y, clm_logprob = state
if node is not None and node[1] >= 0: # check if the node is word end
w = self.xp.full(1, node[1], "i")
else: # this node is not a word end, which means <unk>
w = self.xp_word_unk
wlm_state, z_wlm = self.wordlm(wlm_state, w)
return F.log_softmax(z_wlm).data[:, self.word_eos]
# Definition of a look-ahead word language model
class LookAheadWordLM(chainer.Chain):
logzero = -10000000000.0
zero = 1.0e-10
def __init__(
self, wordlm, word_dict, subword_dict, oov_penalty=0.0001, open_vocab=True
):
super(LookAheadWordLM, self).__init__()
self.wordlm = wordlm
self.word_eos = word_dict["<eos>"]
self.word_unk = word_dict["<unk>"]
self.xp_word_eos = self.xp.full(1, self.word_eos, "i")
self.xp_word_unk = self.xp.full(1, self.word_unk, "i")
self.space = subword_dict["<space>"]
self.eos = subword_dict["<eos>"]
self.lexroot = make_lexical_tree(word_dict, subword_dict, self.word_unk)
self.oov_penalty = oov_penalty
self.open_vocab = open_vocab
self.subword_dict_size = len(subword_dict)
self.normalized = True
def __call__(self, state, x):
# update state with input label x
if state is None: # make initial states and cumlative probability vector
wlm_state, z_wlm = self.wordlm(None, self.xp_word_eos)
cumsum_probs = self.xp.cumsum(F.softmax(z_wlm).data, axis=1)
new_node = self.lexroot
xi = self.space
else:
wlm_state, cumsum_probs, node = state
xi = int(x)
if xi == self.space: # inter-word transition
if node is not None and node[1] >= 0: # check if the node is word end
w = self.xp.full(1, node[1], "i")
else: # this node is not a word end, which means <unk>
w = self.xp_word_unk
# update wordlm state and cumlative probability vector
wlm_state, z_wlm = self.wordlm(wlm_state, w)
cumsum_probs = self.xp.cumsum(F.softmax(z_wlm).data, axis=1)
new_node = self.lexroot # move to the tree root
elif node is not None and xi in node[0]: # intra-word transition
new_node = node[0][xi]
elif self.open_vocab: # if no path in the tree, enter open-vocabulary mode
new_node = None
else: # if open_vocab flag is disabled, return 0 probabilities
log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
return (wlm_state, None, None), log_y
if new_node is not None:
succ, wid, wids = new_node
# compute parent node probability
sum_prob = (
(cumsum_probs[:, wids[1]] - cumsum_probs[:, wids[0]])
if wids is not None
else 1.0
)
if sum_prob < self.zero:
log_y = self.xp.full((1, self.subword_dict_size), self.logzero, "f")
return (wlm_state, cumsum_probs, new_node), log_y
# set <unk> probability as a default value
unk_prob = (
cumsum_probs[:, self.word_unk] - cumsum_probs[:, self.word_unk - 1]
)
y = self.xp.full(
(1, self.subword_dict_size), unk_prob * self.oov_penalty, "f"
)
# compute transition probabilities to child nodes
for cid, nd in succ.items():
y[:, cid] = (
cumsum_probs[:, nd[2][1]] - cumsum_probs[:, nd[2][0]]
) / sum_prob
# apply word-level probabilies for <space> and <eos> labels
if wid >= 0:
wlm_prob = (cumsum_probs[:, wid] - cumsum_probs[:, wid - 1]) / sum_prob
y[:, self.space] = wlm_prob
y[:, self.eos] = wlm_prob
elif xi == self.space:
y[:, self.space] = self.zero
y[:, self.eos] = self.zero
log_y = self.xp.log(
self.xp.clip(y, self.zero, None)
) # clip to avoid log(0)
else: # if no path in the tree, transition probability is one
log_y = self.xp.zeros((1, self.subword_dict_size), "f")
return (wlm_state, cumsum_probs, new_node), log_y
def final(self, state):
wlm_state, cumsum_probs, node = state
if node is not None and node[1] >= 0: # check if the node is word end
w = self.xp.full(1, node[1], "i")
else: # this node is not a word end, which means <unk>
w = self.xp_word_unk
wlm_state, z_wlm = self.wordlm(wlm_state, w)
return F.log_softmax(z_wlm).data[:, self.word_eos]
#!/usr/bin/env python3
# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
# This code is ported from the following implementation written in Torch.
# https://github.com/chainer/chainer/blob/master/examples/ptb/train_ptb_custom_loop.py
import copy
import json
import logging
import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np
from chainer import link, reporter, training
from chainer.dataset import convert
# for classifier link
from chainer.functions.loss import softmax_cross_entropy
from chainer.training import extensions
import espnet.nets.chainer_backend.deterministic_embed_id as DL
from espnet.lm.lm_utils import (
MakeSymlinkToBestModel,
ParallelSentenceIterator,
compute_perplexity,
count_tokens,
read_tokens,
)
from espnet.nets.lm_interface import LMInterface
from espnet.optimizer.factory import dynamic_import_optimizer
from espnet.scheduler.chainer import ChainerScheduler
from espnet.scheduler.scheduler import dynamic_import_scheduler
from espnet.utils.deterministic_utils import set_deterministic_chainer
from espnet.utils.training.evaluator import BaseEvaluator
from espnet.utils.training.iterators import ShufflingEnabler
from espnet.utils.training.tensorboard_logger import TensorboardLogger
from espnet.utils.training.train_utils import check_early_stop, set_early_stop
# TODO(karita): reimplement RNNLM with new interface
class DefaultRNNLM(LMInterface, link.Chain):
"""Default RNNLM wrapper to compute reduce framewise loss values.
Args:
n_vocab (int): The size of the vocabulary
args (argparse.Namespace): configurations. see `add_arguments`
"""
@staticmethod
def add_arguments(parser):
parser.add_argument(
"--type",
type=str,
default="lstm",
nargs="?",
choices=["lstm", "gru"],
help="Which type of RNN to use",
)
parser.add_argument(
"--layer", "-l", type=int, default=2, help="Number of hidden layers"
)
parser.add_argument(
"--unit", "-u", type=int, default=650, help="Number of hidden units"
)
return parser
class ClassifierWithState(link.Chain):
"""A wrapper for a chainer RNNLM
:param link.Chain predictor : The RNNLM
:param function lossfun: The loss function to use
:param int/str label_key:
"""
def __init__(
self,
predictor,
lossfun=softmax_cross_entropy.softmax_cross_entropy,
label_key=-1,
):
if not (isinstance(label_key, (int, str))):
raise TypeError("label_key must be int or str, but is %s" % type(label_key))
super(ClassifierWithState, self).__init__()
self.lossfun = lossfun
self.y = None
self.loss = None
self.label_key = label_key
with self.init_scope():
self.predictor = predictor
def __call__(self, state, *args, **kwargs):
"""Computes the loss value for an input and label pair.
It also computes accuracy and stores it to the attribute.
When ``label_key`` is ``int``, the corresponding element in ``args``
is treated as ground truth labels. And when it is ``str``, the
element in ``kwargs`` is used.
The all elements of ``args`` and ``kwargs`` except the groundtruth
labels are features.
It feeds features to the predictor and compare the result
with ground truth labels.
:param state : The LM state
:param list[chainer.Variable] args : Input minibatch
:param dict[chainer.Variable] kwargs : Input minibatch
:return loss value
:rtype chainer.Variable
"""
if isinstance(self.label_key, int):
if not (-len(args) <= self.label_key < len(args)):
msg = "Label key %d is out of bounds" % self.label_key
raise ValueError(msg)
t = args[self.label_key]
if self.label_key == -1:
args = args[:-1]
else:
args = args[: self.label_key] + args[self.label_key + 1 :]
elif isinstance(self.label_key, str):
if self.label_key not in kwargs:
msg = 'Label key "%s" is not found' % self.label_key
raise ValueError(msg)
t = kwargs[self.label_key]
del kwargs[self.label_key]
self.y = None
self.loss = None
state, self.y = self.predictor(state, *args, **kwargs)
self.loss = self.lossfun(self.y, t)
return state, self.loss
def predict(self, state, x):
"""Predict log probabilities for given state and input x using the predictor
:param state : the state
:param x : the input
:return a tuple (state, log prob vector)
:rtype cupy/numpy array
"""
if hasattr(self.predictor, "normalized") and self.predictor.normalized:
return self.predictor(state, x)
else:
state, z = self.predictor(state, x)
return state, F.log_softmax(z).data
def final(self, state):
"""Predict final log probabilities for given state using the predictor
:param state : the state
:return log probability vector
:rtype cupy/numpy array
"""
if hasattr(self.predictor, "final"):
return self.predictor.final(state)
else:
return 0.0
# Definition of a recurrent net for language modeling
class RNNLM(chainer.Chain):
"""A chainer RNNLM
:param int n_vocab: The size of the vocabulary
:param int n_layers: The number of layers to create
:param int n_units: The number of units per layer
:param str type: The RNN type
"""
def __init__(self, n_vocab, n_layers, n_units, typ="lstm"):
super(RNNLM, self).__init__()
with self.init_scope():
self.embed = DL.EmbedID(n_vocab, n_units)
self.rnn = (
chainer.ChainList(
*[L.StatelessLSTM(n_units, n_units) for _ in range(n_layers)]
)
if typ == "lstm"
else chainer.ChainList(
*[L.StatelessGRU(n_units, n_units) for _ in range(n_layers)]
)
)
self.lo = L.Linear(n_units, n_vocab)
for param in self.params():
param.data[...] = np.random.uniform(-0.1, 0.1, param.data.shape)
self.n_layers = n_layers
self.n_units = n_units
self.typ = typ
def __call__(self, state, x):
if state is None:
if self.typ == "lstm":
state = {"c": [None] * self.n_layers, "h": [None] * self.n_layers}
else:
state = {"h": [None] * self.n_layers}
h = [None] * self.n_layers
emb = self.embed(x)
if self.typ == "lstm":
c = [None] * self.n_layers
c[0], h[0] = self.rnn[0](state["c"][0], state["h"][0], F.dropout(emb))
for n in range(1, self.n_layers):
c[n], h[n] = self.rnn[n](
state["c"][n], state["h"][n], F.dropout(h[n - 1])
)
state = {"c": c, "h": h}
else:
if state["h"][0] is None:
xp = self.xp
with chainer.backends.cuda.get_device_from_id(self._device_id):
state["h"][0] = chainer.Variable(
xp.zeros((emb.shape[0], self.n_units), dtype=emb.dtype)
)
h[0] = self.rnn[0](state["h"][0], F.dropout(emb))
for n in range(1, self.n_layers):
if state["h"][n] is None:
xp = self.xp
with chainer.backends.cuda.get_device_from_id(self._device_id):
state["h"][n] = chainer.Variable(
xp.zeros(
(h[n - 1].shape[0], self.n_units), dtype=h[n - 1].dtype
)
)
h[n] = self.rnn[n](state["h"][n], F.dropout(h[n - 1]))
state = {"h": h}
y = self.lo(F.dropout(h[-1]))
return state, y
class BPTTUpdater(training.updaters.StandardUpdater):
"""An updater for a chainer LM
:param chainer.dataset.Iterator train_iter : The train iterator
:param optimizer:
:param schedulers:
:param int device : The device id
:param int accum_grad :
"""
def __init__(self, train_iter, optimizer, schedulers, device, accum_grad):
super(BPTTUpdater, self).__init__(train_iter, optimizer, device=device)
self.scheduler = ChainerScheduler(schedulers, optimizer)
self.accum_grad = accum_grad
# The core part of the update routine can be customized by overriding.
def update_core(self):
# When we pass one iterator and optimizer to StandardUpdater.__init__,
# they are automatically named 'main'.
train_iter = self.get_iterator("main")
optimizer = self.get_optimizer("main")
count = 0
sum_loss = 0
optimizer.target.cleargrads() # Clear the parameter gradients
for _ in range(self.accum_grad):
# Progress the dataset iterator for sentences at each iteration.
batch = train_iter.__next__()
x, t = convert.concat_examples(batch, device=self.device, padding=(0, -1))
# Concatenate the token IDs to matrices and send them to the device
# self.converter does this job
# (it is chainer.dataset.concat_examples by default)
xp = chainer.backends.cuda.get_array_module(x)
loss = 0
state = None
batch_size, sequence_length = x.shape
for i in range(sequence_length):
# Compute the loss at this time step and accumulate it
state, loss_batch = optimizer.target(
state, chainer.Variable(x[:, i]), chainer.Variable(t[:, i])
)
non_zeros = xp.count_nonzero(x[:, i])
loss += loss_batch * non_zeros
count += int(non_zeros)
# backward
loss /= batch_size * self.accum_grad # normalized by batch size
sum_loss += float(loss.data)
loss.backward() # Backprop
loss.unchain_backward() # Truncate the graph
reporter.report({"loss": sum_loss}, optimizer.target)
reporter.report({"count": count}, optimizer.target)
# update
optimizer.update() # Update the parameters
self.scheduler.step(self.iteration)
class LMEvaluator(BaseEvaluator):
"""A custom evaluator for a chainer LM
:param chainer.dataset.Iterator val_iter : The validation iterator
:param eval_model : The model to evaluate
:param int device : The device id to use
"""
def __init__(self, val_iter, eval_model, device):
super(LMEvaluator, self).__init__(val_iter, eval_model, device=device)
def evaluate(self):
val_iter = self.get_iterator("main")
target = self.get_target("main")
loss = 0
count = 0
for batch in copy.copy(val_iter):
x, t = convert.concat_examples(batch, device=self.device, padding=(0, -1))
xp = chainer.backends.cuda.get_array_module(x)
state = None
for i in range(len(x[0])):
state, loss_batch = target(state, x[:, i], t[:, i])
non_zeros = xp.count_nonzero(x[:, i])
loss += loss_batch.data * non_zeros
count += int(non_zeros)
# report validation loss
observation = {}
with reporter.report_scope(observation):
reporter.report({"loss": float(loss / count)}, target)
return observation
def train(args):
"""Train with the given args
:param Namespace args: The program arguments
"""
# TODO(karita): support this
if args.model_module != "default":
raise NotImplementedError("chainer backend does not support --model-module")
# display chainer version
logging.info("chainer version = " + chainer.__version__)
set_deterministic_chainer(args)
# check cuda and cudnn availability
if not chainer.cuda.available:
logging.warning("cuda is not available")
if not chainer.cuda.cudnn_enabled:
logging.warning("cudnn is not available")
# get special label ids
unk = args.char_list_dict["<unk>"]
eos = args.char_list_dict["<eos>"]
# read tokens as a sequence of sentences
train = read_tokens(args.train_label, args.char_list_dict)
val = read_tokens(args.valid_label, args.char_list_dict)
# count tokens
n_train_tokens, n_train_oovs = count_tokens(train, unk)
n_val_tokens, n_val_oovs = count_tokens(val, unk)
logging.info("#vocab = " + str(args.n_vocab))
logging.info("#sentences in the training data = " + str(len(train)))
logging.info("#tokens in the training data = " + str(n_train_tokens))
logging.info(
"oov rate in the training data = %.2f %%"
% (n_train_oovs / n_train_tokens * 100)
)
logging.info("#sentences in the validation data = " + str(len(val)))
logging.info("#tokens in the validation data = " + str(n_val_tokens))
logging.info(
"oov rate in the validation data = %.2f %%" % (n_val_oovs / n_val_tokens * 100)
)
use_sortagrad = args.sortagrad == -1 or args.sortagrad > 0
# Create the dataset iterators
train_iter = ParallelSentenceIterator(
train,
args.batchsize,
max_length=args.maxlen,
sos=eos,
eos=eos,
shuffle=not use_sortagrad,
)
val_iter = ParallelSentenceIterator(
val, args.batchsize, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
)
epoch_iters = int(len(train_iter.batch_indices) / args.accum_grad)
logging.info("#iterations per epoch = %d" % epoch_iters)
logging.info("#total iterations = " + str(args.epoch * epoch_iters))
# Prepare an RNNLM model
rnn = RNNLM(args.n_vocab, args.layer, args.unit, args.type)
model = ClassifierWithState(rnn)
if args.ngpu > 1:
logging.warning("currently, multi-gpu is not supported. use single gpu.")
if args.ngpu > 0:
# Make the specified GPU current
gpu_id = 0
chainer.cuda.get_device_from_id(gpu_id).use()
model.to_gpu()
else:
gpu_id = -1
# Save model conf to json
model_conf = args.outdir + "/model.json"
with open(model_conf, "wb") as f:
logging.info("writing a model config file to " + model_conf)
f.write(
json.dumps(vars(args), indent=4, ensure_ascii=False, sort_keys=True).encode(
"utf_8"
)
)
# Set up an optimizer
opt_class = dynamic_import_optimizer(args.opt, args.backend)
optimizer = opt_class.from_args(model, args)
if args.schedulers is None:
schedulers = []
else:
schedulers = [dynamic_import_scheduler(v)(k, args) for k, v in args.schedulers]
optimizer.setup(model)
optimizer.add_hook(chainer.optimizer.GradientClipping(args.gradclip))
updater = BPTTUpdater(train_iter, optimizer, schedulers, gpu_id, args.accum_grad)
trainer = training.Trainer(updater, (args.epoch, "epoch"), out=args.outdir)
trainer.extend(LMEvaluator(val_iter, model, device=gpu_id))
trainer.extend(
extensions.LogReport(
postprocess=compute_perplexity,
trigger=(args.report_interval_iters, "iteration"),
)
)
trainer.extend(
extensions.PrintReport(
["epoch", "iteration", "perplexity", "val_perplexity", "elapsed_time"]
),
trigger=(args.report_interval_iters, "iteration"),
)
trainer.extend(extensions.ProgressBar(update_interval=args.report_interval_iters))
trainer.extend(extensions.snapshot(filename="snapshot.ep.{.updater.epoch}"))
trainer.extend(extensions.snapshot_object(model, "rnnlm.model.{.updater.epoch}"))
# MEMO(Hori): wants to use MinValueTrigger, but it seems to fail in resuming
trainer.extend(MakeSymlinkToBestModel("validation/main/loss", "rnnlm.model"))
if use_sortagrad:
trainer.extend(
ShufflingEnabler([train_iter]),
trigger=(args.sortagrad if args.sortagrad != -1 else args.epoch, "epoch"),
)
if args.resume:
logging.info("resumed from %s" % args.resume)
chainer.serializers.load_npz(args.resume, trainer)
set_early_stop(trainer, args, is_lm=True)
if args.tensorboard_dir is not None and args.tensorboard_dir != "":
try:
from tensorboardX import SummaryWriter
except Exception:
logging.error("Please install tensorboardx")
raise
writer = SummaryWriter(args.tensorboard_dir)
trainer.extend(
TensorboardLogger(writer), trigger=(args.report_interval_iters, "iteration")
)
trainer.run()
check_early_stop(trainer, args.epoch)
# compute perplexity for test set
if args.test_label:
logging.info("test the best model")
chainer.serializers.load_npz(args.outdir + "/rnnlm.model.best", model)
test = read_tokens(args.test_label, args.char_list_dict)
n_test_tokens, n_test_oovs = count_tokens(test, unk)
logging.info("#sentences in the test data = " + str(len(test)))
logging.info("#tokens in the test data = " + str(n_test_tokens))
logging.info(
"oov rate in the test data = %.2f %%" % (n_test_oovs / n_test_tokens * 100)
)
test_iter = ParallelSentenceIterator(
test, args.batchsize, max_length=args.maxlen, sos=eos, eos=eos, repeat=False
)
evaluator = LMEvaluator(test_iter, model, device=gpu_id)
with chainer.using_config("train", False):
result = evaluator()
logging.info("test perplexity: " + str(np.exp(float(result["main/loss"]))))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment