init v0.10.0

9dcc7a15 · flyingdown · db2b0b79 · 9dcc7a15 · 9dcc7a15 · 9dcc7a15
Commit 9dcc7a15 authored Apr 25, 2022 by flyingdown
20 changed files
--- a/examples/libtorchaudio/README.md
+++ b/examples/libtorchaudio/README.md
+# Libtorchaudio Examples
+* [Augmentation](./augmentation)
+* [Speech Recognition with wav2vec2.0](./speech_recognition)
+## Build
+The example applications in this directory depend on `libtorch` and `libtorchaudio`.
+If you have a working `PyTorch`, you already have `libtorch`.
+Please refer to [this tutorial](https://pytorch.org/tutorials/advanced/torch_script_custom_classes.html) for the use of `libtorch` and TorchScript.
+`libtorchaudio` is the library of torchaudio's C++ components without Python component.
+It is currently not distributed, and it will be built alongside with the applications.
+The following commands will build `libtorchaudio` and applications.
+```bash
+git submodule update
+mkdir build
+cd build
+cmake -GNinja \
+      -DCMAKE_PREFIX_PATH="$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')" \
+      -DBUILD_SOX=ON \
+      -DBUILD_KALDI=OFF \
+      -DBUILD_RNNT=ON \
+      ..
+cmake --build .
+```
+For the usages of each application, refer to the corresponding application directory.
--- a/examples/libtorchaudio/augmentation/CMakeLists.txt
+++ b/examples/libtorchaudio/augmentation/CMakeLists.txt
+add_executable(augment main.cpp)
+target_link_libraries(augment "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}")
+set_property(TARGET augment PROPERTY CXX_STANDARD 14)
--- a/examples/libtorchaudio/augmentation/README.md
+++ b/examples/libtorchaudio/augmentation/README.md
+# Augmentation
+This example demonstrates how you can use torchaudio's I/O features and augmentations in C++ application.
+**NOTE**
+This example uses `"sox_io"` backend, thus does not work on Windows.
+## Steps
+### 1. Create augmentation pipeline TorchScript file.
+First, we implement our data process pipeline as a regular Python, and save it as a TorchScript object.
+We will load and execute it in our C++ application. The C++ code is found in [`main.cpp`](./main.cpp).
+```python
+python create_jittable_pipeline.py \
+    --rir-path "../data/rir.wav" \
+    --output-path "./pipeline.zip"
+```
+### 2. Build the application
+Please refer to [the top level README.md](../README.md)
+### 3. Run the application
+Now we run the C++ application `augment`, with the TorchScript object we created in Step.1 and an input audio file.
+In [the top level directory](../)
+```bash
+input_audio_file="./data/input.wav"
+./build/augmentation/augment ./augmentation/pipeline.zip "${input_audio_file}" "output.wav"
+```
+When you give a clean speech file, the output audio sounds like it's a phone conversation.
--- a/examples/libtorchaudio/augmentation/create_jittable_pipeline.py
+++ b/examples/libtorchaudio/augmentation/create_jittable_pipeline.py
+#!/usr/bin/env python3
+"""
+Create a data preprocess pipeline that can be run with libtorchaudio
+"""
+import os
+import argparse
+import torch
+import torchaudio
+class Pipeline(torch.nn.Module):
+    """Example audio process pipeline.
+    This example load waveform from a file then apply effects and save it to a file.
+    """
+    def __init__(self, rir_path: str):
+        super().__init__()
+        rir, sample_rate = torchaudio.load(rir_path)
+        self.register_buffer('rir', rir)
+        self.rir_sample_rate: int = sample_rate
+    def forward(self, input_path: str, output_path: str):
+        torchaudio.sox_effects.init_sox_effects()
+        # 1. load audio
+        waveform, sample_rate = torchaudio.load(input_path)
+        # 2. Add background noise
+        alpha = 0.01
+        waveform = alpha * torch.randn_like(waveform) + (1 - alpha) * waveform
+        # 3. Reample the RIR filter to much the audio sample rate
+        rir, _ = torchaudio.sox_effects.apply_effects_tensor(
+            self.rir, self.rir_sample_rate, effects=[["rate", str(sample_rate)]])
+        rir = rir / torch.norm(rir, p=2)
+        rir = torch.flip(rir, [1])
+        # 4. Apply RIR filter
+        waveform = torch.nn.functional.pad(waveform, (rir.shape[1] - 1, 0))
+        waveform = torch.nn.functional.conv1d(waveform[None, ...], rir[None, ...])[0]
+        # Save
+        torchaudio.save(output_path, waveform, sample_rate)
+def _create_jit_pipeline(rir_path, output_path):
+    module = torch.jit.script(Pipeline(rir_path))
+    print("*" * 40)
+    print("* Pipeline code")
+    print("*" * 40)
+    print()
+    print(module.code)
+    print("*" * 40)
+    module.save(output_path)
+def _get_path(*paths):
+    return os.path.join(os.path.dirname(__file__), *paths)
+def _parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--rir-path",
+        default=_get_path("..", "data", "rir.wav"),
+        help="Audio dara for room impulse response."
+    )
+    parser.add_argument(
+        "--output-path",
+        default=_get_path("pipeline.zip"),
+        help="Output JIT file."
+    )
+    return parser.parse_args()
+def _main():
+    args = _parse_args()
+    _create_jit_pipeline(args.rir_path, args.output_path)
+if __name__ == '__main__':
+    _main()
--- a/examples/libtorchaudio/augmentation/main.cpp
+++ b/examples/libtorchaudio/augmentation/main.cpp
+#include <torch/script.h>
+int main(int argc, char* argv[]) {
+  if (argc !=4) {
+    std::cerr << "Usage: " << argv[0] << " <JIT_OBJECT> <INPUT_FILE> <OUTPUT_FILE>" << std::endl;
+    return -1;
+  }
+  torch::jit::script::Module module;
+  std::cout << "Loading module from: " << argv[1] << std::endl;
+  try {
+    module = torch::jit::load(argv[1]);
+  } catch (const c10::Error &error) {
+    std::cerr << "Failed to load the module:" << error.what() << std::endl;
+    return -1;
+  }
+  std::cout << "Performing the process ..." << std::endl;
+  module.forward({c10::IValue(argv[2]), c10::IValue(argv[3])});
+  std::cout << "Done." << std::endl; 
+}
--- a/examples/libtorchaudio/build.sh
+++ b/examples/libtorchaudio/build.sh
+#!/usr/bin/env bash
+set -eux
+this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+build_dir="${this_dir}/build"
+mkdir -p "${build_dir}"
+cd "${build_dir}"
+git submodule update
+cmake -GNinja \
+      -DCMAKE_PREFIX_PATH="$(python -c 'import torch;print(torch.utils.cmake_prefix_path)')" \
+      -DBUILD_SOX=ON \
+      -DBUILD_KALDI=OFF \
+      ..
+cmake --build .
--- a/examples/libtorchaudio/data/README.md
+++ b/examples/libtorchaudio/data/README.md
+The files in this directory are originated from [VOiCES](https://iqtlabs.github.io/voices/) dataset, which is licensed under Creative Commos BY 4.0. They are modified to fit into the tutorial.
+* `input.wav`: `VOiCES_devkit/source-16k/train/sp0307/Lab41-SRI-VOiCES-src-sp0307-ch127535-sg0042.wav`
+* `rir.wav`: `VOiCES_devkit/distant-16k/room-response/rm1/impulse/Lab41-SRI-VOiCES-rm1-impulse-mc01-stu-clo.wav`
--- a/examples/libtorchaudio/data/input.wav
+++ b/examples/libtorchaudio/data/input.wav
--- a/examples/libtorchaudio/data/rir.wav
+++ b/examples/libtorchaudio/data/rir.wav
--- a/examples/libtorchaudio/speech_recognition/CMakeLists.txt
+++ b/examples/libtorchaudio/speech_recognition/CMakeLists.txt
+add_executable(transcribe transcribe.cpp)
+add_executable(transcribe_list transcribe_list.cpp)
+target_link_libraries(transcribe "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}")
+target_link_libraries(transcribe_list "${TORCH_LIBRARIES}" "${TORCHAUDIO_LIBRARY}")
+set_property(TARGET transcribe PROPERTY CXX_STANDARD 14)
+set_property(TARGET transcribe_list PROPERTY CXX_STANDARD 14)
--- a/examples/libtorchaudio/speech_recognition/README.md
+++ b/examples/libtorchaudio/speech_recognition/README.md
+# Speech Recognition with wav2vec2.0
+This example demonstarates how you can use torchaudio's I/O features and models to run speech recognition in C++ application.
+**NOTE**
+This example uses `"sox_io"` backend for loading audio, which does not work on Windows. To make it work on
+Windows, you need to replace the part of loading audio and converting it to Tensor object.
+## 1. Create a transcription pipeline TorchScript file
+We will create a TorchScript that performs the following processes;
+1. Load audio from a file.
+1. Pass audio to encoder which produces the sequence of probability distribution on labels.
+1. Pass the encoder output to decoder which generates transcripts.
+For building decoder, we borrow the pre-trained weights published by `fairseq` and/or Hugging Face Transformers, then convert it `torchaudio`'s format, which supports TorchScript.
+### 1.1. From `fairseq`
+For `fairseq` models, you can download pre-trained weights
+You can download a model from [`fairseq` repository](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec). Here, we will use `Base / 960h` model. You also need to download [the letter dictionary file](https://github.com/pytorch/fairseq/tree/main/examples/wav2vec#evaluating-a-ctc-model).
+For the decoder part, we use [simple_ctc](https://github.com/mthrok/ctcdecode), which also supports TorchScript.
+```bash
+mkdir -p pipeline-fairseq
+python build_pipeline_from_fairseq.py \
+    --model-file "wav2vec_small_960.pt" \
+    --dict-dir <DIRECTORY_WHERE_dict.ltr.txt_IS_FOUND> \
+    --output-path "./pipeline-fairseq/"
+```
+The above command should create the following TorchScript object files in the output directory.
+```
+decoder.zip  encoder.zip  loader.zip
+```
+* `loader.zip` loads audio file and generate waveform Tensor.
+* `encoder.zip` receives waveform Tensor and generates the sequence of probability distribution over the label.
+* `decoder.zip` receives the probability distribution over the label and generates a transcript.
+### 1.2. From Hugging Face Transformers
+[Hugging Face Transformers](https://huggingface.co/transformers/index.html) and [Hugging Face Model Hub](https://huggingface.co/models) provides `wav2vec2.0` models fine-tuned on variety of datasets and languages.
+We can also import the model published on Hugging Face Hub and run it in our C++ application.
+In the following example, we will try the Geremeny model, ([facebook/wav2vec2-large-xlsr-53-german](https://huggingface.co/facebook/wav2vec2-large-xlsr-53-german/tree/main)) on [VoxForge Germany dataset](http://www.voxforge.org/de/downloads).
+```bash
+mkdir -p pipeline-hf
+python build_pipeline_from_huggingface_transformers.py \
+    --model facebook/wav2vec2-large-xlsr-53-german \
+    --output-path ./pipeline-hf/
+```
+The resulting TorchScript object files should be same as the `fairseq` example.
+## 2. Build the application
+Please refer to [the top level README.md](../README.md)
+## 3. Run the application
+Now we run the C++ application [`transcribe`](./transcribe.cpp), with the TorchScript object we created in Step.1.1. and an input audio file.
+```bash
+../build/speech_recognition/transcribe ./pipeline-fairseq ../data/input.wav
+```
+This will output something like the following.
+```
+Loading module from: ./pipeline/loader.zip
+Loading module from: ./pipeline/encoder.zip
+Loading module from: ./pipeline/decoder.zip
+Loading the audio
+Running inference
+Generating the transcription
+I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT
+Done.
+```
+## 4. Evaluate the pipeline on Librispeech dataset
+Let's evaluate this word error rate (WER) of this application using [Librispeech dataset](https://www.openslr.org/12).
+### 4.1. Create a list of audio paths
+For the sake of simplifying our C++ code, we will first parse the Librispeech dataset to get the list of audio path
+```bash
+python parse_librispeech.py <PATH_TO_YOUR_DATASET>/LibriSpeech/test-clean ./flist.txt
+```
+The list should look like the following;
+```bash
+head flist.txt
+1089-134691-0000    /LibriSpeech/test-clean/1089/134691/1089-134691-0000.flac    HE COULD WAIT NO LONGER
+```
+### 4.2. Run the transcription
+[`transcribe_list`](./transcribe_list.cpp) processes the input flist list and feed the audio path one by one to the pipeline, then generate reference file and hypothesis file.
+```bash
+../build/speech_recognition/transcribe_list ./pipeline-fairseq ./flist.txt <OUTPUT_DIR>
+```
+### 4.3. Score WER
+You need `sclite` for this step. You can download the code from [SCTK repository](https://github.com/usnistgov/SCTK).
+```bash
+# in the output directory
+sclite -r ref.trn -h hyp.trn -i wsj -o pralign -o sum
+```
+WER can be found in the resulting `hyp.trn.sys`. Check out the column that starts with `Sum/Avg` the first column of the third block is `100 - WER`.
+In our test, we got the following results.
+|          model                            | Fine Tune | test-clean | test-other |
+|:-----------------------------------------:|----------:|:----------:|:----------:|
+| Base<br/>`wav2vec_small_960`              |      960h |        3.1 |        7.7 |
+| Large<br/>`wav2vec_big_960`               |      960h |        2.6 |        5.9 |
+| Large (LV-60)<br/>`wav2vec2_vox_960h_new` |      960h |        2.9 |        6.2 |
+| Large (LV-60) + Self Training<br/>`wav2vec_vox_960h_pl` | 960h | 1.9 |      4.5 |
+You can also check `hyp.trn.pra` file to see what errors were made.
+```
+id: (3528-168669-0005)
+Scores: (#C #S #D #I) 7 1 0 0
+REF:  there is a stone to be RAISED heavy
+HYP:  there is a stone to be RACED  heavy
+Eval:                        S
+```
+## 5. Evaluate the pipeline on VoxForge dataset
+Now we use the pipeline we created in step 1.2. This time with German language dataset from VoxForge.
+### 5.1. Create a list of audio paths
+Download an archive from http://www.repository.voxforge1.org/downloads/de/Trunk/Audio/Main/16kHz_16bit/, and extract it to your local file system, then run the following to generate the file list.
+```bash
+python parse_voxforge.py <PATH_TO_YOUR_DATASET> > ./flist-de.txt
+```
+The list should look like
+```bash
+head flist-de.txt
+de5-001    /datasets/voxforge/de/guenter-20140214-afn/wav/de5-001.wav    ES SOLL ETWA FÜNFZIGTAUSEND VERSCHIEDENE SORTEN GEBEN
+```
+### 5.2. Run the application and score WER
+This process is same as the Librispeech example. We just use the pipeline with the Germany model and file list of Germany dataset. Refer to the corresponding ssection in Librispeech evaluation..
+```bash
+../build/speech_recognition/transcribe_list ./pipeline-hf ./flist-de.txt <OUTPUT_DIR>
+```
+Then
+```bash
+# in the output directory
+sclite -r ref.trn -h hyp.trn -i wsj -o pralign -o sum
+```
+You can find the detail of evalauation result in PRA.
+```
+id: (guenter-20140214-afn/mfc/de5-012)
+Scores: (#C #S #D #I) 4 1 1 0
+REF:  die ausgaben kÖnnen gigantisch STEIGE N
+HYP:  die ausgaben kÖnnen gigantisch ****** STEIGEN
+Eval:                                 D      S
+```
--- a/examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py
+++ b/examples/libtorchaudio/speech_recognition/build_pipeline_from_fairseq.py
+#!/usr/bin/evn python3
+"""Build Speech Recognition pipeline based on fairseq's wav2vec2.0 and dump it to TorchScript file.
+To use this script, you need `fairseq`.
+"""
+import os
+import argparse
+import logging
+import torch
+from torch.utils.mobile_optimizer import optimize_for_mobile
+import torchaudio
+from torchaudio.models.wav2vec2.utils.import_fairseq import import_fairseq_model
+import fairseq
+from greedy_decoder import Decoder
+_LG = logging.getLogger(__name__)
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+    )
+    parser.add_argument(
+        '--model-file',
+        required=True,
+        help='Path to the input pretrained weight file.'
+    )
+    parser.add_argument(
+        '--dict-dir',
+        help=(
+            'Path to the directory in which `dict.ltr.txt` file is found. '
+            'Required only when the model is finetuned.'
+        )
+    )
+    parser.add_argument(
+        '--output-path',
+        help='Path to the directory, where the TorchScript-ed pipelines are saved.',
+    )
+    parser.add_argument(
+        '--test-file',
+        help='Path to a test audio file.',
+    )
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help=(
+            'When enabled, individual components are separately tested '
+            'for the numerical compatibility and TorchScript compatibility.'
+        )
+    )
+    parser.add_argument(
+        '--quantize',
+        action='store_true',
+        help='Apply quantization to model.'
+    )
+    parser.add_argument(
+        '--optimize-for-mobile',
+        action='store_true',
+        help='Apply optmization for mobile.'
+    )
+    return parser.parse_args()
+class Loader(torch.nn.Module):
+    def forward(self, audio_path: str) -> torch.Tensor:
+        waveform, sample_rate = torchaudio.load(audio_path)
+        if sample_rate != 16000:
+            waveform = torchaudio.functional.resample(waveform, float(sample_rate), 16000.)
+        return waveform
+class Encoder(torch.nn.Module):
+    def __init__(self, encoder: torch.nn.Module):
+        super().__init__()
+        self.encoder = encoder
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        result, _ = self.encoder(waveform)
+        return result[0]
+def _get_decoder():
+    labels = [
+        "<s>",
+        "<pad>",
+        "</s>",
+        "<unk>",
+        "|",
+        "E",
+        "T",
+        "A",
+        "O",
+        "N",
+        "I",
+        "H",
+        "S",
+        "R",
+        "D",
+        "L",
+        "U",
+        "M",
+        "W",
+        "C",
+        "F",
+        "G",
+        "Y",
+        "P",
+        "B",
+        "V",
+        "K",
+        "'",
+        "X",
+        "J",
+        "Q",
+        "Z",
+    ]
+    return Decoder(labels)
+def _load_fairseq_model(input_file, data_dir=None):
+    overrides = {}
+    if data_dir:
+        overrides['data'] = data_dir
+    model, _, _ = fairseq.checkpoint_utils.load_model_ensemble_and_task(
+        [input_file], arg_overrides=overrides
+    )
+    model = model[0]
+    return model
+def _get_model(model_file, dict_dir):
+    original = _load_fairseq_model(model_file, dict_dir)
+    model = import_fairseq_model(original.w2v_encoder)
+    return model
+def _main():
+    args = _parse_args()
+    _init_logging(args.debug)
+    loader = Loader()
+    model = _get_model(args.model_file, args.dict_dir).eval()
+    encoder = Encoder(model)
+    decoder = _get_decoder()
+    _LG.info(encoder)
+    if args.quantize:
+        _LG.info('Quantizing the model')
+        model.encoder.transformer.pos_conv_embed.__prepare_scriptable__()
+        encoder = torch.quantization.quantize_dynamic(
+            encoder, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
+        _LG.info(encoder)
+    # test
+    if args.test_file:
+        _LG.info('Testing with %s', args.test_file)
+        waveform = loader(args.test_file)
+        emission = encoder(waveform)
+        transcript = decoder(emission)
+        _LG.info(transcript)
+    torch.jit.script(loader).save(os.path.join(args.output_path, 'loader.zip'))
+    torch.jit.script(decoder).save(os.path.join(args.output_path, 'decoder.zip'))
+    scripted = torch.jit.script(encoder)
+    if args.optimize_for_mobile:
+        scripted = optimize_for_mobile(scripted)
+    scripted.save(os.path.join(args.output_path, 'encoder.zip'))
+def _init_logging(debug=False):
+    level = logging.DEBUG if debug else logging.INFO
+    format_ = (
+        '%(message)s' if not debug else
+        '%(asctime)s: %(levelname)7s: %(funcName)10s: %(message)s'
+    )
+    logging.basicConfig(level=level, format=format_)
+if __name__ == '__main__':
+    _main()
--- a/examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py
+++ b/examples/libtorchaudio/speech_recognition/build_pipeline_from_huggingface_transformers.py
+#!/usr/bin/env python3
+import argparse
+import logging
+import os
+import torch
+import torchaudio
+from torchaudio.models.wav2vec2.utils.import_huggingface import import_huggingface_model
+from greedy_decoder import Decoder
+_LG = logging.getLogger(__name__)
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+    )
+    parser.add_argument(
+        '--model',
+        required=True,
+        help='Path to the input pretrained weight file.'
+    )
+    parser.add_argument(
+        '--output-path',
+        help='Path to the directory, where the Torchscript-ed pipelines are saved.',
+    )
+    parser.add_argument(
+        '--test-file',
+        help='Path to a test audio file.',
+    )
+    parser.add_argument(
+        '--quantize',
+        action='store_true',
+        help='Quantize the model.',
+    )
+    parser.add_argument(
+        '--debug',
+        action='store_true',
+        help=(
+            'When enabled, individual components are separately tested '
+            'for the numerical compatibility and TorchScript compatibility.'
+        )
+    )
+    return parser.parse_args()
+class Loader(torch.nn.Module):
+    def forward(self, audio_path: str) -> torch.Tensor:
+        waveform, sample_rate = torchaudio.load(audio_path)
+        if sample_rate != 16000:
+            waveform = torchaudio.functional.resample(waveform, float(sample_rate), 16000.)
+        return waveform
+class Encoder(torch.nn.Module):
+    def __init__(self, encoder: torch.nn.Module):
+        super().__init__()
+        self.encoder = encoder
+    def forward(self, waveform: torch.Tensor) -> torch.Tensor:
+        result, _ = self.encoder(waveform)
+        return result[0]
+def _get_model(model_id):
+    from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+    tokenizer = Wav2Vec2Processor.from_pretrained(model_id).tokenizer
+    labels = [k for k, v in sorted(tokenizer.get_vocab().items(), key=lambda kv: kv[1])]
+    original = Wav2Vec2ForCTC.from_pretrained(model_id)
+    model = import_huggingface_model(original)
+    return model.eval(), labels
+def _get_decoder(labels):
+    return Decoder(labels)
+def _main():
+    args = _parse_args()
+    _init_logging(args.debug)
+    _LG.info('Loading model: %s', args.model)
+    model, labels = _get_model(args.model)
+    _LG.info('Labels: %s', labels)
+    _LG.info('Building pipeline')
+    loader = Loader()
+    encoder = Encoder(model)
+    decoder = _get_decoder(labels)
+    _LG.info(encoder)
+    if args.quantize:
+        _LG.info('Quantizing the model')
+        model.encoder.transformer.pos_conv_embed.__prepare_scriptable__()
+        encoder = torch.quantization.quantize_dynamic(
+            encoder, qconfig_spec={torch.nn.Linear}, dtype=torch.qint8)
+        _LG.info(encoder)
+    # test
+    if args.test_file:
+        _LG.info('Testing with %s', args.test_file)
+        waveform = loader(args.test_file)
+        emission = encoder(waveform)
+        transcript = decoder(emission)
+        _LG.info(transcript)
+    torch.jit.script(loader).save(os.path.join(args.output_path, 'loader.zip'))
+    torch.jit.script(encoder).save(os.path.join(args.output_path, 'encoder.zip'))
+    torch.jit.script(decoder).save(os.path.join(args.output_path, 'decoder.zip'))
+def _init_logging(debug=False):
+    level = logging.DEBUG if debug else logging.INFO
+    format_ = (
+        '%(message)s' if not debug else
+        '%(asctime)s: %(levelname)7s: %(funcName)10s: %(message)s'
+    )
+    logging.basicConfig(level=level, format=format_)
+if __name__ == '__main__':
+    _main()
--- a/examples/libtorchaudio/speech_recognition/greedy_decoder.py
+++ b/examples/libtorchaudio/speech_recognition/greedy_decoder.py
+import torch
+class Decoder(torch.nn.Module):
+    def __init__(self, labels):
+        super().__init__()
+        self.labels = labels
+    def forward(self, logits: torch.Tensor) -> str:
+        """Given a sequence logits over labels, get the best path string
+        Args:
+            logits (Tensor): Logit tensors. Shape `[num_seq, num_label]`.
+        Returns:
+            str: The resulting transcript
+        """
+        best_path = torch.argmax(logits, dim=-1)  # [num_seq,]
+        best_path = torch.unique_consecutive(best_path, dim=-1)
+        hypothesis = ''
+        for i in best_path:
+            char = self.labels[i]
+            if char in ['<s>', '<pad>']:
+                continue
+            if char == '|':
+                char = ' '
+            hypothesis += char
+        return hypothesis
--- a/examples/libtorchaudio/speech_recognition/parse_librispeech.py
+++ b/examples/libtorchaudio/speech_recognition/parse_librispeech.py
+#!/usr/bin/env python3
+"""Parse a directory contains Librispeech dataset.
+Recursively search for "*.trans.txt" file in the given directory and print out
+`<ID>\\t<AUDIO_PATH>\\t<TRANSCRIPTION>`
+example: python parse_librispeech.py LibriSpeech/test-clean
+    1089-134691-0000\t/LibriSpeech/test-clean/1089/134691/1089-134691-0000.flac\tHE COULD WAIT NO LONGER
+    ...
+Dataset can be obtained from https://www.openslr.org/12
+"""
+import argparse
+from pathlib import Path
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        'input_dir',
+        type=Path,
+        help='Directory where `*.trans.txt` files are searched.'
+    )
+    return parser.parse_args()
+def _parse_transcript(path):
+    with open(path) as trans_fileobj:
+        for line in trans_fileobj:
+            line = line.strip()
+            if line:
+                yield line.split(' ', maxsplit=1)
+def _parse_directory(root_dir: Path):
+    for trans_file in root_dir.glob('**/*.trans.txt'):
+        trans_dir = trans_file.parent
+        for id_, transcription in _parse_transcript(trans_file):
+            audio_path = trans_dir / f'{id_}.flac'
+            yield id_, audio_path, transcription
+def _main():
+    args = _parse_args()
+    for id_, path, transcription in _parse_directory(args.input_dir):
+        print(f'{id_}\t{path}\t{transcription}')
+if __name__ == '__main__':
+    _main()
--- a/examples/libtorchaudio/speech_recognition/parse_voxforge.py
+++ b/examples/libtorchaudio/speech_recognition/parse_voxforge.py
+#!/usr/bin/env python
+"""Parse a directory contains VoxForge dataset.
+Recursively search for "PROMPTS" file in the given directory and print out
+`<ID>\\t<AUDIO_PATH>\\t<TRANSCRIPTION>`
+example: python parse_voxforge.py voxforge/de/Helge-20150608-aku
+    de5-001\t/datasets/voxforge/de/guenter-20140214-afn/wav/de5-001.wav\tES SOLL ETWA FÜNFZIGTAUSEND VERSCHIEDENE SORTEN GEBEN
+    ...
+Dataset can be obtained from http://www.repository.voxforge1.org/downloads/de/Trunk/Audio/Main/16kHz_16bit/
+"""  # noqa: E501
+import os
+import argparse
+from pathlib import Path
+def _parse_args():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        'input_dir',
+        type=Path,
+        help='Directory where `*.trans.txt` files are searched.'
+    )
+    return parser.parse_args()
+def _parse_prompts(path):
+    base_dir = path.parent.parent
+    with open(path) as trans_fileobj:
+        for line in trans_fileobj:
+            line = line.strip()
+            if not line:
+                continue
+            id_, transcript = line.split(' ', maxsplit=1)
+            if not transcript:
+                continue
+            transcript = transcript.upper()
+            filename = id_.split('/')[-1]
+            audio_path = base_dir / 'wav' / f'{filename}.wav'
+            if os.path.exists(audio_path):
+                yield id_, audio_path, transcript
+def _parse_directory(root_dir: Path):
+    for prompt_file in root_dir.glob('**/PROMPTS'):
+        try:
+            yield from _parse_prompts(prompt_file)
+        except UnicodeDecodeError:
+            pass
+def _main():
+    args = _parse_args()
+    for id_, path, transcription in _parse_directory(args.input_dir):
+        print(f'{id_}\t{path}\t{transcription}')
+if __name__ == '__main__':
+    _main()
--- a/examples/libtorchaudio/speech_recognition/transcribe.cpp
+++ b/examples/libtorchaudio/speech_recognition/transcribe.cpp
+#include <torch/script.h>
+int main(int argc, char* argv[]) {
+  if (argc != 3) {
+    std::cerr << "Usage: " << argv[0] << " <JIT_OBJECT_DIR> <INPUT_AUDIO_FILE>" << std::endl;
+    return -1;
+  }
+  torch::jit::script::Module loader, encoder, decoder;
+  std::cout << "Loading module from: " << argv[1] << std::endl;
+  try {
+    loader = torch::jit::load(std::string(argv[1]) + "/loader.zip");
+  } catch (const c10::Error &error) {
+    std::cerr << "Failed to load the module:" << error.what() << std::endl;
+    return -1;
+  }
+  try {
+    encoder = torch::jit::load(std::string(argv[1]) + "/encoder.zip");
+  } catch (const c10::Error &error) {
+    std::cerr << "Failed to load the module:" << error.what() << std::endl;
+    return -1;
+  }
+  try {
+    decoder = torch::jit::load(std::string(argv[1]) + "/decoder.zip");
+  } catch (const c10::Error &error) {
+    std::cerr << "Failed to load the module:" << error.what() << std::endl;
+    return -1;
+  }
+  std::cout << "Loading the audio" << std::endl;
+  auto waveform = loader.forward({c10::IValue(argv[2])});
+  std::cout << "Running inference" << std::endl;
+  auto emission = encoder.forward({waveform});
+  std::cout << "Generating the transcription" << std::endl;
+  auto result = decoder.forward({emission});
+  std::cout << result.toString()->string() << std::endl;
+  std::cout << "Done." << std::endl; 
+}
--- a/examples/libtorchaudio/speech_recognition/transcribe_list.cpp
+++ b/examples/libtorchaudio/speech_recognition/transcribe_list.cpp
+#include <chrono>
+#include <torch/script.h>
+int main(int argc, char* argv[]) {
+  if (argc != 4) {
+    std::cerr << "Usage: " << argv[0] << "<JIT_OBJECT_DIR> <FILE_LIST> <OUTPUT_DIR>\n" << std::endl;
+    std::cerr << "<FILE_LIST> is `<ID>\t<PATH>\t<TRANSCRIPTION>`" << std::endl;
+    return -1;
+  }
+  torch::jit::script::Module loader, encoder, decoder;
+  std::cout << "Loading module from: " << argv[1] << std::endl;
+  try {
+    loader = torch::jit::load(std::string(argv[1]) + "/loader.zip");
+  } catch (const c10::Error &error) {
+    std::cerr << "Failed to load the module:" << error.what() << std::endl;
+    return -1;
+  }
+  try {
+    encoder = torch::jit::load(std::string(argv[1]) + "/encoder.zip");
+  } catch (const c10::Error &error) {
+    std::cerr << "Failed to load the module:" << error.what() << std::endl;
+    return -1;
+  }
+  try {
+    decoder = torch::jit::load(std::string(argv[1]) + "/decoder.zip");
+  } catch (const c10::Error &error) {
+    std::cerr << "Failed to load the module:" << error.what() << std::endl;
+    return -1;
+  }
+  std::ifstream input_file(argv[2]);
+  std::string output_dir(argv[3]);
+  std::ofstream output_ref(output_dir + "/ref.trn");
+  std::ofstream output_hyp(output_dir + "/hyp.trn");
+  std::string line;
+  std::chrono::milliseconds t_encode(0);
+  std::chrono::milliseconds t_decode(0);
+  while(std::getline(input_file, line)) {
+    std::istringstream iline(line);
+    std::string id;
+    std::string path;
+    std::string reference;
+    std::getline(iline, id, '\t');
+    std::getline(iline, path, '\t');
+    std::getline(iline, reference, '\t');
+    auto waveform = loader.forward({c10::IValue(path)});
+    std::chrono::steady_clock::time_point t0 = std::chrono::steady_clock::now();
+    auto emission = encoder.forward({waveform});
+    std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
+    auto result = decoder.forward({emission});
+    std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
+    t_encode += std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0);
+    t_decode += std::chrono::duration_cast<std::chrono::milliseconds>(t2 - t1);
+    auto hypothesis = result.toString()->string();
+    output_hyp << hypothesis << " (" << id << ")" << std::endl;
+    output_ref << reference << " (" << id << ")" << std::endl;
+    std::cout << id << '\t' << hypothesis << std::endl;
+  }
+  std::cout << "Time (encode): " << t_encode.count() << " [ms]" << std::endl;
+  std::cout << "Time (decode): " << t_decode.count() << " [ms]" << std::endl;
+}
--- a/examples/pipeline_tacotron2/README.md
+++ b/examples/pipeline_tacotron2/README.md
+This is an example pipeline for text-to-speech using Tacotron2.
+Here is a [colab example](https://colab.research.google.com/drive/1MPcn1_G5lKozxZ7v8b9yucOD5X5cLK4j?usp=sharing)
+that shows how the text-to-speech pipeline is used during inference with the built-in pretrained models.
+## Install required packages
+Required packages
+```bash
+pip install librosa tqdm inflect joblib
+```
+To use tensorboard
+```bash
+pip install tensorboard pillow
+```
+## Training Tacotron2 with character as input
+The training of Tacotron2 can be invoked with the following command.
+```bash
+python train.py \
+    --learning-rate 1e-3 \
+    --epochs 1501 \
+    --anneal-steps 500 1000 1500 \
+    --anneal-factor 0.1 \
+    --batch-size 96 \
+    --weight-decay 1e-6 \
+    --grad-clip 1.0 \
+    --text-preprocessor english_characters \
+    --logging-dir ./logs \
+    --checkpoint-path ./ckpt.pth \
+    --dataset-path ./
+```
+The training script will use all GPUs that is available, please set the
+environment variable `CUDA_VISIBLE_DEVICES` if you don't want all GPUs to be used.
+The newest checkpoint will be saved to `./ckpt.pth` and the checkpoint with the best validation
+loss will be saved to `./best_ckpt.pth`.
+The training log will be saved to `./logs/train.log` and the tensorboard results will also
+be in `./logs`.
+If `./ckpt.pth` already exist, this script will automatically load the file and try to continue
+training from the checkpoint.
+This command takes around 36 hours to train on 8 NVIDIA Tesla V100 GPUs.
+To train the Tacotron2 model to work with the [pretrained wavernn](https://pytorch.org/audio/main/models.html#id10)
+with checkpoint_name `"wavernn_10k_epochs_8bits_ljspeech"`, please run the following command instead.
+```bash
+python train.py
+    --learning-rate 1e-3 \
+    --epochs 1501 \
+    --anneal-steps 500 1000 1500 \
+    --anneal-factor 0.1 \
+    --sample-rate 22050 \
+    --n-fft 2048 \
+    --hop-length 275 \
+    --win-length 1100 \
+    --mel-fmin 40 \
+    --mel-fmax 11025 \
+    --batch-size 96 \
+    --weight-decay 1e-6 \
+    --grad-clip 1.0 \
+    --text-preprocessor english_characters \
+    --logging-dir ./wavernn_logs \
+    --checkpoint-path ./ckpt_wavernn.pth \
+    --dataset-path ./
+```
+## Training Tacotron2 with phoneme as input
+#### Dependencies
+This example use the [DeepPhonemizer](https://github.com/as-ideas/DeepPhonemizer) as
+the phonemizer (the function to turn text into phonemes),
+please install it with the following command (the code is tested with version 0.0.15).
+```bash
+pip install deep-phonemizer==0.0.15
+```
+Then download the model weights from [their website](https://github.com/as-ideas/DeepPhonemizer)
+The link to the checkpoint that is tested with this example is
+[https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_forward.pt](https://public-asai-dl-models.s3.eu-central-1.amazonaws.com/DeepPhonemizer/en_us_cmudict_forward.pt).
+#### Running training script
+The training of Tacotron2 with english phonemes as input can be invoked with the following command.
+```bash
+python train.py \
+    --workers 12 \
+    --learning-rate 1e-3 \
+    --epochs 1501 \
+    --anneal-steps 500 1000 1500 \
+    --anneal-factor 0.1 \
+    --batch-size 96 \
+    --weight-decay 1e-6 \
+    --grad-clip 1.0 \
+    --text-preprocessor english_phonemes \
+    --phonemizer DeepPhonemizer \
+    --phonemizer-checkpoint ./en_us_cmudict_forward.pt \
+    --cmudict-root ./ \
+    --logging-dir ./english_phonemes_logs \
+    --checkpoint-path ./english_phonemes_ckpt.pth \
+    --dataset-path ./
+```
+Similar to the previous examples, this command will save the log in the directory `./english_phonemes_logs`
+and the checkpoint will be saved to `./english_phonemes_ckpt.pth`.
+To train the Tacotron2 model with english phonemes that works with the
+[pretrained wavernn](https://pytorch.org/audio/main/models.html#id10)
+with checkpoint_name `"wavernn_10k_epochs_8bits_ljspeech"`, please run the following command.
+```bash
+python train.py \
+    --workers 12 \
+    --learning-rate 1e-3 \
+    --epochs 1501 \
+    --anneal-steps 500 1000 1500 \
+    --anneal-factor 0.1 \
+    --sample-rate 22050 \
+    --n-fft 2048 \
+    --hop-length 275 \
+    --win-length 1100 \
+    --mel-fmin 40 \
+    --mel-fmax 11025 \
+    --batch-size 96 \
+    --weight-decay 1e-6 \
+    --grad-clip 1.0 \
+    --text-preprocessor english_phonemes \
+    --phonemizer DeepPhonemizer \
+    --phonemizer-checkpoint ./en_us_cmudict_forward.pt \
+    --cmudict-root ./ \
+    --logging-dir ./english_phonemes_wavernn_logs \
+    --checkpoint-path ./english_phonemes_wavernn_ckpt.pth \
+    --dataset-path ./
+```
+## Text-to-speech pipeline
+Here we present an example of how to use Tacotron2 to generate audio from text.
+The text-to-speech pipeline goes as follows:
+1. text preprocessing: encoder the text into list of symbols (the symbols can represent characters, phonemes, etc.)
+2. spectrogram generation: after retrieving the list of symbols, we feed this list to a Tacotron2 model and the model
+will output the mel spectrogram.
+3. time-domain conversion: when the mel spectrogram is generated, we need to convert it into audio with a vocoder.
+Currently, there are three vocoders being supported in this script, which includes the
+[WaveRNN](https://pytorch.org/audio/stable/models/wavernn.html),
+[Griffin-Lim](https://pytorch.org/audio/stable/transforms.html#griffinlim), and
+[Nvidia's WaveGlow](https://pytorch.org/hub/nvidia_deeplearningexamples_tacotron2/).
+The spectro parameters including `n-fft`, `mel-fmin`, `mel-fmax` should be set to the values
+used during the training of Tacotron2.
+#### Pretrained WaveRNN as the Vocoder
+The following command will generate a waveform to `./outputs.wav`
+with the text "Hello world!" using WaveRNN as the vocoder.
+```bash
+python inference.py --checkpoint-path ${model_path} \
+    --vocoder wavernn \
+    --n-fft 2048 \
+    --mel-fmin 40 \
+    --mel-fmax 11025 \
+    --input-text "Hello world!" \
+    --text-preprocessor english_characters \
+    --output-path "./outputs.wav"
+```
+If you want to generate a waveform with a different text with phonemes
+as the input to Tacotron2, please use the `--text-preprocessor english_phonemes`.
+The following is an example.
+(Remember to install the [DeepPhonemizer](https://github.com/as-ideas/DeepPhonemizer)
+and download their pretrained weights.
+```bash
+python inference.py --checkpoint-path ${model_path} \
+    --vocoder wavernn \
+    --n-fft 2048 \
+    --mel-fmin 40 \
+    --mel-fmax 11025 \
+    --input-text "Hello world!" \
+    --text-preprocessor english_phonemes \
+    --phonimizer DeepPhonemizer \
+    --phoimizer-checkpoint ./en_us_cmudict_forward.pt \
+    --cmudict-root ./ \
+    --output-path "./outputs.wav"
+```
+To use torchaudio pretrained models, please see the following example command.
+For Tacotron2, we use the checkpoint named `"tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech"`, and
+for WaveRNN, we use the checkpoint named `"wavernn_10k_epochs_8bits_ljspeech"`.
+See https://pytorch.org/audio/stable/models.html for more checkpoint options for Tacotron2 and WaveRNN.
+```bash
+python inference.py \
+    --checkpoint-path tacotron2_english_phonemes_1500_epochs_wavernn_ljspeech \
+    --wavernn-checkpoint-path wavernn_10k_epochs_8bits_ljspeech \
+    --vocoder wavernn \
+    --n-fft 2048 \
+    --mel-fmin 40 \
+    --mel-fmax 11025 \
+    --input-text "Hello world!" \
+    --text-preprocessor english_phonemes \
+    --phonimizer DeepPhonemizer \
+    --phoimizer-checkpoint ./en_us_cmudict_forward.pt \
+    --cmudict-root ./ \
+    --output-path "./outputs.wav"
+```
+#### Griffin-Lim's algorithm as the Vocoder
+The following command will generate a waveform to `./outputs.wav`
+with the text "Hello world!" using Griffin-Lim's algorithm as the vocoder.
+```bash
+python inference.py --checkpoint-path ${model_path} \
+    --vocoder griffin_lim \
+    --n-fft 1024 \
+    --mel-fmin 0 \
+    --mel-fmax 8000 \
+    --input-text "Hello world!" \
+    --text-preprocessor english_characters \
+    --output-path "./outputs.wav"
+```
+#### Nvidia's Waveglow as the Vocoder
+The following command will generate a waveform to `./outputs.wav`
+with the text `"Hello world!"` using Nvidia's WaveGlow as the vocoder.
+The WaveGlow is loaded using the following torchhub's API.
+```python
+torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16')
+```
+```bash
+python inference.py --checkpoint-path ${model_path} \
+    --vocoder nvidia_waveglow \
+    --n-fft 1024 \
+    --mel-fmin 0 \
+    --mel-fmax 8000 \
+    --input-text "Hello world!" \
+    --text-preprocessor english_characters \
+    --output-path "./outputs.wav"
+```
--- a/examples/pipeline_tacotron2/datasets.py
+++ b/examples/pipeline_tacotron2/datasets.py
+# *****************************************************************************
+#  Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+#  Redistribution and use in source and binary forms, with or without
+#  modification, are permitted provided that the following conditions are met:
+#      * Redistributions of source code must retain the above copyright
+#        notice, this list of conditions and the following disclaimer.
+#      * Redistributions in binary form must reproduce the above copyright
+#        notice, this list of conditions and the following disclaimer in the
+#        documentation and/or other materials provided with the distribution.
+#      * Neither the name of the NVIDIA CORPORATION nor the
+#        names of its contributors may be used to endorse or promote products
+#        derived from this software without specific prior written permission.
+#
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+#  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+#  WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+#  DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+#  DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+#  (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+#  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+#  ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+#  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+#
+# *****************************************************************************
+from typing import Tuple, Callable, List
+import torch
+from torch import Tensor
+from torch.utils.data.dataset import random_split
+from torchaudio.datasets import LJSPEECH
+class SpectralNormalization(torch.nn.Module):
+    def forward(self, input):
+        return torch.log(torch.clamp(input, min=1e-5))
+class InverseSpectralNormalization(torch.nn.Module):
+    def forward(self, input):
+        return torch.exp(input)
+class MapMemoryCache(torch.utils.data.Dataset):
+    r"""Wrap a dataset so that, whenever a new item is returned, it is saved to memory.
+    """
+    def __init__(self, dataset):
+        self.dataset = dataset
+        self._cache = [None] * len(dataset)
+    def __getitem__(self, n):
+        if self._cache[n] is not None:
+            return self._cache[n]
+        item = self.dataset[n]
+        self._cache[n] = item
+        return item
+    def __len__(self):
+        return len(self.dataset)
+class Processed(torch.utils.data.Dataset):
+    def __init__(self, dataset, transforms, text_preprocessor):
+        self.dataset = dataset
+        self.transforms = transforms
+        self.text_preprocessor = text_preprocessor
+    def __getitem__(self, key):
+        item = self.dataset[key]
+        return self.process_datapoint(item)
+    def __len__(self):
+        return len(self.dataset)
+    def process_datapoint(self, item):
+        melspec = self.transforms(item[0])
+        text_norm = torch.IntTensor(self.text_preprocessor(item[2]))
+        return text_norm, torch.squeeze(melspec, 0)
+def split_process_dataset(dataset: str,
+                          file_path: str,
+                          val_ratio: float,
+                          transforms: Callable,
+                          text_preprocessor: Callable[[str], List[int]],
+                          ) -> Tuple[torch.utils.data.Dataset, torch.utils.data.Dataset]:
+    """Returns the Training and validation datasets.
+    Args:
+        dataset (str): The dataset to use. Avaliable options: [`'ljspeech'`]
+        file_path (str): Path to the data.
+        val_ratio (float): Path to the data.
+        transforms (callable): A function/transform that takes in a waveform and
+            returns a transformed waveform (mel spectrogram in this example).
+        text_preprocess (callable): A function that takes in a string and
+            returns a list of integers representing each of the symbol in the string.
+    Returns:
+        train_dataset (`torch.utils.data.Dataset`): The training set.
+        val_dataset (`torch.utils.data.Dataset`): The validation set.
+    """
+    if dataset == 'ljspeech':
+        data = LJSPEECH(root=file_path, download=False)
+        val_length = int(len(data) * val_ratio)
+        lengths = [len(data) - val_length, val_length]
+        train_dataset, val_dataset = random_split(data, lengths)
+    else:
+        raise ValueError(f"Expected datasets: `ljspeech`, but found {dataset}")
+    train_dataset = Processed(train_dataset, transforms, text_preprocessor)
+    val_dataset = Processed(val_dataset, transforms, text_preprocessor)
+    train_dataset = MapMemoryCache(train_dataset)
+    val_dataset = MapMemoryCache(val_dataset)
+    return train_dataset, val_dataset
+def text_mel_collate_fn(batch: Tuple[Tensor, Tensor],
+                        n_frames_per_step: int = 1) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]:
+    """The collate function padding and adjusting the data based on `n_frames_per_step`.
+    Modified from https://github.com/NVIDIA/DeepLearningExamples
+    Args:
+        batch (tuple of two tensors): the first tensor is the mel spectrogram with shape
+            (n_batch, n_mels, n_frames), the second tensor is the text with shape (n_batch, ).
+        n_frames_per_step (int, optional): The number of frames to advance every step.
+    Returns:
+        text_padded (Tensor): The input text to Tacotron2 with shape (n_batch, max of ``text_lengths``).
+        text_lengths (Tensor): The length of each text with shape (n_batch).
+        mel_specgram_padded (Tensor): The target mel spectrogram
+            with shape (n_batch, n_mels, max of ``mel_specgram_lengths``)
+        mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape (n_batch).
+        gate_padded (Tensor): The ground truth gate output
+            with shape (n_batch, max of ``mel_specgram_lengths``)
+    """
+    text_lengths, ids_sorted_decreasing = torch.sort(
+        torch.LongTensor([len(x[0]) for x in batch]), dim=0, descending=True)
+    max_input_len = text_lengths[0]
+    text_padded = torch.zeros((len(batch), max_input_len), dtype=torch.int64)
+    for i in range(len(ids_sorted_decreasing)):
+        text = batch[ids_sorted_decreasing[i]][0]
+        text_padded[i, :text.size(0)] = text
+    # Right zero-pad mel-spec
+    num_mels = batch[0][1].size(0)
+    max_target_len = max([x[1].size(1) for x in batch])
+    if max_target_len % n_frames_per_step != 0:
+        max_target_len += n_frames_per_step - max_target_len % n_frames_per_step
+        assert max_target_len % n_frames_per_step == 0
+    # include mel padded and gate padded
+    mel_specgram_padded = torch.zeros((len(batch), num_mels, max_target_len), dtype=torch.float32)
+    gate_padded = torch.zeros((len(batch), max_target_len), dtype=torch.float32)
+    mel_specgram_lengths = torch.LongTensor(len(batch))
+    for i in range(len(ids_sorted_decreasing)):
+        mel = batch[ids_sorted_decreasing[i]][1]
+        mel_specgram_padded[i, :, :mel.size(1)] = mel
+        mel_specgram_lengths[i] = mel.size(1)
+        gate_padded[i, mel.size(1) - 1:] = 1
+    return text_padded, text_lengths, mel_specgram_padded, mel_specgram_lengths, gate_padded