Update files

cdab2875 · SWHL · cdab2875 · cdab2875 · cdab2875 · cdab2875
Commit cdab2875 authored Apr 07, 2023 by SWHL
20 changed files
--- a/.gitignore
+++ b/.gitignore
+*.pth
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+.pytest_cache
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+# *.manifest
+# *.spec
+*.res
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+#idea
+.vs
+.vscode
+.idea
+#models
+*.ttf
+*.ttc
+*.bin
+*.mapping
+*.xml
+*.pdiparams
+*.pdiparams.info
+*.pdmodel
+.DS_Store
\ No newline at end of file
--- a/README.md
+++ b/README.md
+#### 基于PaddeSpeech训练所得模型的推理代码
+- 项目来源：[PaddleSpeech/s2t](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell/asr0)
+- 运行环境：Linux| Python 3.7 | CPU | 不依赖Paddle
+#### 使用方法
+1. 下载整个`python/base_paddlespeech`目录
+2. 安装依赖环境
+   - 批量安装
+    ```bash
+    pip install -r requirements.txt -i https://pypi.douban.com/simple/
+    # CentOS
+    sudo yum install libsndfile 
+    ```
+3. 下载`resources`模型相关文件到`base_paddlespeech`下,
+    - 下载`resources`链接：[Google Drive](https://drive.google.com/file/d/1MWmKxsfCNQyQ5CPlaYxJKnYfIIC5OO5L/view?usp=sharing)
+    - 下载语言模型文件→[下载链接](https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm)，放到`base_paddlespeech/resources/models/language_model`目录下
+    - 最终结构目录如下，请自行比对：
+        ```text
+        base_paddlespeech
+        ├── deepspeech2
+        │   ├── infer.py
+        │   ├── __init__.py
+        │   └── s2t
+        │       ├── decoders
+        │       ├── deepspeech2.py
+        │       ├── frontend
+        │       ├── io
+        │       ├── modules
+        │       ├── __pycache__
+        │       ├── transform
+        │       └── utils
+        ├── main.py
+        ├── requirements.txt
+        ├── resources
+        │   └── models
+        │       ├── asr0_deepspeech2_online_aishell_ckpt_0.2.0.onnx
+        │       ├── language_model
+        │       │   └── zh_giga.no_cna_cmn.prune01244.klm
+        │       └── model.yaml
+        └── test_wav
+            └── zh.wav
+        ```
+4. 运行`python main.py`
+5. 运行结果如下：
+   ```text
+    checking the audio file format......
+    The sample rate is 16000
+    The audio file format is right
+    Preprocess audio_file:/da2/SWHL/test_wav/zh.wav
+    audio feat shape: (1, 498, 161)
+    ASR Result:     我认为跑步最重要的就是给我们带来了身体健康
+   ```
+#### 模型转onnx代码
+```bash
+model_dir="pretrained_models/deepspeech2online_aishell-zh-16k/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar/exp/deepspeech2_online/checkpoints"
+pdmodel="avg_1.jit.pdmodel"
+params_file="avg_1.jit.pdiparams"
+save_onnx="pretrained_models/onnx/asr0_deepspeech2_online_aishell_ckpt_0.1.1.onnx"
+paddle2onnx --model_dir ${model_dir} \
+            --model_filename ${pdmodel} \
+            --params_filename ${params_file} \
+            --save_file ${save_onnx} \
+            --opset_version 12
+```
--- a/deepspeech2/__init__.py
+++ b/deepspeech2/__init__.py
+# !/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# @File: __init__.py
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+from .infer import ASRExecutor
--- a/deepspeech2/infer.py
+++ b/deepspeech2/infer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+from collections import OrderedDict
+from typing import Union
+import numpy as np
+import soundfile
+from yacs.config import CfgNode
+from .s2t.deepspeech2 import DeepSpeech2ModelOnline
+from .s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from .s2t.io.collator import SpeechCollator
+from .s2t.utils.utility import UpdateConfig
+class ASRExecutor(object):
+    def __init__(self,
+                 sample_rate: int = 16000,
+                 config_path: os.PathLike = None,
+                 onnx_path: os.PathLike = None,
+                 decode_method: str = 'attention_rescoring',
+                 lan_model_path=None):
+        self.sample_rate = sample_rate
+        self.config_path = config_path
+        self.onnx_path = onnx_path
+        self.decode_method = decode_method
+        self.lan_model_path = lan_model_path
+        self._inputs = OrderedDict()
+        self._outputs = OrderedDict()
+        self.config_path = os.path.abspath(self.config_path)
+        self.res_path = os.path.dirname(
+            os.path.dirname(os.path.abspath(self.config_path)))
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.config_path)
+        with UpdateConfig(self.config):
+            self.vocab = self.config.vocab_filepath
+            self.config.decode.lang_model_path = self.lan_model_path
+            self.collate_fn_test = SpeechCollator.from_config(self.config)
+            self.text_feature = TextFeaturizer(unit_type=self.config.unit_type,
+                                               vocab=self.vocab)
+        self.model = DeepSpeech2ModelOnline(encoder_onnx_path=self.onnx_path)
+    def __call__(self, audio_file, force_yes: bool = False):
+        audio_file = os.path.abspath(audio_file)
+        if not self._check(audio_file, self.sample_rate, force_yes):
+            sys.exit(-1)
+        self.preprocess(audio_file)
+        res = self.infer()
+        return res
+    def preprocess(self, input: Union[str, os.PathLike]):
+        audio_file = input
+        if isinstance(audio_file, (str, os.PathLike)):
+            print("Preprocess audio_file:" + audio_file)
+        # Get the object for feature extraction
+        audio, _ = self.collate_fn_test.process_utterance(
+            audio_file=audio_file, transcript=" ")
+        audio_len = audio.shape[0]
+        audio = audio[np.newaxis, ...]
+        self._inputs["audio"] = audio
+        self._inputs["audio_len"] = audio_len
+        print(f"audio feat shape: {audio.shape}")
+    def infer(self):
+        """
+        Model inference and result stored in self.output.
+        """
+        cfg = self.config.decode
+        audio = self._inputs["audio"]
+        audio_len = self._inputs["audio_len"]
+        decode_batch_size = audio.shape[0]
+        self.model.decoder.init_decoder(
+            decode_batch_size, self.text_feature.vocab_list,
+            cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
+            cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
+            cfg.num_proc_bsearch)
+        result_transcripts = self.model.decode(audio, audio_len)
+        self.model.decoder.del_decoder()
+        return result_transcripts[0]
+    def _check(self, audio_file: str, sample_rate: int, force_yes: bool):
+        self.sample_rate = sample_rate
+        if self.sample_rate != 16000 and self.sample_rate != 8000:
+            print(
+                "invalid sample rate, please input --sr 8000 or --sr 16000")
+            return False
+        if isinstance(audio_file, (str, os.PathLike)):
+            if not os.path.isfile(audio_file):
+                print("Please input the right audio file path")
+                return False
+        print("checking the audio file format......")
+        try:
+            audio, audio_sample_rate = soundfile.read(
+                audio_file, dtype="int16", always_2d=True)
+        except Exception as e:
+            print(
+                "can not open the audio file, please check the audio file format is 'wav'. \n \
+                 you can try to use sox to change the file format.\n \
+                 For example: \n \
+                 sample rate: 16k \n \
+                 sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \
+                 sample rate: 8k \n \
+                 sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \
+                 ")
+            return False
+        print("The sample rate is %d" % audio_sample_rate)
+        if audio_sample_rate != self.sample_rate:
+            print("The sample rate of the input file is not {}.\n \
+                            The program will resample the wav file to {}.\n \
+                            If the result does not meet your expectations，\n \
+                            Please input the 16k 16 bit 1 channel wav file. \
+                        ".format(self.sample_rate, self.sample_rate))
+            if force_yes is False:
+                while (True):
+                    print(
+                        "Whether to change the sample rate and the channel. Y: change the sample. N: exit the prgream."
+                    )
+                    content = input("Input(Y/N):")
+                    if content.strip() in ["Y", "y", "yes", "Yes"]:
+                        print("change the sampele rate, channel to 16k and 1 channel")
+                        break
+                    elif content.strip() in ["N", "n", "no", "No"]:
+                        print("Exit the program")
+                        exit(1)
+                    else:
+                        print("Not regular input, please input again")
+            self.change_format = True
+        else:
+            print("The audio file format is right")
+            self.change_format = False
+        return True
--- a/deepspeech2/s2t/decoders/__init__.py
+++ b/deepspeech2/s2t/decoders/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/deepspeech2/s2t/decoders/ctcdecoder/__init__.py
+++ b/deepspeech2/s2t/decoders/ctcdecoder/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .swig_wrapper import ctc_beam_search_decoding
+from .swig_wrapper import ctc_beam_search_decoding_batch
+from .swig_wrapper import ctc_greedy_decoding
+from .swig_wrapper import CTCBeamSearchDecoder
+from .swig_wrapper import Scorer
--- a/deepspeech2/s2t/decoders/ctcdecoder/decoders_deprecated.py
+++ b/deepspeech2/s2t/decoders/ctcdecoder/decoders_deprecated.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains various CTC decoders."""
+import multiprocessing
+from itertools import groupby
+from math import log
+import numpy as np
+def ctc_greedy_decoder(probs_seq, vocabulary):
+    """CTC greedy (best path) decoder.
+    Path consisting of the most probable tokens are further post-processed to
+    remove consecutive repetitions and all blanks.
+    :param probs_seq: 2-D list of probabilities over the vocabulary for each
+                      character. Each element is a list of float probabilities
+                      for one character.
+    :type probs_seq: list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: baseline
+    """
+    # dimension verification
+    for probs in probs_seq:
+        if not len(probs) == len(vocabulary) + 1:
+            raise ValueError("probs_seq dimension mismatchedd with vocabulary")
+    # argmax to get the best index for each time step
+    max_index_list = list(np.array(probs_seq).argmax(axis=1))
+    # remove consecutive duplicate indexes
+    index_list = [index_group[0] for index_group in groupby(max_index_list)]
+    # remove blank indexes
+    blank_index = len(vocabulary)
+    index_list = [index for index in index_list if index != blank_index]
+    # convert index list to string
+    return ''.join([vocabulary[index] for index in index_list])
+def ctc_beam_search_decoder(probs_seq,
+                            beam_size,
+                            vocabulary,
+                            cutoff_prob=1.0,
+                            cutoff_top_n=40,
+                            ext_scoring_func=None,
+                            nproc=False):
+    """CTC Beam search decoder.
+    It utilizes beam search to approximately select top best decoding
+    labels and returning results in the descending order.
+    The implementation is based on Prefix Beam Search
+    (https://arxiv.org/abs/1408.2873), and the unclear part is
+    redesigned. Two important modifications: 1) in the iterative computation
+    of probabilities, the assignment operation is changed to accumulation for
+    one prefix may comes from different paths; 2) the if condition "if l^+ not
+    in A_prev then" after probabilities' computation is deprecated for it is
+    hard to understand and seems unnecessary.
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param ext_scoring_func: External scoring function for
+                            partially decoded sentence, e.g. word count
+                            or language model.
+    :type external_scoring_func: callable
+    :param nproc: Whether the decoder used in multiprocesses.
+    :type nproc: bool
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    # dimension check
+    for prob_list in probs_seq:
+        if not len(prob_list) == len(vocabulary) + 1:
+            raise ValueError("The shape of prob_seq does not match with the "
+                             "shape of the vocabulary.")
+    # blank_id assign
+    blank_id = len(vocabulary)
+    # If the decoder called in the multiprocesses, then use the global scorer
+    # instantiated in ctc_beam_search_decoder_batch().
+    if nproc is True:
+        global ext_nproc_scorer
+        ext_scoring_func = ext_nproc_scorer
+    # initialize
+    # prefix_set_prev: the set containing selected prefixes
+    # probs_b_prev: prefixes' probability ending with blank in previous step
+    # probs_nb_prev: prefixes' probability ending with non-blank in previous
+    # step
+    prefix_set_prev = {'\t': 1.0}
+    probs_b_prev, probs_nb_prev = {'\t': 1.0}, {'\t': 0.0}
+    # extend prefix in loop
+    for time_step in range(len(probs_seq)):
+        # prefix_set_next: the set containing candidate prefixes
+        # probs_b_cur: prefixes' probability ending with blank in current step
+        # probs_nb_cur: prefixes' probability ending with non-blank in current
+        # step
+        prefix_set_next, probs_b_cur, probs_nb_cur = {}, {}, {}
+        prob_idx = list(enumerate(probs_seq[time_step]))
+        cutoff_len = len(prob_idx)
+        # If pruning is enabled
+        if cutoff_prob < 1.0 or cutoff_top_n < cutoff_len:
+            prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True)
+            cutoff_len, cum_prob = 0, 0.0
+            for i in range(len(prob_idx)):
+                cum_prob += prob_idx[i][1]
+                cutoff_len += 1
+                if cum_prob >= cutoff_prob:
+                    break
+            cutoff_len = min(cutoff_len, cutoff_top_n)
+            prob_idx = prob_idx[0:cutoff_len]
+        for l in prefix_set_prev:
+            if l not in prefix_set_next:
+                probs_b_cur[l], probs_nb_cur[l] = 0.0, 0.0
+            # extend prefix by travering prob_idx
+            for index in range(cutoff_len):
+                c, prob_c = prob_idx[index][0], prob_idx[index][1]
+                if c == blank_id:
+                    probs_b_cur[l] += prob_c * (
+                        probs_b_prev[l] + probs_nb_prev[l])
+                else:
+                    last_char = l[-1]
+                    new_char = vocabulary[c]
+                    l_plus = l + new_char
+                    if l_plus not in prefix_set_next:
+                        probs_b_cur[l_plus], probs_nb_cur[l_plus] = 0.0, 0.0
+                    if new_char == last_char:
+                        probs_nb_cur[l_plus] += prob_c * probs_b_prev[l]
+                        probs_nb_cur[l] += prob_c * probs_nb_prev[l]
+                    elif new_char == ' ':
+                        if (ext_scoring_func is None) or (len(l) == 1):
+                            score = 1.0
+                        else:
+                            prefix = l[1:]
+                            score = ext_scoring_func(prefix)
+                        probs_nb_cur[l_plus] += score * prob_c * (
+                            probs_b_prev[l] + probs_nb_prev[l])
+                    else:
+                        probs_nb_cur[l_plus] += prob_c * (
+                            probs_b_prev[l] + probs_nb_prev[l])
+                    # add l_plus into prefix_set_next
+                    prefix_set_next[l_plus] = probs_nb_cur[
+                        l_plus] + probs_b_cur[l_plus]
+            # add l into prefix_set_next
+            prefix_set_next[l] = probs_b_cur[l] + probs_nb_cur[l]
+        # update probs
+        probs_b_prev, probs_nb_prev = probs_b_cur, probs_nb_cur
+        # store top beam_size prefixes
+        prefix_set_prev = sorted(
+            prefix_set_next.items(), key=lambda asd: asd[1], reverse=True)
+        if beam_size < len(prefix_set_prev):
+            prefix_set_prev = prefix_set_prev[:beam_size]
+        prefix_set_prev = dict(prefix_set_prev)
+    beam_result = []
+    for seq, prob in prefix_set_prev.items():
+        if prob > 0.0 and len(seq) > 1:
+            result = seq[1:]
+            # score last word by external scorer
+            if (ext_scoring_func is not None) and (result[-1] != ' '):
+                prob = prob * ext_scoring_func(result)
+            log_prob = log(prob)
+            beam_result.append((log_prob, result))
+        else:
+            beam_result.append((float('-inf'), ''))
+    # output top beam_size decoding results
+    beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True)
+    return beam_result
+def ctc_beam_search_decoder_batch(probs_split,
+                                  beam_size,
+                                  vocabulary,
+                                  num_processes,
+                                  cutoff_prob=1.0,
+                                  cutoff_top_n=40,
+                                  ext_scoring_func=None):
+    """CTC beam search decoder using multiple processes.
+    :param probs_seq: 3-D list with each element as an instance of 2-D list
+                      of probabilities used by ctc_beam_search_decoder().
+    :type probs_seq: 3-D list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param ext_scoring_func: External scoring function for
+                            partially decoded sentence, e.g. word count
+                            or language model.
+    :type external_scoring_function: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    if not num_processes > 0:
+        raise ValueError("Number of processes must be positive!")
+    # use global variable to pass the externnal scorer to beam search decoder
+    global ext_nproc_scorer
+    ext_nproc_scorer = ext_scoring_func
+    nproc = True
+    pool = multiprocessing.Pool(processes=num_processes)
+    results = []
+    for i, probs_list in enumerate(probs_split):
+        args = (probs_list, beam_size, vocabulary, cutoff_prob, cutoff_top_n,
+                None, nproc)
+        results.append(pool.apply_async(ctc_beam_search_decoder, args))
+    pool.close()
+    pool.join()
+    beam_search_results = [result.get() for result in results]
+    return beam_search_results
--- a/deepspeech2/s2t/decoders/ctcdecoder/swig_wrapper.py
+++ b/deepspeech2/s2t/decoders/ctcdecoder/swig_wrapper.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wrapper for various CTC decoders in SWIG."""
+import paddlespeech_ctcdecoders
+class Scorer(paddlespeech_ctcdecoders.Scorer):
+    """Wrapper for Scorer.
+    :param alpha: Parameter associated with language model. Don't use
+                  language model when alpha = 0.
+    :type alpha: float
+    :param beta: Parameter associated with word count. Don't use word
+                 count when beta = 0.
+    :type beta: float
+    :model_path: Path to load language model.
+    :type model_path: str
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    """
+    def __init__(self, alpha, beta, model_path, vocabulary):
+        paddlespeech_ctcdecoders.Scorer.__init__(self, alpha, beta, model_path,
+                                                 vocabulary)
+def ctc_greedy_decoding(probs_seq, vocabulary, blank_id):
+    """Wrapper for ctc best path decodeing function in swig.
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :return: Decoding result string.
+    :rtype: str
+    """
+    result = paddlespeech_ctcdecoders.ctc_greedy_decoding(probs_seq.tolist(),
+                                                          vocabulary, blank_id)
+    return result
+def ctc_beam_search_decoding(probs_seq,
+                             vocabulary,
+                             beam_size,
+                             cutoff_prob=1.0,
+                             cutoff_top_n=40,
+                             ext_scoring_func=None,
+                             blank_id=0):
+    """Wrapper for the CTC Beam Search Decoding function.
+    :param probs_seq: 2-D list of probability distributions over each time
+                      step, with each element being a list of normalized
+                      probabilities over vocabulary and blank.
+    :type probs_seq: 2-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param cutoff_prob: Cutoff probability in pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                         characters with highest probs in vocabulary will be
+                         used in beam search, default 40.
+    :type cutoff_top_n: int
+    :param ext_scoring_func: External scoring function for
+                             partially decoded sentence, e.g. word count
+                             or language model.
+    :type external_scoring_func: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    beam_results = paddlespeech_ctcdecoders.ctc_beam_search_decoding(
+        probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n,
+        ext_scoring_func, blank_id)
+    beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results]
+    return beam_results
+def ctc_beam_search_decoding_batch(probs_split,
+                                   vocabulary,
+                                   beam_size,
+                                   num_processes,
+                                   cutoff_prob=1.0,
+                                   cutoff_top_n=40,
+                                   ext_scoring_func=None,
+                                   blank_id=0):
+    """Wrapper for the batched CTC beam search decodeing batch function.
+    :param probs_seq: 3-D list with each element as an instance of 2-D list
+                      of probabilities used by ctc_beam_search_decoder().
+    :type probs_seq: 3-D list
+    :param vocabulary: Vocabulary list.
+    :type vocabulary: list
+    :param beam_size: Width for beam search.
+    :type beam_size: int
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param cutoff_prob: Cutoff probability in vocabulary pruning,
+                        default 1.0, no pruning.
+    :type cutoff_prob: float
+    :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
+                         characters with highest probs in vocabulary will be
+                         used in beam search, default 40.
+    :type cutoff_top_n: int
+    :param num_processes: Number of parallel processes.
+    :type num_processes: int
+    :param ext_scoring_func: External scoring function for
+                             partially decoded sentence, e.g. word count
+                             or language model.
+    :type external_scoring_function: callable
+    :return: List of tuples of log probability and sentence as decoding
+             results, in descending order of the probability.
+    :rtype: list
+    """
+    probs_split = [probs_seq.tolist() for probs_seq in probs_split]
+    batch_beam_results = paddlespeech_ctcdecoders.ctc_beam_search_decoding_batch(
+        probs_split, vocabulary, beam_size, num_processes, cutoff_prob,
+        cutoff_top_n, ext_scoring_func, blank_id)
+    batch_beam_results = [[(res[0], res[1]) for res in beam_results]
+                          for beam_results in batch_beam_results]
+    return batch_beam_results
+class CTCBeamSearchDecoder(paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch):
+    """Wrapper for CtcBeamSearchDecoderBatch.
+    Args:
+        vocab_list (list): Vocabulary list.
+        beam_size (int): Width for beam search.
+        num_processes (int): Number of parallel processes.
+        param cutoff_prob (float): Cutoff probability in vocabulary pruning,
+                            default 1.0, no pruning.
+        cutoff_top_n (int): Cutoff number in pruning, only top cutoff_top_n
+                            characters with highest probs in vocabulary will be
+                            used in beam search, default 40.
+        param ext_scorer (Scorer): External scorer for partially decoded sentence, e.g. word count
+                                or language model.
+    """
+    def __init__(self, vocab_list, batch_size, beam_size, num_processes,
+                 cutoff_prob, cutoff_top_n, _ext_scorer, blank_id):
+        paddlespeech_ctcdecoders.CtcBeamSearchDecoderBatch.__init__(
+            self, vocab_list, batch_size, beam_size, num_processes, cutoff_prob,
+            cutoff_top_n, _ext_scorer, blank_id)
--- a/deepspeech2/s2t/decoders/utils.py
+++ b/deepspeech2/s2t/decoders/utils.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+import numpy as np
+__all__ = ["end_detect", "parse_hypothesis", "add_results_to_json"]
+def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))):
+    """End detection.
+    described in Eq. (50) of S. Watanabe et al
+    "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition"
+    :param ended_hyps: dict
+    :param i: int
+    :param M: int
+    :param D_end: float
+    :return: bool
+    """
+    if len(ended_hyps) == 0:
+        return False
+    count = 0
+    best_hyp = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[0]
+    for m in range(M):
+        # get ended_hyps with their length is i - m
+        hyp_length = i - m
+        hyps_same_length = [
+            x for x in ended_hyps if len(x["yseq"]) == hyp_length
+        ]
+        if len(hyps_same_length) > 0:
+            best_hyp_same_length = sorted(
+                hyps_same_length, key=lambda x: x["score"], reverse=True)[0]
+            if best_hyp_same_length["score"] - best_hyp["score"] < D_end:
+                count += 1
+    if count == M:
+        return True
+    else:
+        return False
+# * ------------------ recognition related ------------------ *
+def parse_hypothesis(hyp, char_list):
+    """Parse hypothesis.
+    Args:
+        hyp (list[dict[str, Any]]): Recognition hypothesis.
+        char_list (list[str]): List of characters.
+    Returns:
+        tuple(str, str, str, float)
+    """
+    # remove sos and get results
+    tokenid_as_list = list(map(int, hyp["yseq"][1:]))
+    token_as_list = [char_list[idx] for idx in tokenid_as_list]
+    score = float(hyp["score"])
+    # convert to string
+    tokenid = " ".join([str(idx) for idx in tokenid_as_list])
+    token = " ".join(token_as_list)
+    text = "".join(token_as_list).replace("<space>", " ")
+    return text, token, tokenid, score
+def add_results_to_json(js, nbest_hyps, char_list):
+    """Add N-best results to json.
+    Args:
+        js (dict[str, Any]): Groundtruth utterance dict.
+        nbest_hyps_sd (list[dict[str, Any]]):
+            List of hypothesis for multi_speakers: nutts x nspkrs.
+        char_list (list[str]): List of characters.
+    Returns:
+        dict[str, Any]: N-best results added utterance dict.
+    """
+    # copy old json info
+    new_js = dict()
+    new_js["utt2spk"] = js["utt2spk"]
+    new_js["output"] = []
+    for n, hyp in enumerate(nbest_hyps, 1):
+        # parse hypothesis
+        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp,
+                                                                   char_list)
+        # copy ground-truth
+        if len(js["output"]) > 0:
+            out_dic = dict(js["output"][0].items())
+        else:
+            # for no reference case (e.g., speech translation)
+            out_dic = {"name": ""}
+        # update name
+        out_dic["name"] += "[%d]" % n
+        # add recognition results
+        out_dic["rec_text"] = rec_text
+        out_dic["rec_token"] = rec_token
+        out_dic["rec_tokenid"] = rec_tokenid
+        out_dic["score"] = score
+        # add to list of N-best result dicts
+        new_js["output"].append(out_dic)
+        # show 1-best result
+        if n == 1:
+            if "text" in out_dic.keys():
+                print("groundtruth: %s" % out_dic["text"])
+            print("prediction : %s" % out_dic["rec_text"])
+    return new_js
--- a/deepspeech2/s2t/deepspeech2.py
+++ b/deepspeech2/s2t/deepspeech2.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Deepspeech2 ASR Online Model"""
+import numpy as np
+import onnxruntime as ort
+from .modules.ctc import CTCDecoder
+class DeepSpeech2ModelOnline(object):
+    def __init__(self, encoder_onnx_path):
+        self.encoder_sess = ort.InferenceSession(encoder_onnx_path)
+        self.decoder = CTCDecoder()
+    def decode(self, audio, audio_len):
+        onnx_inputs_name = self.encoder_sess.get_inputs()
+        ort_inputs = {
+            onnx_inputs_name[0].name: np.array(audio).astype(np.float32),
+            onnx_inputs_name[1].name: np.array([audio_len]).astype(np.int64),
+            onnx_inputs_name[2].name: np.zeros([5, 1, 1024]).astype(np.float32),
+            onnx_inputs_name[3].name: np.zeros([5, 1, 1024]).astype(np.float32)
+        }
+        ort_outputs = self.encoder_sess.run(None, ort_inputs)
+        probs, eouts_len, _, _ = ort_outputs
+        batch_size = probs.shape[0]
+        self.decoder.reset_decoder(batch_size=batch_size)
+        self.decoder.next(probs, eouts_len)
+        trans_best, trans_beam = self.decoder.decode()
+        return trans_best
--- a/deepspeech2/s2t/frontend/__init__.py
+++ b/deepspeech2/s2t/frontend/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/deepspeech2/s2t/frontend/audio.py
+++ b/deepspeech2/s2t/frontend/audio.py
--- a/deepspeech2/s2t/frontend/augmentor/__init__.py
+++ b/deepspeech2/s2t/frontend/augmentor/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/deepspeech2/s2t/frontend/augmentor/augmentation.py
+++ b/deepspeech2/s2t/frontend/augmentor/augmentation.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the data augmentation pipeline."""
+import json
+import os
+from collections.abc import Sequence
+from inspect import signature
+import numpy as np
+__all__ = ["AugmentationPipeline"]
+class AugmentationPipeline():
+    """Build a pre-processing pipeline with various augmentation models.Such a
+    data augmentation pipeline is oftern leveraged to augment the training
+    samples to make the model invariant to certain types of perturbations in the
+    real world, improving model's generalization ability.
+    The pipeline is built according the the augmentation configuration in json
+    string, e.g.
+    .. code-block::
+        [ {
+                "type": "noise",
+                "params": {"min_snr_dB": 10,
+                           "max_snr_dB": 20,
+                           "noise_manifest_path": "datasets/manifest.noise"},
+                "prob": 0.0
+            },
+            {
+                "type": "speed",
+                "params": {"min_speed_rate": 0.9,
+                           "max_speed_rate": 1.1},
+                "prob": 1.0
+            },
+            {
+                "type": "shift",
+                "params": {"min_shift_ms": -5,
+                           "max_shift_ms": 5},
+                "prob": 1.0
+            },
+            {
+                "type": "volume",
+                "params": {"min_gain_dBFS": -10,
+                           "max_gain_dBFS": 10},
+                "prob": 0.0
+            },
+            {
+                "type": "bayesian_normal",
+                "params": {"target_db": -20,
+                           "prior_db": -20,
+                           "prior_samples": 100},
+                "prob": 0.0
+            }
+        ]
+    This augmentation configuration inserts two augmentation models
+    into the pipeline, with one is VolumePerturbAugmentor and the other
+    SpeedPerturbAugmentor. "prob" indicates the probability of the current
+    augmentor to take effect. If "prob" is zero, the augmentor does not take
+    effect.
+    Params:
+        preprocess_conf(str): Augmentation configuration in `json file` or `json string`.
+        random_seed(int): Random seed.
+    Raises:
+        ValueError: If the augmentation json config is in incorrect format".
+    """
+    SPEC_TYPES = {'specaug'}
+    def __init__(self, preprocess_conf: str, random_seed: int = 0):
+        self._rng = np.random.RandomState(random_seed)
+        self.conf = {'mode': 'sequential', 'process': []}
+        if preprocess_conf:
+            if os.path.isfile(preprocess_conf):
+                with open(preprocess_conf, 'r') as fin:
+                    json_string = fin.read()
+            else:
+                json_string = preprocess_conf
+            process = json.loads(json_string)
+            self.conf['process'] += process
+        self._augmentors, self._rates = self._parse_pipeline_from('all')
+        self._audio_augmentors, self._audio_rates = self._parse_pipeline_from(
+            'audio')
+        self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
+            'feature')
+    def __call__(self, xs, uttid_list=None, **kwargs):
+        if not isinstance(xs, Sequence):
+            is_batch = False
+            xs = [xs]
+        else:
+            is_batch = True
+        if isinstance(uttid_list, str):
+            uttid_list = [uttid_list for _ in range(len(xs))]
+        if self.conf.get("mode", "sequential") == "sequential":
+            for idx, (func, rate) in enumerate(
+                    zip(self._augmentors, self._rates), 0):
+                if self._rng.uniform(0., 1.) >= rate:
+                    continue
+                # Derive only the args which the func has
+                try:
+                    param = signature(func).parameters
+                except ValueError:
+                    # Some function, e.g. built-in function, are failed
+                    param = {}
+                _kwargs = {k: v for k, v in kwargs.items() if k in param}
+                try:
+                    if uttid_list is not None and "uttid" in param:
+                        xs = [
+                            func(x, u, **_kwargs)
+                            for x, u in zip(xs, uttid_list)
+                        ]
+                    else:
+                        xs = [func(x, **_kwargs) for x in xs]
+                except Exception:
+                    logger.fatal("Catch a exception from {}th func: {}".format(
+                        idx, func))
+                    raise
+        else:
+            raise NotImplementedError(
+                "Not supporting mode={}".format(self.conf["mode"]))
+        if is_batch:
+            return xs
+        else:
+            return xs[0]
+    def transform_audio(self, audio_segment):
+        """Run the pre-processing pipeline for data augmentation.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to process.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
+            if self._rng.uniform(0., 1.) < rate:
+                augmentor.transform_audio(audio_segment)
+    def transform_feature(self, spec_segment):
+        """spectrogram augmentation.
+        Args:
+            spec_segment (np.ndarray): audio feature, (D, T).
+        """
+        for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
+            if self._rng.uniform(0., 1.) < rate:
+                spec_segment = augmentor.transform_feature(spec_segment)
+        return spec_segment
+    def _parse_pipeline_from(self, aug_type='all'):
+        """Parse the config json to build a augmentation pipelien."""
+        assert aug_type in ('audio', 'feature', 'all'), aug_type
+        audio_confs = []
+        feature_confs = []
+        all_confs = []
+        for config in self.conf['process']:
+            all_confs.append(config)
+            if config["type"] in self.SPEC_TYPES:
+                feature_confs.append(config)
+            else:
+                audio_confs.append(config)
+        if aug_type == 'audio':
+            aug_confs = audio_confs
+        elif aug_type == 'feature':
+            aug_confs = feature_confs
+        elif aug_type == 'all':
+            aug_confs = all_confs
+        else:
+            raise ValueError(f"Not support: {aug_type}")
+        augmentors = [
+            self._get_augmentor(config["type"], config["params"])
+            for config in aug_confs
+        ]
+        rates = [config["prob"] for config in aug_confs]
+        return augmentors, rates
--- a/deepspeech2/s2t/frontend/augmentor/base.py
+++ b/deepspeech2/s2t/frontend/augmentor/base.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the abstract base class for augmentation models."""
+from abc import ABCMeta
+from abc import abstractmethod
+class AugmentorBase():
+    """Abstract base class for augmentation model (augmentor) class.
+    All augmentor classes should inherit from this class, and implement the
+    following abstract methods.
+    """
+    __metaclass__ = ABCMeta
+    @abstractmethod
+    def __init__(self):
+        pass
+    @abstractmethod
+    def __call__(self, xs):
+        raise NotImplementedError("AugmentorBase: Not impl __call__")
+    @abstractmethod
+    def transform_audio(self, audio_segment):
+        """Adds various effects to the input audio segment. Such effects
+        will augment the training data to make the model invariant to certain
+        types of perturbations in the real world, improving model's
+        generalization ability.
+        Note that this is an in-place transformation.
+        :param audio_segment: Audio segment to add effects to.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        raise NotImplementedError("AugmentorBase: Not impl transform_audio")
+    @abstractmethod
+    def transform_feature(self, spec_segment):
+        """Adds various effects to the input audo feature segment. Such effects
+        will augment the training data to make the model invariant to certain
+        types of time_mask or freq_mask in the real world, improving model's
+        generalization ability.
+        Args:
+            spec_segment (Spectrogram): Spectrogram segment to add effects to.
+        """
+        raise NotImplementedError("AugmentorBase: Not impl transform_feature")
--- a/deepspeech2/s2t/frontend/featurizer/__init__.py
+++ b/deepspeech2/s2t/frontend/featurizer/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .audio_featurizer import AudioFeaturizer  # noqa: F401
+from .speech_featurizer import SpeechFeaturizer
+from .text_featurizer import TextFeaturizer
--- a/deepspeech2/s2t/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech2/s2t/frontend/featurizer/audio_featurizer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the audio featurizer class."""
+import numpy as np
+from python_speech_features import delta
+from python_speech_features import logfbank
+from python_speech_features import mfcc
+class AudioFeaturizer():
+    """Audio featurizer, for extracting features from audio contents of
+    AudioSegment or SpeechSegment.
+    Currently, it supports feature types of linear spectrogram and mfcc.
+    :param spectrum_type: Specgram feature type. Options: 'linear'.
+    :type spectrum_type: str
+    :param stride_ms: Striding size (in milliseconds) for generating frames.
+    :type stride_ms: float
+    :param window_ms: Window size (in milliseconds) for generating frames.
+    :type window_ms: float
+    :param max_freq: When spectrum_type is 'linear', only FFT bins
+                     corresponding to frequencies between [0, max_freq] are
+                     returned; when spectrum_type is 'mfcc', max_feq is the
+                     highest band edge of mel filters.
+    :types max_freq: None|float
+    :param target_sample_rate: Audio are resampled (if upsampling or
+                               downsampling is allowed) to this before
+                               extracting spectrogram features.
+    :type target_sample_rate: float
+    :param use_dB_normalization: Whether to normalize the audio to a certain
+                                 decibels before extracting the features.
+    :type use_dB_normalization: bool
+    :param target_dB: Target audio decibels for normalization.
+    :type target_dB: float
+    """
+    def __init__(self,
+                 spectrum_type: str = 'linear',
+                 feat_dim: int = None,
+                 delta_delta: bool = False,
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 n_fft=None,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20,
+                 dither=1.0):
+        self._spectrum_type = spectrum_type
+        # mfcc and fbank using `feat_dim`
+        self._feat_dim = feat_dim
+        # mfcc and fbank using `delta-delta`
+        self._delta_delta = delta_delta
+        self._stride_ms = stride_ms
+        self._window_ms = window_ms
+        self._max_freq = max_freq
+        self._target_sample_rate = target_sample_rate
+        self._use_dB_normalization = use_dB_normalization
+        self._target_dB = target_dB
+        self._fft_point = n_fft
+        self._dither = dither
+    def featurize(self,
+                  audio_segment,
+                  allow_downsampling=True,
+                  allow_upsampling=True):
+        """Extract audio features from AudioSegment or SpeechSegment.
+        :param audio_segment: Audio/speech segment to extract features from.
+        :type audio_segment: AudioSegment|SpeechSegment
+        :param allow_downsampling: Whether to allow audio downsampling before
+                                   featurizing.
+        :type allow_downsampling: bool
+        :param allow_upsampling: Whether to allow audio upsampling before
+                                 featurizing.
+        :type allow_upsampling: bool
+        :return: Spectrogram audio feature in 2darray.
+        :rtype: ndarray
+        :raises ValueError: If audio sample rate is not supported.
+        """
+        # upsampling or downsampling
+        if ((audio_segment.sample_rate > self._target_sample_rate and
+             allow_downsampling) or
+            (audio_segment.sample_rate < self._target_sample_rate and
+             allow_upsampling)):
+            audio_segment.resample(self._target_sample_rate)
+        if audio_segment.sample_rate != self._target_sample_rate:
+            raise ValueError("Audio sample rate is not supported. "
+                             "Turn allow_downsampling or allow up_sampling on.")
+        # decibel normalization
+        if self._use_dB_normalization:
+            audio_segment.normalize(target_db=self._target_dB)
+        # extract spectrogram
+        return self._compute_specgram(audio_segment)
+    @property
+    def stride_ms(self):
+        return self._stride_ms
+    @property
+    def feature_size(self):
+        """audio feature size"""
+        feat_dim = 0
+        if self._spectrum_type == 'linear':
+            fft_point = self._window_ms if self._fft_point is None else self._fft_point
+            feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
+                           1)
+        elif self._spectrum_type == 'mfcc':
+            # mfcc, delta, delta-delta
+            feat_dim = int(self._feat_dim *
+                           3) if self._delta_delta else int(self._feat_dim)
+        elif self._spectrum_type == 'fbank':
+            # fbank, delta, delta-delta
+            feat_dim = int(self._feat_dim *
+                           3) if self._delta_delta else int(self._feat_dim)
+        else:
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
+        return feat_dim
+    def _compute_specgram(self, audio_segment):
+        """Extract various audio features."""
+        sample_rate = audio_segment.sample_rate
+        if self._spectrum_type == 'linear':
+            samples = audio_segment.samples
+            return self._compute_linear_specgram(
+                samples,
+                sample_rate,
+                stride_ms=self._stride_ms,
+                window_ms=self._window_ms,
+                max_freq=self._max_freq)
+        elif self._spectrum_type == 'mfcc':
+            samples = audio_segment.to('int16')
+            return self._compute_mfcc(
+                samples,
+                sample_rate,
+                feat_dim=self._feat_dim,
+                stride_ms=self._stride_ms,
+                window_ms=self._window_ms,
+                max_freq=self._max_freq,
+                dither=self._dither,
+                delta_delta=self._delta_delta)
+        elif self._spectrum_type == 'fbank':
+            samples = audio_segment.to('int16')
+            return self._compute_fbank(
+                samples,
+                sample_rate,
+                feat_dim=self._feat_dim,
+                stride_ms=self._stride_ms,
+                window_ms=self._window_ms,
+                max_freq=self._max_freq,
+                dither=self._dither,
+                delta_delta=self._delta_delta)
+        else:
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
+    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
+        """Compute the spectrogram for samples from a real signal."""
+        # extract strided windows
+        truncate_size = (len(samples) - window_size) % stride_size
+        samples = samples[:len(samples) - truncate_size]
+        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
+        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
+        windows = np.lib.stride_tricks.as_strided(
+            samples, shape=nshape, strides=nstrides)
+        assert np.all(
+            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
+        # window weighting, squared Fast Fourier Transform (fft), scaling
+        weighting = np.hanning(window_size)[:, None]
+        # https://numpy.org/doc/stable/reference/generated/numpy.fft.rfft.html
+        fft = np.fft.rfft(windows * weighting, n=None, axis=0)
+        fft = np.absolute(fft)
+        fft = fft**2
+        scale = np.sum(weighting**2) * sample_rate
+        fft[1:-1, :] *= (2.0 / scale)
+        fft[(0, -1), :] /= scale
+        # prepare fft frequency list
+        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
+        return fft, freqs
+    def _compute_linear_specgram(self,
+                                 samples,
+                                 sample_rate,
+                                 stride_ms=10.0,
+                                 window_ms=20.0,
+                                 max_freq=None,
+                                 eps=1e-14):
+        """Compute the linear spectrogram from FFT energy.
+        Args:
+            samples ([type]): [description]
+            sample_rate ([type]): [description]
+            stride_ms (float, optional): [description]. Defaults to 10.0.
+            window_ms (float, optional): [description]. Defaults to 20.0.
+            max_freq ([type], optional): [description]. Defaults to None.
+            eps ([type], optional): [description]. Defaults to 1e-14.
+        Raises:
+            ValueError: [description]
+            ValueError: [description]
+        Returns:
+            np.ndarray: log spectrogram, (time, freq)
+        """
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        stride_size = int(0.001 * sample_rate * stride_ms)
+        window_size = int(0.001 * sample_rate * window_ms)
+        specgram, freqs = self._specgram_real(
+            samples,
+            window_size=window_size,
+            stride_size=stride_size,
+            sample_rate=sample_rate)
+        ind = np.where(freqs <= max_freq)[0][-1] + 1
+        # (freq, time)
+        spec = np.log(specgram[:ind, :] + eps)
+        return np.transpose(spec)
+    def _concat_delta_delta(self, feat):
+        """append delat, delta-delta feature.
+        Args:
+            feat (np.ndarray): (T, D)
+        Returns:
+            np.ndarray: feat with delta-delta, (T, 3*D)
+        """
+        # Deltas
+        d_feat = delta(feat, 2)
+        # Deltas-Deltas
+        dd_feat = delta(feat, 2)
+        # concat above three features
+        concat_feat = np.concatenate((feat, d_feat, dd_feat), axis=1)
+        return concat_feat
+    def _compute_mfcc(self,
+                      samples,
+                      sample_rate,
+                      feat_dim=13,
+                      stride_ms=10.0,
+                      window_ms=25.0,
+                      max_freq=None,
+                      dither=1.0,
+                      delta_delta=True):
+        """Compute mfcc from samples.
+        Args:
+            samples (np.ndarray, np.int16): the audio signal from which to compute features.
+            sample_rate (float): the sample rate of the signal we are working with, in Hz.
+            feat_dim (int): the number of cepstrum to return, default 13.
+            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
+            window_ms (float, optional): window length in ms. Defaults to 25.0.
+            max_freq ([type], optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
+            delta_delta (bool, optional): Whether with delta delta. Defaults to False.
+        Raises:
+            ValueError: max_freq > samplerate/2
+            ValueError: stride_ms > window_ms
+        Returns:
+            np.ndarray: mfcc feature, (D, T).
+        """
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        # compute the 13 cepstral coefficients, and the first one is replaced
+        # by log(frame energy), (T, D)
+        mfcc_feat = mfcc(
+            signal=samples,
+            samplerate=sample_rate,
+            winlen=0.001 * window_ms,
+            winstep=0.001 * stride_ms,
+            numcep=feat_dim,
+            nfilt=23,
+            nfft=512,
+            lowfreq=20,
+            highfreq=max_freq,
+            dither=dither,
+            remove_dc_offset=True,
+            preemph=0.97,
+            ceplifter=22,
+            useEnergy=True,
+            winfunc='povey')
+        if delta_delta:
+            mfcc_feat = self._concat_delta_delta(mfcc_feat)
+        return mfcc_feat
+    def _compute_fbank(self,
+                       samples,
+                       sample_rate,
+                       feat_dim=40,
+                       stride_ms=10.0,
+                       window_ms=25.0,
+                       max_freq=None,
+                       dither=1.0,
+                       delta_delta=False):
+        """Compute logfbank from samples.
+        Args:
+            samples (np.ndarray, np.int16): the audio signal from which to compute features. Should be an N*1 array
+            sample_rate (float): the sample rate of the signal we are working with, in Hz.
+            feat_dim (int): the number of cepstrum to return, default 13.
+            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
+            window_ms (float, optional): window length in ms. Defaults to 20.0.
+            max_freq (float, optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
+            delta_delta (bool, optional): Whether with delta delta. Defaults to False.
+        Raises:
+            ValueError: max_freq > samplerate/2
+            ValueError: stride_ms > window_ms
+        Returns:
+            np.ndarray: mfcc feature, (D, T).
+        """
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        # (T, D)
+        fbank_feat = logfbank(
+            signal=samples,
+            samplerate=sample_rate,
+            winlen=0.001 * window_ms,
+            winstep=0.001 * stride_ms,
+            nfilt=feat_dim,
+            nfft=512,
+            lowfreq=20,
+            highfreq=max_freq,
+            dither=dither,
+            remove_dc_offset=True,
+            preemph=0.97,
+            wintype='povey')
+        if delta_delta:
+            fbank_feat = self._concat_delta_delta(fbank_feat)
+        return fbank_feat
--- a/deepspeech2/s2t/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech2/s2t/frontend/featurizer/speech_featurizer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the speech featurizer class."""
+from .audio_featurizer import AudioFeaturizer
+from .text_featurizer import TextFeaturizer
+class SpeechFeaturizer():
+    def __init__(self,
+                 unit_type,
+                 vocab_filepath,
+                 spm_model_prefix=None,
+                 spectrum_type='linear',
+                 feat_dim=None,
+                 delta_delta=False,
+                 stride_ms=10.0,
+                 window_ms=20.0,
+                 n_fft=None,
+                 max_freq=None,
+                 target_sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20,
+                 dither=1.0,
+                 maskctc=False):
+        self.stride_ms = stride_ms
+        self.window_ms = window_ms
+        self.audio_feature = AudioFeaturizer(
+            spectrum_type=spectrum_type,
+            feat_dim=feat_dim,
+            delta_delta=delta_delta,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            n_fft=n_fft,
+            max_freq=max_freq,
+            target_sample_rate=target_sample_rate,
+            use_dB_normalization=use_dB_normalization,
+            target_dB=target_dB,
+            dither=dither)
+        self.feature_size = self.audio_feature.feature_size
+        self.text_feature = TextFeaturizer(
+            unit_type=unit_type,
+            vocab=vocab_filepath,
+            spm_model_prefix=spm_model_prefix,
+            maskctc=maskctc)
+        self.vocab_size = self.text_feature.vocab_size
+    def featurize(self, speech_segment, keep_transcription_text):
+        """Extract features for speech segment.
+        1. For audio parts, extract the audio features.
+        2. For transcript parts, keep the original text or convert text string
+           to a list of token indices in char-level.
+        Args:
+            speech_segment (SpeechSegment): Speech segment to extract features from.
+            keep_transcription_text (bool): True, keep transcript text, False, token ids
+        Returns:
+            tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
+        """
+        spec_feature = self.audio_feature.featurize(speech_segment)
+        if keep_transcription_text:
+            return spec_feature, speech_segment.transcript
+        if speech_segment.has_token:
+            text_ids = speech_segment.token_ids
+        else:
+            text_ids = self.text_feature.featurize(speech_segment.transcript)
+        return spec_feature, text_ids
+    def text_featurize(self, text, keep_transcription_text):
+        """Extract features for speech segment.
+        1. For audio parts, extract the audio features.
+        2. For transcript parts, keep the original text or convert text string
+           to a list of token indices in char-level.
+        Args:
+            text (str): text.
+            keep_transcription_text (bool): True, keep transcript text, False, token ids
+        Returns:
+            (str|List[int]): text, or list of token indices.
+        """
+        if keep_transcription_text:
+            return text
+        text_ids = self.text_feature.featurize(text)
+        return text_ids
--- a/deepspeech2/s2t/frontend/featurizer/text_featurizer.py
+++ b/deepspeech2/s2t/frontend/featurizer/text_featurizer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains the text featurizer class."""
+from typing import Union
+import sentencepiece as spm
+from ..utility import BLANK, EOS, MASKCTC, SOS, SPACE, UNK, load_dict
+__all__ = ["TextFeaturizer"]
+class TextFeaturizer():
+    def __init__(self, unit_type, vocab, spm_model_prefix=None, maskctc=False):
+        """Text featurizer, for processing or extracting features from text.
+        Currently, it supports char/word/sentence-piece level tokenizing and conversion into
+        a list of token indices. Note that the token indexing order follows the
+        given vocabulary file.
+        Args:
+            unit_type (str): unit type, e.g. char, word, spm
+            vocab Option[str, list]: Filepath to load vocabulary for token indices conversion, or vocab list.
+            spm_model_prefix (str, optional): spm model prefix. Defaults to None.
+        """
+        assert unit_type in ('char', 'spm', 'word')
+        self.unit_type = unit_type
+        self.unk = UNK
+        self.maskctc = maskctc
+        if vocab:
+            self.vocab_dict, self._id2token, self.vocab_list, \
+                self.unk_id, self.eos_id, \
+                self.blank_id = self._load_vocabulary_from_file(vocab, maskctc)
+            self.vocab_size = len(self.vocab_list)
+        else:
+            print("TextFeaturizer: not have vocab file or vocab list.")
+        if unit_type == 'spm':
+            spm_model = spm_model_prefix + '.model'
+            self.sp = spm.SentencePieceProcessor()
+            self.sp.Load(spm_model)
+    def tokenize(self, text, replace_space=True):
+        if self.unit_type == 'char':
+            tokens = self.char_tokenize(text, replace_space)
+        elif self.unit_type == 'word':
+            tokens = self.word_tokenize(text)
+        else:  # spm
+            tokens = self.spm_tokenize(text)
+        return tokens
+    def detokenize(self, tokens):
+        if self.unit_type == 'char':
+            text = self.char_detokenize(tokens)
+        elif self.unit_type == 'word':
+            text = self.word_detokenize(tokens)
+        else:  # spm
+            text = self.spm_detokenize(tokens)
+        return text
+    def featurize(self, text):
+        """Convert text string to a list of token indices.
+        Args:
+            text (str): Text to process.
+        Returns:
+            List[int]: List of token indices.
+        """
+        tokens = self.tokenize(text)
+        ids = []
+        for token in tokens:
+            if token not in self.vocab_dict:
+                token = self.unk
+            ids.append(self.vocab_dict[token])
+        return ids
+    def defeaturize(self, idxs):
+        """Convert a list of token indices to text string,
+        ignore index after eos_id.
+        Args:
+            idxs (List[int]): List of token indices.
+        Returns:
+            str: Text.
+        """
+        tokens = []
+        for idx in idxs:
+            if idx == self.eos_id:
+                break
+            tokens.append(self._id2token[idx])
+        text = self.detokenize(tokens)
+        return text
+    def char_tokenize(self, text, replace_space=True):
+        """Character tokenizer.
+        Args:
+            text (str): text string.
+            replace_space (bool): False only used by build_vocab.py.
+        Returns:
+            List[str]: tokens.
+        """
+        text = text.strip()
+        if replace_space:
+            text_list = [SPACE if item == " " else item for item in list(text)]
+        else:
+            text_list = list(text)
+        return text_list
+    def char_detokenize(self, tokens):
+        """Character detokenizer.
+        Args:
+            tokens (List[str]): tokens.
+        Returns:
+           str: text string.
+        """
+        tokens = [t.replace(SPACE, " ") for t in tokens]
+        return "".join(tokens)
+    def word_tokenize(self, text):
+        """Word tokenizer, separate by <space>."""
+        return text.strip().split()
+    def word_detokenize(self, tokens):
+        """Word detokenizer, separate by <space>."""
+        return " ".join(tokens)
+    def spm_tokenize(self, text):
+        """spm tokenize.
+        Args:
+            text (str): text string.
+        Returns:
+            List[str]: sentence pieces str code
+        """
+        stats = {"num_empty": 0, "num_filtered": 0}
+        def valid(line):
+            return True
+        def encode(l):
+            return self.sp.EncodeAsPieces(l)
+        def encode_line(line):
+            line = line.strip()
+            if len(line) > 0:
+                line = encode(line)
+                if valid(line):
+                    return line
+                else:
+                    stats["num_filtered"] += 1
+            else:
+                stats["num_empty"] += 1
+            return None
+        enc_line = encode_line(text)
+        return enc_line
+    def spm_detokenize(self, tokens, input_format='piece'):
+        """spm detokenize.
+        Args:
+            ids (List[str]): tokens.
+        Returns:
+            str: text
+        """
+        if input_format == "piece":
+            def decode(l):
+                return "".join(self.sp.DecodePieces(l))
+        elif input_format == "id":
+            def decode(l):
+                return "".join(self.sp.DecodeIds(l))
+        return decode(tokens)
+    def _load_vocabulary_from_file(self, vocab: Union[str, list],
+                                   maskctc: bool):
+        """Load vocabulary from file."""
+        if isinstance(vocab, list):
+            vocab_list = vocab
+        else:
+            vocab_list = load_dict(vocab, maskctc)
+        assert vocab_list is not None
+        id2token = dict(
+            [(idx, token) for (idx, token) in enumerate(vocab_list)])
+        token2id = dict(
+            [(token, idx) for (idx, token) in enumerate(vocab_list)])
+        blank_id = vocab_list.index(BLANK) if BLANK in vocab_list else -1
+        unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1
+        eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1
+        return token2id, id2token, vocab_list, unk_id, eos_id, blank_id
--- a/deepspeech2/s2t/frontend/normalizer.py
+++ b/deepspeech2/s2t/frontend/normalizer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains feature normalizers."""
+import numpy as np
+from .utility import load_cmvn
+__all__ = ["FeatureNormalizer"]
+class FeatureNormalizer(object):
+    def __init__(self, mean_std_filepath):
+        mean_std = mean_std_filepath
+        self._read_mean_std_from_file(mean_std)
+    def apply(self, features):
+        """Normalize features to be of zero mean and unit stddev.
+        :param features: Input features to be normalized.
+        :type features: ndarray, shape (T, D)
+        :param eps:  added to stddev to provide numerical stablibity.
+        :type eps: float
+        :return: Normalized features.
+        :rtype: ndarray
+        """
+        return (features - self._mean) * self._istd
+    def _read_mean_std_from_file(self, mean_std, eps=1e-20):
+        """Load mean and std from file."""
+        if isinstance(mean_std, list):
+            mean = mean_std[0]['cmvn_stats']['mean']
+            istd = mean_std[0]['cmvn_stats']['istd']
+        else:
+            filetype = mean_std.split(".")[-1]
+            mean, istd = load_cmvn(mean_std, filetype=filetype)
+        self._mean = np.expand_dims(mean, axis=0)
+        self._istd = np.expand_dims(istd, axis=0)