pipeline_demo.py 3.61 KB
Newer Older
1
#!/usr/bin/env python3
2
3
4
5
6
"""The demo script for testing the pre-trained Emformer RNNT pipelines.

Example:
python pipeline_demo.py --model-type librispeech --dataset-path ./datasets/librispeech
"""
7
8
import logging
import pathlib
9
10
11
12
from argparse import ArgumentParser, RawTextHelpFormatter
from dataclasses import dataclass
from functools import partial
from typing import Callable
13
14
15

import torch
import torchaudio
16
17
from common import MODEL_TYPE_LIBRISPEECH, MODEL_TYPE_MUSTC, MODEL_TYPE_TEDLIUM3
from mustc.dataset import MUSTC
18
from torchaudio.pipelines import EMFORMER_RNNT_BASE_LIBRISPEECH
19
from torchaudio.pipelines import RNNTBundle
20
from torchaudio.prototype.pipelines import EMFORMER_RNNT_BASE_MUSTC, EMFORMER_RNNT_BASE_TEDLIUM3
21

22
logger = logging.getLogger(__name__)
23
24


25
26
27
28
@dataclass
class Config:
    dataset: Callable
    bundle: RNNTBundle
29
30


31
32
33
34
35
_CONFIGS = {
    MODEL_TYPE_LIBRISPEECH: Config(
        partial(torchaudio.datasets.LIBRISPEECH, url="test-clean"),
        EMFORMER_RNNT_BASE_LIBRISPEECH,
    ),
36
37
38
39
    MODEL_TYPE_MUSTC: Config(
        partial(MUSTC, subset="tst-COMMON"),
        EMFORMER_RNNT_BASE_MUSTC,
    ),
40
41
42
43
44
    MODEL_TYPE_TEDLIUM3: Config(
        partial(torchaudio.datasets.TEDLIUM, release="release3", subset="test"),
        EMFORMER_RNNT_BASE_TEDLIUM3,
    ),
}
45
46
47


def run_eval_streaming(args):
48
49
    dataset = _CONFIGS[args.model_type].dataset(args.dataset_path)
    bundle = _CONFIGS[args.model_type].bundle
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
    decoder = bundle.get_decoder()
    token_processor = bundle.get_token_processor()
    feature_extractor = bundle.get_feature_extractor()
    streaming_feature_extractor = bundle.get_streaming_feature_extractor()
    hop_length = bundle.hop_length
    num_samples_segment = bundle.segment_length * hop_length
    num_samples_segment_right_context = num_samples_segment + bundle.right_context_length * hop_length

    for idx in range(10):
        sample = dataset[idx]
        waveform = sample[0].squeeze()
        # Streaming decode.
        state, hypothesis = None, None
        for idx in range(0, len(waveform), num_samples_segment):
            segment = waveform[idx : idx + num_samples_segment_right_context]
            segment = torch.nn.functional.pad(segment, (0, num_samples_segment_right_context - len(segment)))
            with torch.no_grad():
                features, length = streaming_feature_extractor(segment)
                hypos, state = decoder.infer(features, length, 10, state=state, hypothesis=hypothesis)
            hypothesis = hypos[0]
70
            transcript = token_processor(hypothesis[0], lstrip=False)
71
72
73
74
75
76
77
            print(transcript, end="", flush=True)
        print()

        # Non-streaming decode.
        with torch.no_grad():
            features, length = feature_extractor(waveform)
            hypos = decoder(features, length, 10)
78
        print(token_processor(hypos[0][0]))
79
80
81
82
        print()


def parse_args():
83
84
    parser = ArgumentParser(description=__doc__, formatter_class=RawTextHelpFormatter)
    parser.add_argument("--model-type", type=str, choices=_CONFIGS.keys(), required=True)
85
    parser.add_argument(
86
        "--dataset-path",
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
        type=pathlib.Path,
        help="Path to dataset.",
        required=True,
    )
    parser.add_argument("--debug", action="store_true", help="whether to use debug level for logging")
    return parser.parse_args()


def init_logger(debug):
    fmt = "%(asctime)s %(message)s" if debug else "%(message)s"
    level = logging.DEBUG if debug else logging.INFO
    logging.basicConfig(format=fmt, level=level, datefmt="%Y-%m-%d %H:%M:%S")


def cli_main():
    args = parse_args()
    init_logger(args.debug)
    run_eval_streaming(args)


if __name__ == "__main__":
    cli_main()