"docs/how_to_download_models_zh_cn.md" did not exist on "45e7fbd2d8000f132324eb98ce5b2cc5a1ad6218"
generate_manifests.py 2.51 KB
Newer Older
huaerkl's avatar
v1.0  
huaerkl committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import argparse
from pathlib import Path
import soundfile

def get_insl_frame(parse):
    out = []
    def is_ont_token(tok):
        return tok[0] in ["[", "]"];

    res = []
    x = []
    for tok in parse.split():
        if is_ont_token(tok):
            res.extend('_'.join(x))
            x = []
            res.append(tok.upper())
        else:
            x.append(tok.upper())

    return " ".join(res) + ' | '

def sequencify_utterance(utterance):
    utterance = utterance.upper()
    utterance = utterance.replace(' ', '|') + '|'
    utterance = list(utterance)
    utterance = ' '.join(utterance)
    return utterance


def generate_fairseq_manifests(manifest, output_path, audio_root=None):

    with open(manifest, 'r') as i:
        parses = []
        utterances = []
        filepaths = []
        keys = None
        for (idx, line) in enumerate(i):
            if idx == 0: keys = line.strip().split('\t')
            else:
                data = { k: v for (k, v) in zip(keys, line.split('\t'))}
                parses.append(get_insl_frame(data['decoupled_normalized_seqlogical']))
                utterances.append(sequencify_utterance(data['normalized_utterance']))
                filepaths.append(data['file_id'])

    parses_fp = output_path.with_suffix('.parse')
    with open(str(parses_fp), 'w') as o:
        for p in parses:
            o.write(p + '\n')

    utterances_fp = output_path.with_suffix('.ltr')
    with open(str(utterances_fp), 'w') as o:
        for u in utterances:
            o.write(u + '\n')

    filepaths_fp = output_path.with_suffix('.tsv')
    with open(str(filepaths_fp), 'w') as o:
        o.write(str(audio_root) + '\n')
        for f in filepaths:
            fullpath = audio_root / f
            assert fullpath.exists(), f'{fullpath}'
            frames = soundfile.info(fullpath).frames
            o.write(f'{f}\t{frames}\n')

def main(args):

    splits = ['train', 'eval', 'test']
    root = Path(args.stop_root)
    output_root = Path(args.output)

    for split in splits:
        stop_manifest_path = root / 'manifests' / (split + '.tsv')
        output_path = output_root / (split)

        generate_fairseq_manifests(stop_manifest_path, output_path, root)

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('--stop_root', type=str,
                    help='path to stop root directory')
    parser.add_argument('--output', type=str,
                    help='output directory')
    args = parser.parse_args()
    main(args)