merge_scp2txt.py 4.39 KB
Newer Older
Sugon_ldc's avatar
Sugon_ldc committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env python3
# encoding: utf-8

from __future__ import print_function
from __future__ import unicode_literals

import argparse
import codecs
from distutils.util import strtobool
from io import open
import logging
import sys

PY2 = sys.version_info[0] == 2
sys.stdin = codecs.getreader('utf-8')(sys.stdin if PY2 else sys.stdin.buffer)
sys.stdout = codecs.getwriter('utf-8')(
    sys.stdout if PY2 else sys.stdout.buffer)


# Special types:
def shape(x):
    """Change str to List[int]

    >>> shape('3,5')
    [3, 5]
    >>> shape(' [3, 5] ')
    [3, 5]

    """

    # x: ' [3, 5] ' -> '3, 5'
    x = x.strip()
    if x[0] == '[':
        x = x[1:]
    if x[-1] == ']':
        x = x[:-1]

    return list(map(int, x.split(',')))


def get_parser():
    parser = argparse.ArgumentParser(
        description='Given each file paths with such format as '
        '<key>:<file>:<type>. type> can be omitted and the default '
        'is "str". e.g. {} '
        '--input-scps feat:data/feats.scp shape:data/utt2feat_shape:shape '
        '--input-scps feat:data/feats2.scp shape:data/utt2feat2_shape:shape '
        '--output-scps text:data/text shape:data/utt2text_shape:shape '
        '--scps utt2spk:data/utt2spk'.format(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--input-scps',
                        type=str,
                        nargs='*',
                        action='append',
                        default=[],
                        help='files for the inputs')
    parser.add_argument('--output-scps',
                        type=str,
                        nargs='*',
                        action='append',
                        default=[],
                        help='files for the outputs')
    parser.add_argument('--scps',
                        type=str,
                        nargs='+',
                        default=[],
                        help='The files except for the input and outputs')
    parser.add_argument('--verbose',
                        '-V',
                        default=1,
                        type=int,
                        help='Verbose option')
    parser.add_argument('--allow-one-column',
                        type=strtobool,
                        default=False,
                        help='Allow one column in input scp files. '
                        'In this case, the value will be empty string.')
    parser.add_argument('--out',
                        '-O',
                        type=str,
                        help='The output filename. '
                        'If omitted, then output to sys.stdout')
    return parser


if __name__ == '__main__':
    parser = get_parser()
    args = parser.parse_args()
    args.scps = [args.scps]

    # logging info
    logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
    if args.verbose > 0:
        logging.basicConfig(level=logging.INFO, format=logfmt)
    else:
        logging.basicConfig(level=logging.WARN, format=logfmt)

    inputs = {}
    assert (len(args.input_scps) == 1)
    for f in args.input_scps[0]:
        arr = f.strip().split(':')
        inputs[arr[0]] = arr[1]
    assert ('feat' in inputs)
    assert ('shape' in inputs)

    outputs = {}
    assert (len(args.output_scps) == 1)
    for f in args.output_scps[0]:
        arr = f.strip().split(':')
        outputs[arr[0]] = arr[1]
    assert ('shape' in outputs)
    assert ('text' in outputs)
    assert ('token' in outputs)
    assert ('tokenid' in outputs)

    files = [
        inputs['feat'], inputs['shape'], outputs['text'], outputs['token'],
        outputs['tokenid'], outputs['shape']
    ]
    fields = ['feat', 'feat_shape', 'text', 'token', 'tokenid', 'token_shape']
    fids = [open(f, 'r', encoding='utf-8') for f in files]

    if args.out is None:
        out = sys.stdout
    else:
        out = open(args.out, 'w', encoding='utf-8')
    done = False
    while not done:
        for i, fid in enumerate(fids):
            line = fid.readline()
            if line == '':
                done = True
                break
            arr = line.strip().split()
            content = ' '.join(arr[1:])
            if i == 0:
                out.write('utt:{}'.format(arr[0]))
            out.write('\t')
            out.write('{}:{}'.format(fields[i], content))
        out.write('\n')

    for f in fids:
        f.close()
    if args.out is not None:
        out.close()