preprocess.py 9.03 KB
Newer Older
Louis Martin's avatar
Louis Martin committed
1
#!/usr/bin/env python3
Sergey Edunov's avatar
Sergey Edunov committed
2
3
4
5
6
7
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
Myle Ott's avatar
Myle Ott committed
8
9
10
"""
Data pre-processing: build vocabularies and binarize training data.
"""
Sergey Edunov's avatar
Sergey Edunov committed
11
12
13

import argparse
from itertools import zip_longest
14
15
import os
import shutil
Sergey Edunov's avatar
Sergey Edunov committed
16

alexeib's avatar
alexeib committed
17
from fairseq.data import indexed_dataset, dictionary
Myle Ott's avatar
Myle Ott committed
18
from fairseq.tokenizer import Tokenizer, tokenize_line
Sergey Edunov's avatar
Sergey Edunov committed
19
20


Myle Ott's avatar
Myle Ott committed
21
def get_parser():
Myle Ott's avatar
Myle Ott committed
22
    parser = argparse.ArgumentParser()
Sergey Edunov's avatar
Sergey Edunov committed
23
24
    parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language')
    parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language')
25
26
27
    parser.add_argument('--trainpref', metavar='FP', default=None, help='train file prefix')
    parser.add_argument('--validpref', metavar='FP', default=None, help='comma separated, valid file prefixes')
    parser.add_argument('--testpref', metavar='FP', default=None, help='comma separated, test file prefixes')
Sergey Edunov's avatar
Sergey Edunov committed
28
29
30
31
32
    parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir')
    parser.add_argument('--thresholdtgt', metavar='N', default=0, type=int,
                        help='map words appearing less than threshold times to unknown')
    parser.add_argument('--thresholdsrc', metavar='N', default=0, type=int,
                        help='map words appearing less than threshold times to unknown')
33
34
    parser.add_argument('--tgtdict', metavar='FP', help='reuse given target dictionary')
    parser.add_argument('--srcdict', metavar='FP', help='reuse given source dictionary')
Sergey Edunov's avatar
Sergey Edunov committed
35
36
37
    parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain')
    parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain')
    parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)')
38
39
    parser.add_argument('--output-format', metavar='FORMAT', default='binary', choices=['binary', 'raw'],
                        help='output format (optional)')
Myle Ott's avatar
Myle Ott committed
40
    parser.add_argument('--joined-dictionary', action='store_true', help='Generate joined dictionary')
41
    parser.add_argument('--only-source', action='store_true', help='Only process the source language')
Myle Ott's avatar
Myle Ott committed
42
43
    parser.add_argument('--padding-factor', metavar='N', default=8, type=int,
                        help='Pad dictionary size to be multiple of N')
Myle Ott's avatar
Myle Ott committed
44
    return parser
Sergey Edunov's avatar
Sergey Edunov committed
45

Myle Ott's avatar
Myle Ott committed
46

Myle Ott's avatar
Myle Ott committed
47
def main(args):
Sergey Edunov's avatar
Sergey Edunov committed
48
49
    print(args)
    os.makedirs(args.destdir, exist_ok=True)
50
    target = not args.only_source
Sergey Edunov's avatar
Sergey Edunov committed
51

52
53
54
55
56
57
    def build_dictionary(filenames):
        d = dictionary.Dictionary()
        for filename in filenames:
            Tokenizer.add_file_to_dictionary(filename, d, tokenize_line)
        return d

alexeib's avatar
alexeib committed
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    def train_path(lang):
        return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '')

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += f'.{lang}'
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path('dict', lang) + '.txt'

    def dataset_dest_path(output_prefix, lang, extension):
        base = f'{args.destdir}/{output_prefix}'
        lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else ''
        return f'{base}{lang_part}.{extension}'

Myle Ott's avatar
Myle Ott committed
78
79
80
    if args.joined_dictionary:
        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
alexeib's avatar
alexeib committed
81
82
        src_dict = build_dictionary(set([
            train_path(lang)
83
            for lang in [args.source_lang, args.target_lang]
alexeib's avatar
alexeib committed
84
        ]))
Myle Ott's avatar
Myle Ott committed
85
        tgt_dict = src_dict
86
    else:
Myle Ott's avatar
Myle Ott committed
87
88
89
        if args.srcdict:
            src_dict = dictionary.Dictionary.load(args.srcdict)
        else:
90
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
alexeib's avatar
alexeib committed
91
            src_dict = build_dictionary([train_path(args.source_lang)])
92
93
94
95
96
        if target:
            if args.tgtdict:
                tgt_dict = dictionary.Dictionary.load(args.tgtdict)
            else:
                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
alexeib's avatar
alexeib committed
97
                tgt_dict = build_dictionary([train_path(args.target_lang)])
Myle Ott's avatar
Myle Ott committed
98

Myle Ott's avatar
Myle Ott committed
99
100
101
102
103
    src_dict.finalize(
        threshold=args.thresholdsrc,
        nwords=args.nwordssrc,
        padding_factor=args.padding_factor,
    )
alexeib's avatar
alexeib committed
104
    src_dict.save(dict_path(args.source_lang))
105
    if target:
Myle Ott's avatar
Myle Ott committed
106
107
108
109
110
111
        if not args.joined_dictionary:
            tgt_dict.finalize(
                threshold=args.thresholdtgt,
                nwords=args.nwordstgt,
                padding_factor=args.padding_factor,
            )
alexeib's avatar
alexeib committed
112
        tgt_dict.save(dict_path(args.target_lang))
Sergey Edunov's avatar
Sergey Edunov committed
113

114
    def make_binary_dataset(input_prefix, output_prefix, lang):
alexeib's avatar
alexeib committed
115
        dict = dictionary.Dictionary.load(dict_path(lang))
Sergey Edunov's avatar
Sergey Edunov committed
116
117
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

alexeib's avatar
alexeib committed
118
        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))
Sergey Edunov's avatar
Sergey Edunov committed
119
120
121
122

        def consumer(tensor):
            ds.add_item(tensor)

alexeib's avatar
alexeib committed
123
        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
Sergey Edunov's avatar
Sergey Edunov committed
124
125
126
127
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
alexeib's avatar
alexeib committed
128
        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
Sergey Edunov's avatar
Sergey Edunov committed
129

130
131
    def make_dataset(input_prefix, output_prefix, lang):
        if args.output_format == 'binary':
132
            make_binary_dataset(input_prefix, output_prefix, lang)
133
        elif args.output_format == 'raw':
134
            # Copy original text file to destination folder
135
136
137
138
            output_text_file = dest_path(
                output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang),
                lang,
            )
alexeib's avatar
alexeib committed
139
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)
140

141
    def make_all(lang):
142
        if args.trainpref:
143
            make_dataset(args.trainpref, 'train', lang)
144
145
146
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(',')):
                outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
147
                make_dataset(validpref, outprefix, lang)
148
149
150
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(',')):
                outprefix = 'test{}'.format(k) if k > 0 else 'test'
151
                make_dataset(testpref, outprefix, lang)
152

153
    make_all(args.source_lang)
154
    if target:
155
        make_all(args.target_lang)
156

Sergey Edunov's avatar
Sergey Edunov committed
157
158
159
    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
160
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
alexeib's avatar
alexeib committed
161
162
163
164
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
        tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
Sergey Edunov's avatar
Sergey Edunov committed
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')), a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)

        with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
                args.source_lang, args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)


if __name__ == '__main__':
Myle Ott's avatar
Myle Ott committed
200
201
202
    parser = get_parser()
    args = parser.parse_args()
    main(args)