preprocess.py 11.1 KB
Newer Older
Louis Martin's avatar
Louis Martin committed
1
#!/usr/bin/env python3
Sergey Edunov's avatar
Sergey Edunov committed
2
3
4
5
6
7
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
Myle Ott's avatar
Myle Ott committed
8
9
10
"""
Data pre-processing: build vocabularies and binarize training data.
"""
Sergey Edunov's avatar
Sergey Edunov committed
11
12

import argparse
Sergey Edunov's avatar
Sergey Edunov committed
13
from collections import Counter
Sergey Edunov's avatar
Sergey Edunov committed
14
from itertools import zip_longest
15
16
import os
import shutil
Sergey Edunov's avatar
Sergey Edunov committed
17

Sergey Edunov's avatar
Sergey Edunov committed
18

alexeib's avatar
alexeib committed
19
from fairseq.data import indexed_dataset, dictionary
Myle Ott's avatar
Myle Ott committed
20
from fairseq.tokenizer import Tokenizer, tokenize_line
Sergey Edunov's avatar
Sergey Edunov committed
21
22
from multiprocessing import Pool, Manager, Process

Sergey Edunov's avatar
Sergey Edunov committed
23
24


Myle Ott's avatar
Myle Ott committed
25
def get_parser():
Myle Ott's avatar
Myle Ott committed
26
    parser = argparse.ArgumentParser()
Sergey Edunov's avatar
Sergey Edunov committed
27
28
    parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language')
    parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language')
29
30
31
    parser.add_argument('--trainpref', metavar='FP', default=None, help='train file prefix')
    parser.add_argument('--validpref', metavar='FP', default=None, help='comma separated, valid file prefixes')
    parser.add_argument('--testpref', metavar='FP', default=None, help='comma separated, test file prefixes')
Sergey Edunov's avatar
Sergey Edunov committed
32
33
34
35
36
    parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir')
    parser.add_argument('--thresholdtgt', metavar='N', default=0, type=int,
                        help='map words appearing less than threshold times to unknown')
    parser.add_argument('--thresholdsrc', metavar='N', default=0, type=int,
                        help='map words appearing less than threshold times to unknown')
37
38
    parser.add_argument('--tgtdict', metavar='FP', help='reuse given target dictionary')
    parser.add_argument('--srcdict', metavar='FP', help='reuse given source dictionary')
Sergey Edunov's avatar
Sergey Edunov committed
39
40
41
    parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain')
    parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain')
    parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)')
42
43
    parser.add_argument('--output-format', metavar='FORMAT', default='binary', choices=['binary', 'raw'],
                        help='output format (optional)')
Myle Ott's avatar
Myle Ott committed
44
    parser.add_argument('--joined-dictionary', action='store_true', help='Generate joined dictionary')
45
    parser.add_argument('--only-source', action='store_true', help='Only process the source language')
Myle Ott's avatar
Myle Ott committed
46
47
    parser.add_argument('--padding-factor', metavar='N', default=8, type=int,
                        help='Pad dictionary size to be multiple of N')
Sergey Edunov's avatar
Sergey Edunov committed
48
    parser.add_argument('--workers', metavar='N', default=1, type=int, help='number of parallel workers')
Myle Ott's avatar
Myle Ott committed
49
    return parser
Sergey Edunov's avatar
Sergey Edunov committed
50

Myle Ott's avatar
Myle Ott committed
51

Myle Ott's avatar
Myle Ott committed
52
def main(args):
Sergey Edunov's avatar
Sergey Edunov committed
53
54
    print(args)
    os.makedirs(args.destdir, exist_ok=True)
55
    target = not args.only_source
Sergey Edunov's avatar
Sergey Edunov committed
56

57
58
59
    def build_dictionary(filenames):
        d = dictionary.Dictionary()
        for filename in filenames:
Sergey Edunov's avatar
Sergey Edunov committed
60
            Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, args.workers)
61
62
        return d

alexeib's avatar
alexeib committed
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
    def train_path(lang):
        return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '')

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += f'.{lang}'
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path('dict', lang) + '.txt'

Myle Ott's avatar
Myle Ott committed
78
79
80
    if args.joined_dictionary:
        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
alexeib's avatar
alexeib committed
81
82
        src_dict = build_dictionary(set([
            train_path(lang)
83
            for lang in [args.source_lang, args.target_lang]
alexeib's avatar
alexeib committed
84
        ]))
Myle Ott's avatar
Myle Ott committed
85
        tgt_dict = src_dict
86
    else:
Myle Ott's avatar
Myle Ott committed
87
88
89
        if args.srcdict:
            src_dict = dictionary.Dictionary.load(args.srcdict)
        else:
90
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
alexeib's avatar
alexeib committed
91
            src_dict = build_dictionary([train_path(args.source_lang)])
92
93
94
95
96
        if target:
            if args.tgtdict:
                tgt_dict = dictionary.Dictionary.load(args.tgtdict)
            else:
                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
alexeib's avatar
alexeib committed
97
                tgt_dict = build_dictionary([train_path(args.target_lang)])
Myle Ott's avatar
Myle Ott committed
98

Myle Ott's avatar
Myle Ott committed
99
100
101
102
103
    src_dict.finalize(
        threshold=args.thresholdsrc,
        nwords=args.nwordssrc,
        padding_factor=args.padding_factor,
    )
alexeib's avatar
alexeib committed
104
    src_dict.save(dict_path(args.source_lang))
105
    if target:
Myle Ott's avatar
Myle Ott committed
106
107
108
109
110
111
        if not args.joined_dictionary:
            tgt_dict.finalize(
                threshold=args.thresholdtgt,
                nwords=args.nwordstgt,
                padding_factor=args.padding_factor,
            )
alexeib's avatar
alexeib committed
112
        tgt_dict.save(dict_path(args.target_lang))
Sergey Edunov's avatar
Sergey Edunov committed
113

Sergey Edunov's avatar
Sergey Edunov committed
114
    def make_binary_dataset(input_prefix, output_prefix, lang, num_workers):
alexeib's avatar
alexeib committed
115
        dict = dictionary.Dictionary.load(dict_path(lang))
Sergey Edunov's avatar
Sergey Edunov committed
116
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
Sergey Edunov's avatar
Sergey Edunov committed
117
118
        n_seq_tok = [0, 0]
        replaced = Counter()
Sergey Edunov's avatar
Sergey Edunov committed
119

Sergey Edunov's avatar
Sergey Edunov committed
120
121
122
123
        def merge_result(worker_result):
            replaced.update(worker_result['replaced'])
            n_seq_tok[0] += worker_result['nseq']
            n_seq_tok[1] += worker_result['ntok']
Sergey Edunov's avatar
Sergey Edunov committed
124

alexeib's avatar
alexeib committed
125
        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
Sergey Edunov's avatar
Sergey Edunov committed
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
        offsets = Tokenizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers-1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize, (args, input_file, dict, prefix, lang,
                                            offsets[worker_id],
                                            offsets[worker_id + 1]), callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin'))
        merge_result(Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t),
                                        offset=0, end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))


        ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))


Sergey Edunov's avatar
Sergey Edunov committed
153
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
Sergey Edunov's avatar
Sergey Edunov committed
154
155
156
157
            lang, input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))


Sergey Edunov's avatar
Sergey Edunov committed
158

Sergey Edunov's avatar
Sergey Edunov committed
159
    def make_dataset(input_prefix, output_prefix, lang, num_workers=1):
160
        if args.output_format == 'binary':
Sergey Edunov's avatar
Sergey Edunov committed
161
            make_binary_dataset(input_prefix, output_prefix, lang, num_workers)
162
        elif args.output_format == 'raw':
163
            # Copy original text file to destination folder
164
165
166
167
            output_text_file = dest_path(
                output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang),
                lang,
            )
alexeib's avatar
alexeib committed
168
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)
169

170
    def make_all(lang):
171
        if args.trainpref:
Sergey Edunov's avatar
Sergey Edunov committed
172
            make_dataset(args.trainpref, 'train', lang, num_workers=args.workers)
173
174
175
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(',')):
                outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
176
                make_dataset(validpref, outprefix, lang)
177
178
179
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(',')):
                outprefix = 'test{}'.format(k) if k > 0 else 'test'
180
                make_dataset(testpref, outprefix, lang)
181

182
    make_all(args.source_lang)
183
    if target:
184
        make_all(args.target_lang)
185

Sergey Edunov's avatar
Sergey Edunov committed
186
187
188
    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
189
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
alexeib's avatar
alexeib committed
190
191
192
193
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
        tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
Sergey Edunov's avatar
Sergey Edunov committed
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')), a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)

        with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
                args.source_lang, args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)


Sergey Edunov's avatar
Sergey Edunov committed
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249

def binarize(args, filename, dict, output_prefix, lang, offset, end):

    ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin'))
    def consumer(tensor):
        ds.add_item(tensor)

    res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end)
    ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))
    return res

def dataset_dest_prefix(args, output_prefix, lang):
    base = f'{args.destdir}/{output_prefix}'
    lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else ''
    return f'{base}{lang_part}'


def dataset_dest_file(args, output_prefix, lang, extension):
    base = dataset_dest_prefix(args, output_prefix, lang)
    return f'{base}.{extension}'


Sergey Edunov's avatar
Sergey Edunov committed
250
if __name__ == '__main__':
Myle Ott's avatar
Myle Ott committed
251
252
253
    parser = get_parser()
    args = parser.parse_args()
    main(args)