build_sym_alignment.py 3.82 KB
Newer Older
Sergey Edunov's avatar
Sergey Edunov committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# Copyright (c) 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the LICENSE file in
# the root directory of this source tree. An additional grant of patent rights
# can be found in the PATENTS file in the same directory.
"""
Use this script in order to build symmetric alignments for your translation
dataset.
This script depends on fast_align and mosesdecoder tools. You will need to
build those before running the script.
fast_align:
    github: http://github.com/clab/fast_align
    instructions: follow the instructions in README.md
mosesdecoder:
    github: http://github.com/moses-smt/mosesdecoder
    instructions: http://www.statmt.org/moses/?n=Development.GetStarted
The script produces the following files under --output_dir:
    text.joined - concatenation of lines from the source_file and the
    target_file.
    align.forward - forward pass of fast_align.
    align.backward - backward pass of fast_align.
    aligned.sym_heuristic - symmetrized alignment.
"""

import argparse
import os
from itertools import zip_longest


def main():
    parser = argparse.ArgumentParser(description='symmetric alignment builer')
33
    # fmt: off
Sergey Edunov's avatar
Sergey Edunov committed
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
    parser.add_argument('--fast_align_dir',
                        help='path to fast_align build directory')
    parser.add_argument('--mosesdecoder_dir',
                        help='path to mosesdecoder root directory')
    parser.add_argument('--sym_heuristic',
                        help='heuristic to use for symmetrization',
                        default='grow-diag-final-and')
    parser.add_argument('--source_file',
                        help='path to a file with sentences '
                             'in the source language')
    parser.add_argument('--target_file',
                        help='path to a file with sentences '
                             'in the target language')
    parser.add_argument('--output_dir',
                        help='output directory')
49
    # fmt: on
Sergey Edunov's avatar
Sergey Edunov committed
50
51
52
53
54
55
56
57
58
59
    args = parser.parse_args()

    fast_align_bin = os.path.join(args.fast_align_dir, 'fast_align')
    symal_bin = os.path.join(args.mosesdecoder_dir, 'bin', 'symal')
    sym_fast_align_bin = os.path.join(
        args.mosesdecoder_dir, 'scripts', 'ems',
        'support', 'symmetrize-fast-align.perl')

    # create joined file
    joined_file = os.path.join(args.output_dir, 'text.joined')
60
61
    with open(args.source_file, 'r', encoding='utf-8') as src, open(args.target_file, 'r', encoding='utf-8') as tgt:
        with open(joined_file, 'w', encoding='utf-8') as joined:
Sergey Edunov's avatar
Sergey Edunov committed
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
            for s, t in zip_longest(src, tgt):
                print('{} ||| {}'.format(s.strip(), t.strip()), file=joined)

    bwd_align_file = os.path.join(args.output_dir, 'align.backward')

    # run forward alignment
    fwd_align_file = os.path.join(args.output_dir, 'align.forward')
    fwd_fast_align_cmd = '{FASTALIGN} -i {JOINED} -d -o -v > {FWD}'.format(
        FASTALIGN=fast_align_bin,
        JOINED=joined_file,
        FWD=fwd_align_file)
    assert os.system(fwd_fast_align_cmd) == 0

    # run backward alignment
    bwd_align_file = os.path.join(args.output_dir, 'align.backward')
    bwd_fast_align_cmd = '{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}'.format(
        FASTALIGN=fast_align_bin,
        JOINED=joined_file,
        BWD=bwd_align_file)
    assert os.system(bwd_fast_align_cmd) == 0

    # run symmetrization
    sym_out_file = os.path.join(args.output_dir, 'aligned')
    sym_cmd = '{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}'.format(
        SYMFASTALIGN=sym_fast_align_bin,
        FWD=fwd_align_file,
        BWD=bwd_align_file,
        SRC=args.source_file,
        TGT=args.target_file,
        OUT=sym_out_file,
        HEURISTIC=args.sym_heuristic,
        SYMAL=symal_bin
    )
    assert os.system(sym_cmd) == 0


if __name__ == '__main__':
    main()