Support configurable BPE symbol

7333d04d · Myle Ott · 59d599a2 · 7333d04d · 7333d04d
Commit 7333d04d authored Sep 25, 2017 by Myle Ott
Hide whitespace changes
Inline Side-by-side

Showing with 6 additions and 7 deletions

fairseq/options.py fairseq/options.py +1 -1

generate.py generate.py +5 -6

No files found.
--- a/fairseq/options.py
+++ b/fairseq/options.py
@@ -91,7 +91,7 @@ def add_generation_args(parser):
    group.add_argument('--max-len-b', default=200, type=int, metavar='N',
                       help=('generate sequence of maximum length ax + b, '
                             'where x is the source length'))
-    group.add_argument('--remove-bpe', action='store_true',
+    group.add_argument('--remove-bpe', nargs='?', const='@@ ', default=None,
                       help='remove BPE tokens before scoring')
    group.add_argument('--no-early-stop', action='store_true',
                       help=('continue searching even after finalizing k=beam '

--- a/generate.py
+++ b/generate.py
@@ -84,19 +84,18 @@ def main():
                    hypo_tokens[i] = src_token
        return ' '.join(hypo_tokens)

-    bpe_symbol = '@@ ' if args.remove_bpe else None
    def display_hypotheses(id, src, orig, ref, hypos):
        if args.quiet:
            return
        id_str = '' if id is None else '-{}'.format(id)
-        src_str = dataset.src_dict.string(src, bpe_symbol)
+        src_str = dataset.src_dict.string(src, args.remove_bpe)
        print('S{}\t{}'.format(id_str, src_str))
        if orig is not None:
            print('O{}\t{}'.format(id_str, orig.strip()))
        if ref is not None:
-            print('T{}\t{}'.format(id_str, dataset.dst_dict.string(ref, bpe_symbol, escape_unk=True)))
+            print('T{}\t{}'.format(id_str, dataset.dst_dict.string(ref, args.remove_bpe, escape_unk=True)))
        for hypo in hypos:
-            hypo_str = dataset.dst_dict.string(hypo['tokens'], bpe_symbol)
+            hypo_str = dataset.dst_dict.string(hypo['tokens'], args.remove_bpe)
            align_str = ' '.join(map(str, hypo['alignment']))
            if args.unk_replace_dict != '':
                hypo_str = replace_unk(hypo_str, align_str, orig, dataset.dst_dict.unk_string())
@@ -118,10 +117,10 @@ def main():
    else:
        def maybe_remove_bpe(tokens):
            """Helper for removing BPE symbols from a hypothesis."""
-            if not args.remove_bpe:
+            if args.remove_bpe is None:
                return tokens
            assert (tokens == dataset.dst_dict.pad()).sum() == 0
-            hypo_minus_bpe = dataset.dst_dict.string(tokens, bpe_symbol)
+            hypo_minus_bpe = dataset.dst_dict.string(tokens, args.remove_bpe)
            return tokenizer.Tokenizer.tokenize(hypo_minus_bpe, dataset.dst_dict, add_if_not_exist=True)

        # Generate and compute BLEU score