Commit 7333d04d authored by Myle Ott's avatar Myle Ott
Browse files

Support configurable BPE symbol

parent 59d599a2
......@@ -91,7 +91,7 @@ def add_generation_args(parser):
group.add_argument('--max-len-b', default=200, type=int, metavar='N',
help=('generate sequence of maximum length ax + b, '
'where x is the source length'))
group.add_argument('--remove-bpe', action='store_true',
group.add_argument('--remove-bpe', nargs='?', const='@@ ', default=None,
help='remove BPE tokens before scoring')
group.add_argument('--no-early-stop', action='store_true',
help=('continue searching even after finalizing k=beam '
......
......@@ -84,19 +84,18 @@ def main():
hypo_tokens[i] = src_token
return ' '.join(hypo_tokens)
bpe_symbol = '@@ ' if args.remove_bpe else None
def display_hypotheses(id, src, orig, ref, hypos):
if args.quiet:
return
id_str = '' if id is None else '-{}'.format(id)
src_str = dataset.src_dict.string(src, bpe_symbol)
src_str = dataset.src_dict.string(src, args.remove_bpe)
print('S{}\t{}'.format(id_str, src_str))
if orig is not None:
print('O{}\t{}'.format(id_str, orig.strip()))
if ref is not None:
print('T{}\t{}'.format(id_str, dataset.dst_dict.string(ref, bpe_symbol, escape_unk=True)))
print('T{}\t{}'.format(id_str, dataset.dst_dict.string(ref, args.remove_bpe, escape_unk=True)))
for hypo in hypos:
hypo_str = dataset.dst_dict.string(hypo['tokens'], bpe_symbol)
hypo_str = dataset.dst_dict.string(hypo['tokens'], args.remove_bpe)
align_str = ' '.join(map(str, hypo['alignment']))
if args.unk_replace_dict != '':
hypo_str = replace_unk(hypo_str, align_str, orig, dataset.dst_dict.unk_string())
......@@ -118,10 +117,10 @@ def main():
else:
def maybe_remove_bpe(tokens):
"""Helper for removing BPE symbols from a hypothesis."""
if not args.remove_bpe:
if args.remove_bpe is None:
return tokens
assert (tokens == dataset.dst_dict.pad()).sum() == 0
hypo_minus_bpe = dataset.dst_dict.string(tokens, bpe_symbol)
hypo_minus_bpe = dataset.dst_dict.string(tokens, args.remove_bpe)
return tokenizer.Tokenizer.tokenize(hypo_minus_bpe, dataset.dst_dict, add_if_not_exist=True)
# Generate and compute BLEU score
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment