Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Fairseq
Commits
7333d04d
Commit
7333d04d
authored
Sep 25, 2017
by
Myle Ott
Browse files
Support configurable BPE symbol
parent
59d599a2
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
6 additions
and
7 deletions
+6
-7
fairseq/options.py
fairseq/options.py
+1
-1
generate.py
generate.py
+5
-6
No files found.
fairseq/options.py
View file @
7333d04d
...
...
@@ -91,7 +91,7 @@ def add_generation_args(parser):
group
.
add_argument
(
'--max-len-b'
,
default
=
200
,
type
=
int
,
metavar
=
'N'
,
help
=
(
'generate sequence of maximum length ax + b, '
'where x is the source length'
))
group
.
add_argument
(
'--remove-bpe'
,
action
=
'store_true'
,
group
.
add_argument
(
'--remove-bpe'
,
nargs
=
'?'
,
const
=
'@@ '
,
default
=
None
,
help
=
'remove BPE tokens before scoring'
)
group
.
add_argument
(
'--no-early-stop'
,
action
=
'store_true'
,
help
=
(
'continue searching even after finalizing k=beam '
...
...
generate.py
View file @
7333d04d
...
...
@@ -84,19 +84,18 @@ def main():
hypo_tokens
[
i
]
=
src_token
return
' '
.
join
(
hypo_tokens
)
bpe_symbol
=
'@@ '
if
args
.
remove_bpe
else
None
def
display_hypotheses
(
id
,
src
,
orig
,
ref
,
hypos
):
if
args
.
quiet
:
return
id_str
=
''
if
id
is
None
else
'-{}'
.
format
(
id
)
src_str
=
dataset
.
src_dict
.
string
(
src
,
bpe_symbol
)
src_str
=
dataset
.
src_dict
.
string
(
src
,
args
.
remove_bpe
)
print
(
'S{}
\t
{}'
.
format
(
id_str
,
src_str
))
if
orig
is
not
None
:
print
(
'O{}
\t
{}'
.
format
(
id_str
,
orig
.
strip
()))
if
ref
is
not
None
:
print
(
'T{}
\t
{}'
.
format
(
id_str
,
dataset
.
dst_dict
.
string
(
ref
,
bpe_symbol
,
escape_unk
=
True
)))
print
(
'T{}
\t
{}'
.
format
(
id_str
,
dataset
.
dst_dict
.
string
(
ref
,
args
.
remove_bpe
,
escape_unk
=
True
)))
for
hypo
in
hypos
:
hypo_str
=
dataset
.
dst_dict
.
string
(
hypo
[
'tokens'
],
bpe_symbol
)
hypo_str
=
dataset
.
dst_dict
.
string
(
hypo
[
'tokens'
],
args
.
remove_bpe
)
align_str
=
' '
.
join
(
map
(
str
,
hypo
[
'alignment'
]))
if
args
.
unk_replace_dict
!=
''
:
hypo_str
=
replace_unk
(
hypo_str
,
align_str
,
orig
,
dataset
.
dst_dict
.
unk_string
())
...
...
@@ -118,10 +117,10 @@ def main():
else
:
def
maybe_remove_bpe
(
tokens
):
"""Helper for removing BPE symbols from a hypothesis."""
if
not
args
.
remove_bpe
:
if
args
.
remove_bpe
is
None
:
return
tokens
assert
(
tokens
==
dataset
.
dst_dict
.
pad
()).
sum
()
==
0
hypo_minus_bpe
=
dataset
.
dst_dict
.
string
(
tokens
,
bpe_symbol
)
hypo_minus_bpe
=
dataset
.
dst_dict
.
string
(
tokens
,
args
.
remove_bpe
)
return
tokenizer
.
Tokenizer
.
tokenize
(
hypo_minus_bpe
,
dataset
.
dst_dict
,
add_if_not_exist
=
True
)
# Generate and compute BLEU score
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment