Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
8bd6b235
Commit
8bd6b235
authored
Nov 03, 2018
by
VictorSanh
Browse files
typo on tokenization
parent
2c55568c
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
8 additions
and
8 deletions
+8
-8
run_squad_pytorch.py
run_squad_pytorch.py
+8
-8
No files found.
run_squad_pytorch.py
View file @
8bd6b235
...
@@ -23,7 +23,7 @@ import logging
...
@@ -23,7 +23,7 @@ import logging
import
json
import
json
import
math
import
math
import
os
import
os
import
tokenization
import
tokenization
_pytorch
import
six
import
six
import
argparse
import
argparse
...
@@ -62,9 +62,9 @@ class SquadExample(object):
...
@@ -62,9 +62,9 @@ class SquadExample(object):
def
__repr__
(
self
):
def
__repr__
(
self
):
s
=
""
s
=
""
s
+=
"qas_id: %s"
%
(
tokenization
.
printable_text
(
self
.
qas_id
))
s
+=
"qas_id: %s"
%
(
tokenization
_pytorch
.
printable_text
(
self
.
qas_id
))
s
+=
", question_text: %s"
%
(
s
+=
", question_text: %s"
%
(
tokenization
.
printable_text
(
self
.
question_text
))
tokenization
_pytorch
.
printable_text
(
self
.
question_text
))
s
+=
", doc_tokens: [%s]"
%
(
" "
.
join
(
self
.
doc_tokens
))
s
+=
", doc_tokens: [%s]"
%
(
" "
.
join
(
self
.
doc_tokens
))
if
self
.
start_position
:
if
self
.
start_position
:
s
+=
", start_position: %d"
%
(
self
.
start_position
)
s
+=
", start_position: %d"
%
(
self
.
start_position
)
...
@@ -153,7 +153,7 @@ def read_squad_examples(input_file, is_training):
...
@@ -153,7 +153,7 @@ def read_squad_examples(input_file, is_training):
# guaranteed to be preserved.
# guaranteed to be preserved.
actual_text
=
" "
.
join
(
doc_tokens
[
start_position
:(
end_position
+
1
)])
actual_text
=
" "
.
join
(
doc_tokens
[
start_position
:(
end_position
+
1
)])
cleaned_answer_text
=
" "
.
join
(
cleaned_answer_text
=
" "
.
join
(
tokenization
.
whitespace_tokenize
(
orig_answer_text
))
tokenization
_pytorch
.
whitespace_tokenize
(
orig_answer_text
))
if
actual_text
.
find
(
cleaned_answer_text
)
==
-
1
:
if
actual_text
.
find
(
cleaned_answer_text
)
==
-
1
:
logger
.
warning
(
"Could not find answer: '%s' vs. '%s'"
,
logger
.
warning
(
"Could not find answer: '%s' vs. '%s'"
,
actual_text
,
cleaned_answer_text
)
actual_text
,
cleaned_answer_text
)
...
@@ -287,7 +287,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
...
@@ -287,7 +287,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
logger
.
info
(
"example_index: %s"
%
(
example_index
))
logger
.
info
(
"example_index: %s"
%
(
example_index
))
logger
.
info
(
"doc_span_index: %s"
%
(
doc_span_index
))
logger
.
info
(
"doc_span_index: %s"
%
(
doc_span_index
))
logger
.
info
(
"tokens: %s"
%
" "
.
join
(
logger
.
info
(
"tokens: %s"
%
" "
.
join
(
[
tokenization
.
printable_text
(
x
)
for
x
in
tokens
]))
[
tokenization
_pytorch
.
printable_text
(
x
)
for
x
in
tokens
]))
logger
.
info
(
"token_to_orig_map: %s"
%
" "
.
join
(
logger
.
info
(
"token_to_orig_map: %s"
%
" "
.
join
(
[
"%d:%d"
%
(
x
,
y
)
for
(
x
,
y
)
in
six
.
iteritems
(
token_to_orig_map
)]))
[
"%d:%d"
%
(
x
,
y
)
for
(
x
,
y
)
in
six
.
iteritems
(
token_to_orig_map
)]))
logger
.
info
(
"token_is_max_context: %s"
%
" "
.
join
([
logger
.
info
(
"token_is_max_context: %s"
%
" "
.
join
([
...
@@ -303,7 +303,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
...
@@ -303,7 +303,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
logger
.
info
(
"start_position: %d"
%
(
start_position
))
logger
.
info
(
"start_position: %d"
%
(
start_position
))
logger
.
info
(
"end_position: %d"
%
(
end_position
))
logger
.
info
(
"end_position: %d"
%
(
end_position
))
logger
.
info
(
logger
.
info
(
"answer: %s"
%
(
tokenization
.
printable_text
(
answer_text
)))
"answer: %s"
%
(
tokenization
_pytorch
.
printable_text
(
answer_text
)))
features
.
append
(
features
.
append
(
InputFeatures
(
InputFeatures
(
...
@@ -579,7 +579,7 @@ def get_final_text(pred_text, orig_text, do_lower_case):
...
@@ -579,7 +579,7 @@ def get_final_text(pred_text, orig_text, do_lower_case):
# and `pred_text`, and check if they are the same length. If they are
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
# length, we assume the characters are one-to-one aligned.
tokenizer
=
tokenization
.
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
tokenizer
=
tokenization
_pytorch
.
BasicTokenizer
(
do_lower_case
=
do_lower_case
)
tok_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_text
))
tok_text
=
" "
.
join
(
tokenizer
.
tokenize
(
orig_text
))
...
@@ -780,7 +780,7 @@ def main():
...
@@ -780,7 +780,7 @@ def main():
raise
ValueError
(
"Output directory () already exists and is not empty."
)
raise
ValueError
(
"Output directory () already exists and is not empty."
)
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
tokenizer
=
tokenization
.
FullTokenizer
(
tokenizer
=
tokenization
_pytorch
.
FullTokenizer
(
vocab_file
=
args
.
vocab_file
,
do_lower_case
=
args
.
do_lower_case
)
vocab_file
=
args
.
vocab_file
,
do_lower_case
=
args
.
do_lower_case
)
train_examples
=
None
train_examples
=
None
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment