Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
72e506b2
You need to sign in or sign up before continuing.
Commit
72e506b2
authored
Nov 19, 2019
by
Lysandre
Committed by
LysandreJik
Nov 22, 2019
Browse files
wip
parent
ea52f824
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
157 additions
and
5 deletions
+157
-5
examples/run_squad.py
examples/run_squad.py
+27
-2
transformers/__init__.py
transformers/__init__.py
+2
-1
transformers/data/__init__.py
transformers/data/__init__.py
+1
-1
transformers/data/processors/__init__.py
transformers/data/processors/__init__.py
+1
-1
transformers/data/processors/squad.py
transformers/data/processors/squad.py
+122
-0
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+4
-0
No files found.
examples/run_squad.py
View file @
72e506b2
...
@@ -23,7 +23,6 @@ import os
...
@@ -23,7 +23,6 @@ import os
import
random
import
random
import
glob
import
glob
import
timeit
import
timeit
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
from
torch.utils.data
import
(
DataLoader
,
RandomSampler
,
SequentialSampler
,
...
@@ -45,7 +44,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
...
@@ -45,7 +44,7 @@ from transformers import (WEIGHTS_NAME, BertConfig,
XLNetTokenizer
,
XLNetTokenizer
,
DistilBertConfig
,
DistilBertForQuestionAnswering
,
DistilBertTokenizer
)
DistilBertConfig
,
DistilBertForQuestionAnswering
,
DistilBertTokenizer
)
from
transformers
import
AdamW
,
get_linear_schedule_with_warmup
from
transformers
import
AdamW
,
get_linear_schedule_with_warmup
,
squad_convert_examples_to_features
,
read_squad_examples
as
sread_squad_examples
from
utils_squad
import
(
read_squad_examples
,
convert_examples_to_features
,
from
utils_squad
import
(
read_squad_examples
,
convert_examples_to_features
,
RawResult
,
write_predictions
,
RawResult
,
write_predictions
,
...
@@ -309,6 +308,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
...
@@ -309,6 +308,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
examples
=
read_squad_examples
(
input_file
=
input_file
,
examples
=
read_squad_examples
(
input_file
=
input_file
,
is_training
=
not
evaluate
,
is_training
=
not
evaluate
,
version_2_with_negative
=
args
.
version_2_with_negative
)
version_2_with_negative
=
args
.
version_2_with_negative
)
examples
=
examples
[:
10
]
features
=
convert_examples_to_features
(
examples
=
examples
,
features
=
convert_examples_to_features
(
examples
=
examples
,
tokenizer
=
tokenizer
,
tokenizer
=
tokenizer
,
max_seq_length
=
args
.
max_seq_length
,
max_seq_length
=
args
.
max_seq_length
,
...
@@ -319,6 +320,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
...
@@ -319,6 +320,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
pad_token_segment_id
=
3
if
args
.
model_type
in
[
'xlnet'
]
else
0
,
pad_token_segment_id
=
3
if
args
.
model_type
in
[
'xlnet'
]
else
0
,
cls_token_at_end
=
True
if
args
.
model_type
in
[
'xlnet'
]
else
False
,
cls_token_at_end
=
True
if
args
.
model_type
in
[
'xlnet'
]
else
False
,
sequence_a_is_doc
=
True
if
args
.
model_type
in
[
'xlnet'
]
else
False
)
sequence_a_is_doc
=
True
if
args
.
model_type
in
[
'xlnet'
]
else
False
)
exampless
=
sread_squad_examples
(
input_file
=
input_file
,
is_training
=
not
evaluate
,
version_2_with_negative
=
args
.
version_2_with_negative
)
exampless
=
exampless
[:
10
]
features2
=
squad_convert_examples_to_features
(
examples
=
exampless
,
tokenizer
=
tokenizer
,
max_seq_length
=
args
.
max_seq_length
,
doc_stride
=
args
.
doc_stride
,
max_query_length
=
args
.
max_query_length
,
is_training
=
not
evaluate
,
cls_token_segment_id
=
2
if
args
.
model_type
in
[
'xlnet'
]
else
0
,
pad_token_segment_id
=
3
if
args
.
model_type
in
[
'xlnet'
]
else
0
,
cls_token_at_end
=
True
if
args
.
model_type
in
[
'xlnet'
]
else
False
,
sequence_a_is_doc
=
True
if
args
.
model_type
in
[
'xlnet'
]
else
False
)
print
(
features2
)
for
i
in
range
(
len
(
features
)):
assert
features
[
i
]
==
features2
[
i
]
print
(
"Equal"
)
print
(
"DONE"
)
if
args
.
local_rank
in
[
-
1
,
0
]:
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
torch
.
save
(
features
,
cached_features_file
)
torch
.
save
(
features
,
cached_features_file
)
...
...
transformers/__init__.py
View file @
72e506b2
...
@@ -26,7 +26,8 @@ from .data import (is_sklearn_available,
...
@@ -26,7 +26,8 @@ from .data import (is_sklearn_available,
InputExample
,
InputFeatures
,
DataProcessor
,
InputExample
,
InputFeatures
,
DataProcessor
,
glue_output_modes
,
glue_convert_examples_to_features
,
glue_output_modes
,
glue_convert_examples_to_features
,
glue_processors
,
glue_tasks_num_labels
,
glue_processors
,
glue_tasks_num_labels
,
squad_convert_examples_to_features
,
SquadFeatures
)
squad_convert_examples_to_features
,
SquadFeatures
,
SquadExample
,
read_squad_examples
)
if
is_sklearn_available
():
if
is_sklearn_available
():
from
.data
import
glue_compute_metrics
from
.data
import
glue_compute_metrics
...
...
transformers/data/__init__.py
View file @
72e506b2
from
.processors
import
InputExample
,
InputFeatures
,
DataProcessor
,
SquadFeatures
from
.processors
import
InputExample
,
InputFeatures
,
DataProcessor
,
SquadFeatures
from
.processors
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.processors
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.processors
import
squad_convert_examples_to_features
from
.processors
import
squad_convert_examples_to_features
,
SquadExample
,
read_squad_examples
from
.metrics
import
is_sklearn_available
from
.metrics
import
is_sklearn_available
if
is_sklearn_available
():
if
is_sklearn_available
():
...
...
transformers/data/processors/__init__.py
View file @
72e506b2
from
.utils
import
InputExample
,
InputFeatures
,
DataProcessor
from
.utils
import
InputExample
,
InputFeatures
,
DataProcessor
from
.glue
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.glue
import
glue_output_modes
,
glue_processors
,
glue_tasks_num_labels
,
glue_convert_examples_to_features
from
.squad
import
squad_convert_examples_to_features
,
SquadFeatures
from
.squad
import
squad_convert_examples_to_features
,
SquadFeatures
,
SquadExample
,
read_squad_examples
transformers/data/processors/squad.py
View file @
72e506b2
...
@@ -2,7 +2,9 @@ from tqdm import tqdm
...
@@ -2,7 +2,9 @@ from tqdm import tqdm
import
collections
import
collections
import
logging
import
logging
import
os
import
os
import
json
from
...tokenization_bert
import
BasicTokenizer
,
whitespace_tokenize
from
.utils
import
DataProcessor
,
InputExample
,
InputFeatures
from
.utils
import
DataProcessor
,
InputExample
,
InputFeatures
from
...file_utils
import
is_tf_available
from
...file_utils
import
is_tf_available
...
@@ -11,6 +13,7 @@ if is_tf_available():
...
@@ -11,6 +13,7 @@ if is_tf_available():
logger
=
logging
.
getLogger
(
__name__
)
logger
=
logging
.
getLogger
(
__name__
)
def
squad_convert_examples_to_features
(
examples
,
tokenizer
,
max_seq_length
,
def
squad_convert_examples_to_features
(
examples
,
tokenizer
,
max_seq_length
,
doc_stride
,
max_query_length
,
is_training
,
doc_stride
,
max_query_length
,
is_training
,
cls_token_at_end
=
False
,
cls_token_at_end
=
False
,
...
@@ -265,6 +268,125 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
...
@@ -265,6 +268,125 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
return
features
return
features
def
read_squad_examples
(
input_file
,
is_training
,
version_2_with_negative
):
"""Read a SQuAD json file into a list of SquadExample."""
with
open
(
input_file
,
"r"
,
encoding
=
'utf-8'
)
as
reader
:
input_data
=
json
.
load
(
reader
)[
"data"
]
def
is_whitespace
(
c
):
if
c
==
" "
or
c
==
"
\t
"
or
c
==
"
\r
"
or
c
==
"
\n
"
or
ord
(
c
)
==
0x202F
:
return
True
return
False
examples
=
[]
for
entry
in
input_data
:
for
paragraph
in
entry
[
"paragraphs"
]:
paragraph_text
=
paragraph
[
"context"
]
doc_tokens
=
[]
char_to_word_offset
=
[]
prev_is_whitespace
=
True
for
c
in
paragraph_text
:
if
is_whitespace
(
c
):
prev_is_whitespace
=
True
else
:
if
prev_is_whitespace
:
doc_tokens
.
append
(
c
)
else
:
doc_tokens
[
-
1
]
+=
c
prev_is_whitespace
=
False
char_to_word_offset
.
append
(
len
(
doc_tokens
)
-
1
)
for
qa
in
paragraph
[
"qas"
]:
qas_id
=
qa
[
"id"
]
question_text
=
qa
[
"question"
]
start_position
=
None
end_position
=
None
orig_answer_text
=
None
is_impossible
=
False
if
is_training
:
if
version_2_with_negative
:
is_impossible
=
qa
[
"is_impossible"
]
if
(
len
(
qa
[
"answers"
])
!=
1
)
and
(
not
is_impossible
):
raise
ValueError
(
"For training, each question should have exactly 1 answer."
)
if
not
is_impossible
:
answer
=
qa
[
"answers"
][
0
]
orig_answer_text
=
answer
[
"text"
]
answer_offset
=
answer
[
"answer_start"
]
answer_length
=
len
(
orig_answer_text
)
start_position
=
char_to_word_offset
[
answer_offset
]
end_position
=
char_to_word_offset
[
answer_offset
+
answer_length
-
1
]
# Only add answers where the text can be exactly recovered from the
# document. If this CAN'T happen it's likely due to weird Unicode
# stuff so we will just skip the example.
#
# Note that this means for training mode, every example is NOT
# guaranteed to be preserved.
actual_text
=
" "
.
join
(
doc_tokens
[
start_position
:(
end_position
+
1
)])
cleaned_answer_text
=
" "
.
join
(
whitespace_tokenize
(
orig_answer_text
))
if
actual_text
.
find
(
cleaned_answer_text
)
==
-
1
:
logger
.
warning
(
"Could not find answer: '%s' vs. '%s'"
,
actual_text
,
cleaned_answer_text
)
continue
else
:
start_position
=
-
1
end_position
=
-
1
orig_answer_text
=
""
example
=
SquadExample
(
qas_id
=
qas_id
,
question_text
=
question_text
,
doc_tokens
=
doc_tokens
,
orig_answer_text
=
orig_answer_text
,
start_position
=
start_position
,
end_position
=
end_position
,
is_impossible
=
is_impossible
)
examples
.
append
(
example
)
return
examples
class
SquadExample
(
object
):
"""
A single training/test example for the Squad dataset.
For examples without an answer, the start and end position are -1.
"""
def
__init__
(
self
,
qas_id
,
question_text
,
doc_tokens
,
orig_answer_text
=
None
,
start_position
=
None
,
end_position
=
None
,
is_impossible
=
None
):
self
.
qas_id
=
qas_id
self
.
question_text
=
question_text
self
.
doc_tokens
=
doc_tokens
self
.
orig_answer_text
=
orig_answer_text
self
.
start_position
=
start_position
self
.
end_position
=
end_position
self
.
is_impossible
=
is_impossible
def
__str__
(
self
):
return
self
.
__repr__
()
def
__repr__
(
self
):
s
=
""
s
+=
"qas_id: %s"
%
(
self
.
qas_id
)
s
+=
", question_text: %s"
%
(
self
.
question_text
)
s
+=
", doc_tokens: [%s]"
%
(
" "
.
join
(
self
.
doc_tokens
))
if
self
.
start_position
:
s
+=
", start_position: %d"
%
(
self
.
start_position
)
if
self
.
end_position
:
s
+=
", end_position: %d"
%
(
self
.
end_position
)
if
self
.
is_impossible
:
s
+=
", is_impossible: %r"
%
(
self
.
is_impossible
)
return
s
class
SquadFeatures
(
object
):
class
SquadFeatures
(
object
):
"""A single set of features of data."""
"""A single set of features of data."""
...
...
transformers/tokenization_utils.py
View file @
72e506b2
...
@@ -605,6 +605,10 @@ class PreTrainedTokenizer(object):
...
@@ -605,6 +605,10 @@ class PreTrainedTokenizer(object):
vocabularies (BPE/SentencePieces/WordPieces).
vocabularies (BPE/SentencePieces/WordPieces).
Take care of added tokens.
Take care of added tokens.
text: The sequence to be encoded.
return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
**kwargs: passed to the child `self.tokenize()` method
"""
"""
def
split_on_token
(
tok
,
text
):
def
split_on_token
(
tok
,
text
):
result
=
[]
result
=
[]
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment