Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
9ecd83da
"templates/vscode:/vscode.git/clone" did not exist on "9eddf44b7a3bc9373d3f1ed08f7ba2c16e8bf39c"
Commit
9ecd83da
authored
Dec 05, 2019
by
LysandreJik
Browse files
Patch evaluation for impossible values + cleanup
parent
ce158a07
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
11 additions
and
26 deletions
+11
-26
docs/source/main_classes/processors.rst
docs/source/main_classes/processors.rst
+2
-2
examples/run_squad.py
examples/run_squad.py
+5
-20
transformers/data/processors/squad.py
transformers/data/processors/squad.py
+3
-3
transformers/tokenization_utils.py
transformers/tokenization_utils.py
+1
-1
No files found.
docs/source/main_classes/processors.rst
View file @
9ecd83da
...
...
@@ -55,7 +55,7 @@ Example usage
^^^^^^^^^^^^^^^^^^^^^^^^^
An example using these processors is given in the
`run_glue.py <https://github.com/huggingface/
pytorch-
transformers/blob/master/examples/run_glue.py>`__ script.
`run_glue.py <https://github.com/huggingface/transformers/blob/master/examples/run_glue.py>`__ script.
...
...
@@ -132,4 +132,4 @@ Example::
Another example using these processors is given in the
`run_squad.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_squad.py>`__ script.
\ No newline at end of file
`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/run_squad.py>`__ script.
\ No newline at end of file
examples/run_squad.py
View file @
9ecd83da
...
...
@@ -311,7 +311,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
str
(
args
.
max_seq_length
)))
if
os
.
path
.
exists
(
cached_features_file
)
and
not
args
.
overwrite_cache
and
not
output_examples
:
logger
.
info
(
"Loading features from cached file %s"
,
cached_features_file
)
features
=
torch
.
load
(
cached_features_file
)
features_and_dataset
=
torch
.
load
(
cached_features_file
)
features
,
dataset
=
features_and_dataset
[
"features"
],
features_and_dataset
[
"dataset"
]
else
:
logger
.
info
(
"Creating features from dataset file at %s"
,
input_dir
)
...
...
@@ -330,40 +331,24 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
processor
=
SquadV2Processor
()
if
args
.
version_2_with_negative
else
SquadV1Processor
()
examples
=
processor
.
get_dev_examples
(
args
.
data_dir
)
if
evaluate
else
processor
.
get_train_examples
(
args
.
data_dir
)
features
=
squad_convert_examples_to_features
(
features
,
dataset
=
squad_convert_examples_to_features
(
examples
=
examples
,
tokenizer
=
tokenizer
,
max_seq_length
=
args
.
max_seq_length
,
doc_stride
=
args
.
doc_stride
,
max_query_length
=
args
.
max_query_length
,
is_training
=
not
evaluate
,
return_dataset
=
'pt'
)
if
args
.
local_rank
in
[
-
1
,
0
]:
logger
.
info
(
"Saving features into cached file %s"
,
cached_features_file
)
torch
.
save
(
features
,
cached_features_file
)
torch
.
save
(
{
"
features
"
:
features
,
"dataset"
:
dataset
}
,
cached_features_file
)
if
args
.
local_rank
==
0
and
not
evaluate
:
torch
.
distributed
.
barrier
()
# Make sure only the first process in distributed training process the dataset, and the others will use the cache
# Convert to Tensors and build dataset
all_input_ids
=
torch
.
tensor
([
f
.
input_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_input_mask
=
torch
.
tensor
([
f
.
attention_mask
for
f
in
features
],
dtype
=
torch
.
long
)
all_segment_ids
=
torch
.
tensor
([
f
.
token_type_ids
for
f
in
features
],
dtype
=
torch
.
long
)
all_cls_index
=
torch
.
tensor
([
f
.
cls_index
for
f
in
features
],
dtype
=
torch
.
long
)
all_p_mask
=
torch
.
tensor
([
f
.
p_mask
for
f
in
features
],
dtype
=
torch
.
float
)
if
evaluate
:
all_example_index
=
torch
.
arange
(
all_input_ids
.
size
(
0
),
dtype
=
torch
.
long
)
dataset
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_example_index
,
all_cls_index
,
all_p_mask
)
else
:
all_start_positions
=
torch
.
tensor
([
f
.
start_position
for
f
in
features
],
dtype
=
torch
.
long
)
all_end_positions
=
torch
.
tensor
([
f
.
end_position
for
f
in
features
],
dtype
=
torch
.
long
)
dataset
=
TensorDataset
(
all_input_ids
,
all_input_mask
,
all_segment_ids
,
all_start_positions
,
all_end_positions
,
all_cls_index
,
all_p_mask
)
if
output_examples
:
return
dataset
,
examples
,
features
return
dataset
...
...
transformers/data/processors/squad.py
View file @
9ecd83da
...
...
@@ -312,7 +312,7 @@ class SquadProcessor(DataProcessor):
if
not
evaluate
:
answer
=
tensor_dict
[
'answers'
][
'text'
][
0
].
numpy
().
decode
(
'utf-8'
)
answer_start
=
tensor_dict
[
'answers'
][
'answer_start'
][
0
].
numpy
()
answers
=
None
answers
=
[]
else
:
answers
=
[{
"answer_start"
:
start
.
numpy
(),
...
...
@@ -408,7 +408,7 @@ class SquadProcessor(DataProcessor):
question_text
=
qa
[
"question"
]
start_position_character
=
None
answer_text
=
None
answers
=
None
answers
=
[]
if
"is_impossible"
in
qa
:
is_impossible
=
qa
[
"is_impossible"
]
...
...
@@ -469,7 +469,7 @@ class SquadExample(object):
answer_text
,
start_position_character
,
title
,
answers
=
None
,
answers
=
[]
,
is_impossible
=
False
):
self
.
qas_id
=
qas_id
self
.
question_text
=
question_text
...
...
transformers/tokenization_utils.py
View file @
9ecd83da
...
...
@@ -194,7 +194,7 @@ class PreTrainedTokenizer(object):
@
property
def
pad_token_type_id
(
self
):
""" Id of the padding token in the vocabulary.
Log an error if used while not having been set.
"""
""" Id of the padding token
type
in the vocabulary."""
return
self
.
_pad_token_type_id
@
property
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment