Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
chenpangpang
transformers
Commits
8e9526b4
Commit
8e9526b4
authored
Dec 14, 2019
by
erenup
Browse files
add multiple processing
parent
9b312f9d
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
187 additions
and
160 deletions
+187
-160
examples/run_squad.py
examples/run_squad.py
+4
-1
transformers/data/processors/squad.py
transformers/data/processors/squad.py
+183
-159
No files found.
examples/run_squad.py
View file @
8e9526b4
...
...
@@ -360,7 +360,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
doc_stride
=
args
.
doc_stride
,
max_query_length
=
args
.
max_query_length
,
is_training
=
not
evaluate
,
return_dataset
=
'pt'
return_dataset
=
'pt'
,
threads
=
args
.
threads
,
)
if
args
.
local_rank
in
[
-
1
,
0
]:
...
...
@@ -478,6 +479,8 @@ def main():
"See details at https://nvidia.github.io/apex/amp.html"
)
parser
.
add_argument
(
'--server_ip'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--server_port'
,
type
=
str
,
default
=
''
,
help
=
"Can be used for distant debugging."
)
parser
.
add_argument
(
'--threads'
,
type
=
int
,
default
=
1
,
help
=
'multiple threads for converting example to features'
)
args
=
parser
.
parse_args
()
if
os
.
path
.
exists
(
args
.
output_dir
)
and
os
.
listdir
(
args
.
output_dir
)
and
args
.
do_train
and
not
args
.
overwrite_output_dir
:
...
...
transformers/data/processors/squad.py
View file @
8e9526b4
...
...
@@ -4,6 +4,9 @@ import logging
import
os
import
json
import
numpy
as
np
from
multiprocessing
import
Pool
from
multiprocessing
import
cpu_count
from
functools
import
partial
from
...tokenization_bert
import
BasicTokenizer
,
whitespace_tokenize
from
.utils
import
DataProcessor
,
InputExample
,
InputFeatures
...
...
@@ -76,47 +79,9 @@ def _is_whitespace(c):
return
True
return
False
def
squad_convert_examples_to_features
(
examples
,
tokenizer
,
max_seq_length
,
doc_stride
,
max_query_length
,
is_training
,
return_dataset
=
False
):
"""
Converts a list of examples into a list of features that can be directly given as input to a model.
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args:
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
max_seq_length: The maximum sequence length of the inputs.
doc_stride: The stride used when the context is too large and is split across several features.
max_query_length: The maximum length of the query.
is_training: whether to create features for model evaluation or model training.
return_dataset: Default False. Either 'pt' or 'tf'.
if 'pt': returns a torch.data.TensorDataset,
if 'tf': returns a tf.data.Dataset
Returns:
list of :class:`~transformers.data.processors.squad.SquadFeatures`
Example::
processor = SquadV2Processor()
examples = processor.get_dev_examples(data_dir)
features = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=not evaluate,
)
"""
# Defining helper methods
unique_id
=
1000000000
def
squad_convert_example_to_features
(
example
,
max_seq_length
,
doc_stride
,
max_query_length
,
is_training
):
features
=
[]
for
(
example_index
,
example
)
in
enumerate
(
tqdm
(
examples
)):
if
is_training
and
not
example
.
is_impossible
:
# Get start and end position
start_position
=
example
.
start_position
...
...
@@ -127,8 +92,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
cleaned_answer_text
=
" "
.
join
(
whitespace_tokenize
(
example
.
answer_text
))
if
actual_text
.
find
(
cleaned_answer_text
)
==
-
1
:
logger
.
warning
(
"Could not find answer: '%s' vs. '%s'"
,
actual_text
,
cleaned_answer_text
)
continue
return
[]
tok_to_orig_index
=
[]
orig_to_tok_index
=
[]
...
...
@@ -171,7 +135,8 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
truncation_strategy
=
'only_second'
if
tokenizer
.
padding_side
==
"right"
else
'only_first'
)
paragraph_len
=
min
(
len
(
all_doc_tokens
)
-
len
(
spans
)
*
doc_stride
,
max_seq_length
-
len
(
truncated_query
)
-
sequence_pair_added_tokens
)
paragraph_len
=
min
(
len
(
all_doc_tokens
)
-
len
(
spans
)
*
doc_stride
,
max_seq_length
-
len
(
truncated_query
)
-
sequence_pair_added_tokens
)
if
tokenizer
.
pad_token_id
in
encoded_dict
[
'input_ids'
]:
non_padded_ids
=
encoded_dict
[
'input_ids'
][:
encoded_dict
[
'input_ids'
].
index
(
tokenizer
.
pad_token_id
)]
...
...
@@ -202,7 +167,8 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
for
doc_span_index
in
range
(
len
(
spans
)):
for
j
in
range
(
spans
[
doc_span_index
][
"paragraph_len"
]):
is_max_context
=
_new_check_is_max_context
(
spans
,
doc_span_index
,
doc_span_index
*
doc_stride
+
j
)
index
=
j
if
tokenizer
.
padding_side
==
"left"
else
spans
[
doc_span_index
][
"truncated_query_with_special_tokens_length"
]
+
j
index
=
j
if
tokenizer
.
padding_side
==
"left"
else
spans
[
doc_span_index
][
"truncated_query_with_special_tokens_length"
]
+
j
spans
[
doc_span_index
][
"token_is_max_context"
][
index
]
=
is_max_context
for
span
in
spans
:
...
...
@@ -224,7 +190,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
# Set the CLS index to '0'
p_mask
[
cls_index
]
=
0
span_is_impossible
=
example
.
is_impossible
start_position
=
0
end_position
=
0
...
...
@@ -251,16 +216,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
start_position
=
tok_start_position
-
doc_start
+
doc_offset
end_position
=
tok_end_position
-
doc_start
+
doc_offset
features
.
append
(
SquadFeatures
(
span
[
'input_ids'
],
span
[
'attention_mask'
],
span
[
'token_type_ids'
],
cls_index
,
p_mask
.
tolist
(),
example_index
=
example_index
,
unique_id
=
unique_id
,
example_index
=
0
,
unique_id
=
0
,
paragraph_len
=
span
[
'paragraph_len'
],
token_is_max_context
=
span
[
"token_is_max_context"
],
tokens
=
span
[
"tokens"
],
...
...
@@ -269,9 +232,71 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
start_position
=
start_position
,
end_position
=
end_position
))
return
features
unique_id
+=
1
def
squad_convert_example_to_features_init
(
tokenizer_for_convert
):
global
tokenizer
tokenizer
=
tokenizer_for_convert
def
squad_convert_examples_to_features
(
examples
,
tokenizer
,
max_seq_length
,
doc_stride
,
max_query_length
,
is_training
,
return_dataset
=
False
,
threads
=
1
):
"""
Converts a list of examples into a list of features that can be directly given as input to a model.
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args:
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
max_seq_length: The maximum sequence length of the inputs.
doc_stride: The stride used when the context is too large and is split across several features.
max_query_length: The maximum length of the query.
is_training: whether to create features for model evaluation or model training.
return_dataset: Default False. Either 'pt' or 'tf'.
if 'pt': returns a torch.data.TensorDataset,
if 'tf': returns a tf.data.Dataset
threads: multiple processing threadsa-smi
Returns:
list of :class:`~transformers.data.processors.squad.SquadFeatures`
Example::
processor = SquadV2Processor()
examples = processor.get_dev_examples(data_dir)
features = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=not evaluate,
)
"""
# Defining helper methods
features
=
[]
threads
=
min
(
threads
,
cpu_count
())
with
Pool
(
threads
,
initializer
=
squad_convert_example_to_features_init
,
initargs
=
(
tokenizer
,))
as
p
:
annotate_
=
partial
(
squad_convert_example_to_features
,
max_seq_length
=
max_seq_length
,
doc_stride
=
doc_stride
,
max_query_length
=
max_query_length
,
is_training
=
is_training
)
features
=
list
(
tqdm
(
p
.
imap
(
annotate_
,
examples
,
chunksize
=
32
),
total
=
len
(
examples
),
desc
=
'convert squad examples to features'
))
new_features
=
[]
unique_id
=
1000000000
example_index
=
0
for
example_features
in
tqdm
(
features
,
total
=
len
(
features
),
desc
=
'add example index and unique id'
):
if
not
example_features
:
continue
for
example_feature
in
example_features
:
example_feature
.
example_index
=
example_index
example_feature
.
unique_id
=
unique_id
new_features
.
append
(
example_feature
)
unique_id
+=
1
example_index
+=
1
features
=
new_features
del
new_features
if
return_dataset
==
'pt'
:
if
not
is_torch_available
():
raise
ImportError
(
"Pytorch must be installed to return a pytorch dataset."
)
...
...
@@ -296,7 +321,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
return
features
,
dataset
return
features
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment