Commit 8e9526b4 authored by erenup's avatar erenup
Browse files

add multiple processing

parent 9b312f9d
...@@ -360,7 +360,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal ...@@ -360,7 +360,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
doc_stride=args.doc_stride, doc_stride=args.doc_stride,
max_query_length=args.max_query_length, max_query_length=args.max_query_length,
is_training=not evaluate, is_training=not evaluate,
return_dataset='pt' return_dataset='pt',
threads=args.threads,
) )
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
...@@ -478,6 +479,8 @@ def main(): ...@@ -478,6 +479,8 @@ def main():
"See details at https://nvidia.github.io/apex/amp.html") "See details at https://nvidia.github.io/apex/amp.html")
parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
parser.add_argument('--threads', type=int, default=1, help='multiple threads for converting example to features')
args = parser.parse_args() args = parser.parse_args()
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
......
...@@ -4,6 +4,9 @@ import logging ...@@ -4,6 +4,9 @@ import logging
import os import os
import json import json
import numpy as np import numpy as np
from multiprocessing import Pool
from multiprocessing import cpu_count
from functools import partial
from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from ...tokenization_bert import BasicTokenizer, whitespace_tokenize
from .utils import DataProcessor, InputExample, InputFeatures from .utils import DataProcessor, InputExample, InputFeatures
...@@ -76,47 +79,9 @@ def _is_whitespace(c): ...@@ -76,47 +79,9 @@ def _is_whitespace(c):
return True return True
return False return False
def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, def squad_convert_example_to_features(example, max_seq_length,
doc_stride, max_query_length, is_training, doc_stride, max_query_length, is_training):
return_dataset=False):
"""
Converts a list of examples into a list of features that can be directly given as input to a model.
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args:
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
max_seq_length: The maximum sequence length of the inputs.
doc_stride: The stride used when the context is too large and is split across several features.
max_query_length: The maximum length of the query.
is_training: whether to create features for model evaluation or model training.
return_dataset: Default False. Either 'pt' or 'tf'.
if 'pt': returns a torch.data.TensorDataset,
if 'tf': returns a tf.data.Dataset
Returns:
list of :class:`~transformers.data.processors.squad.SquadFeatures`
Example::
processor = SquadV2Processor()
examples = processor.get_dev_examples(data_dir)
features = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=not evaluate,
)
"""
# Defining helper methods
unique_id = 1000000000
features = [] features = []
for (example_index, example) in enumerate(tqdm(examples)):
if is_training and not example.is_impossible: if is_training and not example.is_impossible:
# Get start and end position # Get start and end position
start_position = example.start_position start_position = example.start_position
...@@ -127,8 +92,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -127,8 +92,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
if actual_text.find(cleaned_answer_text) == -1: if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
continue return []
tok_to_orig_index = [] tok_to_orig_index = []
orig_to_tok_index = [] orig_to_tok_index = []
...@@ -171,7 +135,8 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -171,7 +135,8 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first' truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
) )
paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride,
max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
if tokenizer.pad_token_id in encoded_dict['input_ids']: if tokenizer.pad_token_id in encoded_dict['input_ids']:
non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
...@@ -202,7 +167,8 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -202,7 +167,8 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
for doc_span_index in range(len(spans)): for doc_span_index in range(len(spans)):
for j in range(spans[doc_span_index]["paragraph_len"]): for j in range(spans[doc_span_index]["paragraph_len"]):
is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j index = j if tokenizer.padding_side == "left" else spans[doc_span_index][
"truncated_query_with_special_tokens_length"] + j
spans[doc_span_index]["token_is_max_context"][index] = is_max_context spans[doc_span_index]["token_is_max_context"][index] = is_max_context
for span in spans: for span in spans:
...@@ -224,7 +190,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -224,7 +190,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
# Set the CLS index to '0' # Set the CLS index to '0'
p_mask[cls_index] = 0 p_mask[cls_index] = 0
span_is_impossible = example.is_impossible span_is_impossible = example.is_impossible
start_position = 0 start_position = 0
end_position = 0 end_position = 0
...@@ -251,16 +216,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -251,16 +216,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
start_position = tok_start_position - doc_start + doc_offset start_position = tok_start_position - doc_start + doc_offset
end_position = tok_end_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset
features.append(SquadFeatures( features.append(SquadFeatures(
span['input_ids'], span['input_ids'],
span['attention_mask'], span['attention_mask'],
span['token_type_ids'], span['token_type_ids'],
cls_index, cls_index,
p_mask.tolist(), p_mask.tolist(),
example_index=0,
example_index=example_index, unique_id=0,
unique_id=unique_id,
paragraph_len=span['paragraph_len'], paragraph_len=span['paragraph_len'],
token_is_max_context=span["token_is_max_context"], token_is_max_context=span["token_is_max_context"],
tokens=span["tokens"], tokens=span["tokens"],
...@@ -269,9 +232,71 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -269,9 +232,71 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
start_position=start_position, start_position=start_position,
end_position=end_position end_position=end_position
)) ))
return features
unique_id += 1 def squad_convert_example_to_features_init(tokenizer_for_convert):
global tokenizer
tokenizer = tokenizer_for_convert
def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
doc_stride, max_query_length, is_training,
return_dataset=False, threads=1):
"""
Converts a list of examples into a list of features that can be directly given as input to a model.
It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
Args:
examples: list of :class:`~transformers.data.processors.squad.SquadExample`
tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer`
max_seq_length: The maximum sequence length of the inputs.
doc_stride: The stride used when the context is too large and is split across several features.
max_query_length: The maximum length of the query.
is_training: whether to create features for model evaluation or model training.
return_dataset: Default False. Either 'pt' or 'tf'.
if 'pt': returns a torch.data.TensorDataset,
if 'tf': returns a tf.data.Dataset
threads: multiple processing threadsa-smi
Returns:
list of :class:`~transformers.data.processors.squad.SquadFeatures`
Example::
processor = SquadV2Processor()
examples = processor.get_dev_examples(data_dir)
features = squad_convert_examples_to_features(
examples=examples,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
is_training=not evaluate,
)
"""
# Defining helper methods
features = []
threads = min(threads, cpu_count())
with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p:
annotate_ = partial(squad_convert_example_to_features, max_seq_length=max_seq_length,
doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training)
features = list(tqdm(p.imap(annotate_, examples, chunksize=32), total=len(examples), desc='convert squad examples to features'))
new_features = []
unique_id = 1000000000
example_index = 0
for example_features in tqdm(features, total=len(features), desc='add example index and unique id'):
if not example_features:
continue
for example_feature in example_features:
example_feature.example_index = example_index
example_feature.unique_id = unique_id
new_features.append(example_feature)
unique_id += 1
example_index += 1
features = new_features
del new_features
if return_dataset == 'pt': if return_dataset == 'pt':
if not is_torch_available(): if not is_torch_available():
raise ImportError("Pytorch must be installed to return a pytorch dataset.") raise ImportError("Pytorch must be installed to return a pytorch dataset.")
...@@ -296,7 +321,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -296,7 +321,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
return features, dataset return features, dataset
return features return features
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment