Commit 3b7153f0 authored by Chen Chen's avatar Chen Chen Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 326935344
parent 52a017db
...@@ -38,6 +38,8 @@ class QADataConfig(cfg.DataConfig): ...@@ -38,6 +38,8 @@ class QADataConfig(cfg.DataConfig):
input_preprocessed_data_path: str = '' input_preprocessed_data_path: str = ''
doc_stride: int = 128 doc_stride: int = 128
query_length: int = 64 query_length: int = 64
# The path to the vocab file of word piece tokenizer or the
# model of the sentence piece tokenizer.
vocab_file: str = '' vocab_file: str = ''
tokenization: str = 'WordPiece' # WordPiece or SentencePiece tokenization: str = 'WordPiece' # WordPiece or SentencePiece
do_lower_case: bool = True do_lower_case: bool = True
......
...@@ -139,17 +139,23 @@ class QuestionAnsweringTask(base_task.Task): ...@@ -139,17 +139,23 @@ class QuestionAnsweringTask(base_task.Task):
kwargs = dict( kwargs = dict(
examples=eval_examples, examples=eval_examples,
tokenizer=tokenization.FullTokenizer(
vocab_file=params.vocab_file, do_lower_case=params.do_lower_case),
max_seq_length=params.seq_length, max_seq_length=params.seq_length,
doc_stride=params.doc_stride, doc_stride=params.doc_stride,
max_query_length=params.query_length, max_query_length=params.query_length,
is_training=False, is_training=False,
output_fn=_append_feature, output_fn=_append_feature,
batch_size=params.global_batch_size) batch_size=params.global_batch_size)
if params.tokenization == 'SentencePiece': if params.tokenization == 'SentencePiece':
# squad_lib_sp requires one more argument 'do_lower_case'. # squad_lib_sp requires one more argument 'do_lower_case'.
kwargs['do_lower_case'] = params.do_lower_case kwargs['do_lower_case'] = params.do_lower_case
kwargs['tokenizer'] = tokenization.FullSentencePieceTokenizer(
sp_model_file=params.vocab_file)
elif params.tokenization == 'WordPiece':
kwargs['tokenizer'] = tokenization.FullTokenizer(
vocab_file=params.vocab_file, do_lower_case=params.do_lower_case)
else:
raise ValueError('Unexpected tokenization: %s' % params.tokenization)
eval_dataset_size = self.squad_lib.convert_examples_to_features(**kwargs) eval_dataset_size = self.squad_lib.convert_examples_to_features(**kwargs)
eval_writer.close() eval_writer.close()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment