Commit 3ea89d24 authored by Chen Chen's avatar Chen Chen Committed by A. Unique TensorFlower
Browse files

Internal change

PiperOrigin-RevId: 326935344
parent 6496bcf8
......@@ -38,6 +38,8 @@ class QADataConfig(cfg.DataConfig):
input_preprocessed_data_path: str = ''
doc_stride: int = 128
query_length: int = 64
# The path to the vocab file of word piece tokenizer or the
# model of the sentence piece tokenizer.
vocab_file: str = ''
tokenization: str = 'WordPiece' # WordPiece or SentencePiece
do_lower_case: bool = True
......
......@@ -139,17 +139,23 @@ class QuestionAnsweringTask(base_task.Task):
kwargs = dict(
examples=eval_examples,
tokenizer=tokenization.FullTokenizer(
vocab_file=params.vocab_file, do_lower_case=params.do_lower_case),
max_seq_length=params.seq_length,
doc_stride=params.doc_stride,
max_query_length=params.query_length,
is_training=False,
output_fn=_append_feature,
batch_size=params.global_batch_size)
if params.tokenization == 'SentencePiece':
# squad_lib_sp requires one more argument 'do_lower_case'.
kwargs['do_lower_case'] = params.do_lower_case
kwargs['tokenizer'] = tokenization.FullSentencePieceTokenizer(
sp_model_file=params.vocab_file)
elif params.tokenization == 'WordPiece':
kwargs['tokenizer'] = tokenization.FullTokenizer(
vocab_file=params.vocab_file, do_lower_case=params.do_lower_case)
else:
raise ValueError('Unexpected tokenization: %s' % params.tokenization)
eval_dataset_size = self.squad_lib.convert_examples_to_features(**kwargs)
eval_writer.close()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment