Commit 8bd6b235 authored by VictorSanh's avatar VictorSanh
Browse files

typo on tokenization

parent 2c55568c
...@@ -23,7 +23,7 @@ import logging ...@@ -23,7 +23,7 @@ import logging
import json import json
import math import math
import os import os
import tokenization import tokenization_pytorch
import six import six
import argparse import argparse
...@@ -62,9 +62,9 @@ class SquadExample(object): ...@@ -62,9 +62,9 @@ class SquadExample(object):
def __repr__(self): def __repr__(self):
s = "" s = ""
s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += "qas_id: %s" % (tokenization_pytorch.printable_text(self.qas_id))
s += ", question_text: %s" % ( s += ", question_text: %s" % (
tokenization.printable_text(self.question_text)) tokenization_pytorch.printable_text(self.question_text))
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position: if self.start_position:
s += ", start_position: %d" % (self.start_position) s += ", start_position: %d" % (self.start_position)
...@@ -153,7 +153,7 @@ def read_squad_examples(input_file, is_training): ...@@ -153,7 +153,7 @@ def read_squad_examples(input_file, is_training):
# guaranteed to be preserved. # guaranteed to be preserved.
actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
cleaned_answer_text = " ".join( cleaned_answer_text = " ".join(
tokenization.whitespace_tokenize(orig_answer_text)) tokenization_pytorch.whitespace_tokenize(orig_answer_text))
if actual_text.find(cleaned_answer_text) == -1: if actual_text.find(cleaned_answer_text) == -1:
logger.warning("Could not find answer: '%s' vs. '%s'", logger.warning("Could not find answer: '%s' vs. '%s'",
actual_text, cleaned_answer_text) actual_text, cleaned_answer_text)
...@@ -287,7 +287,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -287,7 +287,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
logger.info("example_index: %s" % (example_index)) logger.info("example_index: %s" % (example_index))
logger.info("doc_span_index: %s" % (doc_span_index)) logger.info("doc_span_index: %s" % (doc_span_index))
logger.info("tokens: %s" % " ".join( logger.info("tokens: %s" % " ".join(
[tokenization.printable_text(x) for x in tokens])) [tokenization_pytorch.printable_text(x) for x in tokens]))
logger.info("token_to_orig_map: %s" % " ".join( logger.info("token_to_orig_map: %s" % " ".join(
["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)])) ["%d:%d" % (x, y) for (x, y) in six.iteritems(token_to_orig_map)]))
logger.info("token_is_max_context: %s" % " ".join([ logger.info("token_is_max_context: %s" % " ".join([
...@@ -303,7 +303,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -303,7 +303,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
logger.info("start_position: %d" % (start_position)) logger.info("start_position: %d" % (start_position))
logger.info("end_position: %d" % (end_position)) logger.info("end_position: %d" % (end_position))
logger.info( logger.info(
"answer: %s" % (tokenization.printable_text(answer_text))) "answer: %s" % (tokenization_pytorch.printable_text(answer_text)))
features.append( features.append(
InputFeatures( InputFeatures(
...@@ -579,7 +579,7 @@ def get_final_text(pred_text, orig_text, do_lower_case): ...@@ -579,7 +579,7 @@ def get_final_text(pred_text, orig_text, do_lower_case):
# and `pred_text`, and check if they are the same length. If they are # and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same # NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned. # length, we assume the characters are one-to-one aligned.
tokenizer = tokenization.BasicTokenizer(do_lower_case=do_lower_case) tokenizer = tokenization_pytorch.BasicTokenizer(do_lower_case=do_lower_case)
tok_text = " ".join(tokenizer.tokenize(orig_text)) tok_text = " ".join(tokenizer.tokenize(orig_text))
...@@ -780,7 +780,7 @@ def main(): ...@@ -780,7 +780,7 @@ def main():
raise ValueError("Output directory () already exists and is not empty.") raise ValueError("Output directory () already exists and is not empty.")
os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.output_dir, exist_ok=True)
tokenizer = tokenization.FullTokenizer( tokenizer = tokenization_pytorch.FullTokenizer(
vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)
train_examples = None train_examples = None
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment