"...git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "5603f78fc46bd117e3f25cc8842eb08046bbff4e"
Commit 32167cdf authored by thomwolf's avatar thomwolf
Browse files

remove convert_to_unicode and printable_text from examples

parent ce37b8e4
...@@ -28,7 +28,7 @@ import torch ...@@ -28,7 +28,7 @@ import torch
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from pytorch_pretrained_bert.tokenization import convert_to_unicode, BertTokenizer from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertModel from pytorch_pretrained_bert.modeling import BertModel
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
...@@ -170,7 +170,7 @@ def read_examples(input_file): ...@@ -170,7 +170,7 @@ def read_examples(input_file):
unique_id = 0 unique_id = 0
with open(input_file, "r") as reader: with open(input_file, "r") as reader:
while True: while True:
line = convert_to_unicode(reader.readline()) line = reader.readline()
if not line: if not line:
break break
line = line.strip() line = line.strip()
......
...@@ -30,7 +30,7 @@ import torch ...@@ -30,7 +30,7 @@ import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from pytorch_pretrained_bert.tokenization import printable_text, convert_to_unicode, BertTokenizer from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pytorch_pretrained_bert.optimization import BertAdam from pytorch_pretrained_bert.optimization import BertAdam
...@@ -122,9 +122,9 @@ class MrpcProcessor(DataProcessor): ...@@ -122,9 +122,9 @@ class MrpcProcessor(DataProcessor):
if i == 0: if i == 0:
continue continue
guid = "%s-%s" % (set_type, i) guid = "%s-%s" % (set_type, i)
text_a = convert_to_unicode(line[3]) text_a = line[3]
text_b = convert_to_unicode(line[4]) text_b = line[4]
label = convert_to_unicode(line[0]) label = line[0]
examples.append( examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples return examples
...@@ -154,10 +154,10 @@ class MnliProcessor(DataProcessor): ...@@ -154,10 +154,10 @@ class MnliProcessor(DataProcessor):
for (i, line) in enumerate(lines): for (i, line) in enumerate(lines):
if i == 0: if i == 0:
continue continue
guid = "%s-%s" % (set_type, convert_to_unicode(line[0])) guid = "%s-%s" % (set_type, line[0])
text_a = convert_to_unicode(line[8]) text_a = line[8])
text_b = convert_to_unicode(line[9]) text_b = line[9])
label = convert_to_unicode(line[-1]) label = line[-1]
examples.append( examples.append(
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples return examples
...@@ -185,8 +185,8 @@ class ColaProcessor(DataProcessor): ...@@ -185,8 +185,8 @@ class ColaProcessor(DataProcessor):
examples = [] examples = []
for (i, line) in enumerate(lines): for (i, line) in enumerate(lines):
guid = "%s-%s" % (set_type, i) guid = "%s-%s" % (set_type, i)
text_a = convert_to_unicode(line[3]) text_a = line[3]
label = convert_to_unicode(line[1]) label = line[1]
examples.append( examples.append(
InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
return examples return examples
...@@ -273,7 +273,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer ...@@ -273,7 +273,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
logger.info("*** Example ***") logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid)) logger.info("guid: %s" % (example.guid))
logger.info("tokens: %s" % " ".join( logger.info("tokens: %s" % " ".join(
[printable_text(x) for x in tokens])) [str(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info( logger.info(
......
...@@ -32,7 +32,7 @@ import torch ...@@ -32,7 +32,7 @@ import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from pytorch_pretrained_bert.tokenization import printable_text, whitespace_tokenize, BasicTokenizer, BertTokenizer from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
from pytorch_pretrained_bert.modeling import BertForQuestionAnswering from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
from pytorch_pretrained_bert.optimization import BertAdam from pytorch_pretrained_bert.optimization import BertAdam
...@@ -64,9 +64,9 @@ class SquadExample(object): ...@@ -64,9 +64,9 @@ class SquadExample(object):
def __repr__(self): def __repr__(self):
s = "" s = ""
s += "qas_id: %s" % (printable_text(self.qas_id)) s += "qas_id: %s" % (self.qas_id)
s += ", question_text: %s" % ( s += ", question_text: %s" % (
printable_text(self.question_text)) self.question_text)
s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
if self.start_position: if self.start_position:
s += ", start_position: %d" % (self.start_position) s += ", start_position: %d" % (self.start_position)
...@@ -288,8 +288,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -288,8 +288,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
logger.info("unique_id: %s" % (unique_id)) logger.info("unique_id: %s" % (unique_id))
logger.info("example_index: %s" % (example_index)) logger.info("example_index: %s" % (example_index))
logger.info("doc_span_index: %s" % (doc_span_index)) logger.info("doc_span_index: %s" % (doc_span_index))
logger.info("tokens: %s" % " ".join( logger.info("tokens: %s" % " ".join(tokens))
[printable_text(x) for x in tokens]))
logger.info("token_to_orig_map: %s" % " ".join([ logger.info("token_to_orig_map: %s" % " ".join([
"%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
logger.info("token_is_max_context: %s" % " ".join([ logger.info("token_is_max_context: %s" % " ".join([
...@@ -305,7 +304,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, ...@@ -305,7 +304,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
logger.info("start_position: %d" % (start_position)) logger.info("start_position: %d" % (start_position))
logger.info("end_position: %d" % (end_position)) logger.info("end_position: %d" % (end_position))
logger.info( logger.info(
"answer: %s" % (printable_text(answer_text))) "answer: %s" % (answer_text))
features.append( features.append(
InputFeatures( InputFeatures(
......
...@@ -133,7 +133,7 @@ ...@@ -133,7 +133,7 @@
" unique_id = 0\n", " unique_id = 0\n",
" with tf.gfile.GFile(input_file, \"r\") as reader:\n", " with tf.gfile.GFile(input_file, \"r\") as reader:\n",
" while True:\n", " while True:\n",
" line = reader.readline()#tokenization.convert_to_unicode(reader.readline())\n", " line = reader.readline()\n",
" if not line:\n", " if not line:\n",
" break\n", " break\n",
" line = line.strip()\n", " line = line.strip()\n",
......
...@@ -38,18 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = { ...@@ -38,18 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt", 'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
} }
def printable_text(text):
"""Returns text encoded in a way suitable for print or `tf.logging`."""
# These functions want `str` for both Python2 and Python3, but in one case
# it's a Unicode string and in the other it's a byte string.
if isinstance(text, str):
return text
elif isinstance(text, bytes):
return text.decode("utf-8", "ignore")
else:
raise ValueError("Unsupported string type: %s" % (type(text)))
def load_vocab(vocab_file): def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary.""" """Loads a vocabulary file into a dictionary."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment