remove convert_to_unicode and printable_text from examples

32167cdf · thomwolf · ce37b8e4 · 32167cdf · 32167cdf · 32167cdf
Commit 32167cdf authored Nov 26, 2018 by thomwolf
5 changed files
--- a/examples/extract_features.py
+++ b/examples/extract_features.py
@@ -28,7 +28,7 @@ import torch
 from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler

-from pytorch_pretrained_bert.tokenization import convert_to_unicode, BertTokenizer
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.modeling import BertModel

 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
@@ -170,7 +170,7 @@ def read_examples(input_file):
    unique_id = 0
    with open(input_file, "r") as reader:
        while True:
-            line = convert_to_unicode(reader.readline())
+            line = reader.readline()
            if not line:
                break
            line = line.strip()

--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -30,7 +30,7 @@ import torch
 from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler

-from pytorch_pretrained_bert.tokenization import printable_text, convert_to_unicode, BertTokenizer
+from pytorch_pretrained_bert.tokenization import BertTokenizer
 from pytorch_pretrained_bert.modeling import BertForSequenceClassification
 from pytorch_pretrained_bert.optimization import BertAdam

@@ -122,9 +122,9 @@ class MrpcProcessor(DataProcessor):
            if i == 0:
                continue
            guid = "%s-%s" % (set_type, i)
-            text_a = convert_to_unicode(line[3])
-            text_b = convert_to_unicode(line[4])
-            label = convert_to_unicode(line[0])
+            text_a = line[3]
+            text_b = line[4]
+            label = line[0]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
@@ -154,10 +154,10 @@ class MnliProcessor(DataProcessor):
        for (i, line) in enumerate(lines):
            if i == 0:
                continue
-            guid = "%s-%s" % (set_type, convert_to_unicode(line[0]))
-            text_a = convert_to_unicode(line[8])
-            text_b = convert_to_unicode(line[9])
-            label = convert_to_unicode(line[-1])
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[8])
+            text_b = line[9])
+            label = line[-1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples
@@ -185,8 +185,8 @@ class ColaProcessor(DataProcessor):
        examples = []
        for (i, line) in enumerate(lines):
            guid = "%s-%s" % (set_type, i)
-            text_a = convert_to_unicode(line[3])
-            label = convert_to_unicode(line[1])
+            text_a = line[3]
+            label = line[1]
            examples.append(
                InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
        return examples
@@ -273,7 +273,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer
            logger.info("*** Example ***")
            logger.info("guid: %s" % (example.guid))
            logger.info("tokens: %s" % " ".join(
-                    [printable_text(x) for x in tokens]))
+                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info(

--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -32,7 +32,7 @@ import torch
 from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
 from torch.utils.data.distributed import DistributedSampler

-from pytorch_pretrained_bert.tokenization import printable_text, whitespace_tokenize, BasicTokenizer, BertTokenizer
+from pytorch_pretrained_bert.tokenization import whitespace_tokenize, BasicTokenizer, BertTokenizer
 from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
 from pytorch_pretrained_bert.optimization import BertAdam

@@ -64,9 +64,9 @@ class SquadExample(object):

    def __repr__(self):
        s = ""
-        s += "qas_id: %s" % (printable_text(self.qas_id))
+        s += "qas_id: %s" % (self.qas_id)
        s += ", question_text: %s" % (
-            printable_text(self.question_text))
+            self.question_text)
        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
        if self.start_position:
            s += ", start_position: %d" % (self.start_position)
@@ -288,8 +288,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                logger.info("unique_id: %s" % (unique_id))
                logger.info("example_index: %s" % (example_index))
                logger.info("doc_span_index: %s" % (doc_span_index))
-                logger.info("tokens: %s" % " ".join(
-                    [printable_text(x) for x in tokens]))
+                logger.info("tokens: %s" % " ".join(tokens))
                logger.info("token_to_orig_map: %s" % " ".join([
                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
                logger.info("token_is_max_context: %s" % " ".join([
@@ -305,7 +304,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                    logger.info("start_position: %d" % (start_position))
                    logger.info("end_position: %d" % (end_position))
                    logger.info(
-                        "answer: %s" % (printable_text(answer_text)))
+                        "answer: %s" % (answer_text))

            features.append(
                InputFeatures(

--- a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
@@ -133,7 +133,7 @@
    "    unique_id = 0\n",
    "    with tf.gfile.GFile(input_file, \"r\") as reader:\n",
    "        while True:\n",
-    "            line = reader.readline()#tokenization.convert_to_unicode(reader.readline())\n",
+    "            line = reader.readline()\n",
    "            if not line:\n",
    "                break\n",
    "            line = line.strip()\n",

--- a/pytorch_pretrained_bert/tokenization.py
+++ b/pytorch_pretrained_bert/tokenization.py
@@ -38,18 +38,6 @@ PRETRAINED_VOCAB_ARCHIVE_MAP = {
    'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
 }

-def printable_text(text):
-    """Returns text encoded in a way suitable for print or `tf.logging`."""
-
-    # These functions want `str` for both Python2 and Python3, but in one case
-    # it's a Unicode string and in the other it's a byte string.
-    if isinstance(text, str):
-        return text
-    elif isinstance(text, bytes):
-        return text.decode("utf-8", "ignore")
-    else:
-        raise ValueError("Unsupported string type: %s" % (type(text)))
-

 def load_vocab(vocab_file):
    """Loads a vocabulary file into a dictionary."""