Merge pull request #2 from elyase/patch-1

Port tokenization for the multilingual model

Merge pull request #2 from elyase/patch-1
Port tokenization for the multilingual model
48930a4c · Thomas Wolf · GitHub · a81a1ef8 · 4d124baf · 48930a4c
Unverified Commit 48930a4c authored Nov 10, 2018 by Thomas Wolf Committed by GitHub Nov 10, 2018
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 0 deletions

tests/tokenization_test.py tests/tokenization_test.py +7 -0

tokenization.py tokenization.py +42 -0

No files found.
--- a/tests/tokenization_test.py
+++ b/tests/tokenization_test.py
@@ -43,6 +43,13 @@ class TokenizationTest(unittest.TestCase):
        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+    def test_chinese(self):
+        tokenizer = tokenization.BasicTokenizer()
+        self.assertListEqual(
+            tokenizer.tokenize(u"ah\u535A\u63A8zz"),
+            [u"ah", u"\u535A", u"\u63A8", u"zz"])  
    def test_basic_tokenizer_lower(self):
        tokenizer = tokenization.BasicTokenizer(do_lower_case=True)

--- a/tokenization.py
+++ b/tokenization.py
@@ -133,6 +133,13 @@ class BasicTokenizer(object):
        """Tokenizes a piece of text."""
        text = convert_to_unicode(text)
        text = self._clean_text(text)
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        text = self._tokenize_chinese_chars(text)
        orig_tokens = whitespace_tokenize(text)
        split_tokens = []
        for token in orig_tokens:
@@ -174,7 +181,42 @@ class BasicTokenizer(object):
            i += 1
        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+            (cp >= 0x3400 and cp <= 0x4DBF) or  #
+            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+            (cp >= 0x2B820 and cp <= 0x2CEAF) or
+            (cp >= 0xF900 and cp <= 0xFAFF) or  #
+            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+            return True
+        return False
    def _clean_text(self, text):
        """Performs invalid character removal and whitespace cleanup on text."""
        output = []