Add sentence piece tokenizer in tokenization.py

PiperOrigin-RevId: 284624714

Add sentence piece tokenizer in tokenization.py
PiperOrigin-RevId: 284624714
558bab5d · Chen Chen · A. Unique TensorFlower · 9cae3c4f · 558bab5d
Commit 558bab5d authored Dec 09, 2019 by Chen Chen Committed by A. Unique TensorFlower Dec 09, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 131 additions and 1 deletion

official/nlp/bert/tokenization.py official/nlp/bert/tokenization.py +131 -1

No files found.
--- a/official/nlp/bert/tokenization.py
+++ b/official/nlp/bert/tokenization.py
+# coding=utf-8
 # Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -29,6 +30,10 @@ import unicodedata
 import six
 import tensorflow as tf

+import sentencepiece as spm
+
+SPIECE_UNDERLINE = u"▁".encode("utf-8")
+

 def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
  """Checks whether the casing config is consistent with the checkpoint name."""
@@ -366,7 +371,7 @@ class WordpieceTokenizer(object):

 def _is_whitespace(char):
  """Checks whether `chars` is a whitespace character."""
-  # \t, \n, and \r are technically contorl characters but we treat them
+  # \t, \n, and \r are technically control characters but we treat them
  # as whitespace since they are generally considered as such.
  if char == " " or char == "\t" or char == "\n" or char == "\r":
    return True
@@ -402,3 +407,128 @@ def _is_punctuation(char):
  if cat.startswith("P"):
    return True
  return False
+
+
+def preprocess_text(inputs, remove_space=True, lower=False):
+  """Preprocesses data by removing extra space and normalize data.
+
+  This method is used together with sentence piece tokenizer and is forked from:
+  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+
+  Args:
+    inputs: The input text.
+    remove_space: Whether to remove the extra space.
+    lower: Whether to lowercase the text.
+
+  Returns:
+    The preprocessed text.
+
+  """
+  outputs = inputs
+  if remove_space:
+    outputs = " ".join(inputs.strip().split())
+
+  if six.PY2 and isinstance(outputs, str):
+    try:
+      outputs = six.ensure_text(outputs, "utf-8")
+    except UnicodeDecodeError:
+      outputs = six.ensure_text(outputs, "latin-1")
+
+  outputs = unicodedata.normalize("NFKD", outputs)
+  outputs = "".join([c for c in outputs if not unicodedata.combining(c)])
+  if lower:
+    outputs = outputs.lower()
+
+  return outputs
+
+
+def encode_pieces(sp_model, text, sample=False):
+  """Segements text into pieces.
+
+  This method is used together with sentence piece tokenizer and is forked from:
+  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+
+
+  Args:
+    sp_model: A spm.SentencePieceProcessor object.
+    text: The input text to be segemented.
+    sample: Whether to randomly sample a segmentation output or return a
+      deterministic one.
+
+  Returns:
+    A list of token pieces.
+  """
+  if not sample:
+    pieces = sp_model.EncodeAsPieces(text)
+  else:
+    pieces = sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+  new_pieces = []
+  for piece in pieces:
+    piece = printable_text(piece)
+    if len(piece) > 1 and piece[-1] == "," and piece[-2].isdigit():
+      cur_pieces = sp_model.EncodeAsPieces(
+          six.ensure_binary(piece[:-1]).replace(SPIECE_UNDERLINE, b""))
+      if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE:
+        if len(cur_pieces[0]) == 1:
+          cur_pieces = cur_pieces[1:]
+        else:
+          cur_pieces[0] = cur_pieces[0][1:]
+      cur_pieces.append(piece[-1])
+      new_pieces.extend(cur_pieces)
+    else:
+      new_pieces.append(piece)
+
+  return new_pieces
+
+
+def encode_ids(sp_model, text, sample=False):
+  """Segments text and return token ids.
+
+  This method is used together with sentence piece tokenizer and is forked from:
+  https://github.com/google-research/google-research/blob/master/albert/tokenization.py
+
+  Args:
+    sp_model: A spm.SentencePieceProcessor object.
+    text: The input text to be segemented.
+    sample: Whether to randomly sample a segmentation output or return a
+      deterministic one.
+
+  Returns:
+    A list of token ids.
+  """
+  pieces = encode_pieces(sp_model, text, sample=sample)
+  ids = [sp_model.PieceToId(piece) for piece in pieces]
+  return ids
+
+
+class FullSentencePieceTokenizer(object):
+  """Runs end-to-end sentence piece tokenization.
+
+  The interface of this class is intended to keep the same as above
+  `FullTokenizer` class for easier usage.
+  """
+
+  def __init__(self, sp_model_file):
+    """Inits FullSentencePieceTokenizer.
+
+    Args:
+      sp_model_file: The path to the sentence piece model file.
+    """
+    self._sp_model = spm.SentencePieceProcessor()
+    self._sp_model.Load(sp_model_file)
+    self.vocab = {
+        self._sp_model.IdToPiece(i): i
+        for i in six.moves.range(self._sp_model.GetPieceSize())
+    }
+
+  def tokenize(self, text):
+    """Tokenizes text into pieces."""
+    return encode_pieces(self._sp_model, text)
+
+  def convert_tokens_to_ids(self, tokens):
+    """Converts a list of tokens to a list of ids."""
+    return [self._sp_model.PieceToId(printable_text(token)) for token in tokens]
+
+  def convert_ids_to_tokens(self, ids):
+    """Converts a list of ids ot a list of tokens."""
+    return [self._sp_model.IdToPiece(id_) for id_ in ids]