Initial checkin of sequence_projection (#9153)

* Initial checkin of sequence_projection * Fix the path * Fix paths and deps * Fix path and deps Co-authored-by: Learn2Compress <expander-robot@google.com>

Initial checkin of sequence_projection (#9153)
* Initial checkin of sequence_projection * Fix the path * Fix paths and deps * Fix path and deps Co-authored-by: Learn2Compress <expander-robot@google.com>
f91b59c6 · thunderfyc · GitHub · 67efd3ab · f91b59c6 · f91b59c6
Unverified Commit f91b59c6 authored Aug 26, 2020 by thunderfyc Committed by GitHub Aug 26, 2020
20 changed files
--- a/research/sequence_projection/sgnn/sgnn_projection_op_resolver.cc
+++ b/research/sequence_projection/sgnn/sgnn_projection_op_resolver.cc
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "sgnn/sgnn_projection_op_resolver.h"  // sequence_projection
+#include "tensorflow/lite/mutable_op_resolver.h"
+#include "sgnn/sgnn_projection.h"  // sequence_projection
+namespace tflite {
+namespace ops {
+namespace custom {
+void AddSgnnProjectionCustomOp(MutableOpResolver* resolver) {
+  resolver->AddCustom("tftext:custom:SgnnProjection",
+                      Register_tftext_SGNN_PROJECTION());
+}
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
--- a/research/sequence_projection/sgnn/sgnn_projection_op_resolver.h
+++ b/research/sequence_projection/sgnn/sgnn_projection_op_resolver.h
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_MODELS_SEQUENCE_PROJECTION_SGNN_SGNN_PROJECTION_OP_RESOLVER_H_
+#define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_SGNN_SGNN_PROJECTION_OP_RESOLVER_H_
+#include "tensorflow/lite/mutable_op_resolver.h"
+namespace tflite {
+namespace ops {
+namespace custom {
+// Adds the SgnnProjection custom op to an op resolver.
+// This function can be loaded using dlopen.  Since C++ function names get
+// mangled, declare this function as extern C, so its name is unchanged.
+extern "C" void AddSgnnProjectionCustomOp(MutableOpResolver* resolver);
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
+#endif  // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_SGNN_SGNN_PROJECTION_OP_RESOLVER_H_
--- a/research/sequence_projection/sgnn/sgnn_projection_test.cc
+++ b/research/sequence_projection/sgnn/sgnn_projection_test.cc
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "sgnn/sgnn_projection.h"  // sequence_projection
+#include <string>
+#include <vector>
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "flatbuffers/flexbuffers.h"  // flatbuffer
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_util.h"
+namespace tflite {
+namespace ops {
+namespace custom {
+namespace sgnn_projection {
+namespace test {
+namespace {
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+}  // namespace
+class SgnnProjectionModel : public SingleOpModel {
+ public:
+  // Constructor for testing the op with a tf.Tensor
+  SgnnProjectionModel(const std::vector<std::string>& input_values,
+                      const std::vector<int64_t>& input_row_splits,
+                      const std::vector<int64_t>& hash_seed, int64_t buckets) {
+    input_values_index_ = AddInput(TensorType_STRING);
+    input_row_splits_index_ = AddInput(TensorType_INT64);
+    output_values_index_ = AddOutput(TensorType_FLOAT32);
+    BuildCustomOp(hash_seed, buckets);
+    BuildInterpreter({{static_cast<int>(input_values.size())},
+                      {static_cast<int>(input_row_splits.size())}});
+    PopulateStringTensor(input_values_index_, input_values);
+    PopulateTensor(input_row_splits_index_, input_row_splits);
+    Invoke();
+  }
+  std::vector<int> GetOutputShape() {
+    return GetTensorShape(output_values_index_);
+  }
+  std::vector<float> ExtractOutputValue() {
+    return ExtractVector<float>(output_values_index_);
+  }
+ private:
+  void BuildCustomOp(const std::vector<int64_t>& hash_seed, int64_t buckets) {
+    flexbuffers::Builder fbb;
+    size_t start_map = fbb.StartMap();
+    auto vector_start = fbb.StartVector("hash_seed");
+    for (int i = 0; i < hash_seed.size(); i++) {
+      fbb.Add(hash_seed[i]);
+    }
+    fbb.EndVector(vector_start, /*typed=*/true, /*fixed=*/false);
+    fbb.Int("buckets", buckets);
+    fbb.EndMap(start_map);
+    fbb.Finish();
+    SetCustomOp("tftext:custom:SgnnProjection", fbb.GetBuffer(),
+                Register_tftext_SGNN_PROJECTION);
+  }
+  int input_values_index_;
+  int input_row_splits_index_;
+  int output_values_index_;
+};
+// Keep same result of test_projection in sgnn_test.py
+TEST(SgnnProjectionTest, TensorSgnnProjection) {
+  SgnnProjectionModel m({"^h", "he", "el", "ll", "lo", "o$", "^h", "hi", "i$"},
+                        /*input_row_splits=*/{0, 6, 9}, /*hash_seed=*/{5, 7},
+                        /*buckets=*/0x7FFFFFFF);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(2, 2));
+  EXPECT_THAT(m.ExtractOutputValue(),
+              ElementsAreArray(ArrayFloatNear(
+                  { 0.448691, -0.238499, -0.037561,  0.080748})));
+}
+}  // namespace test
+}  // namespace sgnn_projection
+}  // namespace custom
+}  // namespace ops
+}  // namespace tflite
--- a/research/sequence_projection/sgnn/sgnn_test.py
+++ b/research/sequence_projection/sgnn/sgnn_test.py
+# Copyright 2020 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Lint as: python3
+"""Tests for sequence_projection.sgnn."""
+import tensorflow as tf
+from tensorflow.python.framework import test_util  # pylint: disable=g-direct-tensorflow-import
+import sgnn # import sequence_projection module
+@test_util.run_all_in_graph_and_eager_modes
+class SgnnTest(tf.test.TestCase):
+  def test_preprocess(self):
+    self.assertAllEqual(
+        sgnn.preprocess(
+            tf.constant([['Hello World!'], [u'你好'],
+                         [u'مرحبا بالعالم']])),
+        [['hello'.encode(), 'world!'.encode()], [u'你好'.encode()],
+         [u'مرحبا'.encode(), u'بالعالم'.encode()]])
+  def test_get_ngram(self):
+    tokens = tf.ragged.constant([['hello', 'world'], [u'你好'],
+                                 [u'مرحبا', u'بالعالم']])
+    self.assertAllEqual(
+        sgnn.get_ngrams(tokens, 3),
+        [[
+            b'^he', b'hel', b'ell', b'llo', b'lo$', b'^wo', b'wor', b'orl',
+            b'rld', b'ld$'
+        ], [u'^你好'.encode(), u'你好$'.encode()],
+         [
+             u'^مر'.encode(), u'مرح'.encode(), u'رحب'.encode(),
+             u'حبا'.encode(), u'با$'.encode(), u'^با'.encode(),
+             u'بال'.encode(), u'الع'.encode(), u'لعا'.encode(),
+             u'عال'.encode(), u'الم'.encode(), u'لم$'.encode()
+         ]])
+  def test_project(self):
+    ngrams = tf.ragged.constant([[b'^h', b'he', b'el', b'll', b'lo', b'o$'],
+                                 [b'^h', b'hi', b'i$']])
+    self.assertAllClose(
+        sgnn.fused_project(ngrams, [5, 7], 0x7FFFFFFF),
+        [[0.448691, -0.238499], [-0.037561, 0.080748]])
+    self.assertAllClose(
+        sgnn.fused_project(ngrams, [5, 7], 0x7FFFFFFF),
+        sgnn.project(ngrams, [5, 7], 0x7FFFFFFF))
+  def test_sgnn(self):
+    self.assertAllClose(
+        sgnn.sgnn(tf.constant([['hello'], ['hi']]), [3, 5, 7], 2),
+        [[0.268503, 0.448691, -0.238499], [0.093143, -0.037561, 0.080748]])
+  def test_keras_model(self):
+    hparams = sgnn.Hparams(learning_rate=2e-4)
+    model = sgnn.keras_model([1, 2, 3, 4], 2, [100, 50], hparams)
+    self.assertIsNotNone(model)
+if __name__ == '__main__':
+  tf.test.main()
--- a/research/sequence_projection/sgnn/train.py
+++ b/research/sequence_projection/sgnn/train.py
+# Copyright 2020 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to train langid model.
+The script builds language detection from wikipedia dataset,
+builds SGNN model to train an on-device model to
+predict the language of the given text.
+"""
+import os
+from absl import app
+from absl import flags
+import numpy as np
+import tensorflow.compat.v2 as tf
+import tensorflow_datasets as tfds
+import sgnn # import sequence_projection module
+FLAGS = flags.FLAGS
+flags.DEFINE_string('output_dir', '/tmp/langid',
+                    'Path for the output directory.')
+flags.DEFINE_integer('projection_size', 600, 'Size of projection layer.')
+flags.DEFINE_integer('ngram_size', 3, 'Max size of ngram to project features.')
+flags.DEFINE_string('fc_layer', '256,128',
+                    'Size of fully connected layer, separated by comma.')
+flags.DEFINE_integer('batch_size', 160, 'Batch size for training.')
+flags.DEFINE_integer('epochs', 10, 'Num of epochs for training.')
+flags.DEFINE_float('learning_rate', 2e-4, 'learning rate for optimizer.')
+LANGIDS = ['ar', 'en', 'es', 'fr', 'ru', 'zh']
+def dataset_fn(batch_size, is_training, split, try_gcs, max_input_len):
+  """Creates dataset to train and evaluate.
+  Args:
+    batch_size: Batch size for training or evaluation.
+    is_training: True if the dataset is for training.
+    split: Split of dataset, follow the pattern defined in
+      https://www.tensorflow.org/datasets/splits
+    try_gcs: True if loading the data from gcs.
+    max_input_len: Max length of input string.
+  Returns:
+    Dataset object.
+  """
+  def _get_text(item):
+    return tf.strings.substr(item['text'], 0, max_input_len)
+  all_data = []
+  for idx, langid in enumerate(LANGIDS):
+    dataset = tfds.load(
+        'wikipedia/20190301.%s' % langid, try_gcs=try_gcs, split=split)
+    map_fn = lambda item: (_get_text(item), idx)  # pylint: disable=cell-var-from-loop
+    dataset = dataset.map(map_fn)
+    all_data.append(dataset)
+  datasets = tf.data.experimental.sample_from_datasets(
+      all_data, [1. / len(all_data)] * len(LANGIDS))
+  repeat_count = None if is_training else 1
+  return datasets.cache().shuffle(100000).batch(batch_size).repeat(repeat_count)
+def save_and_convert(model, output_dir):
+  """Save keras model and convert to tflite."""
+  saved_model_path = os.path.join(output_dir, 'saved_model')
+  tf.saved_model.save(model, saved_model_path)
+  converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
+  converter.allow_custom_ops = True
+  converter.target_spec.supported_ops = [
+      tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
+  ]
+  data = converter.convert()
+  with open(os.path.join(output_dir, 'model.tflite'), 'wb') as f:
+    f.write(data)
+def train_and_evaluate():
+  """Train and evaluate the model."""
+  hash_seed = np.random.uniform(-1, 1, FLAGS.projection_size) * 0x7FFFFFFF
+  fc_layer = [int(fc) for fc in FLAGS.fc_layer.split(',')]
+  fc_layer.append(len(LANGIDS) + 1)
+  hparams = sgnn.Hparams(learning_rate=FLAGS.learning_rate)
+  model = sgnn.keras_model(hash_seed, FLAGS.ngram_size, fc_layer, hparams)
+  model.fit(
+      dataset_fn(FLAGS.batch_size, True, 'train[:10%]', True, 100),
+      epochs=FLAGS.epochs,
+      steps_per_epoch=1000,
+      validation_steps=100,
+      validation_data=dataset_fn(FLAGS.batch_size, False, 'train[10:11%]', True,
+                                 100),
+  )
+  save_and_convert(model, FLAGS.output_dir)
+def main(_):
+  if not os.path.exists(FLAGS.output_dir):
+    os.mkdir(FLAGS.output_dir)
+  train_and_evaluate()
+if __name__ == '__main__':
+  app.run(main)
--- a/research/sequence_projection/tf_ops/BUILD
+++ b/research/sequence_projection/tf_ops/BUILD
+# Tensorflow ops for sequence string projection.
+load("//tf_ops:build_def.bzl", "gen_op_wrapper_py")
+licenses(["notice"])
+package(
+    default_visibility = [
+        "//:__subpackages__",
+    ],
+)
+cc_library(
+    name = "sequence_string_projection_op",
+    srcs = [
+        "sequence_string_projection.cc",
+    ],
+    deps = [
+        ":projection_normalizer_util",
+        ":projection_tokenizer_util",
+        ":projection_util",
+        ":text_distorter",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/random",
+        "@tensorflow_includes//:includes",
+        "@tensorflow_solib//:framework_lib",
+    ],
+    alwayslink = 1,
+)
+cc_library(
+    name = "projection_util",
+    srcs = ["projection_util.cc"],
+    hdrs = ["projection_util.h"],
+    deps = [
+        "@utf_archive//:utf",
+    ],
+)
+cc_library(
+    name = "projection_tokenizer_util",
+    srcs = ["projection_tokenizer_util.cc"],
+    hdrs = ["projection_tokenizer_util.h"],
+    deps = [
+        ":projection_util",
+        "@utf_archive//:utf",
+    ],
+)
+cc_library(
+    name = "projection_normalizer_util",
+    srcs = ["projection_normalizer_util.cc"],
+    hdrs = ["projection_normalizer_util.h"],
+    deps = [
+        ":projection_util",
+        "@utf_archive//:utf",
+    ],
+)
+cc_library(
+    name = "text_distorter",
+    srcs = ["text_distorter.cc"],
+    hdrs = ["text_distorter.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@icu4c",
+        "@tensorflow_includes//:includes",
+        "@tensorflow_solib//:framework_lib",
+        "@utf_archive//:utf",
+    ],
+)
+cc_test(
+    name = "sequence_string_projection_test",
+    size = "small",
+    srcs = ["sequence_string_projection_test.cc"],
+    deps = [
+        ":sequence_string_projection_op",
+        "@tensorflow_includes//:includes",
+        "@tensorflow_solib//:framework_lib",
+    ],
+)
+cc_library(
+    name = "sequence_string_projection_op_v2",
+    srcs = [
+        "sequence_string_projection_op_v2.cc",
+    ],
+    deps = [
+        ":projection_normalizer_util",
+        ":projection_util",
+        ":text_distorter",
+        "@tensorflow_includes//:includes",
+        "@tensorflow_solib//:framework_lib",
+    ],
+    alwayslink = 1,
+)
+cc_test(
+    name = "sequence_string_projection_op_v2_test",
+    size = "small",
+    srcs = ["sequence_string_projection_op_v2_test.cc"],
+    deps = [
+        ":sequence_string_projection_op_v2",
+        "@tensorflow_includes//:includes",
+        "@tensorflow_solib//:framework_lib",
+    ],
+)
+gen_op_wrapper_py(
+    name = "sequence_string_projection_op_v2_py",
+    out = "sequence_string_projection_op_v2.py",
+    kernel_lib = ":sequence_string_projection_op_v2",
+)
+gen_op_wrapper_py(
+    name = "sequence_string_projection_op_py",
+    out = "sequence_string_projection_op.py",
+    kernel_lib = ":sequence_string_projection_op",
+)
--- a/research/sequence_projection/tf_ops/build_def.bzl
+++ b/research/sequence_projection/tf_ops/build_def.bzl
+def tf_deps():
+    return [
+        "@tensorflow_includes//:includes",
+        "@tensorflow_solib//:framework_lib",
+    ]
+def tf_copts():
+    return ["-Wno-sign-compare"]
+def _make_search_paths(prefix, levels_to_root):
+    return ",".join(
+        [
+            "-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
+            for search_level in range(levels_to_root + 1)
+        ],
+    )
+def _rpath_linkopts(name):
+    # Search parent directories up to the TensorFlow root directory for shared
+    # object dependencies, even if this op shared object is deeply nested
+    # (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
+    # the root and tensorflow/libtensorflow_framework.so should exist when
+    # deployed. Other shared object dependencies (e.g. shared between contrib/
+    # ops) are picked up as long as they are in either the same or a parent
+    # directory in the tensorflow/ tree.
+    levels_to_root = native.package_name().count("/") + name.count("/")
+    return ["-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),)]
+def gen_op_wrapper_py(name, out, kernel_lib, linkopts = [], **kwargs):
+    """Generates the py_library `name` with a data dep on the ops in kernel_lib.
+    The resulting py_library creates file `$out`, and has a dependency on a
+    symbolic library called lib{$name}_gen_op.so, which contains the kernels
+    and ops and can be loaded via `tf.load_op_library`.
+    Args:
+      name: The name of the py_library.
+      out: The name of the python file.  Use "gen_{name}_ops.py".
+      kernel_lib: A cc_kernel_library target to generate for.
+      **kwargs: Any args to the `cc_binary` and `py_library` internal rules.
+    """
+    if not out.endswith(".py"):
+        fail("Argument out must end with '.py', but saw: {}".format(out))
+    module_name = "lib{}_gen_op".format(name)
+    version_script_file = "%s-version-script.lds" % module_name
+    native.genrule(
+        name = module_name + "_version_script",
+        outs = [version_script_file],
+        cmd = "echo '{global:\n *tensorflow*;\n *deepmind*;\n local: *;};' >$@",
+        output_licenses = ["unencumbered"],
+        visibility = ["//visibility:private"],
+    )
+    native.cc_binary(
+        name = "{}.so".format(module_name),
+        deps = [kernel_lib] + tf_deps() + [version_script_file],
+        copts = tf_copts() + [
+            "-fno-strict-aliasing",  # allow a wider range of code [aliasing] to compile.
+            "-fvisibility=hidden",  # avoid symbol clashes between DSOs.
+        ],
+        linkshared = 1,
+        linkopts = linkopts + _rpath_linkopts(module_name) + [
+            "-Wl,--version-script",
+            "$(location %s)" % version_script_file,
+        ],
+        **kwargs
+    )
+    native.genrule(
+        name = "{}_genrule".format(out),
+        outs = [out],
+        cmd = """
+        echo 'import tensorflow as tf
+_reverb_gen_op = tf.load_op_library(
+    tf.compat.v1.resource_loader.get_path_to_datafile(
+       "lib{}_gen_op.so"))
+_locals = locals()
+for k in dir(_reverb_gen_op):
+  _locals[k] = getattr(_reverb_gen_op, k)
+del _locals' > $@""".format(name),
+    )
+    native.py_library(
+        name = name,
+        srcs = [out],
+        data = [":lib{}_gen_op.so".format(name)],
+        **kwargs
+    )
--- a/research/sequence_projection/tf_ops/projection_normalizer_util.cc
+++ b/research/sequence_projection/tf_ops/projection_normalizer_util.cc
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tf_ops/projection_normalizer_util.h"  // sequence_projection
+#include <algorithm>
+#include <cstddef>
+#include <memory>
+#include <sstream>
+#include <utility>
+#include "tf_ops/projection_util.h"  // sequence_projection
+// Returns true if the given text contains a number.
+bool IsDigit(const std::string& text) {
+  Rune rune;
+  for (size_t i = 0; i < text.length();) {
+    const int bytes_read = charntorune(&rune, text.data(), 1);
+    if (rune == Runeerror || bytes_read == 0) break;
+    if (rune >= static_cast<Rune>('0') && rune <= static_cast<Rune>('9')) {
+      return true;
+    }
+    i += bytes_read;
+  }
+  return false;
+}
+// Gets the string containing |num_chars| characters from |start| position.
+std::string GetCharToken(const std::vector<std::string>& char_tokens,
+                         size_t start, size_t num_chars) {
+  std::string char_token = "";
+  if (start + num_chars <= char_tokens.size()) {
+    for (size_t i = 0; i < num_chars; ++i) {
+      char_token.append(char_tokens[start + i]);
+    }
+  }
+  return char_token;
+}
+// Counts how many times |pattern| appeared from |start| position.
+int GetNumPattern(const std::vector<std::string>& char_tokens, size_t start,
+                  size_t num_chars, const std::string& pattern) {
+  int count = 0;
+  for (size_t i = start; i < char_tokens.size(); i += num_chars) {
+    std::string cur_pattern = GetCharToken(char_tokens, i, num_chars);
+    if (pattern == cur_pattern) {
+      ++count;
+    } else {
+      break;
+    }
+  }
+  return count;
+}
+std::string ContractToken(const char* input_ptr, size_t len, size_t num_chars) {
+  // This function contracts patterns whose length is |num_chars| and appeared
+  // more than twice. So if the input is shorter than 3 * |num_chars|, do not
+  // apply any contraction.
+  if (len < 3 * num_chars) {
+    return input_ptr;
+  }
+  std::vector<std::string> char_tokens = SplitByChar(input_ptr, len, len);
+  std::string token;
+  token.reserve(len);
+  for (size_t i = 0; i < char_tokens.size();) {
+    std::string cur_pattern = GetCharToken(char_tokens, i, num_chars);
+    // Count how many times this pattern appeared.
+    int num_cur_patterns = 0;
+    if (cur_pattern.find(" ") == std::string::npos && !IsDigit(cur_pattern)) {
+      num_cur_patterns =
+          GetNumPattern(char_tokens, i + num_chars, num_chars, cur_pattern);
+    }
+    if (num_cur_patterns >= 2) {
+      // If this pattern is repeated, store it only twice.
+      token.append(cur_pattern);
+      token.append(cur_pattern);
+      i += (num_cur_patterns + 1) * num_chars;
+    } else {
+      token.append(char_tokens[i]);
+      ++i;
+    }
+  }
+  return token;
+}
+void ProjectionNormalizer::InitializeSeparators(const std::string& separators) {
+  for (size_t i = 0; i < separators.length(); ++i) {
+    if (separators[i] != ' ') {
+      separators_.insert(separators[i]);
+    }
+  }
+}
+std::string ProjectionNormalizer::NormalizeInternal(const char* input_ptr,
+                                                    size_t len) {
+  std::string normalized;
+  normalized.reserve(len * 2);
+  for (size_t i = 0; i < len; ++i) {
+    char c = input_ptr[i];
+    bool matched_separator = separators_.find(c) != separators_.end();
+    if (matched_separator) {
+      if (i > 0 && input_ptr[i - 1] != ' ' && normalized.back() != ' ') {
+        normalized.append(" ");
+      }
+    }
+    normalized.append(1, c);
+    if (matched_separator) {
+      if (i + 1 < len && input_ptr[i + 1] != ' ' && c != '\'') {
+        normalized.append(" ");
+      }
+    }
+  }
+  return normalized;
+}
+std::string ProjectionNormalizer::Normalize(const std::string& input,
+                                            size_t max_input) {
+  return Normalize(input.c_str(), input.size(), max_input);
+}
+std::string ProjectionNormalizer::Normalize(const char* input_ptr, size_t len,
+                                            size_t max_input) {
+  std::string normalized(input_ptr, std::min(len, max_input));
+  if (normalize_repetition_) {
+    // Remove repeated 1 char (e.g. soooo => soo)
+    normalized = ContractToken(normalized.data(), normalized.length(), 1);
+    // Remove repeated 2 chars from the beginning (e.g. hahaha =>
+    // haha, xhahaha => xhaha, xyhahaha => xyhaha).
+    normalized = ContractToken(normalized.data(), normalized.length(), 2);
+    // Remove repeated 3 chars from the beginning
+    // (e.g. wowwowwow => wowwow, abcdbcdbcd => abcdbcd).
+    normalized = ContractToken(normalized.data(), normalized.length(), 3);
+  }
+  if (!separators_.empty()) {
+    // Add space around separators_.
+    normalized = NormalizeInternal(normalized.data(), normalized.length());
+  }
+  return normalized;
+}
--- a/research/sequence_projection/tf_ops/projection_normalizer_util.h
+++ b/research/sequence_projection/tf_ops/projection_normalizer_util.h
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_NORMALIZER_UTIL_H_
+#define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_NORMALIZER_UTIL_H_
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "libutf/utf.h"
+// Normalizes the input with the given |separators| by adding a space before and
+// after each separator. When |normalize_repetition| is true, it removes the
+// repeated characters (except numbers) which consecutively appeared more than
+// twice in a word.
+// Examples: arwwwww -> arww, good!!!!! -> good!!, hahaha => haha.
+class ProjectionNormalizer {
+ public:
+  explicit ProjectionNormalizer(const std::string& separators,
+                                bool normalize_repetition = false) {
+    InitializeSeparators(separators);
+    normalize_repetition_ = normalize_repetition;
+  }
+  // Normalizes the repeated characters (except numbers) which consecutively
+  // appeared more than twice in a word.
+  std::string Normalize(const std::string& input, size_t max_input = 300);
+  std::string Normalize(const char* input_ptr, size_t len,
+                        size_t max_input = 300);
+ private:
+  // Parses and extracts supported separators.
+  void InitializeSeparators(const std::string& separators);
+  // Removes repeated chars.
+  std::string NormalizeInternal(const char* input_ptr, size_t len);
+  std::unordered_set<char> separators_;
+  bool normalize_repetition_;
+};
+#endif  // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_NORMALIZER_UTIL_H_
--- a/research/sequence_projection/tf_ops/projection_tokenizer_util.cc
+++ b/research/sequence_projection/tf_ops/projection_tokenizer_util.cc
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tf_ops/projection_tokenizer_util.h"  // sequence_projection
+#include <cstddef>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <utility>
+#include "tf_ops/projection_util.h"  // sequence_projection
+namespace {
+constexpr char kApostrophe = '\'';
+constexpr char kSpace = ' ';
+constexpr char kComma = ',';
+constexpr char kDot = '.';
+constexpr size_t kInvalid = -1;
+}  // namespace
+// Returns true if the input |c| is ascii number.
+bool is_numeric(char c) { return c >= '0' && c <= '9'; }
+// Returns true if we want to prepend the separator to the next token.
+bool prepend_separator(char separator) { return separator == kApostrophe; }
+void ProjectionTokenizer::InitializeSeparators(const std::string& separators) {
+  for (size_t i = 0; i < separators.length(); ++i) {
+    separators_.insert(separators[i]);
+  }
+}
+size_t ProjectionTokenizer::FindNextSeparator(const char* input_ptr,
+                                              size_t from,
+                                              size_t length) const {
+  auto index = from;
+  while (index < length) {
+    char c = input_ptr[index];
+    // Do not break a number (e.g. "10,000", "0.23").
+    if (c == kComma || c == kDot) {
+      if (index + 1 < length && is_numeric(input_ptr[index + 1])) {
+        c = input_ptr[++index];
+      }
+    }
+    if (separators_.find(c) != separators_.end()) {
+      break;
+    }
+    ++index;
+  }
+  return index == length ? kInvalid : index;
+}
+std::vector<std::string> ProjectionTokenizer::Tokenize(
+    const char* input_ptr, size_t len, size_t max_input,
+    size_t max_tokens) const {
+  // If separators_ is not given, tokenize the input with a space.
+  if (separators_.empty()) {
+    return SplitBySpace(input_ptr, len, max_input, max_tokens);
+  }
+  std::vector<std::string> tokens;
+  size_t last_index =
+      max_input == kEntireString ? len : (len < max_input ? len : max_input);
+  size_t start = 0;
+  // Skip leading spaces.
+  while (start < last_index && input_ptr[start] == kSpace) {
+    start++;
+  }
+  auto end = FindNextSeparator(input_ptr, start, last_index);
+  while (end != kInvalid &&
+         (max_tokens == kAllTokens || tokens.size() < max_tokens - 1)) {
+    auto length = end - start;
+    if (length > 0) tokens.emplace_back(input_ptr + start, length);
+    // Add the separator (except space and apostrophe) as a token
+    char separator = input_ptr[end];
+    if (separator != kSpace && separator != kApostrophe) {
+      tokens.emplace_back(input_ptr + end, 1);
+    }
+    start = end + (prepend_separator(separator) ? 0 : 1);
+    end = FindNextSeparator(input_ptr, end + 1, last_index);
+  }
+  auto length = end == kInvalid ? (last_index - start) : (end - start);
+  if (length > 0) tokens.emplace_back(input_ptr + start, length);
+  return tokens;
+}
--- a/research/sequence_projection/tf_ops/projection_tokenizer_util.h
+++ b/research/sequence_projection/tf_ops/projection_tokenizer_util.h
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_TOKENIZER_UTIL_H_
+#define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_TOKENIZER_UTIL_H_
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "libutf/utf.h"
+// Tokenizes the input with the given separators. To properly tokenize a text
+// containing contractions in English (e.g. I'm), it combines the apostrophe
+// with the token coming after it. For example, the text "I'm happy" is
+// tokenized into three tokens: "I", "'m", "happy". When |separators| is not
+// given, use the space to tokenize the input.
+// Note) This tokenization supports only English.
+class ProjectionTokenizer {
+ public:
+  explicit ProjectionTokenizer(const std::string& separators) {
+    InitializeSeparators(separators);
+  }
+  // Tokenizes the input by separators_. Limit to max_tokens, when it is not -1.
+  std::vector<std::string> Tokenize(const std::string& input, size_t max_input,
+                                    size_t max_tokens) const {
+    return Tokenize(input.c_str(), input.size(), max_input, max_tokens);
+  }
+  std::vector<std::string> Tokenize(const char* input_ptr, size_t len,
+                                    size_t max_input, size_t max_tokens) const;
+ private:
+  // Parses and extracts supported separators.
+  void InitializeSeparators(const std::string& separators);
+  // Starting from input_ptr[from], search for the next occurrence of
+  // separators_. Don't search beyond input_ptr[length](non-inclusive). Return
+  // -1 if not found.
+  size_t FindNextSeparator(const char* input_ptr, size_t from,
+                           size_t length) const;
+  std::unordered_set<char> separators_;
+};
+#endif  // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_TOKENIZER_UTIL_H_
--- a/research/sequence_projection/tf_ops/projection_util.cc
+++ b/research/sequence_projection/tf_ops/projection_util.cc
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tf_ops/projection_util.h"  // sequence_projection
+#include <cstddef>
+#include <iostream>
+#include <memory>
+#include <sstream>
+namespace {
+constexpr size_t kInvalid = -1;
+constexpr char kSpace = ' ';
+}  // namespace
+std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes(
+    const std::pair<const char*, size_t>& source) const {
+  // Ideally the size of target should be less than or equal to source. But
+  // when we do to_lower the number of bytes needed to encode a unicode
+  // character could increase. To account for this 4 times the source length
+  // is allocated for target.
+  const char* csource = source.first;
+  int len = source.second;
+  auto target = std::unique_ptr<char[]>(new char[len * 4]);
+  auto target_ptr = target.get();
+  int i = 0;
+  while (i < len) {
+    Rune rune;
+    const int bytes_read = charntorune(&rune, csource + i, len - i);
+    if (bytes_read == 0) {
+      break;
+    }
+    i += bytes_read;
+    if (rune != Runeerror) {
+      Rune lower = tolowerrune(rune);
+      // Skip processing the unicode if exclude_nonalphaspace_unicodes_ is true
+      // and the unicode is not alpha and not space.
+      const Rune kSpaceRune = ' ';
+      if (exclude_nonalphaspace_unicodes_ && !isalpharune(lower) &&
+          lower != kSpaceRune) {
+        continue;
+      }
+      if (IsUnrestrictedVocabulary() || IsValidUnicode(lower)) {
+        const int bytes_written = runetochar(target_ptr, &lower);
+        target_ptr += bytes_written;
+      }
+    }
+  }
+  return std::string(target.get(), target_ptr);
+}
+void ProjectionUnicodeHandler::InitializeVocabulary(
+    const std::string& vocabulary) {
+  for (size_t i = 0, index = 0; i < vocabulary.length();) {
+    Rune rune;
+    const int bytes_read =
+        charntorune(&rune, vocabulary.c_str() + i, vocabulary.length() - i);
+    if (!bytes_read) {
+      break;
+    }
+    i += bytes_read;
+    // Include novel lower case unicode segments as part of valid chars.
+    if (rune == Runeerror) {
+      std::clog << "Invalid rune in vocabulary.";
+    } else if (IsValidUnicode(rune)) {
+      std::clog << "Duplicate rune " << rune << " found in vocabulary.";
+    } else if (rune != tolowerrune(rune)) {
+      std::clog << "Upper case rune " << rune << " found in vocabulary.";
+    } else {
+      valid_chars_[rune] = index++;
+    }
+  }
+}
+// Starting from input_ptr[from], search for the next occurrence of ' ',
+// Don't search beyond input_ptr[length](non-inclusive), return -1 if not found.
+inline size_t FindNextSpace(const char* input_ptr, size_t from, size_t length) {
+  size_t space_index;
+  for (space_index = from; space_index < length; space_index++) {
+    if (input_ptr[space_index] == kSpace) {
+      break;
+    }
+  }
+  return space_index == length ? kInvalid : space_index;
+}
+template <typename T>
+void SplitBySpaceInternal(std::vector<T>* tokens, const char* input_ptr,
+                          size_t len, size_t max_input, size_t max_tokens) {
+  size_t last_index =
+      max_input == kEntireString ? len : (len < max_input ? len : max_input);
+  size_t start = 0;
+  // skip leading spaces
+  while (start < last_index && input_ptr[start] == kSpace) {
+    start++;
+  }
+  auto end = FindNextSpace(input_ptr, start, last_index);
+  while (end != kInvalid &&
+         (max_tokens == kAllTokens || tokens->size() < max_tokens - 1)) {
+    auto length = end - start;
+    if (length > 0) {
+      tokens->emplace_back(input_ptr + start, length);
+    }
+    start = end + 1;
+    end = FindNextSpace(input_ptr, start, last_index);
+  }
+  auto length = end == kInvalid ? (last_index - start) : (end - start);
+  if (length > 0) {
+    tokens->emplace_back(input_ptr + start, length);
+  }
+}
+std::vector<std::pair<const char*, size_t>> SplitBySpaceAsPairs(
+    const char* input_ptr, size_t len, size_t max_tokens) {
+  std::vector<std::pair<const char*, size_t>> tokens;
+  SplitBySpaceInternal(&tokens, input_ptr, len, kEntireString, max_tokens);
+  return tokens;
+}
+std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len,
+                                      size_t max_input, size_t max_tokens) {
+  std::vector<std::string> tokens;
+  SplitBySpaceInternal(&tokens, input_ptr, len, max_input, max_tokens);
+  return tokens;
+}
+template <typename T>
+void SplitByCharInternal(std::vector<T>* tokens, const char* input_ptr,
+                         size_t len, size_t max_tokens) {
+  Rune rune;
+  for (size_t i = 0; i < len;) {
+    auto bytes_read = charntorune(&rune, input_ptr + i, len - i);
+    if (bytes_read == 0) break;
+    tokens->emplace_back(input_ptr + i, bytes_read);
+    if (max_tokens != kInvalid && tokens->size() == max_tokens) {
+      break;
+    }
+    i += bytes_read;
+  }
+}
+std::vector<std::pair<const char*, size_t>> SplitByCharAsPairs(
+    const char* input_ptr, size_t len, size_t max_tokens) {
+  std::vector<std::pair<const char*, size_t>> tokens;
+  SplitByCharInternal(&tokens, input_ptr, len, max_tokens);
+  return tokens;
+}
+std::vector<std::string> SplitByChar(const char* input_ptr, size_t len,
+                                     size_t max_tokens) {
+  std::vector<std::string> tokens;
+  SplitByCharInternal(&tokens, input_ptr, len, max_tokens);
+  return tokens;
+}
+std::string JoinPairsBySpace(
+    std::vector<std::pair<const char*, size_t>> words) {
+  std::stringstream ss;
+  bool first = true;
+  for (auto& str_pair : words) {
+    if (first) {
+      ss << std::string(str_pair.first, str_pair.second);
+      first = false;
+    } else {
+      ss << kSpace << std::string(str_pair.first, str_pair.second);
+    }
+  }
+  return ss.str();
+}
+std::vector<std::pair<const char*, size_t>> ProjectionUnicodeHandler::Tokenize(
+    const char* str, size_t len, bool by_space, int max_tokens) const {
+  return by_space ? SplitBySpaceAsPairs(str, len, max_tokens)
+                  : SplitByCharAsPairs(str, len, max_tokens);
+}
--- a/research/sequence_projection/tf_ops/projection_util.h
+++ b/research/sequence_projection/tf_ops/projection_util.h
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_UTIL_H_
+#define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_UTIL_H_
+#include <cassert>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "libutf/utf.h"
+inline int charntorune(Rune* r, const char* s, int n) {
+  const int bytes_read = chartorune(r, const_cast<char *>(s));
+  if (bytes_read > n) {
+    *r = Runeerror;
+    return 0;
+  }
+  return bytes_read;
+}
+// A hashing wrapper class that can hash a string and generate a hash code with
+// requested number of features (two bit values). Some of the implementations
+// are copied from murmurhash.
+class Hasher {
+ public:
+  explicit Hasher(int feature_size) : feature_size_(feature_size) {
+    GetHashCodesInternal(empty_string_, &null_hash_codes_);
+  }
+  void GetHashCodes(const std::string& word,
+                    std::vector<uint64_t>* hash_codes) {
+    if (word.empty()) {
+      *hash_codes = null_hash_codes_;
+    } else {
+      hash_codes->clear();
+      GetHashCodesInternal(word, hash_codes);
+    }
+  }
+ private:
+  static constexpr uint64_t kMul = 0xc6a4a7935bd1e995ULL;
+  static constexpr uint64_t kMul2 = 0x9e3779b97f4a7835ULL;
+  inline uint64_t ShiftMix(uint64_t val) { return val ^ (val >> 47); }
+  inline uint64_t MurmurStep(uint64_t hash, uint64_t data) {
+    hash ^= ShiftMix(data * kMul) * kMul;
+    hash *= kMul;
+    return hash;
+  }
+  inline uint64_t Load64VariableLength(const void* p, int len) {
+    assert(len >= 1 && len <= 8);
+    const char* buf = static_cast<const char*>(p);
+    uint64_t val = 0;
+    --len;
+    do {
+      val = (val << 8) | buf[len];
+      // (--len >= 0) is about 10 % faster than (len--) in some benchmarks.
+    } while (--len >= 0);
+    // No ToHost64(...) needed. The bytes are accessed in little-endian manner
+    // on every architecture.
+    return val;
+  }
+  void GetMoreBits(uint64_t hash, uint64_t hash2, uint64_t* rlow,
+                   uint64_t* rhigh) {
+    hash = ShiftMix(hash) * kMul;
+    hash2 ^= hash;
+    *rhigh = ShiftMix(hash);
+    *rlow = ShiftMix(hash2 * kMul2) * kMul2;
+  }
+  std::pair<uint64_t, uint64_t> MurmurHash128(const char* buf,
+                                              const size_t len) {
+    // Initialize the hashing value.
+    uint64_t hash = len * kMul;
+    // hash2 will be xored by hash during the hash computation iterations.
+    // In the end we use an alternative mixture multiplier for mixing
+    // the bits in hash2.
+    uint64_t hash2 = 0;
+    // Let's remove the bytes not divisible by the sizeof(uint64_t).
+    // This allows the inner loop to process the data as 64 bit integers.
+    const size_t len_aligned = len & ~0x7;
+    const char* end = buf + len_aligned;
+    for (const char* p = buf; p != end; p += 8) {
+      // Manually unrolling this loop 2x did not help on Intel Core 2.
+      hash = MurmurStep(hash, Load64VariableLength(p, 8));
+      hash2 ^= hash;
+    }
+    if ((len & 0x7) != 0) {
+      const uint64_t data = Load64VariableLength(end, len & 0x7);
+      hash ^= data;
+      hash *= kMul;
+      hash2 ^= hash;
+    }
+    hash = ShiftMix(hash) * kMul;
+    hash2 ^= hash;
+    hash = ShiftMix(hash);
+    // mul2 is a prime just above golden ratio. mul2 is used to ensure that the
+    // impact of the last few bytes is different to the upper and lower 64 bits.
+    hash2 = ShiftMix(hash2 * kMul2) * kMul2;
+    return std::make_pair(hash, hash2);
+  }
+  void GetHashCodesInternal(const std::string& word,
+                            std::vector<uint64_t>* hash_codes) {
+    uint64_t hash_low = 0;
+    uint64_t hash_high = 0;
+    for (int i = 0; i < feature_size_; i += 64) {
+      if (i == 0) {
+        auto hash = MurmurHash128(word.c_str(), word.size());
+        hash_low = hash.first;
+        hash_high = hash.second;
+      } else {
+        GetMoreBits(hash_low, hash_high, &hash_low, &hash_high);
+      }
+      hash_codes->push_back(hash_low);
+      hash_codes->push_back(hash_high);
+    }
+  }
+  const std::string empty_string_ = "<null>";
+  const int feature_size_;
+  std::vector<uint64_t> null_hash_codes_;
+};
+// Unicode processor for tensorflow and tflite string projection ops.
+class ProjectionUnicodeHandler {
+ public:
+  // Takes an utf8 string which lists the unicodes that are supported and are
+  // part of the vocabulary of this instance. When the utf8 string is empty,
+  // all unicode segments are supported by this instance. The boolean
+  // flag exclude_nonalphaspace_unicodes is used to indicate if nonalpha and
+  // space unicode segments from the input should be stripped out.
+  // Another way to analyse the filtering logic is as below.
+  // Vocabulary acts as a allowlist when provided and all unicode set when
+  // empty. The flag exclude_nonalphaspace_unicodes when true acts as a
+  // allowlist on all alpha characters and space. It includes the entire unicode
+  // set when false. Valid unicode segments are the intersection of these 2
+  // sets.
+  explicit ProjectionUnicodeHandler(const std::string& vocabulary,
+                                    bool exclude_nonalphaspace_unicodes = false)
+      : exclude_nonalphaspace_unicodes_(exclude_nonalphaspace_unicodes) {
+    InitializeVocabulary(vocabulary);
+  }
+  // Performs language independent lower case and returns a string with
+  // supported unicode segments.
+  std::string LowerCaseUTF8WithSupportedUnicodes(
+      const std::pair<const char*, size_t>& source) const;
+  // Returns a boolean flag indicating if the unicode segment is part of the
+  // vocabulary.
+  bool IsValidUnicode(Rune rune) const {
+    return valid_chars_.find(rune) != valid_chars_.end();
+  }
+  // Returns an index in [0, |vocabulary|), if the unicode is part of the
+  // vocabulary and -1 if it's not.
+  int UnicodeIndex(Rune rune) const {
+    return IsValidUnicode(rune) ? valid_chars_.at(rune) : -1;
+  }
+  // Returns |vocabulary|.
+  size_t NumberOfValidUnicodes() const { return valid_chars_.size(); }
+  // Returns true if the vocabulary is empty which means all unicode segments
+  // are supported.
+  bool IsUnrestrictedVocabulary() const { return valid_chars_.empty(); }
+  // Tokenizes input by space or unicode point segmentation. Limit to
+  // max_tokens, when it is not -1.
+  std::vector<std::pair<const char*, size_t>> Tokenize(const std::string& input,
+                                                       bool by_space,
+                                                       int max_tokens) const {
+    return Tokenize(input.c_str(), input.size(), by_space, max_tokens);
+  }
+  std::vector<std::pair<const char*, size_t>> Tokenize(const char* str,
+                                                       size_t len,
+                                                       bool by_space,
+                                                       int max_tokens) const;
+ private:
+  // Parses and extracts supported unicode segments from a utf8 string.
+  void InitializeVocabulary(const std::string& vocabulary);
+  std::unordered_map<Rune, int> valid_chars_;
+  bool exclude_nonalphaspace_unicodes_;
+};
+static constexpr size_t kEntireString = SIZE_MAX;
+static constexpr size_t kAllTokens = SIZE_MAX;
+std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len,
+                                      size_t max_input, size_t max_tokens);
+std::vector<std::string> SplitByChar(const char* input_ptr, size_t len,
+                                     size_t max_tokens);
+std::string JoinPairsBySpace(std::vector<std::pair<const char*, size_t>> words);
+#endif  // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_UTIL_H_
--- a/research/sequence_projection/tf_ops/repo.bzl
+++ b/research/sequence_projection/tf_ops/repo.bzl
+"""Reverb custom external dependencies."""
+# Sanitize a dependency so that it works correctly from code that includes
+# reverb as a submodule.
+def clean_dep(dep):
+    return str(Label(dep))
+def get_python_path(ctx):
+    path = ctx.os.environ.get("PYTHON_BIN_PATH")
+    if not path:
+        fail(
+            "Could not get environment variable PYTHON_BIN_PATH.  " +
+            "Check your .bazelrc file.",
+        )
+    return path
+def _find_tf_include_path(repo_ctx):
+    exec_result = repo_ctx.execute(
+        [
+            get_python_path(repo_ctx),
+            "-c",
+            "import tensorflow as tf; import sys; " +
+            "sys.stdout.write(tf.sysconfig.get_include())",
+        ],
+        quiet = True,
+    )
+    if exec_result.return_code != 0:
+        fail("Could not locate tensorflow installation path:\n{}"
+            .format(exec_result.stderr))
+    return exec_result.stdout.splitlines()[-1]
+def _find_tf_lib_path(repo_ctx):
+    exec_result = repo_ctx.execute(
+        [
+            get_python_path(repo_ctx),
+            "-c",
+            "import tensorflow as tf; import sys; " +
+            "sys.stdout.write(tf.sysconfig.get_lib())",
+        ],
+        quiet = True,
+    )
+    if exec_result.return_code != 0:
+        fail("Could not locate tensorflow installation path:\n{}"
+            .format(exec_result.stderr))
+    return exec_result.stdout.splitlines()[-1]
+def _find_numpy_include_path(repo_ctx):
+    exec_result = repo_ctx.execute(
+        [
+            get_python_path(repo_ctx),
+            "-c",
+            "import numpy; import sys; " +
+            "sys.stdout.write(numpy.get_include())",
+        ],
+        quiet = True,
+    )
+    if exec_result.return_code != 0:
+        fail("Could not locate numpy includes path:\n{}"
+            .format(exec_result.stderr))
+    return exec_result.stdout.splitlines()[-1]
+def _find_python_include_path(repo_ctx):
+    exec_result = repo_ctx.execute(
+        [
+            get_python_path(repo_ctx),
+            "-c",
+            "from distutils import sysconfig; import sys; " +
+            "sys.stdout.write(sysconfig.get_python_inc())",
+        ],
+        quiet = True,
+    )
+    if exec_result.return_code != 0:
+        fail("Could not locate python includes path:\n{}"
+            .format(exec_result.stderr))
+    return exec_result.stdout.splitlines()[-1]
+def _find_python_solib_path(repo_ctx):
+    exec_result = repo_ctx.execute(
+        [
+            get_python_path(repo_ctx),
+            "-c",
+            "import sys; vi = sys.version_info; " +
+            "sys.stdout.write('python{}.{}'.format(vi.major, vi.minor))",
+        ],
+    )
+    if exec_result.return_code != 0:
+        fail("Could not locate python shared library path:\n{}"
+            .format(exec_result.stderr))
+    version = exec_result.stdout.splitlines()[-1]
+    basename = "lib{}.so".format(version)
+    exec_result = repo_ctx.execute(
+        ["{}-config".format(version), "--configdir"],
+        quiet = True,
+    )
+    if exec_result.return_code != 0:
+        fail("Could not locate python shared library path:\n{}"
+            .format(exec_result.stderr))
+    solib_dir = exec_result.stdout.splitlines()[-1]
+    full_path = repo_ctx.path("{}/{}".format(solib_dir, basename))
+    if not full_path.exists:
+        fail("Unable to find python shared library file:\n{}/{}"
+            .format(solib_dir, basename))
+    return struct(dir = solib_dir, basename = basename)
+def _eigen_archive_repo_impl(repo_ctx):
+    tf_include_path = _find_tf_include_path(repo_ctx)
+    repo_ctx.symlink(tf_include_path, "tf_includes")
+    repo_ctx.file(
+        "BUILD",
+        content = """
+cc_library(
+    name = "includes",
+    hdrs = glob(["tf_includes/Eigen/**/*.h",
+                 "tf_includes/Eigen/**",
+                 "tf_includes/unsupported/Eigen/**/*.h",
+                 "tf_includes/unsupported/Eigen/**"]),
+    # https://groups.google.com/forum/#!topic/bazel-discuss/HyyuuqTxKok
+    includes = ["tf_includes"],
+    visibility = ["//visibility:public"],
+)
+""",
+        executable = False,
+    )
+def _nsync_includes_repo_impl(repo_ctx):
+    tf_include_path = _find_tf_include_path(repo_ctx)
+    repo_ctx.symlink(tf_include_path + "/external", "nsync_includes")
+    repo_ctx.file(
+        "BUILD",
+        content = """
+cc_library(
+    name = "includes",
+    hdrs = glob(["nsync_includes/nsync/public/*.h"]),
+    includes = ["nsync_includes"],
+    visibility = ["//visibility:public"],
+)
+""",
+        executable = False,
+    )
+def _zlib_includes_repo_impl(repo_ctx):
+    tf_include_path = _find_tf_include_path(repo_ctx)
+    repo_ctx.symlink(
+        tf_include_path + "/external/zlib",
+        "zlib",
+    )
+    repo_ctx.file(
+        "BUILD",
+        content = """
+cc_library(
+    name = "includes",
+    hdrs = glob(["zlib/**/*.h"]),
+    includes = ["zlib"],
+    visibility = ["//visibility:public"],
+)
+""",
+        executable = False,
+    )
+def _snappy_includes_repo_impl(repo_ctx):
+    tf_include_path = _find_tf_include_path(repo_ctx)
+    repo_ctx.symlink(
+        tf_include_path + "/external/snappy",
+        "snappy",
+    )
+    repo_ctx.file(
+        "BUILD",
+        content = """
+cc_library(
+    name = "includes",
+    hdrs = glob(["snappy/*.h"]),
+    includes = ["snappy"],
+    visibility = ["//visibility:public"],
+)
+""",
+        executable = False,
+    )
+def _protobuf_includes_repo_impl(repo_ctx):
+    tf_include_path = _find_tf_include_path(repo_ctx)
+    repo_ctx.symlink(tf_include_path, "tf_includes")
+    repo_ctx.symlink(Label("//third_party:protobuf.BUILD"), "BUILD")
+def _tensorflow_includes_repo_impl(repo_ctx):
+    tf_include_path = _find_tf_include_path(repo_ctx)
+    repo_ctx.symlink(tf_include_path, "tensorflow_includes")
+    repo_ctx.file(
+        "BUILD",
+        content = """
+cc_library(
+    name = "includes",
+    hdrs = glob(
+        [
+            "tensorflow_includes/**/*.h",
+            "tensorflow_includes/third_party/eigen3/**",
+        ],
+        exclude = ["tensorflow_includes/absl/**/*.h"],
+    ),
+    includes = ["tensorflow_includes"],
+    deps = [
+        "@eigen_archive//:eigen",
+        "@protobuf_archive//:includes",
+        "@zlib_includes//:includes",
+        "@snappy_includes//:includes",
+    ],
+    visibility = ["//visibility:public"],
+)
+filegroup(
+    name = "protos",
+    srcs = glob(["tensorflow_includes/**/*.proto"]),
+    visibility = ["//visibility:public"],
+)
+""",
+        executable = False,
+    )
+def _tensorflow_solib_repo_impl(repo_ctx):
+    tf_lib_path = _find_tf_lib_path(repo_ctx)
+    repo_ctx.symlink(tf_lib_path, "tensorflow_solib")
+    repo_ctx.file(
+        "BUILD",
+        content = """
+cc_library(
+    name = "framework_lib",
+    srcs = ["tensorflow_solib/libtensorflow_framework.so.2"],
+    deps = ["@python_includes", "@python_includes//:numpy_includes"],
+    visibility = ["//visibility:public"],
+)
+""",
+    )
+def _python_includes_repo_impl(repo_ctx):
+    python_include_path = _find_python_include_path(repo_ctx)
+    python_solib = _find_python_solib_path(repo_ctx)
+    repo_ctx.symlink(python_include_path, "python_includes")
+    numpy_include_path = _find_numpy_include_path(repo_ctx)
+    repo_ctx.symlink(numpy_include_path, "numpy_includes")
+    repo_ctx.symlink(
+        "{}/{}".format(python_solib.dir, python_solib.basename),
+        python_solib.basename,
+    )
+    # Note, "@python_includes" is a misnomer since we include the
+    # libpythonX.Y.so in the srcs, so we can get access to python's various
+    # symbols at link time.
+    repo_ctx.file(
+        "BUILD",
+        content = """
+cc_library(
+    name = "python_includes",
+    hdrs = glob(["python_includes/**/*.h"]),
+    srcs = ["{}"],
+    includes = ["python_includes"],
+    visibility = ["//visibility:public"],
+)
+cc_library(
+    name = "numpy_includes",
+    hdrs = glob(["numpy_includes/**/*.h"]),
+    includes = ["numpy_includes"],
+    visibility = ["//visibility:public"],
+)
+""".format(python_solib.basename),
+        executable = False,
+    )
+def cc_tf_configure():
+    """Autoconf pre-installed tensorflow repo."""
+    make_nsync_repo = repository_rule(
+        implementation = _nsync_includes_repo_impl,
+    )
+    make_nsync_repo(name = "nsync_includes")
+    make_zlib_repo = repository_rule(
+        implementation = _zlib_includes_repo_impl,
+    )
+    make_zlib_repo(name = "zlib_includes")
+    make_snappy_repo = repository_rule(
+        implementation = _snappy_includes_repo_impl,
+    )
+    make_snappy_repo(name = "snappy_includes")
+    make_protobuf_repo = repository_rule(
+        implementation = _protobuf_includes_repo_impl,
+    )
+    make_protobuf_repo(name = "protobuf_archive")
+    make_tfinc_repo = repository_rule(
+        implementation = _tensorflow_includes_repo_impl,
+    )
+    make_tfinc_repo(name = "tensorflow_includes")
+    make_tflib_repo = repository_rule(
+        implementation = _tensorflow_solib_repo_impl,
+    )
+    make_tflib_repo(name = "tensorflow_solib")
+    make_python_inc_repo = repository_rule(
+        implementation = _python_includes_repo_impl,
+    )
+    make_python_inc_repo(name = "python_includes")
+def _reverb_protoc_archive(ctx):
+    version = ctx.attr.version
+    sha256 = ctx.attr.sha256
+    override_version = ctx.os.environ.get("REVERB_PROTOC_VERSION")
+    if override_version:
+        sha256 = ""
+        version = override_version
+    urls = [
+        "https://github.com/protocolbuffers/protobuf/releases/download/v%s/protoc-%s-linux-x86_64.zip" % (version, version),
+    ]
+    ctx.download_and_extract(
+        url = urls,
+        sha256 = sha256,
+    )
+    ctx.file(
+        "BUILD",
+        content = """
+filegroup(
+    name = "protoc_bin",
+    srcs = ["bin/protoc"],
+    visibility = ["//visibility:public"],
+)
+""",
+        executable = False,
+    )
+reverb_protoc_archive = repository_rule(
+    implementation = _reverb_protoc_archive,
+    attrs = {
+        "version": attr.string(mandatory = True),
+        "sha256": attr.string(mandatory = True),
+    },
+)
+def reverb_protoc_deps(version, sha256):
+    reverb_protoc_archive(name = "protobuf_protoc", version = version, sha256 = sha256)
--- a/research/sequence_projection/tf_ops/sequence_string_projection.cc
+++ b/research/sequence_projection/tf_ops/sequence_string_projection.cc
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tf_ops/projection_normalizer_util.h"  // sequence_projection
+#include "tf_ops/projection_tokenizer_util.h"  // sequence_projection
+#include "tf_ops/projection_util.h"  // sequence_projection
+#include "tf_ops/text_distorter.h"  // sequence_projection
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+using ::tensorflow::int32;
+using ::tensorflow::int64;
+using ::tensorflow::uint64;
+using ::tensorflow::OpKernel;
+using ::tensorflow::OpKernelConstruction;
+using ::tensorflow::OpKernelContext;
+using ::tensorflow::Tensor;
+using ::tensorflow::TensorShape;
+using ::tensorflow::TensorShapeUtils;
+using ::tensorflow::errors::InvalidArgument;
+using tensorflow::shape_inference::DimensionHandle;
+using tensorflow::shape_inference::InferenceContext;
+constexpr char kBeginTokenTSP[] = "<BOS>";
+constexpr char kEndTokenTSP[] = "<EOS>";
+float* AllocateTensor(OpKernelContext* ctx, const std::string& tensor_name,
+                      const TensorShape& tensor_shape) {
+  Tensor* tensor = nullptr;
+  auto status = ctx->allocate_output(tensor_name, tensor_shape, &tensor);
+  if (!TF_PREDICT_TRUE(status.ok())) {
+    ctx->CtxFailureWithWarning(__FILE__, __LINE__, status);
+    return nullptr;
+  }
+  return &tensor->flat<float>()(0);
+}
+class SequenceStringProjectionOp : public OpKernel {
+ public:
+  explicit SequenceStringProjectionOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("feature_size", &feature_size_));
+    hasher_ = absl::make_unique<Hasher>(feature_size_);
+    float distortion_probability = 0.0;
+    OP_REQUIRES_OK(context, context->GetAttr("distortion_probability",
+                                             &distortion_probability));
+    text_distorter_ = absl::make_unique<TextDistorter>(distortion_probability);
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("split_on_space", &split_on_space_));
+    OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
+    OP_REQUIRES_OK(context, context->GetAttr("vocabulary", &vocabulary_));
+    bool add_bos_tag;
+    OP_REQUIRES_OK(context, context->GetAttr("add_bos_tag", &add_bos_tag));
+    bos_tag_ = add_bos_tag ? 1 : 0;
+    bool add_eos_tag;
+    OP_REQUIRES_OK(context, context->GetAttr("add_eos_tag", &add_eos_tag));
+    eos_tag_ = add_eos_tag ? 1 : 0;
+    // When word_novelty_bits is set to a positive integer, the last feature
+    // generated by the op captures the token frequency.
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("word_novelty_bits", &word_novelty_bits_));
+    CHECK_GE(word_novelty_bits_, 0);
+    CHECK_LE(word_novelty_bits_, 7);
+    if (word_novelty_bits_ != 0) {
+      CHECK_GE(feature_size_, 1);
+    }
+    // When doc_size_levels is set to a positive integer, the second to last
+    // feature generated by the op is derived from the log of the document
+    // size.
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("doc_size_levels", &doc_size_levels_));
+    CHECK_GE(doc_size_levels_, 0);
+    CHECK_LE(doc_size_levels_, 16);
+    if (doc_size_levels_ != 0) {
+      CHECK_GE(feature_size_, 2);
+    }
+    word_novelty_offset_ = 1.0f / (1 << word_novelty_bits_);
+    bool exclude_nonalphaspace_unicodes;
+    OP_REQUIRES_OK(context, context->GetAttr("exclude_nonalphaspace_unicodes",
+                                             &exclude_nonalphaspace_unicodes));
+    if (!vocabulary_.empty()) {
+      CHECK(!exclude_nonalphaspace_unicodes);
+    }
+    unicode_handler_ = absl::make_unique<ProjectionUnicodeHandler>(
+        vocabulary_, exclude_nonalphaspace_unicodes);
+    vocabulary_size_ = unicode_handler_->NumberOfValidUnicodes();
+    bool normalize_repetition;
+    OP_REQUIRES_OK(context, context->GetAttr("normalize_repetition",
+                                             &normalize_repetition));
+    std::string separators;
+    OP_REQUIRES_OK(context, context->GetAttr("token_separators", &separators));
+    if (!separators.empty() || normalize_repetition) {
+      projection_normalizer_ = absl::make_unique<ProjectionNormalizer>(
+          separators, normalize_repetition);
+    }
+  }
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
+                InvalidArgument("input must be a vector, got shape: ",
+                                input_tensor->shape().DebugString()));
+    auto input_vec = input_tensor->vec<::tensorflow::tstring>();
+    const int64 batch_size = input_vec.dimension(0);
+    std::vector<std::vector<std::pair<const char*, size_t>>> words_batches;
+    int64 max_seq_len = 0;
+    words_batches.reserve(batch_size);
+    std::vector<std::string> normalized_input_vec(batch_size);
+    for (int64 i = 0; i < batch_size; ++i) {
+      std::vector<std::pair<const char*, size_t>> words;
+      if (projection_normalizer_ == nullptr) {
+        words =
+            unicode_handler_->Tokenize(input_vec(i).data(), input_vec(i).size(),
+                                       split_on_space_, max_splits_);
+      } else {
+        normalized_input_vec[i] = projection_normalizer_->Normalize(
+            input_vec(i).data(), input_vec(i).size(), SIZE_MAX);
+        words = unicode_handler_->Tokenize(normalized_input_vec[i],
+                                           split_on_space_, max_splits_);
+      }
+      const int64 seq_len =
+          static_cast<int64>(bos_tag_ + words.size() + eos_tag_);
+      CHECK_GT(seq_len, 0);
+      max_seq_len = std::max(max_seq_len, seq_len);
+      words_batches.emplace_back(std::move(words));
+    }
+    auto projection =
+        AllocateTensor(ctx, "projection",
+                       TensorShape({batch_size, max_seq_len, feature_size_}));
+    AllocateTensor(ctx, "dummy_output", TensorShape({1}));
+    auto sequence_length =
+        AllocateTensor(ctx, "sequence_length", TensorShape({batch_size}));
+    if (!projection || !sequence_length) {
+      LOG(ERROR) << "Unable to create buffer!";
+      return;
+    }
+    const float mapping_table[4] = {0, 1, -1, 0};
+    const int increment = 32;
+    std::vector<uint64_t> hash_codes;
+    absl::flat_hash_map<uint64, int> word_counter;
+    for (int64 i = 0; i < batch_size; ++i) {
+      word_counter.clear();
+      const int64 num_tokens = words_batches[i].size();
+      sequence_length[i] = bos_tag_ + num_tokens + eos_tag_;
+      int64 offset0 = i * max_seq_len * feature_size_;
+      // Calculate doc_size_feature in [0, infinity)
+      float doc_size_feature =
+          (doc_size_levels_ != 0)
+              ? std::log2(static_cast<float>(num_tokens)) / doc_size_levels_
+              : 0.0f;
+      // Rescale doc_size_feature to [-1, 1].
+      doc_size_feature = std::min(doc_size_feature, 1.0f) * 2.0f - 1.0f;
+      for (int64 j = -bos_tag_; j < num_tokens + eos_tag_; ++j) {
+        std::string word;
+        if (j < 0) {
+          // Use a special tag for begin of sentence.
+          word = kBeginTokenTSP;
+        } else if (j < num_tokens) {
+          auto uword = icu::UnicodeString::fromUTF8(
+              unicode_handler_->LowerCaseUTF8WithSupportedUnicodes(
+                  words_batches[i][j]));
+          word = text_distorter_->DistortText(&uword);
+        } else {
+          // Use a special tag for end of sentence.
+          CHECK_EQ(eos_tag_, 1);
+          word = kEndTokenTSP;
+        }
+        hasher_->GetHashCodes(word, &hash_codes);
+        for (int hindex = 0, k = 0; hindex < hash_codes.size(); hindex++) {
+          auto hash = hash_codes[hindex];
+          for (int kmax = std::min(k + increment, feature_size_); k < kmax;) {
+            projection[offset0 + k++] = mapping_table[hash & 0x3];
+            hash >>= 2;
+          }
+        }
+        if (word_novelty_bits_ != 0 && !hash_codes.empty()) {
+          const auto word_hash = hash_codes[0];
+          projection[offset0 + feature_size_ - 1] =
+              std::min((word_counter[word_hash]++ * word_novelty_offset_),
+                       1.0f) *
+                  2.0f -
+              1.0f;
+        }
+        if (doc_size_levels_ != 0) {
+          projection[offset0 + feature_size_ - 2] = doc_size_feature;
+        }
+        offset0 += feature_size_;
+      }
+      const int pending = (max_seq_len - (bos_tag_ + num_tokens + eos_tag_));
+      memset(projection + offset0, 0, pending * feature_size_ * sizeof(float));
+    }
+  }
+ private:
+  int32 feature_size_;
+  std::unique_ptr<Hasher> hasher_;
+  std::unique_ptr<TextDistorter> text_distorter_;
+  std::unique_ptr<ProjectionUnicodeHandler> unicode_handler_;
+  std::unique_ptr<ProjectionNormalizer> projection_normalizer_;
+  std::string vocabulary_;
+  int vocabulary_size_;
+  int32 max_splits_;
+  bool split_on_space_;
+  int eos_tag_;
+  int bos_tag_;
+  int word_novelty_bits_;
+  int doc_size_levels_;
+  float word_novelty_offset_;
+};
+REGISTER_KERNEL_BUILDER(
+    Name("SequenceStringProjection").Device(::tensorflow::DEVICE_CPU),
+    SequenceStringProjectionOp);
+REGISTER_OP("SequenceStringProjection")
+    .Input("input: string")
+    .Output("projection: float32")
+    .Output("dummy_output: float32")
+    .Output("sequence_length: float32")
+    .Attr("feature_size: int")
+    .Attr("distortion_probability: float = 0.0")
+    .Attr("vocabulary: string = ''")
+    .Attr("max_splits: int = -1")
+    .Attr("exclude_nonalphaspace_unicodes: bool = False")
+    .Attr("add_bos_tag: bool = False")
+    .Attr("add_eos_tag: bool = True")
+    .Attr("word_novelty_bits: int = 0")
+    .Attr("doc_size_levels: int = 0")
+    .Attr("split_on_space: bool = True")
+    .Attr("token_separators: string = ''")
+    .Attr("normalize_repetition: bool = false")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      DimensionHandle size;
+      int32 feature_size;
+      TF_RETURN_IF_ERROR(c->GetAttr("feature_size", &feature_size));
+      const int kMaxFeatureSize = 4096;
+      CHECK_GE(feature_size, 0);
+      CHECK_LE(feature_size, kMaxFeatureSize);
+      auto batch_size = c->Dim(c->input(0), 0);
+      c->set_output(0, c->MakeShape({batch_size, InferenceContext::kUnknownDim,
+                                     feature_size}));
+      c->set_output(1, c->MakeShape({1}));
+      c->set_output(2, c->MakeShape({batch_size}));
+      return tensorflow::Status::OK();
+    })
+    .Doc(R"doc(
+This op referred to as Ternary Sequence String Projection op (TSP), tokenizes
+input text either on space or unicode boundary. Fingerprint for each token is
+computed using murmur hash and bit features are extracted from the fingerprint
+that maps every 2 bits to the ternary output {-1, 0, 1}. This effectively turns
+a batch of text input into a ternary rank 3 tensor (in float format) of shape
+[batch size, max token length, requested number of features].
+Input(s):
+- input: A string tensor with batch size number of elements.
+Attribute(s):
+- feature_size: Length of the ternary vector generated for each token.
+- distortion_probability: When non zero distort the input text with this
+    probability. Helps as a regularization method when training data set is
+    small.
+- vocabulary: When not empty provides a list of unique unicode characters that
+    will be allowed in the input text before fingerprinting. Another way to
+    say it is that the vocabulary is an optional character allowlist for the
+    input text. It helps normalize the text.
+- max_splits: Maximum number of tokens that are allowed. It helps restrict the
+    max token length of the projection output. When the value is -1 the op
+    does not restrict the number of tokens in the output.
+- exclude_nonalphaspace_unicodes: When set to true excludes unicodes that are
+    not alphabets or space character. This is multilingual. Though the effect
+    of this flag can be achieved using vocabulary, the vocabulary will have to
+    be very large for multilingual input.
+- add_bos_tag: When true inserts a begin of sentence tag.
+- add_eos_tag: When true inserts a end of sentence tag.
+- word_novelty_bits: When true adds a special feature to the ternary output
+    that captures the frequency of occurrence of a particular token. This is an
+    experimental feature.
+- doc_size_levels: When true adds a special feature to the ternary projection
+    output the document size in log scale. This is an experimental feature.
+- split_on_space: When true tokenization is done on space segmentation.
+    Otherwise tokenization is done by segmenting on unicode boundary.
+Output(s):
+- projection: Floating point tensor with ternary values of shape
+    [batch size, max token length, requested number of features].
+- dummy_output: Ignore this output, will be eliminated in a subsequent version.
+- sequence_length: Batch size length vector containing the number of tokens for
+    each input text entry.
+)doc");
--- a/research/sequence_projection/tf_ops/sequence_string_projection_op_v2.cc
+++ b/research/sequence_projection/tf_ops/sequence_string_projection_op_v2.cc
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tf_ops/projection_normalizer_util.h"  // sequence_projection
+#include "tf_ops/projection_util.h"  // sequence_projection
+#include "tf_ops/text_distorter.h"  // sequence_projection
+using ::tensorflow::int32;
+using ::tensorflow::int64;
+using ::tensorflow::OpKernel;
+using ::tensorflow::OpKernelConstruction;
+using ::tensorflow::OpKernelContext;
+using ::tensorflow::Tensor;
+using ::tensorflow::TensorShapeUtils;
+using ::tensorflow::uint64;
+using ::tensorflow::errors::InvalidArgument;
+using ::tensorflow::shape_inference::DimensionHandle;
+using ::tensorflow::shape_inference::InferenceContext;
+using ::tensorflow::shape_inference::ShapeHandle;
+constexpr char kBeginTokenTSP[] = "<BOS>";
+constexpr char kEndTokenTSP[] = "<EOS>";
+constexpr float kMappingTable[4] = {0, 1, -1, 0};
+constexpr int kIncrement = 32;
+class SequenceStringProjectionOpV2 : public OpKernel {
+ public:
+  explicit SequenceStringProjectionOpV2(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("feature_size", &feature_size_));
+    hasher_ = absl::make_unique<Hasher>(feature_size_);
+    float distortion_probability = 0.0;
+    OP_REQUIRES_OK(context, context->GetAttr("distortion_probability",
+                                             &distortion_probability));
+    text_distorter_ = absl::make_unique<TextDistorter>(distortion_probability);
+    OP_REQUIRES_OK(context, context->GetAttr("vocabulary", &vocabulary_));
+    unicode_handler_ = absl::make_unique<ProjectionUnicodeHandler>(vocabulary_);
+    bool add_bos_tag;
+    OP_REQUIRES_OK(context, context->GetAttr("add_bos_tag", &add_bos_tag));
+    bos_tag_ = add_bos_tag ? 1 : 0;
+    bool add_eos_tag;
+    OP_REQUIRES_OK(context, context->GetAttr("add_eos_tag", &add_eos_tag));
+    eos_tag_ = add_eos_tag ? 1 : 0;
+    bool normalize_repetition;
+    OP_REQUIRES_OK(context, context->GetAttr("normalize_repetition",
+                                             &normalize_repetition));
+    if (normalize_repetition) {
+      projection_normalizer_ = absl::make_unique<ProjectionNormalizer>(
+          std::string(), normalize_repetition);
+    }
+  }
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(input_tensor->shape()),
+                InvalidArgument("`input` must be a matrix, got shape: ",
+                                input_tensor->shape().DebugString()));
+    auto input_matrix = input_tensor->matrix<::tensorflow::tstring>();
+    const int64 batch_size = input_matrix.dimension(0);
+    const int64 max_seq_len = input_matrix.dimension(1);
+    const Tensor* seq_len;
+    OP_REQUIRES_OK(ctx, ctx->input("sequence_length", &seq_len));
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsVector(seq_len->shape()),
+        InvalidArgument("`sequence_length` must be a vector, got shape: ",
+                        seq_len->shape().DebugString()));
+    auto seq_len_vector = seq_len->vec<int32>();
+    OP_REQUIRES(
+        ctx, seq_len_vector.size() == batch_size,
+        InvalidArgument("`sequence_length` should have batch size number "
+                        "of elements, got size ",
+                        seq_len_vector.size(), ", batch size is ", batch_size));
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(
+                 "projection",
+                 {batch_size, bos_tag_ + max_seq_len + eos_tag_, feature_size_},
+                 &output_tensor));
+    float* projection = &output_tensor->flat<float>()(0);
+    std::vector<uint64_t> hash_codes;
+    for (int64 i = 0; i < batch_size; ++i) {
+      const int64 num_tokens = seq_len_vector(i);
+      OP_REQUIRES(ctx, num_tokens > 0,
+                  InvalidArgument(
+                      "`sequence_length` should have values greater than 0"));
+      OP_REQUIRES(ctx, num_tokens <= max_seq_len,
+                  InvalidArgument("`sequence_length` should have values less "
+                                  "than or equal to max_seq_len"));
+      int64 offset0 = i * (bos_tag_ + max_seq_len + eos_tag_) * feature_size_;
+      for (int64 j = -bos_tag_; j < num_tokens + eos_tag_; ++j) {
+        std::string word;
+        if (j < 0) {
+          word = kBeginTokenTSP;
+        } else if (j < num_tokens) {
+          auto token = std::pair<const char*, int32>(input_matrix(i, j).data(),
+                                                     input_matrix(i, j).size());
+          auto uword = icu::UnicodeString::fromUTF8(
+              unicode_handler_->LowerCaseUTF8WithSupportedUnicodes(token));
+          word = text_distorter_->DistortText(&uword);
+          if (projection_normalizer_) {
+            word = projection_normalizer_->Normalize(word.data(), word.size(),
+                                                     SIZE_MAX);
+          }
+        } else {
+          word = kEndTokenTSP;
+        }
+        hasher_->GetHashCodes(word, &hash_codes);
+        for (int hindex = 0, k = 0; hindex < hash_codes.size(); hindex++) {
+          auto hash = hash_codes[hindex];
+          for (int kmax = std::min(k + kIncrement, feature_size_); k < kmax;) {
+            projection[offset0 + k++] = kMappingTable[hash & 0x3];
+            hash >>= 2;
+          }
+        }
+        offset0 += feature_size_;
+      }
+      const int fill_length = (max_seq_len - num_tokens) * feature_size_;
+      float* fill_start = projection + offset0;
+      std::fill(fill_start, fill_start + fill_length, 0.0f);
+    }
+  }
+ private:
+  int32 feature_size_;
+  std::unique_ptr<Hasher> hasher_;
+  std::unique_ptr<TextDistorter> text_distorter_;
+  std::unique_ptr<ProjectionUnicodeHandler> unicode_handler_;
+  std::unique_ptr<ProjectionNormalizer> projection_normalizer_;
+  std::string vocabulary_;
+  int eos_tag_;
+  int bos_tag_;
+};
+REGISTER_KERNEL_BUILDER(
+    Name("SequenceStringProjectionV2").Device(::tensorflow::DEVICE_CPU),
+    SequenceStringProjectionOpV2);
+REGISTER_OP("SequenceStringProjectionV2")
+    .Input("input: string")
+    .Input("sequence_length: int32")
+    .Output("projection: float32")
+    .Attr("feature_size: int")
+    .Attr("distortion_probability: float = 0.0")
+    .Attr("vocabulary: string = ''")
+    .Attr("add_bos_tag: bool = False")
+    .Attr("add_eos_tag: bool = False")
+    .Attr("normalize_repetition: bool = False")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      DimensionHandle size;
+      int32 feature_size;
+      TF_RETURN_IF_ERROR(c->GetAttr("feature_size", &feature_size));
+      const int kMaxFeatureSize = 4096;
+      CHECK_GT(feature_size, 0);
+      CHECK_LE(feature_size, kMaxFeatureSize);
+      ShapeHandle output_shape;
+      TF_RETURN_IF_ERROR(c->Concatenate(
+          c->input(0), c->MakeShape({feature_size}), &output_shape));
+      c->set_output(0, output_shape);
+      return tensorflow::Status::OK();
+    })
+    .Doc(R"doc(
+This op referred to as Ternary Sequence String Projection Op V2 (TSPV2),
+works with presegmented string `input`. It fingerprints each token using murmur
+hash and extracts bit features from the fingerprint that maps every 2 bits to
+the ternary output {-1, 0, 1}. This effectively turns a batch of text segments
+into a ternary rank 3 tensor (in float format) of shape
+[batch size, max sequence length, requested number of features].
+Input(s):
+- input: A string tensor with [batch size, max sequence length] tokens.
+- sequence_length: A vector with batch size number of integers, where each
+    integer is in (0, max sequence length], and represents the number of valid
+    text segments in each batch entry.
+Attribute(s):
+- feature_size: Length of the ternary vector generated for each token.
+- distortion_probability: When non zero distort the input tokens with this
+    probability. Helps as a regularization method when training data set is
+    small.
+- vocabulary: When not empty provides a list of unique unicode characters that
+    will be allowed in the input text before fingerprinting. Expressed another
+    way the vocabulary is an optional character allowlist for the
+    input tokens. It helps normalize the text.
+- add_bos_tag: When true inserts a begin of sentence tag.
+- add_eos_tag: When true inserts a end of sentence tag.
+- normalize_repetition: When true normalizes repetition in text tokens before
+    fingerprinting.
+Output(s):
+- projection: Floating point tensor with ternary values of shape
+    [batch size, max sequence length, requested number of features].
+)doc");
--- a/research/sequence_projection/tf_ops/sequence_string_projection_op_v2_test.cc
+++ b/research/sequence_projection/tf_ops/sequence_string_projection_op_v2_test.cc
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+namespace {
+using ::tensorflow::DT_INT32;
+using ::tensorflow::DT_STRING;
+using ::tensorflow::int32;
+using ::tensorflow::NodeDefBuilder;
+using ::tensorflow::OpsTestBase;
+using ::tensorflow::Tensor;
+using ::tensorflow::TensorShape;
+class SequenceStringProjectionOpV2Test : public OpsTestBase {
+ protected:
+  bool FeatureMatches(const Tensor& output, int i1, int j1, int i2, int j2) {
+    bool all_matches = true;
+    auto output_tensor = output.tensor<float, 3>();
+    for (int k = 0; k < output.dim_size(2); ++k) {
+      all_matches &= (output_tensor(i1, j1, k) == output_tensor(i2, j2, k));
+    }
+    return all_matches;
+  }
+  bool FeatureIsZero(const Tensor& output, int i, int j) {
+    auto output_tensor = output.tensor<float, 3>();
+    bool all_zeros = true;
+    for (int k = 0; k < output.dim_size(2); ++k) {
+      all_zeros &= (output_tensor(i, j, k) == 0.0f);
+    }
+    return all_zeros;
+  }
+};
+TEST_F(SequenceStringProjectionOpV2Test, TestOutput) {
+  TF_ASSERT_OK(NodeDefBuilder("test_op", "SequenceStringProjectionV2")
+                   .Input({"input", 1, DT_STRING})
+                   .Input({"sequence_length", 1, DT_INT32})
+                   .Attr("vocabulary", "abcdefghijklmnopqrstuvwxyz")
+                   .Attr("feature_size", 16)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<::tensorflow::tstring>(
+      TensorShape({2, 8, 1}),
+      {"hello", "world", "147", "dog", "xyz", "abc", "efg", "hij", "quick",
+       "hel1lo", "123", "jumped", "over", "the", "lazy", "dog"});
+  AddInputFromArray<int32>(TensorShape({3, 1}), {9, 0, 9});
+  EXPECT_EQ(RunOpKernel().error_message(),
+            "`input` must be a matrix, got shape: [2,8,1]");
+  auto old = *mutable_input(0).tensor;
+  *mutable_input(0).tensor = Tensor(DT_STRING, TensorShape({2, 8}));
+  (*mutable_input(0).tensor).flat<::tensorflow::tstring>() =
+      old.flat<::tensorflow::tstring>();
+  EXPECT_EQ(RunOpKernel().error_message(),
+            "`sequence_length` must be a vector, got shape: [3,1]");
+  *mutable_input(1).tensor = Tensor(DT_INT32, TensorShape({3}));
+  EXPECT_EQ(RunOpKernel().error_message(),
+            "`sequence_length` should have batch size number of elements, got "
+            "size 3, batch size is 2");
+  *mutable_input(1).tensor = Tensor(DT_INT32, TensorShape({2}));
+  (*mutable_input(1).tensor).flat<int32>()(0) = 9;
+  (*mutable_input(1).tensor).flat<int32>()(1) = 0;
+  EXPECT_EQ(
+      RunOpKernel().error_message(),
+      "`sequence_length` should have values less than or equal to max_seq_len");
+  (*mutable_input(1).tensor).flat<int32>()(0) = 4;
+  EXPECT_EQ(RunOpKernel().error_message(),
+            "`sequence_length` should have values greater than 0");
+  (*mutable_input(1).tensor).flat<int32>()(1) = 8;
+  TF_EXPECT_OK(RunOpKernel());
+  const Tensor& output = *GetOutput(0);
+  // First checks dimensions.
+  ASSERT_EQ(output.dims(), 3);
+  EXPECT_EQ(output.dim_size(0), 2);   // Batch size
+  EXPECT_EQ(output.dim_size(1), 8);   // Max sequence length
+  EXPECT_EQ(output.dim_size(2), 16);  // Feature size
+  EXPECT_FALSE(FeatureMatches(output, 0, 0, 1, 0));  // hello != quick.
+  EXPECT_FALSE(FeatureMatches(output, 0, 1, 1, 1));  // world != hello.
+  EXPECT_TRUE(FeatureMatches(output, 0, 0, 1, 1));   // hello == hel1lo.
+  EXPECT_TRUE(FeatureMatches(output, 0, 2, 1, 2));   // 147 == 123 (oov values).
+  EXPECT_TRUE(FeatureMatches(output, 0, 3, 1, 7));   // dog == dog.
+  // Check zero padding for first sentence.
+  for (int i = 4; i < 8; ++i) {
+    EXPECT_TRUE(FeatureIsZero(output, 0, i));
+  }
+}
+TEST_F(SequenceStringProjectionOpV2Test, TestOutputBoS) {
+  TF_ASSERT_OK(NodeDefBuilder("test_op", "SequenceStringProjectionV2")
+                   .Input({"input", 1, DT_STRING})
+                   .Input({"sequence_length", 1, DT_INT32})
+                   .Attr("add_bos_tag", true)
+                   .Attr("vocabulary", "abcdefghijklmnopqrstuvwxyz")
+                   .Attr("feature_size", 16)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<::tensorflow::tstring>(
+      TensorShape({2, 8}),
+      {"hello", "world", "147", "dog", "", "", "", "", "quick", "hel1lo", "123",
+       "jumped", "over", "the", "lazy", "dog"});
+  AddInputFromArray<int32>(TensorShape({2}), {4, 8});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output = *GetOutput(0);
+  // First checks dimensions.
+  ASSERT_EQ(output.dims(), 3);
+  EXPECT_EQ(output.dim_size(0), 2);   // Batch size
+  EXPECT_EQ(output.dim_size(1), 9);   // Max sequence length
+  EXPECT_EQ(output.dim_size(2), 16);  // Feature size
+  EXPECT_TRUE(FeatureMatches(output, 0, 0, 1, 0));   // <bos> == <bos>.
+  EXPECT_FALSE(FeatureMatches(output, 0, 1, 1, 1));  // hello != quick.
+  EXPECT_FALSE(FeatureMatches(output, 0, 2, 1, 2));  // world != hello.
+  EXPECT_TRUE(FeatureMatches(output, 0, 1, 1, 2));   // hello == hel1lo.
+  EXPECT_TRUE(FeatureMatches(output, 0, 3, 1, 3));   // 147 == 123 (oov values).
+  EXPECT_TRUE(FeatureMatches(output, 0, 4, 1, 8));   // dog == dog.
+  // Check zero padding for first sentence.
+  for (int i = 5; i < 9; ++i) {
+    EXPECT_TRUE(FeatureIsZero(output, 0, i));
+  }
+}
+TEST_F(SequenceStringProjectionOpV2Test, TestOutputEoS) {
+  TF_ASSERT_OK(NodeDefBuilder("test_op", "SequenceStringProjectionV2")
+                   .Input({"input", 1, DT_STRING})
+                   .Input({"sequence_length", 1, DT_INT32})
+                   .Attr("add_eos_tag", true)
+                   .Attr("vocabulary", "abcdefghijklmnopqrstuvwxyz")
+                   .Attr("feature_size", 16)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<::tensorflow::tstring>(
+      TensorShape({2, 8}),
+      {"hello", "world", "147", "dog", "", "", "", "", "quick", "hel1lo", "123",
+       "jumped", "over", "the", "lazy", "dog"});
+  AddInputFromArray<int32>(TensorShape({2}), {4, 8});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output = *GetOutput(0);
+  // First checks dimensions.
+  ASSERT_EQ(output.dims(), 3);
+  EXPECT_EQ(output.dim_size(0), 2);   // Batch size
+  EXPECT_EQ(output.dim_size(1), 9);   // Max sequence length
+  EXPECT_EQ(output.dim_size(2), 16);  // Feature size
+  EXPECT_FALSE(FeatureMatches(output, 0, 0, 1, 0));  // hello != quick.
+  EXPECT_FALSE(FeatureMatches(output, 0, 1, 1, 1));  // world != hello.
+  EXPECT_TRUE(FeatureMatches(output, 0, 0, 1, 1));   // hello == hel1lo.
+  EXPECT_TRUE(FeatureMatches(output, 0, 2, 1, 2));   // 147 == 123 (oov values).
+  EXPECT_TRUE(FeatureMatches(output, 0, 3, 1, 7));   // dog == dog.
+  EXPECT_TRUE(FeatureMatches(output, 0, 4, 1, 8));   // <bos> == <bos>.
+  // Check zero padding for first sentence.
+  for (int i = 5; i < 9; ++i) {
+    EXPECT_TRUE(FeatureIsZero(output, 0, i));
+  }
+}
+TEST_F(SequenceStringProjectionOpV2Test, TestOutputBoSEoS) {
+  TF_ASSERT_OK(NodeDefBuilder("test_op", "SequenceStringProjectionV2")
+                   .Input({"input", 1, DT_STRING})
+                   .Input({"sequence_length", 1, DT_INT32})
+                   .Attr("add_bos_tag", true)
+                   .Attr("add_eos_tag", true)
+                   .Attr("vocabulary", "abcdefghijklmnopqrstuvwxyz.")
+                   .Attr("feature_size", 16)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<::tensorflow::tstring>(
+      TensorShape({2, 8}),
+      {"hello", "world", "147", "dog", "...", "..", "", "", "quick", "hel1lo",
+       "123", "jumped", "over", "the", "lazy", "dog"});
+  AddInputFromArray<int32>(TensorShape({2}), {6, 8});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output = *GetOutput(0);
+  // First checks dimensions.
+  ASSERT_EQ(output.dims(), 3);
+  EXPECT_EQ(output.dim_size(0), 2);   // Batch size
+  EXPECT_EQ(output.dim_size(1), 10);  // Max sequence length
+  EXPECT_EQ(output.dim_size(2), 16);  // Feature size
+  EXPECT_TRUE(FeatureMatches(output, 0, 0, 1, 0));   // <bos> == <bos>.
+  EXPECT_FALSE(FeatureMatches(output, 0, 1, 1, 1));  // hello != quick.
+  EXPECT_FALSE(FeatureMatches(output, 0, 2, 1, 2));  // world != hello.
+  EXPECT_TRUE(FeatureMatches(output, 0, 1, 1, 2));   // hello == hel1lo.
+  EXPECT_TRUE(FeatureMatches(output, 0, 3, 1, 3));   // 147 == 123 (oov values).
+  EXPECT_TRUE(FeatureMatches(output, 0, 4, 1, 8));   // dog == dog.
+  EXPECT_TRUE(FeatureMatches(output, 0, 7, 1, 9));   // <eos> == <eos>.
+  // Check for default normalize_repetition=false
+  EXPECT_FALSE(FeatureMatches(output, 0, 4, 0, 5));  // ... != ..
+  // Check zero padding for first sentence.
+  for (int i = 8; i < 10; ++i) {
+    EXPECT_TRUE(FeatureIsZero(output, 0, i));
+  }
+}
+TEST_F(SequenceStringProjectionOpV2Test, TestOutputNormalize) {
+  TF_ASSERT_OK(NodeDefBuilder("test_op", "SequenceStringProjectionV2")
+                   .Input({"input", 1, DT_STRING})
+                   .Input({"sequence_length", 1, DT_INT32})
+                   .Attr("normalize_repetition", true)
+                   .Attr("vocabulary", "abcdefghijklmnopqrstuvwxyz.")
+                   .Attr("feature_size", 16)
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<::tensorflow::tstring>(
+      TensorShape({2, 8}),
+      {"hello", "world", "..", "....", "", "", "", "", "quick", "hel1lo", "123",
+       "jumped", "over", "...", ".....", "dog"});
+  AddInputFromArray<int32>(TensorShape({2}), {4, 8});
+  TF_ASSERT_OK(RunOpKernel());
+  const Tensor& output = *GetOutput(0);
+  // First checks dimensions.
+  ASSERT_EQ(output.dims(), 3);
+  EXPECT_EQ(output.dim_size(0), 2);   // Batch size
+  EXPECT_EQ(output.dim_size(1), 8);   // Max sequence length
+  EXPECT_EQ(output.dim_size(2), 16);  // Feature size
+  EXPECT_TRUE(FeatureMatches(output, 0, 2, 0, 3));  // .. == ....
+  EXPECT_TRUE(FeatureMatches(output, 1, 5, 1, 6));  // ... == ..
+  EXPECT_TRUE(FeatureMatches(output, 0, 3, 1, 6));  // .... == ...
+  // Check zero padding for first sentence.
+  for (int i = 4; i < 8; ++i) {
+    EXPECT_TRUE(FeatureIsZero(output, 0, i));
+  }
+}
+}  // namespace
+int main(int argc, char** argv) {
+  // On Linux, add: absl::SetFlag(&FLAGS_logtostderr, true);
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/research/sequence_projection/tf_ops/sequence_string_projection_test.cc
+++ b/research/sequence_projection/tf_ops/sequence_string_projection_test.cc
--- a/research/sequence_projection/tf_ops/text_distorter.cc
+++ b/research/sequence_projection/tf_ops/text_distorter.cc
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tf_ops/text_distorter.h"  // sequence_projection
+using tensorflow::uint32;
+// Distorts the words in the text by inserting, deleting and swapping
+// unicodes randomly with probability one third of distortion_probability.
+std::string TextDistorter::DistortText(icu::UnicodeString* uword) {
+  if (distortion_probability_ > 0.0 &&
+      generator_.RandFloat() < distortion_probability_ && uword->length()) {
+    // Distort text with non zero length with distortion_probability_.
+    float distortion_type = generator_.RandFloat();
+    uint32 rindex = generator_.Rand32() % uword->length();
+    if (distortion_type < 0.33f) {
+      // Remove character with one third probability.
+      random_char_ = (*uword)[rindex];
+      uword->remove(rindex, 1);
+    } else if (distortion_type < 0.66f) {
+      // Swap character with one third probability if there are more than 2
+      // characters.
+      if (uword->length() > 2) {
+        random_char_ = (*uword)[rindex];
+        uword->remove(rindex, 1);
+        uword->insert(generator_.Rand32() % uword->length(), random_char_);
+      }
+    } else if (random_char_) {
+      // Insert character with one third probability.
+      uword->insert(rindex, random_char_);
+    }
+  }
+  // Convert unicode sequence back to string.
+  std::string word;
+  icu::StringByteSink<std::string> sink(&word);
+  uword->toUTF8(sink);
+  return word;
+}
--- a/research/sequence_projection/tf_ops/text_distorter.h
+++ b/research/sequence_projection/tf_ops/text_distorter.h
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_TEXT_DISTORTER_H_
+#define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_TEXT_DISTORTER_H_
+#include <assert.h>
+#include "icu4c/source/common/unicode/unistr.h"
+#include "tensorflow/core/lib/random/simple_philox.h"
+// A class that can be used to distort text randomly.
+class TextDistorter {
+ public:
+  // Add a random seed for PhiloxRandom constructor
+  explicit TextDistorter(float distortion_probability)
+      : philox_(171),
+        generator_(&philox_),
+        distortion_probability_(distortion_probability) {
+    assert(distortion_probability_ >= 0.0);
+    assert(distortion_probability_ <= 1.0);
+  }
+  std::string DistortText(icu::UnicodeString* uword);
+ private:
+  tensorflow::random::PhiloxRandom philox_;
+  tensorflow::random::SimplePhilox generator_;
+  float distortion_probability_;
+  UChar32 random_char_ = 0;
+};
+#endif  // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_TEXT_DISTORTER_H_