Adding SyntaxNet to tensorflow/models (#63)

32ab5a58 · calberti · Martin Wicke · 148a15fb · 32ab5a58 · 32ab5a58
Commit 32ab5a58 authored May 12, 2016 by calberti Committed by Martin Wicke May 12, 2016
11 changed files
--- a/syntaxnet/third_party/utf/utfutf.c
+++ b/syntaxnet/third_party/utf/utfutf.c
+/*
+ * The authors of this software are Rob Pike and Ken Thompson.
+ *              Copyright (c) 2002 by Lucent Technologies.
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose without fee is hereby granted, provided that this entire notice
+ * is included in all copies of any software which is or includes a copy
+ * or modification of this software and in all copies of the supporting
+ * documentation for such software.
+ * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
+ * WARRANTY.  IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
+ * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
+ * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
+ */
+#include <stdarg.h>
+#include <string.h>
+#include "third_party/utf/utf.h"
+#include "third_party/utf/utfdef.h"
+
+
+/*
+ * Return pointer to first occurrence of s2 in s1,
+ * 0 if none
+ */
+const
+char*
+utfutf(const char *s1, const char *s2)
+{
+	const char *p;
+	long f, n1, n2;
+	Rune r;
+
+	n1 = chartorune(&r, s2);
+	f = r;
+	if(f <= Runesync)		/* represents self */
+		return strstr(s1, s2);
+
+	n2 = strlen(s2);
+	for(p=s1; (p=utfrune(p, f)) != 0; p+=n1)
+		if(strncmp(p, s2, n2) == 0)
+			return p;
+	return 0;
+}
--- a/syntaxnet/tools/bazel.rc
+++ b/syntaxnet/tools/bazel.rc
+build:cuda --crosstool_top=//third_party/gpus/crosstool
+
+build --define=use_fast_cpp_protos=true
+build --define=allow_oversize_protos=true
+build --copt -funsigned-char
+build -c opt
+
+build --spawn_strategy=standalone
+test --spawn_strategy=standalone
+run --spawn_strategy=standalone
--- a/syntaxnet/util/utf8/BUILD
+++ b/syntaxnet/util/utf8/BUILD
+licenses(["notice"])
+
+# Requires --copt -funsigned-char when compiling (unsigned chars).
+
+cc_library(
+    name = "unicodetext",
+    srcs = [
+        "unicodetext.cc",
+        "unilib.cc",
+    ],
+    hdrs = [
+        "unicodetext.h",
+        "unilib.h",
+        "unilib_utf8_utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//syntaxnet:base",
+        "//third_party/utf",
+    ],
+)
+
+cc_test(
+    name = "unicodetext_unittest",
+    srcs = [
+        "gtest_main.cc",
+        "unicodetext_unittest.cc",
+    ],
+    deps = [
+        "@tf//tensorflow/core:testlib",
+        ":unicodetext",
+    ],
+)
+
+cc_binary(
+    name = "unicodetext_main",
+    srcs = ["unicodetext_main.cc"],
+    deps = [":unicodetext"],
+)
--- a/syntaxnet/util/utf8/gtest_main.cc
+++ b/syntaxnet/util/utf8/gtest_main.cc
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: sligocki@google.com (Shawn Ligocki)
+//
+// Build all tests with this main to run all tests.
+
+#include "gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/syntaxnet/util/utf8/unicodetext.cc
+++ b/syntaxnet/util/utf8/unicodetext.cc
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/utf8/unicodetext.h"
+
+#include <string.h>                     // for memcpy, NULL, memcmp, etc
+#include <algorithm>                    // for max
+
+//#include "base/logging.h"               // for operator<<, CHECK, etc
+//#include "base/stringprintf.h"          // for StringPrintf, StringAppendF
+//#include "strings/stringpiece.h"        // for StringPiece, etc
+
+#include "third_party/utf/utf.h"        // for isvalidcharntorune, etc
+#include "util/utf8/unilib.h"    // for IsInterchangeValid, etc
+#include "util/utf8/unilib_utf8_utils.h"    // for OneCharLen
+
+static int CodepointDistance(const char* start, const char* end) {
+  int n = 0;
+  // Increment n on every non-trail-byte.
+  for (const char* p = start; p < end; ++p) {
+    n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
+  }
+  return n;
+}
+
+static int CodepointCount(const char* utf8, int len) {
+  return CodepointDistance(utf8, utf8 + len);
+}
+
+UnicodeText::const_iterator::difference_type
+distance(const UnicodeText::const_iterator& first,
+         const UnicodeText::const_iterator& last) {
+  return CodepointDistance(first.it_, last.it_);
+}
+
+// ---------- Utility ----------
+
+static int ConvertToInterchangeValid(char* start, int len) {
+  // This routine is called only when we've discovered that a UTF-8 buffer
+  // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
+  // was not interchange valid. This indicates a bug in the caller, and
+  // a LOG(WARNING) is done in that case.
+  // This is similar to CoerceToInterchangeValid, but it replaces each
+  // structurally valid byte with a space, and each non-interchange
+  // character with a space, even when that character requires more
+  // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
+  // structurally valid UTF8, but U+FDD0 is not an interchange-valid
+  // code point. The result should contain one space, not three.
+  //
+  // Since the conversion never needs to write more data than it
+  // reads, it is safe to change the buffer in place. It returns the
+  // number of bytes written.
+  char* const in = start;
+  char* out = start;
+  char* const end = start + len;
+  while (start < end) {
+    int good = UniLib::SpanInterchangeValid(start, end - start);
+    if (good > 0) {
+      if (out != start) {
+        memmove(out, start, good);
+      }
+      out += good;
+      start += good;
+      if (start == end) {
+        break;
+      }
+    }
+    // Is the current string invalid UTF8 or just non-interchange UTF8?
+    char32 rune;
+    int n;
+    if (isvalidcharntorune(start, end - start, &rune, &n)) {
+      // structurally valid UTF8, but not interchange valid
+      start += n;  // Skip over the whole character.
+    } else {  // bad UTF8
+      start += 1;  // Skip over just one byte
+    }
+    *out++ = ' ';
+  }
+  return out - in;
+}
+
+
+// *************** Data representation **********
+
+// Note: the copy constructor is undefined.
+
+// After reserve(), resize(), or clear(), we're an owner, not an alias.
+
+void UnicodeText::Repr::reserve(int new_capacity) {
+  // If there's already enough capacity, and we're an owner, do nothing.
+  if (capacity_ >= new_capacity && ours_) return;
+
+  // Otherwise, allocate a new buffer.
+  capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
+  char* new_data = new char[capacity_];
+
+  // If there is an old buffer, copy it into the new buffer.
+  if (data_) {
+    memcpy(new_data, data_, size_);
+    if (ours_) delete[] data_;  // If we owned the old buffer, free it.
+  }
+  data_ = new_data;
+  ours_ = true;  // We own the new buffer.
+  // size_ is unchanged.
+}
+
+void UnicodeText::Repr::resize(int new_size) {
+  if (new_size == 0) {
+    clear();
+  } else {
+    if (!ours_ || new_size > capacity_) reserve(new_size);
+    // Clear the memory in the expanded part.
+    if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
+    size_ = new_size;
+    ours_ = true;
+  }
+}
+
+// This implementation of clear() deallocates the buffer if we're an owner.
+// That's not strictly necessary; we could just set size_ to 0.
+void UnicodeText::Repr::clear() {
+  if (ours_) delete[] data_;
+  data_ = nullptr;
+  size_ = capacity_ = 0;
+  ours_ = true;
+}
+
+void UnicodeText::Repr::Copy(const char* data, int size) {
+  resize(size);
+  memcpy(data_, data, size);
+}
+
+void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
+  if (data == data_) return;  // We already own this memory. (Weird case.)
+  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
+  data_ = data;
+  size_ = size;
+  capacity_ = capacity;
+  ours_ = true;
+}
+
+void UnicodeText::Repr::PointTo(const char* data, int size) {
+  if (ours_ && data_) delete[] data_;  // If we owned the old buffer, free it.
+  data_ = const_cast<char*>(data);
+  size_ = size;
+  capacity_ = size;
+  ours_ = false;
+}
+
+void UnicodeText::Repr::append(const char* bytes, int byte_length) {
+  reserve(size_ + byte_length);
+  memcpy(data_ + size_, bytes, byte_length);
+  size_ += byte_length;
+}
+
+string UnicodeText::Repr::DebugString() const {
+  return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}",
+                      this,
+                      data_, size_, capacity_,
+                      ours_ ? "Owned" : "Alias");
+}
+
+
+
+// *************** UnicodeText ******************
+
+// ----- Constructors -----
+
+// Default constructor
+UnicodeText::UnicodeText() {
+}
+
+// Copy constructor
+UnicodeText::UnicodeText(const UnicodeText& src) {
+  Copy(src);
+}
+
+// Substring constructor
+UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
+                         const UnicodeText::const_iterator& last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  repr_.append(first.it_, last.it_ - first.it_);
+}
+
+string UnicodeText::UTF8Substring(const const_iterator& first,
+                                  const const_iterator& last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  return string(first.it_, last.it_ - first.it_);
+}
+
+
+// ----- Copy -----
+
+UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
+  if (this != &src) {
+    Copy(src);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
+  repr_.Copy(src.repr_.data_, src.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
+  repr_.Copy(buffer, byte_length);
+  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
+                                           int byte_length) {
+  repr_.Copy(buffer, byte_length);
+  return *this;
+}
+
+// ----- TakeOwnershipOf  -----
+
+UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
+                                              int byte_length,
+                                              int byte_capacity) {
+  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
+  if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
+                                                    int byte_length,
+                                                    int byte_capacity) {
+  repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
+  return *this;
+}
+
+// ----- PointTo -----
+
+UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
+  if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
+    repr_.PointTo(buffer, byte_length);
+  } else {
+    LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
+    repr_.Copy(buffer, byte_length);
+    repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
+  }
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
+                                          int byte_length) {
+  repr_.PointTo(buffer, byte_length);
+  return *this;
+}
+
+UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
+  repr_.PointTo(src.repr_.data_, src.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::PointTo(const const_iterator &first,
+                                  const const_iterator &last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
+  return *this;
+}
+
+// ----- Append -----
+
+UnicodeText& UnicodeText::append(const UnicodeText& u) {
+  repr_.append(u.repr_.data_, u.repr_.size_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::append(const const_iterator& first,
+                                 const const_iterator& last) {
+  CHECK(first <= last) << " Incompatible iterators";
+  repr_.append(first.it_, last.it_ - first.it_);
+  return *this;
+}
+
+UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
+  repr_.append(utf8, len);
+  return *this;
+}
+
+// ----- substring searching -----
+
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
+                                              const_iterator start_pos) const {
+  CHECK_GE(start_pos.utf8_data(), utf8_data());
+  CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
+  return UnsafeFind(look, start_pos);
+}
+
+UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
+  return UnsafeFind(look, begin());
+}
+
+UnicodeText::const_iterator UnicodeText::UnsafeFind(
+    const UnicodeText& look, const_iterator start_pos) const {
+  // Due to the magic of the UTF8 encoding, searching for a sequence of
+  // letters is equivalent to substring search.
+  StringPiece searching(utf8_data(), utf8_length());
+  StringPiece look_piece(look.utf8_data(), look.utf8_length());
+  LOG(FATAL) << "Not implemented";
+  //StringPiece::size_type found =
+  //    searching.find(look_piece, start_pos.utf8_data() - utf8_data());
+  StringPiece::size_type found = StringPiece::npos;
+  if (found == StringPiece::npos) return end();
+  return const_iterator(utf8_data() + found);
+}
+
+bool UnicodeText::HasReplacementChar() const {
+  // Equivalent to:
+  //   UnicodeText replacement_char;
+  //   replacement_char.push_back(0xFFFD);
+  //   return find(replacement_char) != end();
+  StringPiece searching(utf8_data(), utf8_length());
+  StringPiece looking_for("\xEF\xBF\xBD", 3);
+  LOG(FATAL) << "Not implemented";
+  //return searching.find(looking_for) != StringPiece::npos;
+  return false;
+}
+
+// ----- other methods -----
+
+// Clear operator
+void UnicodeText::clear() {
+  repr_.clear();
+}
+
+// Destructor
+UnicodeText::~UnicodeText() {}
+
+
+void UnicodeText::push_back(char32 c) {
+  if (UniLib::IsValidCodepoint(c)) {
+    char buf[UTFmax];
+    int len = runetochar(buf, &c);
+    if (UniLib::IsInterchangeValid(buf, len)) {
+      repr_.append(buf, len);
+    } else {
+      LOG(WARNING) << "Unicode value 0x" << std::hex << c
+                  << " is not valid for interchange";
+      repr_.append(" ", 1);
+    }
+  } else {
+    LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
+    repr_.append(" ", 1);
+  }
+}
+
+int UnicodeText::size() const {
+  return CodepointCount(repr_.data_, repr_.size_);
+}
+
+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
+  if (&lhs == &rhs) return true;
+  if (lhs.repr_.size_ != rhs.repr_.size_) return false;
+  return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
+}
+
+string UnicodeText::DebugString() const {
+  return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
+                      this,
+                      size(),
+                      repr_.DebugString().c_str());
+}
+
+
+// ******************* UnicodeText::const_iterator *********************
+
+// The implementation of const_iterator would be nicer if it
+// inherited from boost::iterator_facade
+// (http://boost.org/libs/iterator/doc/iterator_facade.html).
+
+UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
+
+UnicodeText::const_iterator::const_iterator(const const_iterator& other)
+    : it_(other.it_) {
+}
+
+UnicodeText::const_iterator&
+UnicodeText::const_iterator::operator=(const const_iterator& other) {
+  if (&other != this)
+    it_ = other.it_;
+  return *this;
+}
+
+UnicodeText::const_iterator UnicodeText::begin() const {
+  return const_iterator(repr_.data_);
+}
+
+UnicodeText::const_iterator UnicodeText::end() const {
+  return const_iterator(repr_.data_ + repr_.size_);
+}
+
+bool operator<(const UnicodeText::const_iterator& lhs,
+               const UnicodeText::const_iterator& rhs) {
+  return lhs.it_ < rhs.it_;
+}
+
+char32 UnicodeText::const_iterator::operator*() const {
+  // (We could call chartorune here, but that does some
+  // error-checking, and we're guaranteed that our data is valid
+  // UTF-8. Also, we expect this routine to be called very often. So
+  // for speed, we do the calculation ourselves.)
+
+  // Convert from UTF-8
+  int byte1 = it_[0];
+  if (byte1 < 0x80)
+    return byte1;
+
+  int byte2 = it_[1];
+  if (byte1 < 0xE0)
+    return ((byte1 & 0x1F) << 6)
+          | (byte2 & 0x3F);
+
+  int byte3 = it_[2];
+  if (byte1 < 0xF0)
+    return ((byte1 & 0x0F) << 12)
+         | ((byte2 & 0x3F) << 6)
+         |  (byte3 & 0x3F);
+
+  int byte4 = it_[3];
+  return ((byte1 & 0x07) << 18)
+       | ((byte2 & 0x3F) << 12)
+       | ((byte3 & 0x3F) << 6)
+       |  (byte4 & 0x3F);
+}
+
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
+  it_ += UniLib::OneCharLen(it_);
+  return *this;
+}
+
+UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
+  while (UniLib::IsTrailByte(*--it_));
+  return *this;
+}
+
+int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
+  utf8_output[0] = it_[0]; if (it_[0] < 0x80) return 1;
+  utf8_output[1] = it_[1]; if (it_[0] < 0xE0) return 2;
+  utf8_output[2] = it_[2]; if (it_[0] < 0xF0) return 3;
+  utf8_output[3] = it_[3];
+  return 4;
+}
+
+string UnicodeText::const_iterator::get_utf8_string() const {
+  return string(utf8_data(), utf8_length());
+}
+
+int UnicodeText::const_iterator::utf8_length() const {
+  if (it_[0] < 0x80) {
+    return 1;
+  } else if (it_[0] < 0xE0) {
+    return 2;
+  } else if (it_[0] < 0xF0) {
+    return 3;
+  } else {
+    return 4;
+  }
+}
+
+UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
+  CHECK(p != nullptr);
+  const char* start = utf8_data();
+  int len = utf8_length();
+  const char* end = start + len;
+  CHECK(p >= start);
+  CHECK(p <= end);
+  CHECK(p == end || !UniLib::IsTrailByte(*p));
+  return const_iterator(p);
+}
+
+string UnicodeText::const_iterator::DebugString() const {
+  return tensorflow::strings::Printf("{iter %p}", it_);
+}
+
+
+// *************************** Utilities *************************
+
+string CodepointString(const UnicodeText& t) {
+  string s;
+  UnicodeText::const_iterator it = t.begin(), end = t.end();
+  while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++);
+  return s;
+}
--- a/syntaxnet/util/utf8/unicodetext.h
+++ b/syntaxnet/util/utf8/unicodetext.h
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_
+#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_
+
+#include <stddef.h>                     // for NULL, ptrdiff_t
+#include <iterator>                     // for bidirectional_iterator_tag, etc
+#include <string>                       // for string
+#include <utility>                      // for pair
+
+#include "syntaxnet/base.h"
+
+// ***************************** UnicodeText **************************
+//
+// A UnicodeText object is a container for a sequence of Unicode
+// codepoint values. It has default, copy, and assignment constructors.
+// Data can be appended to it from another UnicodeText, from
+// iterators, or from a single codepoint.
+//
+// The internal representation of the text is UTF-8. Since UTF-8 is a
+// variable-width format, UnicodeText does not provide random access
+// to the text, and changes to the text are permitted only at the end.
+//
+// The UnicodeText class defines a const_iterator. The dereferencing
+// operator (*) returns a codepoint (char32). The iterator is a
+// bidirectional, read-only iterator. It becomes invalid if the text
+// is changed.
+//
+// There are methods for appending and retrieving UTF-8 data directly.
+// The 'utf8_data' method returns a const char* that contains the
+// UTF-8-encoded version of the text; 'utf8_length' returns the number
+// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
+// 4 bytes of UTF-8 data in a char array and returns the number of
+// bytes that it stored.
+//
+// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
+// 0x10FFFF], but UnicodeText has the additional restriction that it
+// can contain only those characters that are valid for interchange on
+// the Web. This excludes all of the control codes except for carriage
+// return, line feed, and horizontal tab.  It also excludes
+// non-characters, but codepoints that are in the Private Use regions
+// are allowed, as are codepoints that are unassigned. (See the
+// Unicode reference for details.) The function UniLib::IsInterchangeValid
+// can be used as a test for this property.
+//
+// UnicodeTexts are safe. Every method that constructs or modifies a
+// UnicodeText tests for interchange-validity, and will substitute a
+// space for the invalid data. Such cases are reported via
+// LOG(WARNING).
+//
+// MEMORY MANAGEMENT: copy, take ownership, or point to
+//
+// A UnicodeText is either an "owner", meaning that it owns the memory
+// for the data buffer and will free it when the UnicodeText is
+// destroyed, or it is an "alias", meaning that it does not.
+//
+// There are three methods for storing UTF-8 data in a UnicodeText:
+//
+// CopyUTF8(buffer, len) copies buffer.
+//
+// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
+//
+// PointToUTF8(buffer, size) creates an alias pointing to buffer.
+//
+// All three methods perform a validity check on the buffer. There are
+// private, "unsafe" versions of these functions that bypass the
+// validity check. They are used internally and by friend-functions
+// that are handling UTF-8 data that has already been validated.
+//
+// The purpose of an alias is to avoid making an unnecessary copy of a
+// UTF-8 buffer while still providing access to the Unicode values
+// within that text through iterators or the fast scanners that are
+// based on UTF-8 state tables. The lifetime of an alias must not
+// exceed the lifetime of the buffer from which it was constructed.
+//
+// The semantics of an alias might be described as "copy on write or
+// repair." The source data is never modified. If push_back() or
+// append() is called on an alias, a copy of the data will be created,
+// and the UnicodeText will become an owner. If clear() is called on
+// an alias, it becomes an (empty) owner.
+//
+// The copy constructor and the assignment operator produce an owner.
+// That is, after direct initialization ("UnicodeText x(y);") or copy
+// initialization ("UnicodeText x = y;") x will be an owner, even if y
+// was an alias. The assignment operator ("x = y;") also produces an
+// owner unless x and y are the same object and y is an alias.
+//
+// Aliases should be used with care. If the source from which an alias
+// was created is freed, or if the contents are changed, while the
+// alias is still in use, fatal errors could result. But it can be
+// quite useful to have a UnicodeText "window" through which to see a
+// UTF-8 buffer without having to pay the price of making a copy.
+//
+// UTILITIES
+//
+// The interfaces in util/utf8/public/textutils.h provide higher-level
+// utilities for dealing with UnicodeTexts, including routines for
+// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
+// strings, creating strings from UnicodeTexts, normalizing text for
+// efficient matching or display, and others.
+
+class UnicodeText {
+ public:
+  class const_iterator;
+
+  typedef char32 value_type;
+
+  // Constructors. These always produce owners.
+  UnicodeText();  // Create an empty text.
+  UnicodeText(const UnicodeText& src);  // copy constructor
+  // Construct a substring (copies the data).
+  UnicodeText(const const_iterator& first, const const_iterator& last);
+
+  // Assignment operator. This copies the data and produces an owner
+  // unless this == &src, e.g., "x = x;", which is a no-op.
+  UnicodeText& operator=(const UnicodeText& src);
+
+  // x.Copy(y) copies the data from y into x.
+  UnicodeText& Copy(const UnicodeText& src);
+  inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
+
+  // x.PointTo(y) changes x so that it points to y's data.
+  // It does not copy y or take ownership of y's data.
+  UnicodeText& PointTo(const UnicodeText& src);
+  UnicodeText& PointTo(const const_iterator& first,
+                       const const_iterator& last);
+
+  ~UnicodeText();
+
+  void clear();  // Clear text.
+  bool empty() const { return repr_.size_ == 0; }  // Test if text is empty.
+
+  // Add a codepoint to the end of the text.
+  // If the codepoint is not interchange-valid, add a space instead
+  // and log a warning.
+  void push_back(char32 codepoint);
+
+  // Generic appending operation.
+  // iterator_traits<ForwardIterator>::value_type must be implicitly
+  // convertible to char32. Typical uses of this method might include:
+  //     char32 chars[] = {0x1, 0x2, ...};
+  //     vector<char32> more_chars = ...;
+  //     utext.append(chars, chars+arraysize(chars));
+  //     utext.append(more_chars.begin(), more_chars.end());
+  template<typename ForwardIterator>
+  UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
+    while (first != last) { push_back(*first++); }
+    return *this;
+  }
+
+  // A specialization of the generic append() method.
+  UnicodeText& append(const const_iterator& first, const const_iterator& last);
+
+  // An optimization of append(source.begin(), source.end()).
+  UnicodeText& append(const UnicodeText& source);
+
+  int size() const;  // the number of Unicode characters (codepoints)
+
+  friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
+  friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
+
+  class const_iterator {
+    typedef const_iterator CI;
+   public:
+    typedef std::bidirectional_iterator_tag iterator_category;
+    typedef char32 value_type;
+    typedef ptrdiff_t difference_type;
+    typedef void pointer;  // (Not needed.)
+    typedef const char32 reference;  // (Needed for const_reverse_iterator)
+
+    // Iterators are default-constructible.
+    const_iterator();
+
+    // It's safe to make multiple passes over a UnicodeText.
+    const_iterator(const const_iterator& other);
+    const_iterator& operator=(const const_iterator& other);
+
+    char32 operator*() const;  // Dereference
+
+    const_iterator& operator++();  // Advance (++iter)
+    const_iterator operator++(int) {  // (iter++)
+      const_iterator result(*this);
+      ++*this;
+      return result;
+    }
+
+    const_iterator& operator--();  // Retreat (--iter)
+    const_iterator operator--(int) {  // (iter--)
+      const_iterator result(*this);
+      --*this;
+      return result;
+    }
+
+    // We love relational operators.
+    friend bool operator==(const CI& lhs, const CI& rhs) {
+      return lhs.it_ == rhs.it_; }
+    friend bool operator!=(const CI& lhs, const CI& rhs) {
+      return !(lhs == rhs); }
+    friend bool operator<(const CI& lhs, const CI& rhs);
+    friend bool operator>(const CI& lhs, const CI& rhs) {
+      return rhs < lhs; }
+    friend bool operator<=(const CI& lhs, const CI& rhs) {
+      return !(rhs < lhs); }
+    friend bool operator>=(const CI& lhs, const CI& rhs) {
+      return !(lhs < rhs); }
+
+    friend difference_type distance(const CI& first, const CI& last);
+
+    // UTF-8-specific methods
+    // Store the UTF-8 encoding of the current codepoint into buf,
+    // which must be at least 4 bytes long. Return the number of
+    // bytes written.
+    int get_utf8(char* buf) const;
+    // Return the UTF-8 character that the iterator points to.
+    string get_utf8_string() const;
+    // Return the byte length of the UTF-8 character the iterator points to.
+    int utf8_length() const;
+    // Return the iterator's pointer into the UTF-8 data.
+    const char* utf8_data() const { return it_; }
+
+    string DebugString() const;
+
+   private:
+    friend class UnicodeText;
+    friend class UnicodeTextUtils;
+    friend class UTF8StateTableProperty;
+    explicit const_iterator(const char* it) : it_(it) {}
+
+    const char* it_;
+  };
+
+  const_iterator begin() const;
+  const_iterator end() const;
+
+  class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
+   public:
+    explicit const_reverse_iterator(const_iterator it) :
+        std::reverse_iterator<const_iterator>(it) {}
+    const char* utf8_data() const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).utf8_data();
+    }
+    int get_utf8(char* buf) const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).get_utf8(buf);
+    }
+    string get_utf8_string() const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).get_utf8_string();
+    }
+    int utf8_length() const {
+      const_iterator tmp_it = base();
+      return (--tmp_it).utf8_length();
+    }
+  };
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
+  }
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
+  }
+
+  // Substring searching.  Returns the beginning of the first
+  // occurrence of "look", or end() if not found.
+  const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
+  // Equivalent to find(look, begin())
+  const_iterator find(const UnicodeText& look) const;
+
+  // Returns whether this contains the character U+FFFD.  This can
+  // occur, for example, if the input to Encodings::Decode() had byte
+  // sequences that were invalid in the source encoding.
+  bool HasReplacementChar() const;
+
+  // UTF-8-specific methods
+  //
+  // Return the data, length, and capacity of UTF-8-encoded version of
+  // the text. Length and capacity are measured in bytes.
+  const char* utf8_data() const { return repr_.data_; }
+  int utf8_length() const { return repr_.size_; }
+  int utf8_capacity() const { return repr_.capacity_; }
+
+  // Return the UTF-8 data as a string.
+  static string UTF8Substring(const const_iterator& first,
+                              const const_iterator& last);
+
+  // There are three methods for initializing a UnicodeText from UTF-8
+  // data. They vary in details of memory management. In all cases,
+  // the data is tested for interchange-validity. If it is not
+  // interchange-valid, a LOG(WARNING) is issued, and each
+  // structurally invalid byte and each interchange-invalid codepoint
+  // is replaced with a space.
+
+  // x.CopyUTF8(buf, len) copies buf into x.
+  UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
+
+  // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
+  // buf. buf is not copied.
+  UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
+                                   int byte_length,
+                                   int byte_capacity);
+
+  // x.PointToUTF8(buf,len) changes x so that it points to buf
+  // ("becomes an alias"). It does not take ownership or copy buf.
+  // If the buffer is not valid, this has the same effect as
+  // CopyUTF8(utf8_buffer, byte_length).
+  UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
+
+  // Occasionally it is necessary to use functions that operate on the
+  // pointer returned by utf8_data(). MakeIterator(p) provides a way
+  // to get back to the UnicodeText level. It uses CHECK to ensure
+  // that p is a pointer within this object's UTF-8 data, and that it
+  // points to the beginning of a character.
+  const_iterator MakeIterator(const char* p) const;
+
+  string DebugString() const;
+
+ private:
+  friend class const_iterator;
+  friend class UnicodeTextUtils;
+
+  class Repr {  // A byte-string.
+   public:
+    char* data_;
+    int size_;
+    int capacity_;
+    bool ours_;  // Do we own data_?
+
+    Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
+    ~Repr() { if (ours_) delete[] data_; }
+
+    void clear();
+    void reserve(int capacity);
+    void resize(int size);
+
+    void append(const char* bytes, int byte_length);
+    void Copy(const char* data, int size);
+    void TakeOwnershipOf(char* data, int size, int capacity);
+    void PointTo(const char* data, int size);
+
+    string DebugString() const;
+
+   private:
+    Repr& operator=(const Repr&);
+    Repr(const Repr& other);
+  };
+
+  Repr repr_;
+
+  // UTF-8-specific private methods.
+  // These routines do not perform a validity check when compiled
+  // in opt mode.
+  // It is an error to call these methods with UTF-8 data that
+  // is not interchange-valid.
+  //
+  UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
+  UnicodeText& UnsafeTakeOwnershipOfUTF8(
+      char* utf8_buffer, int byte_length, int byte_capacity);
+  UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
+  UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
+  const_iterator UnsafeFind(const UnicodeText& look,
+                            const_iterator start_pos) const;
+};
+
+bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
+
+inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
+  return !(lhs == rhs);
+}
+
+// UnicodeTextRange is a pair of iterators, useful for specifying text
+// segments. If the iterators are ==, the segment is empty.
+typedef pair<UnicodeText::const_iterator,
+             UnicodeText::const_iterator> UnicodeTextRange;
+
+inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
+  return r.first == r.second;
+}
+
+
+// *************************** Utilities *************************
+
+// A factory function for creating a UnicodeText from a buffer of
+// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
+// is an "owner.")
+//
+// Each byte that is structurally invalid will be replaced with a
+// space. Each codepoint that is interchange-invalid will also be
+// replaced with a space, even if the codepoint was represented with a
+// multibyte sequence in the UTF-8 data.
+//
+inline UnicodeText MakeUnicodeTextAcceptingOwnership(
+    char* utf8_buffer, int byte_length, int byte_capacity) {
+  return UnicodeText().TakeOwnershipOfUTF8(
+      utf8_buffer, byte_length, byte_capacity);
+}
+
+// A factory function for creating a UnicodeText from a buffer of
+// UTF-8 data. The new UnicodeText does not take ownership of the
+// buffer. (It is an "alias.")
+//
+inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
+    const char* utf8_buffer, int byte_length) {
+  return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
+}
+
+// Create a UnicodeText from a UTF-8 string or buffer.
+//
+// If do_copy is true, then a copy of the string is made. The copy is
+// owned by the resulting UnicodeText object and will be freed when
+// the object is destroyed. This UnicodeText object is referred to
+// as an "owner."
+//
+// If do_copy is false, then no copy is made. The resulting
+// UnicodeText object does NOT take ownership of the string; in this
+// case, the lifetime of the UnicodeText object must not exceed the
+// lifetime of the string. This Unicodetext object is referred to as
+// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
+//
+// If the input string does not contain valid UTF-8, then a copy is
+// made (as if do_copy were true) and coerced to valid UTF-8 by
+// replacing each invalid byte with a space.
+//
+inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
+                                     bool do_copy) {
+  UnicodeText t;
+  if (do_copy) {
+    t.CopyUTF8(utf8_buf, len);
+  } else {
+    t.PointToUTF8(utf8_buf, len);
+  }
+  return t;
+}
+
+inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
+  return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
+}
+
+inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
+  return UTF8ToUnicodeText(utf8_buf, len, true);
+}
+inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
+  return UTF8ToUnicodeText(utf8_string, true);
+}
+
+// Return a string containing the UTF-8 encoded version of all the
+// Unicode characters in t.
+inline string UnicodeTextToUTF8(const UnicodeText& t) {
+  return string(t.utf8_data(), t.utf8_length());
+}
+
+
+// For debugging.  Return a string of integers, written in uppercase
+// hex (%X), corresponding to the codepoints within the text. Each
+// integer is followed by a space. E.g., "61 62 6A 3005 ".
+string CodepointString(const UnicodeText& t);
+
+#endif  // UTIL_UTF8_PUBLIC_UNICODETEXT_H_
--- a/syntaxnet/util/utf8/unicodetext_main.cc
+++ b/syntaxnet/util/utf8/unicodetext_main.cc
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: sligocki@google.com (Shawn Ligocki)
+//
+// A basic main function to test that UnicodeText builds.
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <string>
+
+#include "util/utf8/unicodetext.h"
+
+int main(int argc, char** argv) {
+  if (argc > 1) {
+    printf("Bytes:\n");
+    std::string bytes(argv[1]);
+    for (std::string::const_iterator iter = bytes.begin();
+         iter < bytes.end(); ++iter) {
+      printf("  0x%02X\n", *iter);
+    }
+
+    printf("Unicode codepoints:\n");
+    UnicodeText text(UTF8ToUnicodeText(bytes));
+    for (UnicodeText::const_iterator iter = text.begin();
+         iter < text.end(); ++iter) {
+      printf("  U+%X\n", *iter);
+    }
+  }
+  return EXIT_SUCCESS;
+}
--- a/syntaxnet/util/utf8/unicodetext_unittest.cc
+++ b/syntaxnet/util/utf8/unicodetext_unittest.cc
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/utf8/unicodetext.h"
+
+#include <iterator>
+#include <set>
+
+#include "gtest/gtest.h"
+#include "third_party/utf/utf.h"
+#include "util/utf8/unilib.h"
+
+namespace {
+
+template <typename T, size_t N>
+char (&ArraySizeHelper(T (&array)[N]))[N];
+#define arraysize(array) (sizeof(ArraySizeHelper(array)))
+
+class UnicodeTextTest : public testing::Test {
+ protected:
+  UnicodeTextTest() : empty_text_() {
+    const char32 text[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
+    // Construct a UnicodeText from those codepoints.
+    text_.append(&text[0], text + arraysize(text));
+  }
+
+  UnicodeText empty_text_;
+  UnicodeText text_;
+};
+
+TEST(UnicodeTextTest, Ownership) {
+  const string src =  "\u304A\u00B0\u106B";
+  {
+    string s = src;
+    char* sbuf = new char[s.size()];
+    memcpy(sbuf, s.data(), s.size());
+    UnicodeText owned;
+    owned.TakeOwnershipOfUTF8(sbuf, s.size(), s.size());
+    EXPECT_EQ(owned.utf8_data(), sbuf);
+    s.clear();
+    // owned should be OK even after s has been cleared.
+    UnicodeText::const_iterator it = owned.begin();
+    EXPECT_EQ(*it++, 0x304A);
+    EXPECT_EQ(*it++, 0x00B0);
+    EXPECT_EQ(*it++, 0x106B);
+    CHECK(it == owned.end());
+  }
+
+  {
+    UnicodeText owner;
+    {  // Create a new scope for s.
+      string s = src;
+      char* sbuf = new char[s.size()];
+      memcpy(sbuf, s.data(), s.size());
+      UnicodeText t;
+      t.TakeOwnershipOfUTF8(sbuf, s.size(), s.size());
+      EXPECT_EQ(t.utf8_data(), sbuf);
+      owner = t;  // Copies the data
+      EXPECT_NE(owner.utf8_data(), sbuf);
+    }
+    // owner should be OK even after s has gone out of scope
+    UnicodeText::const_iterator it = owner.begin();
+    EXPECT_EQ(*it++, 0x304A);
+    EXPECT_EQ(*it++, 0x00B0);
+    EXPECT_EQ(*it++, 0x106B);
+    CHECK(it == owner.end());
+  }
+
+  {
+    UnicodeText alias;
+    alias.PointToUTF8(src.data(), src.size());
+    EXPECT_EQ(alias.utf8_data(), src.data());
+    UnicodeText::const_iterator it = alias.begin();
+    EXPECT_EQ(*it++, 0x304A);
+    EXPECT_EQ(*it++, 0x00B0);
+    EXPECT_EQ(*it++, 0x106B);
+    CHECK(it == alias.end());
+
+    UnicodeText t = alias;  // Copy initialization copies the data.
+    EXPECT_NE(t.utf8_data(), alias.utf8_data());
+
+    UnicodeText t2;
+    t2 = alias;  // Assignment copies the data.
+    EXPECT_NE(t2.utf8_data(), alias.utf8_data());
+
+    // Preserve an alias.
+    t.PointTo(alias); // This does not copy the data.
+    EXPECT_EQ(t.utf8_data(), alias.utf8_data());
+
+    t.push_back(0x0020); // Modify the alias
+    EXPECT_NE(t.utf8_data(), alias.utf8_data()); // It's no longer an alias.
+  }
+}
+
+class IteratorTest : public UnicodeTextTest {};
+
+TEST_F(IteratorTest, Iterates) {
+  UnicodeText::const_iterator iter = text_.begin();
+  EXPECT_EQ(0x1C0, *iter);
+  EXPECT_EQ(&iter, &++iter);  // operator++ returns *this.
+  EXPECT_EQ(0x4E8C, *iter++);
+  EXPECT_EQ(0xD7DB, *iter);
+  // Make sure you can dereference more than once.
+  EXPECT_EQ(0xD7DB, *iter);
+  EXPECT_EQ(0x34, *++iter);
+  EXPECT_EQ(0x1D11E, *++iter);
+  ASSERT_TRUE(iter != text_.end());
+  iter++;
+  EXPECT_TRUE(iter == text_.end());
+}
+
+TEST_F(IteratorTest, Reverse) {
+  UnicodeText::const_reverse_iterator iter = text_.rbegin();
+  EXPECT_EQ(0x1D11E, *iter);
+  EXPECT_EQ(&iter, &++iter);  // operator++ returns *this.
+  EXPECT_EQ(0x34, *iter++);
+  EXPECT_EQ(0xD7DB, *iter);
+  // Make sure you can dereference more than once.
+  EXPECT_EQ(0xD7DB, *iter);
+  EXPECT_EQ(0x4E8C, *++iter);
+  EXPECT_EQ(0x1C0, *++iter);
+  ASSERT_TRUE(iter != text_.rend());
+  iter++;
+  EXPECT_TRUE(iter == text_.rend());
+}
+
+TEST_F(IteratorTest, MultiPass) {
+  // Also tests Default Constructible and Assignable.
+  UnicodeText::const_iterator i1, i2;
+  i1 = text_.begin();
+  i2 = i1;
+  EXPECT_EQ(0x4E8C, *++i1);
+  EXPECT_TRUE(i1 != i2);
+  EXPECT_EQ(0x1C0, *i2);
+  ++i2;
+  EXPECT_TRUE(i1 == i2);
+  EXPECT_EQ(0x4E8C, *i2);
+}
+
+TEST_F(IteratorTest, ReverseIterates) {
+  UnicodeText::const_iterator iter = text_.end();
+  EXPECT_TRUE(iter == text_.end());
+  iter--;
+  ASSERT_TRUE(iter != text_.end());
+  EXPECT_EQ(0x1D11E, *iter--);
+  EXPECT_EQ(0x34, *iter);
+  EXPECT_EQ(0xD7DB, *--iter);
+  // Make sure you can dereference more than once.
+  EXPECT_EQ(0xD7DB, *iter);
+  --iter;
+  EXPECT_EQ(0x4E8C, *iter--);
+  EXPECT_EQ(0x1C0, *iter);
+  EXPECT_TRUE(iter == text_.begin());
+}
+
+TEST_F(IteratorTest, Comparable) {
+  UnicodeText::const_iterator i1, i2;
+  i1 = text_.begin();
+  i2 = i1;
+  ++i2;
+
+  EXPECT_TRUE(i1 < i2);
+  EXPECT_TRUE(text_.begin() <= i1);
+  EXPECT_FALSE(i1 >= i2);
+  EXPECT_FALSE(i1 > text_.end());
+}
+
+TEST_F(IteratorTest, Advance) {
+  UnicodeText::const_iterator iter = text_.begin();
+  EXPECT_EQ(0x1C0, *iter);
+  std::advance(iter, 4);
+  EXPECT_EQ(0x1D11E, *iter);
+  ++iter;
+  EXPECT_TRUE(iter == text_.end());
+}
+
+TEST_F(IteratorTest, Distance) {
+  UnicodeText::const_iterator iter = text_.begin();
+  EXPECT_EQ(0, distance(text_.begin(), iter));
+  EXPECT_EQ(5, distance(iter, text_.end()));
+  ++iter;
+  ++iter;
+  EXPECT_EQ(2, distance(text_.begin(), iter));
+  EXPECT_EQ(3, distance(iter, text_.end()));
+  ++iter;
+  ++iter;
+  EXPECT_EQ(4, distance(text_.begin(), iter));
+  ++iter;
+  EXPECT_EQ(0, distance(iter, text_.end()));
+}
+
+TEST_F(IteratorTest, Encode) {
+  const string utf8 = "\xC7\x80"
+                      "\xE4\xBA\x8C"
+                      "\xED\x9F\x9B"
+                      "\x34"
+                      "\xF0\x9D\x84\x9E";
+  const int lengths[] = {2, 3, 3, 1, 4};
+  EXPECT_EQ(text_.size(), 5);
+  EXPECT_EQ(text_.utf8_length(), 13);
+  EXPECT_TRUE(memcmp(text_.utf8_data(), utf8.data(), text_.utf8_length())
+              == 0);
+
+  {
+    // Test the iterator
+    UnicodeText::const_iterator iter = text_.begin(), end = text_.end();
+    const char* u = utf8.data();
+    int i = 0;
+    while (iter != end) {
+      char buf[5];
+      int n = iter.get_utf8(buf);
+      buf[n] = '\0';
+      EXPECT_TRUE(strncmp(buf, u, n) == 0);
+      EXPECT_EQ(buf, iter.get_utf8_string());
+      EXPECT_EQ(lengths[i], iter.utf8_length());
+      u += n;
+      iter++;
+      i++;
+    }
+  }
+
+  {
+    // Test the reverse_iterator
+    UnicodeText::const_reverse_iterator iter = text_.rbegin();
+    UnicodeText::const_reverse_iterator end = text_.rend();
+    const char* u = utf8.data() + utf8.size();
+    int i = 0;
+    while (iter != end) {
+      char buf[5];
+      int n = iter.get_utf8(buf);
+      buf[n] = '\0';
+      u -= n;
+      EXPECT_TRUE(strncmp(buf, u, n) == 0);
+      EXPECT_EQ(buf, iter.get_utf8_string());
+      EXPECT_EQ(lengths[text_.size() - i - 1], iter.utf8_length());
+      iter++;
+      i++;
+    }
+  }
+
+  text_.push_back('$');
+  EXPECT_EQ(text_.size(), 6);
+  EXPECT_EQ(text_.utf8_length(), 14);
+
+  text_.push_back('\xAE');  // registered sign
+  EXPECT_EQ(text_.size(), 7);
+  EXPECT_EQ(text_.utf8_length(), 16);  // 2 bytes long
+}
+
+TEST_F(IteratorTest, Decode) {
+  const char32 text[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
+  UnicodeText::const_iterator iter = text_.begin();
+  for (int i = 0; i < 5; ++i)
+    EXPECT_EQ(text[i], *iter++);
+  string s = CodepointString(text_);
+  EXPECT_EQ(s, "1C0 4E8C D7DB 34 1D11E ");
+}
+
+
+
+class OperatorTest : public UnicodeTextTest {};
+
+TEST_F(OperatorTest, Clear) {
+  UnicodeText empty_text(UTF8ToUnicodeText(""));
+  EXPECT_FALSE(text_ == empty_text);
+  text_.clear();
+  EXPECT_TRUE(text_ == empty_text);
+}
+
+TEST_F(OperatorTest, Empty) {
+  EXPECT_TRUE(empty_text_.empty());
+  EXPECT_FALSE(text_.empty());
+  text_.clear();
+  EXPECT_TRUE(text_.empty());
+}
+
+TEST(UnicodeTextTest, InterchangeValidity) {
+  char* FDD0 = new char[3];
+  memcpy(FDD0, "\xEF\xB7\x90", 3);
+  EXPECT_FALSE(UniLib::IsInterchangeValid(FDD0, 3));
+
+  UnicodeText a = MakeUnicodeTextWithoutAcceptingOwnership(FDD0, 3);
+  EXPECT_EQ(a.size(), 1);
+  EXPECT_EQ(*a.begin(), 0x20);
+  a.clear();
+  a.push_back(0xFDD0);
+  EXPECT_EQ(a.size(), 1);
+  EXPECT_EQ(*a.begin(), 0x20);
+
+  a = MakeUnicodeTextAcceptingOwnership(FDD0, 3, 3);
+  EXPECT_EQ(a.size(), 1);
+  EXPECT_EQ(*a.begin(), 0x20);
+  a.clear();
+  a.push_back(0xFDD0);
+  EXPECT_EQ(a.size(), 1);
+  EXPECT_EQ(*a.begin(), 0x20);
+}
+
+class SubstringSearchTest : public UnicodeTextTest {};
+
+// TEST_F(SubstringSearchTest, FindEmpty) {
+//   EXPECT_TRUE(text_.find(empty_text_) == text_.begin());
+//   EXPECT_TRUE(empty_text_.find(text_) == empty_text_.end());
+// }
+
+// TEST_F(SubstringSearchTest, Find) {
+//   UnicodeText::const_iterator second_pos = text_.begin();
+//   ++second_pos;
+//   UnicodeText::const_iterator third_pos = second_pos;
+//   ++third_pos;
+//   UnicodeText::const_iterator fourth_pos = third_pos;
+//   ++fourth_pos;
+
+//   // same as text_
+//   const char32 text[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
+
+//   UnicodeText prefix;
+//   prefix.append(&text[0], &text[2]);
+//   EXPECT_TRUE(text_.find(prefix) == text_.begin());
+//   EXPECT_TRUE(text_.find(prefix, second_pos) == text_.end());
+
+//   UnicodeText suffix;
+//   suffix.append(&text[2], text + arraysize(text));
+//   EXPECT_TRUE(text_.find(suffix) == third_pos);
+//   EXPECT_TRUE(text_.find(suffix, second_pos) == third_pos);
+//   EXPECT_TRUE(text_.find(suffix, third_pos) == third_pos);
+//   EXPECT_TRUE(text_.find(suffix, fourth_pos) == text_.end());
+// }
+
+// TEST_F(SubstringSearchTest, HasConversionError) {
+//   EXPECT_FALSE(text_.HasReplacementChar());
+//   const char32 beg[] = {0xFFFD, 0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
+//   UnicodeText beg_uni;
+//   beg_uni.append(&beg[0], beg + arraysize(beg));
+//   EXPECT_TRUE(beg_uni.HasReplacementChar());
+
+//   const char32 mid[] = {0x1C0, 0x4E8C, 0xFFFD, 0xD7DB, 0x34, 0x1D11E};
+//   UnicodeText mid_uni;
+//   mid_uni.append(&mid[0], mid + arraysize(mid));
+//   EXPECT_TRUE(mid_uni.HasReplacementChar());
+
+//   const char32 end[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E, 0xFFFD};
+//   UnicodeText end_uni;
+//   end_uni.append(&end[0], end + arraysize(end));
+//   EXPECT_TRUE(end_uni.HasReplacementChar());
+
+//   const char32 two[] = {0xFFFD, 0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E, 0xFFFD};
+//   UnicodeText two_uni;
+//   two_uni.append(&two[0], two + arraysize(two));
+//   EXPECT_TRUE(two_uni.HasReplacementChar());
+
+//   const char32 adj[] = {0x1C0, 0xFFFD, 0xFFFD, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
+//   UnicodeText adj_uni;
+//   adj_uni.append(&adj[0], adj + arraysize(adj));
+//   EXPECT_TRUE(adj_uni.HasReplacementChar());
+// }
+
+}  // namespace
--- a/syntaxnet/util/utf8/unilib.cc
+++ b/syntaxnet/util/utf8/unilib.cc
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Author: sligocki@google.com (Shawn Ligocki)
+
+#include "util/utf8/unilib.h"
+
+#include "syntaxnet/base.h"
+#include "third_party/utf/utf.h"
+
+namespace UniLib {
+
+// Codepoints not allowed for interchange are:
+//   C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
+//       Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
+//       Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
+//   C1 controls: U+007F to U+009F
+//   Surrogates: U+D800 to U+DFFF
+//   Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
+bool IsInterchangeValid(char32 c) {
+  return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
+           (c >= 0x7F && c <= 0x9F) ||
+           (c >= 0xD800 && c <= 0xDFFF) ||
+           (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE);
+}
+
+int SpanInterchangeValid(const char* begin, int byte_length) {
+  char32 rune;
+  const char* p = begin;
+  const char* end = begin + byte_length;
+  while (p < end) {
+    int bytes_consumed = charntorune(&rune, p, end - p);
+    // We want to accept Runeerror == U+FFFD as a valid char, but it is used
+    // by chartorune to indicate error. Luckily, the real codepoint is size 3
+    // while errors return bytes_consumed <= 1.
+    if ((rune == Runeerror && bytes_consumed <= 1) ||
+        !IsInterchangeValid(rune)) {
+      break;  // Found
+    }
+    p += bytes_consumed;
+  }
+  return p - begin;
+}
+
+}  // namespace UniLib
--- a/syntaxnet/util/utf8/unilib.h
+++ b/syntaxnet/util/utf8/unilib.h
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Routines to do manipulation of Unicode characters or text
+//
+// The StructurallyValid routines accept buffers of arbitrary bytes.
+// For CoerceToStructurallyValid(), the input buffer and output buffers may
+// point to exactly the same memory.
+//
+// In all other cases, the UTF-8 string must be structurally valid and
+// have all codepoints in the range  U+0000 to U+D7FF or U+E000 to U+10FFFF.
+// Debug builds take a fatal error for invalid UTF-8 input.
+// The input and output buffers may not overlap at all.
+//
+// The char32 routines are here only for convenience; they convert to UTF-8
+// internally and use the UTF-8 routines.
+
+#ifndef UTIL_UTF8_UNILIB_H__
+#define UTIL_UTF8_UNILIB_H__
+
+#include <string>
+#include "syntaxnet/base.h"
+
+// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
+// but they are defined in unilib_utf8_utils.h.
+//#include "util/utf8/public/unilib_utf8_utils.h"  // IWYU pragma: export
+
+namespace UniLib {
+
+// Returns the length in bytes of the prefix of src that is all
+//  interchange valid UTF-8
+int SpanInterchangeValid(const char* src, int byte_length);
+inline int SpanInterchangeValid(const std::string& src) {
+  return SpanInterchangeValid(src.data(), src.size());
+}
+
+// Returns true if the source is all interchange valid UTF-8
+// "Interchange valid" is a stronger than structurally valid --
+// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
+bool IsInterchangeValid(char32 codepoint);
+inline bool IsInterchangeValid(const char* src, int byte_length) {
+  return (byte_length == SpanInterchangeValid(src, byte_length));
+}
+inline bool IsInterchangeValid(const std::string& src) {
+  return IsInterchangeValid(src.data(), src.size());
+}
+
+}  // namespace UniLib
+
+#endif  // UTIL_UTF8_PUBLIC_UNILIB_H_
--- a/syntaxnet/util/utf8/unilib_utf8_utils.h
+++ b/syntaxnet/util/utf8/unilib_utf8_utils.h
+/**
+ * Copyright 2010 Google Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
+#define UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
+
+// These definitions are self-contained and have no dependencies.
+// They are also exported from unilib.h for legacy reasons.
+
+#include "syntaxnet/base.h"
+
+namespace UniLib {
+
+// Returns true if 'c' is in the range [0, 0xD800) or [0xE000, 0x10FFFF]
+// (i.e., is not a surrogate codepoint). See also
+// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
+inline bool IsValidCodepoint(char32 c) {
+  return (static_cast<uint32>(c) < 0xD800)
+    || (c >= 0xE000 && c <= 0x10FFFF);
+}
+
+// Returns the length (number of bytes) of the Unicode code point
+// starting at src, based on inspecting just that one byte. This
+// requires that src point to a well-formed UTF-8 string; the result
+// is undefined otherwise.
+inline int OneCharLen(const char* src) {
+  return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
+}
+
+// Returns true if this byte is a trailing UTF-8 byte (10xx xxxx)
+inline bool IsTrailByte(char x) {
+  // return (x & 0xC0) == 0x80;
+  // Since trail bytes are always in [0x80, 0xBF], we can optimize:
+  return static_cast<signed char>(x) < -0x40;
+}
+
+}  // namespace UniLib
+
+#endif  // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_