Commit 32ab5a58 authored by calberti's avatar calberti Committed by Martin Wicke
Browse files

Adding SyntaxNet to tensorflow/models (#63)

parent 148a15fb
/*
* The authors of this software are Rob Pike and Ken Thompson.
* Copyright (c) 2002 by Lucent Technologies.
* Permission to use, copy, modify, and distribute this software for any
* purpose without fee is hereby granted, provided that this entire notice
* is included in all copies of any software which is or includes a copy
* or modification of this software and in all copies of the supporting
* documentation for such software.
* THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
* WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY
* REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
* OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
*/
#include <stdarg.h>
#include <string.h>
#include "third_party/utf/utf.h"
#include "third_party/utf/utfdef.h"
/*
* Return pointer to first occurrence of s2 in s1,
* 0 if none
*/
const
char*
utfutf(const char *s1, const char *s2)
{
const char *p;
long f, n1, n2;
Rune r;
n1 = chartorune(&r, s2);
f = r;
if(f <= Runesync) /* represents self */
return strstr(s1, s2);
n2 = strlen(s2);
for(p=s1; (p=utfrune(p, f)) != 0; p+=n1)
if(strncmp(p, s2, n2) == 0)
return p;
return 0;
}
build:cuda --crosstool_top=//third_party/gpus/crosstool
build --define=use_fast_cpp_protos=true
build --define=allow_oversize_protos=true
build --copt -funsigned-char
build -c opt
build --spawn_strategy=standalone
test --spawn_strategy=standalone
run --spawn_strategy=standalone
licenses(["notice"])
# Requires --copt -funsigned-char when compiling (unsigned chars).
cc_library(
name = "unicodetext",
srcs = [
"unicodetext.cc",
"unilib.cc",
],
hdrs = [
"unicodetext.h",
"unilib.h",
"unilib_utf8_utils.h",
],
visibility = ["//visibility:public"],
deps = [
"//syntaxnet:base",
"//third_party/utf",
],
)
cc_test(
name = "unicodetext_unittest",
srcs = [
"gtest_main.cc",
"unicodetext_unittest.cc",
],
deps = [
"@tf//tensorflow/core:testlib",
":unicodetext",
],
)
cc_binary(
name = "unicodetext_main",
srcs = ["unicodetext_main.cc"],
deps = [":unicodetext"],
)
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: sligocki@google.com (Shawn Ligocki)
//
// Build all tests with this main to run all tests.
#include "gtest/gtest.h"
int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "util/utf8/unicodetext.h"
#include <string.h> // for memcpy, NULL, memcmp, etc
#include <algorithm> // for max
//#include "base/logging.h" // for operator<<, CHECK, etc
//#include "base/stringprintf.h" // for StringPrintf, StringAppendF
//#include "strings/stringpiece.h" // for StringPiece, etc
#include "third_party/utf/utf.h" // for isvalidcharntorune, etc
#include "util/utf8/unilib.h" // for IsInterchangeValid, etc
#include "util/utf8/unilib_utf8_utils.h" // for OneCharLen
static int CodepointDistance(const char* start, const char* end) {
int n = 0;
// Increment n on every non-trail-byte.
for (const char* p = start; p < end; ++p) {
n += (*reinterpret_cast<const signed char*>(p) >= -0x40);
}
return n;
}
static int CodepointCount(const char* utf8, int len) {
return CodepointDistance(utf8, utf8 + len);
}
UnicodeText::const_iterator::difference_type
distance(const UnicodeText::const_iterator& first,
const UnicodeText::const_iterator& last) {
return CodepointDistance(first.it_, last.it_);
}
// ---------- Utility ----------
static int ConvertToInterchangeValid(char* start, int len) {
// This routine is called only when we've discovered that a UTF-8 buffer
// that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8
// was not interchange valid. This indicates a bug in the caller, and
// a LOG(WARNING) is done in that case.
// This is similar to CoerceToInterchangeValid, but it replaces each
// structurally valid byte with a space, and each non-interchange
// character with a space, even when that character requires more
// than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is
// structurally valid UTF8, but U+FDD0 is not an interchange-valid
// code point. The result should contain one space, not three.
//
// Since the conversion never needs to write more data than it
// reads, it is safe to change the buffer in place. It returns the
// number of bytes written.
char* const in = start;
char* out = start;
char* const end = start + len;
while (start < end) {
int good = UniLib::SpanInterchangeValid(start, end - start);
if (good > 0) {
if (out != start) {
memmove(out, start, good);
}
out += good;
start += good;
if (start == end) {
break;
}
}
// Is the current string invalid UTF8 or just non-interchange UTF8?
char32 rune;
int n;
if (isvalidcharntorune(start, end - start, &rune, &n)) {
// structurally valid UTF8, but not interchange valid
start += n; // Skip over the whole character.
} else { // bad UTF8
start += 1; // Skip over just one byte
}
*out++ = ' ';
}
return out - in;
}
// *************** Data representation **********
// Note: the copy constructor is undefined.
// After reserve(), resize(), or clear(), we're an owner, not an alias.
void UnicodeText::Repr::reserve(int new_capacity) {
// If there's already enough capacity, and we're an owner, do nothing.
if (capacity_ >= new_capacity && ours_) return;
// Otherwise, allocate a new buffer.
capacity_ = std::max(new_capacity, (3 * capacity_) / 2 + 20);
char* new_data = new char[capacity_];
// If there is an old buffer, copy it into the new buffer.
if (data_) {
memcpy(new_data, data_, size_);
if (ours_) delete[] data_; // If we owned the old buffer, free it.
}
data_ = new_data;
ours_ = true; // We own the new buffer.
// size_ is unchanged.
}
void UnicodeText::Repr::resize(int new_size) {
if (new_size == 0) {
clear();
} else {
if (!ours_ || new_size > capacity_) reserve(new_size);
// Clear the memory in the expanded part.
if (size_ < new_size) memset(data_ + size_, 0, new_size - size_);
size_ = new_size;
ours_ = true;
}
}
// This implementation of clear() deallocates the buffer if we're an owner.
// That's not strictly necessary; we could just set size_ to 0.
void UnicodeText::Repr::clear() {
if (ours_) delete[] data_;
data_ = nullptr;
size_ = capacity_ = 0;
ours_ = true;
}
void UnicodeText::Repr::Copy(const char* data, int size) {
resize(size);
memcpy(data_, data, size);
}
void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) {
if (data == data_) return; // We already own this memory. (Weird case.)
if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
data_ = data;
size_ = size;
capacity_ = capacity;
ours_ = true;
}
void UnicodeText::Repr::PointTo(const char* data, int size) {
if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it.
data_ = const_cast<char*>(data);
size_ = size;
capacity_ = size;
ours_ = false;
}
void UnicodeText::Repr::append(const char* bytes, int byte_length) {
reserve(size_ + byte_length);
memcpy(data_ + size_, bytes, byte_length);
size_ += byte_length;
}
string UnicodeText::Repr::DebugString() const {
return tensorflow::strings::Printf("{Repr %p data=%p size=%d capacity=%d %s}",
this,
data_, size_, capacity_,
ours_ ? "Owned" : "Alias");
}
// *************** UnicodeText ******************
// ----- Constructors -----
// Default constructor
UnicodeText::UnicodeText() {
}
// Copy constructor
UnicodeText::UnicodeText(const UnicodeText& src) {
Copy(src);
}
// Substring constructor
UnicodeText::UnicodeText(const UnicodeText::const_iterator& first,
const UnicodeText::const_iterator& last) {
CHECK(first <= last) << " Incompatible iterators";
repr_.append(first.it_, last.it_ - first.it_);
}
string UnicodeText::UTF8Substring(const const_iterator& first,
const const_iterator& last) {
CHECK(first <= last) << " Incompatible iterators";
return string(first.it_, last.it_ - first.it_);
}
// ----- Copy -----
UnicodeText& UnicodeText::operator=(const UnicodeText& src) {
if (this != &src) {
Copy(src);
}
return *this;
}
UnicodeText& UnicodeText::Copy(const UnicodeText& src) {
repr_.Copy(src.repr_.data_, src.repr_.size_);
return *this;
}
UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) {
repr_.Copy(buffer, byte_length);
if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
}
return *this;
}
UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer,
int byte_length) {
repr_.Copy(buffer, byte_length);
return *this;
}
// ----- TakeOwnershipOf -----
UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer,
int byte_length,
int byte_capacity) {
repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
if (!UniLib:: IsInterchangeValid(buffer, byte_length)) {
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
}
return *this;
}
UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer,
int byte_length,
int byte_capacity) {
repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity);
return *this;
}
// ----- PointTo -----
UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) {
if (UniLib:: IsInterchangeValid(buffer, byte_length)) {
repr_.PointTo(buffer, byte_length);
} else {
LOG(WARNING) << "UTF-8 buffer is not interchange-valid.";
repr_.Copy(buffer, byte_length);
repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length);
}
return *this;
}
UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer,
int byte_length) {
repr_.PointTo(buffer, byte_length);
return *this;
}
UnicodeText& UnicodeText::PointTo(const UnicodeText& src) {
repr_.PointTo(src.repr_.data_, src.repr_.size_);
return *this;
}
UnicodeText& UnicodeText::PointTo(const const_iterator &first,
const const_iterator &last) {
CHECK(first <= last) << " Incompatible iterators";
repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data());
return *this;
}
// ----- Append -----
UnicodeText& UnicodeText::append(const UnicodeText& u) {
repr_.append(u.repr_.data_, u.repr_.size_);
return *this;
}
UnicodeText& UnicodeText::append(const const_iterator& first,
const const_iterator& last) {
CHECK(first <= last) << " Incompatible iterators";
repr_.append(first.it_, last.it_ - first.it_);
return *this;
}
UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) {
repr_.append(utf8, len);
return *this;
}
// ----- substring searching -----
UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look,
const_iterator start_pos) const {
CHECK_GE(start_pos.utf8_data(), utf8_data());
CHECK_LE(start_pos.utf8_data(), utf8_data() + utf8_length());
return UnsafeFind(look, start_pos);
}
UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const {
return UnsafeFind(look, begin());
}
UnicodeText::const_iterator UnicodeText::UnsafeFind(
const UnicodeText& look, const_iterator start_pos) const {
// Due to the magic of the UTF8 encoding, searching for a sequence of
// letters is equivalent to substring search.
StringPiece searching(utf8_data(), utf8_length());
StringPiece look_piece(look.utf8_data(), look.utf8_length());
LOG(FATAL) << "Not implemented";
//StringPiece::size_type found =
// searching.find(look_piece, start_pos.utf8_data() - utf8_data());
StringPiece::size_type found = StringPiece::npos;
if (found == StringPiece::npos) return end();
return const_iterator(utf8_data() + found);
}
bool UnicodeText::HasReplacementChar() const {
// Equivalent to:
// UnicodeText replacement_char;
// replacement_char.push_back(0xFFFD);
// return find(replacement_char) != end();
StringPiece searching(utf8_data(), utf8_length());
StringPiece looking_for("\xEF\xBF\xBD", 3);
LOG(FATAL) << "Not implemented";
//return searching.find(looking_for) != StringPiece::npos;
return false;
}
// ----- other methods -----
// Clear operator
void UnicodeText::clear() {
repr_.clear();
}
// Destructor
UnicodeText::~UnicodeText() {}
void UnicodeText::push_back(char32 c) {
if (UniLib::IsValidCodepoint(c)) {
char buf[UTFmax];
int len = runetochar(buf, &c);
if (UniLib::IsInterchangeValid(buf, len)) {
repr_.append(buf, len);
} else {
LOG(WARNING) << "Unicode value 0x" << std::hex << c
<< " is not valid for interchange";
repr_.append(" ", 1);
}
} else {
LOG(WARNING) << "Illegal Unicode value: 0x" << std::hex << c;
repr_.append(" ", 1);
}
}
int UnicodeText::size() const {
return CodepointCount(repr_.data_, repr_.size_);
}
bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) {
if (&lhs == &rhs) return true;
if (lhs.repr_.size_ != rhs.repr_.size_) return false;
return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0;
}
string UnicodeText::DebugString() const {
return tensorflow::strings::Printf("{UnicodeText %p chars=%d repr=%s}",
this,
size(),
repr_.DebugString().c_str());
}
// ******************* UnicodeText::const_iterator *********************
// The implementation of const_iterator would be nicer if it
// inherited from boost::iterator_facade
// (http://boost.org/libs/iterator/doc/iterator_facade.html).
UnicodeText::const_iterator::const_iterator() : it_(nullptr) {}
UnicodeText::const_iterator::const_iterator(const const_iterator& other)
: it_(other.it_) {
}
UnicodeText::const_iterator&
UnicodeText::const_iterator::operator=(const const_iterator& other) {
if (&other != this)
it_ = other.it_;
return *this;
}
UnicodeText::const_iterator UnicodeText::begin() const {
return const_iterator(repr_.data_);
}
UnicodeText::const_iterator UnicodeText::end() const {
return const_iterator(repr_.data_ + repr_.size_);
}
bool operator<(const UnicodeText::const_iterator& lhs,
const UnicodeText::const_iterator& rhs) {
return lhs.it_ < rhs.it_;
}
char32 UnicodeText::const_iterator::operator*() const {
// (We could call chartorune here, but that does some
// error-checking, and we're guaranteed that our data is valid
// UTF-8. Also, we expect this routine to be called very often. So
// for speed, we do the calculation ourselves.)
// Convert from UTF-8
int byte1 = it_[0];
if (byte1 < 0x80)
return byte1;
int byte2 = it_[1];
if (byte1 < 0xE0)
return ((byte1 & 0x1F) << 6)
| (byte2 & 0x3F);
int byte3 = it_[2];
if (byte1 < 0xF0)
return ((byte1 & 0x0F) << 12)
| ((byte2 & 0x3F) << 6)
| (byte3 & 0x3F);
int byte4 = it_[3];
return ((byte1 & 0x07) << 18)
| ((byte2 & 0x3F) << 12)
| ((byte3 & 0x3F) << 6)
| (byte4 & 0x3F);
}
UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() {
it_ += UniLib::OneCharLen(it_);
return *this;
}
UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() {
while (UniLib::IsTrailByte(*--it_));
return *this;
}
int UnicodeText::const_iterator::get_utf8(char* utf8_output) const {
utf8_output[0] = it_[0]; if (it_[0] < 0x80) return 1;
utf8_output[1] = it_[1]; if (it_[0] < 0xE0) return 2;
utf8_output[2] = it_[2]; if (it_[0] < 0xF0) return 3;
utf8_output[3] = it_[3];
return 4;
}
string UnicodeText::const_iterator::get_utf8_string() const {
return string(utf8_data(), utf8_length());
}
int UnicodeText::const_iterator::utf8_length() const {
if (it_[0] < 0x80) {
return 1;
} else if (it_[0] < 0xE0) {
return 2;
} else if (it_[0] < 0xF0) {
return 3;
} else {
return 4;
}
}
UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const {
CHECK(p != nullptr);
const char* start = utf8_data();
int len = utf8_length();
const char* end = start + len;
CHECK(p >= start);
CHECK(p <= end);
CHECK(p == end || !UniLib::IsTrailByte(*p));
return const_iterator(p);
}
string UnicodeText::const_iterator::DebugString() const {
return tensorflow::strings::Printf("{iter %p}", it_);
}
// *************************** Utilities *************************
string CodepointString(const UnicodeText& t) {
string s;
UnicodeText::const_iterator it = t.begin(), end = t.end();
while (it != end) tensorflow::strings::Appendf(&s, "%X ", *it++);
return s;
}
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef UTIL_UTF8_PUBLIC_UNICODETEXT_H_
#define UTIL_UTF8_PUBLIC_UNICODETEXT_H_
#include <stddef.h> // for NULL, ptrdiff_t
#include <iterator> // for bidirectional_iterator_tag, etc
#include <string> // for string
#include <utility> // for pair
#include "syntaxnet/base.h"
// ***************************** UnicodeText **************************
//
// A UnicodeText object is a container for a sequence of Unicode
// codepoint values. It has default, copy, and assignment constructors.
// Data can be appended to it from another UnicodeText, from
// iterators, or from a single codepoint.
//
// The internal representation of the text is UTF-8. Since UTF-8 is a
// variable-width format, UnicodeText does not provide random access
// to the text, and changes to the text are permitted only at the end.
//
// The UnicodeText class defines a const_iterator. The dereferencing
// operator (*) returns a codepoint (char32). The iterator is a
// bidirectional, read-only iterator. It becomes invalid if the text
// is changed.
//
// There are methods for appending and retrieving UTF-8 data directly.
// The 'utf8_data' method returns a const char* that contains the
// UTF-8-encoded version of the text; 'utf8_length' returns the number
// of bytes in the UTF-8 data. An iterator's 'get' method stores up to
// 4 bytes of UTF-8 data in a char array and returns the number of
// bytes that it stored.
//
// Codepoints are integers in the range [0, 0xD7FF] or [0xE000,
// 0x10FFFF], but UnicodeText has the additional restriction that it
// can contain only those characters that are valid for interchange on
// the Web. This excludes all of the control codes except for carriage
// return, line feed, and horizontal tab. It also excludes
// non-characters, but codepoints that are in the Private Use regions
// are allowed, as are codepoints that are unassigned. (See the
// Unicode reference for details.) The function UniLib::IsInterchangeValid
// can be used as a test for this property.
//
// UnicodeTexts are safe. Every method that constructs or modifies a
// UnicodeText tests for interchange-validity, and will substitute a
// space for the invalid data. Such cases are reported via
// LOG(WARNING).
//
// MEMORY MANAGEMENT: copy, take ownership, or point to
//
// A UnicodeText is either an "owner", meaning that it owns the memory
// for the data buffer and will free it when the UnicodeText is
// destroyed, or it is an "alias", meaning that it does not.
//
// There are three methods for storing UTF-8 data in a UnicodeText:
//
// CopyUTF8(buffer, len) copies buffer.
//
// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer.
//
// PointToUTF8(buffer, size) creates an alias pointing to buffer.
//
// All three methods perform a validity check on the buffer. There are
// private, "unsafe" versions of these functions that bypass the
// validity check. They are used internally and by friend-functions
// that are handling UTF-8 data that has already been validated.
//
// The purpose of an alias is to avoid making an unnecessary copy of a
// UTF-8 buffer while still providing access to the Unicode values
// within that text through iterators or the fast scanners that are
// based on UTF-8 state tables. The lifetime of an alias must not
// exceed the lifetime of the buffer from which it was constructed.
//
// The semantics of an alias might be described as "copy on write or
// repair." The source data is never modified. If push_back() or
// append() is called on an alias, a copy of the data will be created,
// and the UnicodeText will become an owner. If clear() is called on
// an alias, it becomes an (empty) owner.
//
// The copy constructor and the assignment operator produce an owner.
// That is, after direct initialization ("UnicodeText x(y);") or copy
// initialization ("UnicodeText x = y;") x will be an owner, even if y
// was an alias. The assignment operator ("x = y;") also produces an
// owner unless x and y are the same object and y is an alias.
//
// Aliases should be used with care. If the source from which an alias
// was created is freed, or if the contents are changed, while the
// alias is still in use, fatal errors could result. But it can be
// quite useful to have a UnicodeText "window" through which to see a
// UTF-8 buffer without having to pay the price of making a copy.
//
// UTILITIES
//
// The interfaces in util/utf8/public/textutils.h provide higher-level
// utilities for dealing with UnicodeTexts, including routines for
// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or
// strings, creating strings from UnicodeTexts, normalizing text for
// efficient matching or display, and others.
class UnicodeText {
public:
class const_iterator;
typedef char32 value_type;
// Constructors. These always produce owners.
UnicodeText(); // Create an empty text.
UnicodeText(const UnicodeText& src); // copy constructor
// Construct a substring (copies the data).
UnicodeText(const const_iterator& first, const const_iterator& last);
// Assignment operator. This copies the data and produces an owner
// unless this == &src, e.g., "x = x;", which is a no-op.
UnicodeText& operator=(const UnicodeText& src);
// x.Copy(y) copies the data from y into x.
UnicodeText& Copy(const UnicodeText& src);
inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); }
// x.PointTo(y) changes x so that it points to y's data.
// It does not copy y or take ownership of y's data.
UnicodeText& PointTo(const UnicodeText& src);
UnicodeText& PointTo(const const_iterator& first,
const const_iterator& last);
~UnicodeText();
void clear(); // Clear text.
bool empty() const { return repr_.size_ == 0; } // Test if text is empty.
// Add a codepoint to the end of the text.
// If the codepoint is not interchange-valid, add a space instead
// and log a warning.
void push_back(char32 codepoint);
// Generic appending operation.
// iterator_traits<ForwardIterator>::value_type must be implicitly
// convertible to char32. Typical uses of this method might include:
// char32 chars[] = {0x1, 0x2, ...};
// vector<char32> more_chars = ...;
// utext.append(chars, chars+arraysize(chars));
// utext.append(more_chars.begin(), more_chars.end());
template<typename ForwardIterator>
UnicodeText& append(ForwardIterator first, const ForwardIterator last) {
while (first != last) { push_back(*first++); }
return *this;
}
// A specialization of the generic append() method.
UnicodeText& append(const const_iterator& first, const const_iterator& last);
// An optimization of append(source.begin(), source.end()).
UnicodeText& append(const UnicodeText& source);
int size() const; // the number of Unicode characters (codepoints)
friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs);
class const_iterator {
typedef const_iterator CI;
public:
typedef std::bidirectional_iterator_tag iterator_category;
typedef char32 value_type;
typedef ptrdiff_t difference_type;
typedef void pointer; // (Not needed.)
typedef const char32 reference; // (Needed for const_reverse_iterator)
// Iterators are default-constructible.
const_iterator();
// It's safe to make multiple passes over a UnicodeText.
const_iterator(const const_iterator& other);
const_iterator& operator=(const const_iterator& other);
char32 operator*() const; // Dereference
const_iterator& operator++(); // Advance (++iter)
const_iterator operator++(int) { // (iter++)
const_iterator result(*this);
++*this;
return result;
}
const_iterator& operator--(); // Retreat (--iter)
const_iterator operator--(int) { // (iter--)
const_iterator result(*this);
--*this;
return result;
}
// We love relational operators.
friend bool operator==(const CI& lhs, const CI& rhs) {
return lhs.it_ == rhs.it_; }
friend bool operator!=(const CI& lhs, const CI& rhs) {
return !(lhs == rhs); }
friend bool operator<(const CI& lhs, const CI& rhs);
friend bool operator>(const CI& lhs, const CI& rhs) {
return rhs < lhs; }
friend bool operator<=(const CI& lhs, const CI& rhs) {
return !(rhs < lhs); }
friend bool operator>=(const CI& lhs, const CI& rhs) {
return !(lhs < rhs); }
friend difference_type distance(const CI& first, const CI& last);
// UTF-8-specific methods
// Store the UTF-8 encoding of the current codepoint into buf,
// which must be at least 4 bytes long. Return the number of
// bytes written.
int get_utf8(char* buf) const;
// Return the UTF-8 character that the iterator points to.
string get_utf8_string() const;
// Return the byte length of the UTF-8 character the iterator points to.
int utf8_length() const;
// Return the iterator's pointer into the UTF-8 data.
const char* utf8_data() const { return it_; }
string DebugString() const;
private:
friend class UnicodeText;
friend class UnicodeTextUtils;
friend class UTF8StateTableProperty;
explicit const_iterator(const char* it) : it_(it) {}
const char* it_;
};
const_iterator begin() const;
const_iterator end() const;
class const_reverse_iterator : public std::reverse_iterator<const_iterator> {
public:
explicit const_reverse_iterator(const_iterator it) :
std::reverse_iterator<const_iterator>(it) {}
const char* utf8_data() const {
const_iterator tmp_it = base();
return (--tmp_it).utf8_data();
}
int get_utf8(char* buf) const {
const_iterator tmp_it = base();
return (--tmp_it).get_utf8(buf);
}
string get_utf8_string() const {
const_iterator tmp_it = base();
return (--tmp_it).get_utf8_string();
}
int utf8_length() const {
const_iterator tmp_it = base();
return (--tmp_it).utf8_length();
}
};
const_reverse_iterator rbegin() const {
return const_reverse_iterator(end());
}
const_reverse_iterator rend() const {
return const_reverse_iterator(begin());
}
// Substring searching. Returns the beginning of the first
// occurrence of "look", or end() if not found.
const_iterator find(const UnicodeText& look, const_iterator start_pos) const;
// Equivalent to find(look, begin())
const_iterator find(const UnicodeText& look) const;
// Returns whether this contains the character U+FFFD. This can
// occur, for example, if the input to Encodings::Decode() had byte
// sequences that were invalid in the source encoding.
bool HasReplacementChar() const;
// UTF-8-specific methods
//
// Return the data, length, and capacity of UTF-8-encoded version of
// the text. Length and capacity are measured in bytes.
const char* utf8_data() const { return repr_.data_; }
int utf8_length() const { return repr_.size_; }
int utf8_capacity() const { return repr_.capacity_; }
// Return the UTF-8 data as a string.
static string UTF8Substring(const const_iterator& first,
const const_iterator& last);
// There are three methods for initializing a UnicodeText from UTF-8
// data. They vary in details of memory management. In all cases,
// the data is tested for interchange-validity. If it is not
// interchange-valid, a LOG(WARNING) is issued, and each
// structurally invalid byte and each interchange-invalid codepoint
// is replaced with a space.
// x.CopyUTF8(buf, len) copies buf into x.
UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length);
// x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of
// buf. buf is not copied.
UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer,
int byte_length,
int byte_capacity);
// x.PointToUTF8(buf,len) changes x so that it points to buf
// ("becomes an alias"). It does not take ownership or copy buf.
// If the buffer is not valid, this has the same effect as
// CopyUTF8(utf8_buffer, byte_length).
UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length);
// Occasionally it is necessary to use functions that operate on the
// pointer returned by utf8_data(). MakeIterator(p) provides a way
// to get back to the UnicodeText level. It uses CHECK to ensure
// that p is a pointer within this object's UTF-8 data, and that it
// points to the beginning of a character.
const_iterator MakeIterator(const char* p) const;
string DebugString() const;
private:
friend class const_iterator;
friend class UnicodeTextUtils;
class Repr { // A byte-string.
public:
char* data_;
int size_;
int capacity_;
bool ours_; // Do we own data_?
Repr() : data_(nullptr), size_(0), capacity_(0), ours_(true) {}
~Repr() { if (ours_) delete[] data_; }
void clear();
void reserve(int capacity);
void resize(int size);
void append(const char* bytes, int byte_length);
void Copy(const char* data, int size);
void TakeOwnershipOf(char* data, int size, int capacity);
void PointTo(const char* data, int size);
string DebugString() const;
private:
Repr& operator=(const Repr&);
Repr(const Repr& other);
};
Repr repr_;
// UTF-8-specific private methods.
// These routines do not perform a validity check when compiled
// in opt mode.
// It is an error to call these methods with UTF-8 data that
// is not interchange-valid.
//
UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length);
UnicodeText& UnsafeTakeOwnershipOfUTF8(
char* utf8_buffer, int byte_length, int byte_capacity);
UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length);
UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length);
const_iterator UnsafeFind(const UnicodeText& look,
const_iterator start_pos) const;
};
bool operator==(const UnicodeText& lhs, const UnicodeText& rhs);
inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) {
return !(lhs == rhs);
}
// UnicodeTextRange is a pair of iterators, useful for specifying text
// segments. If the iterators are ==, the segment is empty.
typedef pair<UnicodeText::const_iterator,
UnicodeText::const_iterator> UnicodeTextRange;
inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) {
return r.first == r.second;
}
// *************************** Utilities *************************
// A factory function for creating a UnicodeText from a buffer of
// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It
// is an "owner.")
//
// Each byte that is structurally invalid will be replaced with a
// space. Each codepoint that is interchange-invalid will also be
// replaced with a space, even if the codepoint was represented with a
// multibyte sequence in the UTF-8 data.
//
inline UnicodeText MakeUnicodeTextAcceptingOwnership(
char* utf8_buffer, int byte_length, int byte_capacity) {
return UnicodeText().TakeOwnershipOfUTF8(
utf8_buffer, byte_length, byte_capacity);
}
// A factory function for creating a UnicodeText from a buffer of
// UTF-8 data. The new UnicodeText does not take ownership of the
// buffer. (It is an "alias.")
//
inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership(
const char* utf8_buffer, int byte_length) {
return UnicodeText().PointToUTF8(utf8_buffer, byte_length);
}
// Create a UnicodeText from a UTF-8 string or buffer.
//
// If do_copy is true, then a copy of the string is made. The copy is
// owned by the resulting UnicodeText object and will be freed when
// the object is destroyed. This UnicodeText object is referred to
// as an "owner."
//
// If do_copy is false, then no copy is made. The resulting
// UnicodeText object does NOT take ownership of the string; in this
// case, the lifetime of the UnicodeText object must not exceed the
// lifetime of the string. This Unicodetext object is referred to as
// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership.
//
// If the input string does not contain valid UTF-8, then a copy is
// made (as if do_copy were true) and coerced to valid UTF-8 by
// replacing each invalid byte with a space.
//
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len,
bool do_copy) {
UnicodeText t;
if (do_copy) {
t.CopyUTF8(utf8_buf, len);
} else {
t.PointToUTF8(utf8_buf, len);
}
return t;
}
inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) {
return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy);
}
inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) {
return UTF8ToUnicodeText(utf8_buf, len, true);
}
inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) {
return UTF8ToUnicodeText(utf8_string, true);
}
// Return a string containing the UTF-8 encoded version of all the
// Unicode characters in t.
inline string UnicodeTextToUTF8(const UnicodeText& t) {
return string(t.utf8_data(), t.utf8_length());
}
// For debugging. Return a string of integers, written in uppercase
// hex (%X), corresponding to the codepoints within the text. Each
// integer is followed by a space. E.g., "61 62 6A 3005 ".
string CodepointString(const UnicodeText& t);
#endif // UTIL_UTF8_PUBLIC_UNICODETEXT_H_
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: sligocki@google.com (Shawn Ligocki)
//
// A basic main function to test that UnicodeText builds.
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include "util/utf8/unicodetext.h"
int main(int argc, char** argv) {
if (argc > 1) {
printf("Bytes:\n");
std::string bytes(argv[1]);
for (std::string::const_iterator iter = bytes.begin();
iter < bytes.end(); ++iter) {
printf(" 0x%02X\n", *iter);
}
printf("Unicode codepoints:\n");
UnicodeText text(UTF8ToUnicodeText(bytes));
for (UnicodeText::const_iterator iter = text.begin();
iter < text.end(); ++iter) {
printf(" U+%X\n", *iter);
}
}
return EXIT_SUCCESS;
}
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "util/utf8/unicodetext.h"
#include <iterator>
#include <set>
#include "gtest/gtest.h"
#include "third_party/utf/utf.h"
#include "util/utf8/unilib.h"
namespace {
template <typename T, size_t N>
char (&ArraySizeHelper(T (&array)[N]))[N];
#define arraysize(array) (sizeof(ArraySizeHelper(array)))
class UnicodeTextTest : public testing::Test {
protected:
UnicodeTextTest() : empty_text_() {
const char32 text[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
// Construct a UnicodeText from those codepoints.
text_.append(&text[0], text + arraysize(text));
}
UnicodeText empty_text_;
UnicodeText text_;
};
TEST(UnicodeTextTest, Ownership) {
const string src = "\u304A\u00B0\u106B";
{
string s = src;
char* sbuf = new char[s.size()];
memcpy(sbuf, s.data(), s.size());
UnicodeText owned;
owned.TakeOwnershipOfUTF8(sbuf, s.size(), s.size());
EXPECT_EQ(owned.utf8_data(), sbuf);
s.clear();
// owned should be OK even after s has been cleared.
UnicodeText::const_iterator it = owned.begin();
EXPECT_EQ(*it++, 0x304A);
EXPECT_EQ(*it++, 0x00B0);
EXPECT_EQ(*it++, 0x106B);
CHECK(it == owned.end());
}
{
UnicodeText owner;
{ // Create a new scope for s.
string s = src;
char* sbuf = new char[s.size()];
memcpy(sbuf, s.data(), s.size());
UnicodeText t;
t.TakeOwnershipOfUTF8(sbuf, s.size(), s.size());
EXPECT_EQ(t.utf8_data(), sbuf);
owner = t; // Copies the data
EXPECT_NE(owner.utf8_data(), sbuf);
}
// owner should be OK even after s has gone out of scope
UnicodeText::const_iterator it = owner.begin();
EXPECT_EQ(*it++, 0x304A);
EXPECT_EQ(*it++, 0x00B0);
EXPECT_EQ(*it++, 0x106B);
CHECK(it == owner.end());
}
{
UnicodeText alias;
alias.PointToUTF8(src.data(), src.size());
EXPECT_EQ(alias.utf8_data(), src.data());
UnicodeText::const_iterator it = alias.begin();
EXPECT_EQ(*it++, 0x304A);
EXPECT_EQ(*it++, 0x00B0);
EXPECT_EQ(*it++, 0x106B);
CHECK(it == alias.end());
UnicodeText t = alias; // Copy initialization copies the data.
EXPECT_NE(t.utf8_data(), alias.utf8_data());
UnicodeText t2;
t2 = alias; // Assignment copies the data.
EXPECT_NE(t2.utf8_data(), alias.utf8_data());
// Preserve an alias.
t.PointTo(alias); // This does not copy the data.
EXPECT_EQ(t.utf8_data(), alias.utf8_data());
t.push_back(0x0020); // Modify the alias
EXPECT_NE(t.utf8_data(), alias.utf8_data()); // It's no longer an alias.
}
}
class IteratorTest : public UnicodeTextTest {};
TEST_F(IteratorTest, Iterates) {
UnicodeText::const_iterator iter = text_.begin();
EXPECT_EQ(0x1C0, *iter);
EXPECT_EQ(&iter, &++iter); // operator++ returns *this.
EXPECT_EQ(0x4E8C, *iter++);
EXPECT_EQ(0xD7DB, *iter);
// Make sure you can dereference more than once.
EXPECT_EQ(0xD7DB, *iter);
EXPECT_EQ(0x34, *++iter);
EXPECT_EQ(0x1D11E, *++iter);
ASSERT_TRUE(iter != text_.end());
iter++;
EXPECT_TRUE(iter == text_.end());
}
TEST_F(IteratorTest, Reverse) {
UnicodeText::const_reverse_iterator iter = text_.rbegin();
EXPECT_EQ(0x1D11E, *iter);
EXPECT_EQ(&iter, &++iter); // operator++ returns *this.
EXPECT_EQ(0x34, *iter++);
EXPECT_EQ(0xD7DB, *iter);
// Make sure you can dereference more than once.
EXPECT_EQ(0xD7DB, *iter);
EXPECT_EQ(0x4E8C, *++iter);
EXPECT_EQ(0x1C0, *++iter);
ASSERT_TRUE(iter != text_.rend());
iter++;
EXPECT_TRUE(iter == text_.rend());
}
TEST_F(IteratorTest, MultiPass) {
// Also tests Default Constructible and Assignable.
UnicodeText::const_iterator i1, i2;
i1 = text_.begin();
i2 = i1;
EXPECT_EQ(0x4E8C, *++i1);
EXPECT_TRUE(i1 != i2);
EXPECT_EQ(0x1C0, *i2);
++i2;
EXPECT_TRUE(i1 == i2);
EXPECT_EQ(0x4E8C, *i2);
}
TEST_F(IteratorTest, ReverseIterates) {
UnicodeText::const_iterator iter = text_.end();
EXPECT_TRUE(iter == text_.end());
iter--;
ASSERT_TRUE(iter != text_.end());
EXPECT_EQ(0x1D11E, *iter--);
EXPECT_EQ(0x34, *iter);
EXPECT_EQ(0xD7DB, *--iter);
// Make sure you can dereference more than once.
EXPECT_EQ(0xD7DB, *iter);
--iter;
EXPECT_EQ(0x4E8C, *iter--);
EXPECT_EQ(0x1C0, *iter);
EXPECT_TRUE(iter == text_.begin());
}
TEST_F(IteratorTest, Comparable) {
UnicodeText::const_iterator i1, i2;
i1 = text_.begin();
i2 = i1;
++i2;
EXPECT_TRUE(i1 < i2);
EXPECT_TRUE(text_.begin() <= i1);
EXPECT_FALSE(i1 >= i2);
EXPECT_FALSE(i1 > text_.end());
}
TEST_F(IteratorTest, Advance) {
UnicodeText::const_iterator iter = text_.begin();
EXPECT_EQ(0x1C0, *iter);
std::advance(iter, 4);
EXPECT_EQ(0x1D11E, *iter);
++iter;
EXPECT_TRUE(iter == text_.end());
}
TEST_F(IteratorTest, Distance) {
UnicodeText::const_iterator iter = text_.begin();
EXPECT_EQ(0, distance(text_.begin(), iter));
EXPECT_EQ(5, distance(iter, text_.end()));
++iter;
++iter;
EXPECT_EQ(2, distance(text_.begin(), iter));
EXPECT_EQ(3, distance(iter, text_.end()));
++iter;
++iter;
EXPECT_EQ(4, distance(text_.begin(), iter));
++iter;
EXPECT_EQ(0, distance(iter, text_.end()));
}
TEST_F(IteratorTest, Encode) {
const string utf8 = "\xC7\x80"
"\xE4\xBA\x8C"
"\xED\x9F\x9B"
"\x34"
"\xF0\x9D\x84\x9E";
const int lengths[] = {2, 3, 3, 1, 4};
EXPECT_EQ(text_.size(), 5);
EXPECT_EQ(text_.utf8_length(), 13);
EXPECT_TRUE(memcmp(text_.utf8_data(), utf8.data(), text_.utf8_length())
== 0);
{
// Test the iterator
UnicodeText::const_iterator iter = text_.begin(), end = text_.end();
const char* u = utf8.data();
int i = 0;
while (iter != end) {
char buf[5];
int n = iter.get_utf8(buf);
buf[n] = '\0';
EXPECT_TRUE(strncmp(buf, u, n) == 0);
EXPECT_EQ(buf, iter.get_utf8_string());
EXPECT_EQ(lengths[i], iter.utf8_length());
u += n;
iter++;
i++;
}
}
{
// Test the reverse_iterator
UnicodeText::const_reverse_iterator iter = text_.rbegin();
UnicodeText::const_reverse_iterator end = text_.rend();
const char* u = utf8.data() + utf8.size();
int i = 0;
while (iter != end) {
char buf[5];
int n = iter.get_utf8(buf);
buf[n] = '\0';
u -= n;
EXPECT_TRUE(strncmp(buf, u, n) == 0);
EXPECT_EQ(buf, iter.get_utf8_string());
EXPECT_EQ(lengths[text_.size() - i - 1], iter.utf8_length());
iter++;
i++;
}
}
text_.push_back('$');
EXPECT_EQ(text_.size(), 6);
EXPECT_EQ(text_.utf8_length(), 14);
text_.push_back('\xAE'); // registered sign
EXPECT_EQ(text_.size(), 7);
EXPECT_EQ(text_.utf8_length(), 16); // 2 bytes long
}
TEST_F(IteratorTest, Decode) {
const char32 text[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
UnicodeText::const_iterator iter = text_.begin();
for (int i = 0; i < 5; ++i)
EXPECT_EQ(text[i], *iter++);
string s = CodepointString(text_);
EXPECT_EQ(s, "1C0 4E8C D7DB 34 1D11E ");
}
class OperatorTest : public UnicodeTextTest {};
TEST_F(OperatorTest, Clear) {
UnicodeText empty_text(UTF8ToUnicodeText(""));
EXPECT_FALSE(text_ == empty_text);
text_.clear();
EXPECT_TRUE(text_ == empty_text);
}
TEST_F(OperatorTest, Empty) {
EXPECT_TRUE(empty_text_.empty());
EXPECT_FALSE(text_.empty());
text_.clear();
EXPECT_TRUE(text_.empty());
}
TEST(UnicodeTextTest, InterchangeValidity) {
char* FDD0 = new char[3];
memcpy(FDD0, "\xEF\xB7\x90", 3);
EXPECT_FALSE(UniLib::IsInterchangeValid(FDD0, 3));
UnicodeText a = MakeUnicodeTextWithoutAcceptingOwnership(FDD0, 3);
EXPECT_EQ(a.size(), 1);
EXPECT_EQ(*a.begin(), 0x20);
a.clear();
a.push_back(0xFDD0);
EXPECT_EQ(a.size(), 1);
EXPECT_EQ(*a.begin(), 0x20);
a = MakeUnicodeTextAcceptingOwnership(FDD0, 3, 3);
EXPECT_EQ(a.size(), 1);
EXPECT_EQ(*a.begin(), 0x20);
a.clear();
a.push_back(0xFDD0);
EXPECT_EQ(a.size(), 1);
EXPECT_EQ(*a.begin(), 0x20);
}
class SubstringSearchTest : public UnicodeTextTest {};
// TEST_F(SubstringSearchTest, FindEmpty) {
// EXPECT_TRUE(text_.find(empty_text_) == text_.begin());
// EXPECT_TRUE(empty_text_.find(text_) == empty_text_.end());
// }
// TEST_F(SubstringSearchTest, Find) {
// UnicodeText::const_iterator second_pos = text_.begin();
// ++second_pos;
// UnicodeText::const_iterator third_pos = second_pos;
// ++third_pos;
// UnicodeText::const_iterator fourth_pos = third_pos;
// ++fourth_pos;
// // same as text_
// const char32 text[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
// UnicodeText prefix;
// prefix.append(&text[0], &text[2]);
// EXPECT_TRUE(text_.find(prefix) == text_.begin());
// EXPECT_TRUE(text_.find(prefix, second_pos) == text_.end());
// UnicodeText suffix;
// suffix.append(&text[2], text + arraysize(text));
// EXPECT_TRUE(text_.find(suffix) == third_pos);
// EXPECT_TRUE(text_.find(suffix, second_pos) == third_pos);
// EXPECT_TRUE(text_.find(suffix, third_pos) == third_pos);
// EXPECT_TRUE(text_.find(suffix, fourth_pos) == text_.end());
// }
// TEST_F(SubstringSearchTest, HasConversionError) {
// EXPECT_FALSE(text_.HasReplacementChar());
// const char32 beg[] = {0xFFFD, 0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
// UnicodeText beg_uni;
// beg_uni.append(&beg[0], beg + arraysize(beg));
// EXPECT_TRUE(beg_uni.HasReplacementChar());
// const char32 mid[] = {0x1C0, 0x4E8C, 0xFFFD, 0xD7DB, 0x34, 0x1D11E};
// UnicodeText mid_uni;
// mid_uni.append(&mid[0], mid + arraysize(mid));
// EXPECT_TRUE(mid_uni.HasReplacementChar());
// const char32 end[] = {0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E, 0xFFFD};
// UnicodeText end_uni;
// end_uni.append(&end[0], end + arraysize(end));
// EXPECT_TRUE(end_uni.HasReplacementChar());
// const char32 two[] = {0xFFFD, 0x1C0, 0x4E8C, 0xD7DB, 0x34, 0x1D11E, 0xFFFD};
// UnicodeText two_uni;
// two_uni.append(&two[0], two + arraysize(two));
// EXPECT_TRUE(two_uni.HasReplacementChar());
// const char32 adj[] = {0x1C0, 0xFFFD, 0xFFFD, 0x4E8C, 0xD7DB, 0x34, 0x1D11E};
// UnicodeText adj_uni;
// adj_uni.append(&adj[0], adj + arraysize(adj));
// EXPECT_TRUE(adj_uni.HasReplacementChar());
// }
} // namespace
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: sligocki@google.com (Shawn Ligocki)
#include "util/utf8/unilib.h"
#include "syntaxnet/base.h"
#include "third_party/utf/utf.h"
namespace UniLib {
// Codepoints not allowed for interchange are:
// C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020),
// Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A),
// Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D)
// C1 controls: U+007F to U+009F
// Surrogates: U+D800 to U+DFFF
// Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx
bool IsInterchangeValid(char32 c) {
return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) ||
(c >= 0x7F && c <= 0x9F) ||
(c >= 0xD800 && c <= 0xDFFF) ||
(c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE);
}
int SpanInterchangeValid(const char* begin, int byte_length) {
char32 rune;
const char* p = begin;
const char* end = begin + byte_length;
while (p < end) {
int bytes_consumed = charntorune(&rune, p, end - p);
// We want to accept Runeerror == U+FFFD as a valid char, but it is used
// by chartorune to indicate error. Luckily, the real codepoint is size 3
// while errors return bytes_consumed <= 1.
if ((rune == Runeerror && bytes_consumed <= 1) ||
!IsInterchangeValid(rune)) {
break; // Found
}
p += bytes_consumed;
}
return p - begin;
}
} // namespace UniLib
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Routines to do manipulation of Unicode characters or text
//
// The StructurallyValid routines accept buffers of arbitrary bytes.
// For CoerceToStructurallyValid(), the input buffer and output buffers may
// point to exactly the same memory.
//
// In all other cases, the UTF-8 string must be structurally valid and
// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF.
// Debug builds take a fatal error for invalid UTF-8 input.
// The input and output buffers may not overlap at all.
//
// The char32 routines are here only for convenience; they convert to UTF-8
// internally and use the UTF-8 routines.
#ifndef UTIL_UTF8_UNILIB_H__
#define UTIL_UTF8_UNILIB_H__
#include <string>
#include "syntaxnet/base.h"
// We export OneCharLen, IsValidCodepoint, and IsTrailByte from here,
// but they are defined in unilib_utf8_utils.h.
//#include "util/utf8/public/unilib_utf8_utils.h" // IWYU pragma: export
namespace UniLib {
// Returns the length in bytes of the prefix of src that is all
// interchange valid UTF-8
int SpanInterchangeValid(const char* src, int byte_length);
inline int SpanInterchangeValid(const std::string& src) {
return SpanInterchangeValid(src.data(), src.size());
}
// Returns true if the source is all interchange valid UTF-8
// "Interchange valid" is a stronger than structurally valid --
// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters.
bool IsInterchangeValid(char32 codepoint);
inline bool IsInterchangeValid(const char* src, int byte_length) {
return (byte_length == SpanInterchangeValid(src, byte_length));
}
inline bool IsInterchangeValid(const std::string& src) {
return IsInterchangeValid(src.data(), src.size());
}
} // namespace UniLib
#endif // UTIL_UTF8_PUBLIC_UNILIB_H_
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
#define UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
// These definitions are self-contained and have no dependencies.
// They are also exported from unilib.h for legacy reasons.
#include "syntaxnet/base.h"
namespace UniLib {
// Returns true if 'c' is in the range [0, 0xD800) or [0xE000, 0x10FFFF]
// (i.e., is not a surrogate codepoint). See also
// IsValidCodepoint(const char* src) in util/utf8/public/unilib.h.
inline bool IsValidCodepoint(char32 c) {
return (static_cast<uint32>(c) < 0xD800)
|| (c >= 0xE000 && c <= 0x10FFFF);
}
// Returns the length (number of bytes) of the Unicode code point
// starting at src, based on inspecting just that one byte. This
// requires that src point to a well-formed UTF-8 string; the result
// is undefined otherwise.
inline int OneCharLen(const char* src) {
return "\1\1\1\1\1\1\1\1\1\1\1\1\2\2\3\4"[(*src & 0xFF) >> 4];
}
// Returns true if this byte is a trailing UTF-8 byte (10xx xxxx)
inline bool IsTrailByte(char x) {
// return (x & 0xC0) == 0x80;
// Since trail bytes are always in [0x80, 0xBF], we can optimize:
return static_cast<signed char>(x) < -0x40;
}
} // namespace UniLib
#endif // UTIL_UTF8_PUBLIC_UNILIB_UTF8_UTILS_H_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment