Unverified Commit f91b59c6 authored by thunderfyc's avatar thunderfyc Committed by GitHub
Browse files

Initial checkin of sequence_projection (#9153)



* Initial checkin of sequence_projection

* Fix the path

* Fix paths and deps

* Fix path and deps
Co-authored-by: default avatarLearn2Compress <expander-robot@google.com>
parent 67efd3ab
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "sgnn/sgnn_projection_op_resolver.h" // sequence_projection
#include "tensorflow/lite/mutable_op_resolver.h"
#include "sgnn/sgnn_projection.h" // sequence_projection
namespace tflite {
namespace ops {
namespace custom {
void AddSgnnProjectionCustomOp(MutableOpResolver* resolver) {
resolver->AddCustom("tftext:custom:SgnnProjection",
Register_tftext_SGNN_PROJECTION());
}
} // namespace custom
} // namespace ops
} // namespace tflite
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_MODELS_SEQUENCE_PROJECTION_SGNN_SGNN_PROJECTION_OP_RESOLVER_H_
#define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_SGNN_SGNN_PROJECTION_OP_RESOLVER_H_
#include "tensorflow/lite/mutable_op_resolver.h"
namespace tflite {
namespace ops {
namespace custom {
// Adds the SgnnProjection custom op to an op resolver.
// This function can be loaded using dlopen. Since C++ function names get
// mangled, declare this function as extern C, so its name is unchanged.
extern "C" void AddSgnnProjectionCustomOp(MutableOpResolver* resolver);
} // namespace custom
} // namespace ops
} // namespace tflite
#endif // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_SGNN_SGNN_PROJECTION_OP_RESOLVER_H_
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "sgnn/sgnn_projection.h" // sequence_projection
#include <string>
#include <vector>
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "flatbuffers/flexbuffers.h" // flatbuffer
#include "tensorflow/lite/kernels/test_util.h"
#include "tensorflow/lite/schema/schema_generated.h"
#include "tensorflow/lite/string_util.h"
namespace tflite {
namespace ops {
namespace custom {
namespace sgnn_projection {
namespace test {
namespace {
using ::testing::ElementsAre;
using ::testing::ElementsAreArray;
} // namespace
class SgnnProjectionModel : public SingleOpModel {
public:
// Constructor for testing the op with a tf.Tensor
SgnnProjectionModel(const std::vector<std::string>& input_values,
const std::vector<int64_t>& input_row_splits,
const std::vector<int64_t>& hash_seed, int64_t buckets) {
input_values_index_ = AddInput(TensorType_STRING);
input_row_splits_index_ = AddInput(TensorType_INT64);
output_values_index_ = AddOutput(TensorType_FLOAT32);
BuildCustomOp(hash_seed, buckets);
BuildInterpreter({{static_cast<int>(input_values.size())},
{static_cast<int>(input_row_splits.size())}});
PopulateStringTensor(input_values_index_, input_values);
PopulateTensor(input_row_splits_index_, input_row_splits);
Invoke();
}
std::vector<int> GetOutputShape() {
return GetTensorShape(output_values_index_);
}
std::vector<float> ExtractOutputValue() {
return ExtractVector<float>(output_values_index_);
}
private:
void BuildCustomOp(const std::vector<int64_t>& hash_seed, int64_t buckets) {
flexbuffers::Builder fbb;
size_t start_map = fbb.StartMap();
auto vector_start = fbb.StartVector("hash_seed");
for (int i = 0; i < hash_seed.size(); i++) {
fbb.Add(hash_seed[i]);
}
fbb.EndVector(vector_start, /*typed=*/true, /*fixed=*/false);
fbb.Int("buckets", buckets);
fbb.EndMap(start_map);
fbb.Finish();
SetCustomOp("tftext:custom:SgnnProjection", fbb.GetBuffer(),
Register_tftext_SGNN_PROJECTION);
}
int input_values_index_;
int input_row_splits_index_;
int output_values_index_;
};
// Keep same result of test_projection in sgnn_test.py
TEST(SgnnProjectionTest, TensorSgnnProjection) {
SgnnProjectionModel m({"^h", "he", "el", "ll", "lo", "o$", "^h", "hi", "i$"},
/*input_row_splits=*/{0, 6, 9}, /*hash_seed=*/{5, 7},
/*buckets=*/0x7FFFFFFF);
EXPECT_THAT(m.GetOutputShape(), ElementsAre(2, 2));
EXPECT_THAT(m.ExtractOutputValue(),
ElementsAreArray(ArrayFloatNear(
{ 0.448691, -0.238499, -0.037561, 0.080748})));
}
} // namespace test
} // namespace sgnn_projection
} // namespace custom
} // namespace ops
} // namespace tflite
# Copyright 2020 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Lint as: python3
"""Tests for sequence_projection.sgnn."""
import tensorflow as tf
from tensorflow.python.framework import test_util # pylint: disable=g-direct-tensorflow-import
import sgnn # import sequence_projection module
@test_util.run_all_in_graph_and_eager_modes
class SgnnTest(tf.test.TestCase):
def test_preprocess(self):
self.assertAllEqual(
sgnn.preprocess(
tf.constant([['Hello World!'], [u'你好'],
[u'مرحبا بالعالم']])),
[['hello'.encode(), 'world!'.encode()], [u'你好'.encode()],
[u'مرحبا'.encode(), u'بالعالم'.encode()]])
def test_get_ngram(self):
tokens = tf.ragged.constant([['hello', 'world'], [u'你好'],
[u'مرحبا', u'بالعالم']])
self.assertAllEqual(
sgnn.get_ngrams(tokens, 3),
[[
b'^he', b'hel', b'ell', b'llo', b'lo$', b'^wo', b'wor', b'orl',
b'rld', b'ld$'
], [u'^你好'.encode(), u'你好$'.encode()],
[
u'^مر'.encode(), u'مرح'.encode(), u'رحب'.encode(),
u'حبا'.encode(), u'با$'.encode(), u'^با'.encode(),
u'بال'.encode(), u'الع'.encode(), u'لعا'.encode(),
u'عال'.encode(), u'الم'.encode(), u'لم$'.encode()
]])
def test_project(self):
ngrams = tf.ragged.constant([[b'^h', b'he', b'el', b'll', b'lo', b'o$'],
[b'^h', b'hi', b'i$']])
self.assertAllClose(
sgnn.fused_project(ngrams, [5, 7], 0x7FFFFFFF),
[[0.448691, -0.238499], [-0.037561, 0.080748]])
self.assertAllClose(
sgnn.fused_project(ngrams, [5, 7], 0x7FFFFFFF),
sgnn.project(ngrams, [5, 7], 0x7FFFFFFF))
def test_sgnn(self):
self.assertAllClose(
sgnn.sgnn(tf.constant([['hello'], ['hi']]), [3, 5, 7], 2),
[[0.268503, 0.448691, -0.238499], [0.093143, -0.037561, 0.080748]])
def test_keras_model(self):
hparams = sgnn.Hparams(learning_rate=2e-4)
model = sgnn.keras_model([1, 2, 3, 4], 2, [100, 50], hparams)
self.assertIsNotNone(model)
if __name__ == '__main__':
tf.test.main()
# Copyright 2020 The TensorFlow Authors All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Script to train langid model.
The script builds language detection from wikipedia dataset,
builds SGNN model to train an on-device model to
predict the language of the given text.
"""
import os
from absl import app
from absl import flags
import numpy as np
import tensorflow.compat.v2 as tf
import tensorflow_datasets as tfds
import sgnn # import sequence_projection module
FLAGS = flags.FLAGS
flags.DEFINE_string('output_dir', '/tmp/langid',
'Path for the output directory.')
flags.DEFINE_integer('projection_size', 600, 'Size of projection layer.')
flags.DEFINE_integer('ngram_size', 3, 'Max size of ngram to project features.')
flags.DEFINE_string('fc_layer', '256,128',
'Size of fully connected layer, separated by comma.')
flags.DEFINE_integer('batch_size', 160, 'Batch size for training.')
flags.DEFINE_integer('epochs', 10, 'Num of epochs for training.')
flags.DEFINE_float('learning_rate', 2e-4, 'learning rate for optimizer.')
LANGIDS = ['ar', 'en', 'es', 'fr', 'ru', 'zh']
def dataset_fn(batch_size, is_training, split, try_gcs, max_input_len):
"""Creates dataset to train and evaluate.
Args:
batch_size: Batch size for training or evaluation.
is_training: True if the dataset is for training.
split: Split of dataset, follow the pattern defined in
https://www.tensorflow.org/datasets/splits
try_gcs: True if loading the data from gcs.
max_input_len: Max length of input string.
Returns:
Dataset object.
"""
def _get_text(item):
return tf.strings.substr(item['text'], 0, max_input_len)
all_data = []
for idx, langid in enumerate(LANGIDS):
dataset = tfds.load(
'wikipedia/20190301.%s' % langid, try_gcs=try_gcs, split=split)
map_fn = lambda item: (_get_text(item), idx) # pylint: disable=cell-var-from-loop
dataset = dataset.map(map_fn)
all_data.append(dataset)
datasets = tf.data.experimental.sample_from_datasets(
all_data, [1. / len(all_data)] * len(LANGIDS))
repeat_count = None if is_training else 1
return datasets.cache().shuffle(100000).batch(batch_size).repeat(repeat_count)
def save_and_convert(model, output_dir):
"""Save keras model and convert to tflite."""
saved_model_path = os.path.join(output_dir, 'saved_model')
tf.saved_model.save(model, saved_model_path)
converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_path)
converter.allow_custom_ops = True
converter.target_spec.supported_ops = [
tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS
]
data = converter.convert()
with open(os.path.join(output_dir, 'model.tflite'), 'wb') as f:
f.write(data)
def train_and_evaluate():
"""Train and evaluate the model."""
hash_seed = np.random.uniform(-1, 1, FLAGS.projection_size) * 0x7FFFFFFF
fc_layer = [int(fc) for fc in FLAGS.fc_layer.split(',')]
fc_layer.append(len(LANGIDS) + 1)
hparams = sgnn.Hparams(learning_rate=FLAGS.learning_rate)
model = sgnn.keras_model(hash_seed, FLAGS.ngram_size, fc_layer, hparams)
model.fit(
dataset_fn(FLAGS.batch_size, True, 'train[:10%]', True, 100),
epochs=FLAGS.epochs,
steps_per_epoch=1000,
validation_steps=100,
validation_data=dataset_fn(FLAGS.batch_size, False, 'train[10:11%]', True,
100),
)
save_and_convert(model, FLAGS.output_dir)
def main(_):
if not os.path.exists(FLAGS.output_dir):
os.mkdir(FLAGS.output_dir)
train_and_evaluate()
if __name__ == '__main__':
app.run(main)
# Tensorflow ops for sequence string projection.
load("//tf_ops:build_def.bzl", "gen_op_wrapper_py")
licenses(["notice"])
package(
default_visibility = [
"//:__subpackages__",
],
)
cc_library(
name = "sequence_string_projection_op",
srcs = [
"sequence_string_projection.cc",
],
deps = [
":projection_normalizer_util",
":projection_tokenizer_util",
":projection_util",
":text_distorter",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/random",
"@tensorflow_includes//:includes",
"@tensorflow_solib//:framework_lib",
],
alwayslink = 1,
)
cc_library(
name = "projection_util",
srcs = ["projection_util.cc"],
hdrs = ["projection_util.h"],
deps = [
"@utf_archive//:utf",
],
)
cc_library(
name = "projection_tokenizer_util",
srcs = ["projection_tokenizer_util.cc"],
hdrs = ["projection_tokenizer_util.h"],
deps = [
":projection_util",
"@utf_archive//:utf",
],
)
cc_library(
name = "projection_normalizer_util",
srcs = ["projection_normalizer_util.cc"],
hdrs = ["projection_normalizer_util.h"],
deps = [
":projection_util",
"@utf_archive//:utf",
],
)
cc_library(
name = "text_distorter",
srcs = ["text_distorter.cc"],
hdrs = ["text_distorter.h"],
deps = [
"@com_google_absl//absl/strings",
"@icu4c",
"@tensorflow_includes//:includes",
"@tensorflow_solib//:framework_lib",
"@utf_archive//:utf",
],
)
cc_test(
name = "sequence_string_projection_test",
size = "small",
srcs = ["sequence_string_projection_test.cc"],
deps = [
":sequence_string_projection_op",
"@tensorflow_includes//:includes",
"@tensorflow_solib//:framework_lib",
],
)
cc_library(
name = "sequence_string_projection_op_v2",
srcs = [
"sequence_string_projection_op_v2.cc",
],
deps = [
":projection_normalizer_util",
":projection_util",
":text_distorter",
"@tensorflow_includes//:includes",
"@tensorflow_solib//:framework_lib",
],
alwayslink = 1,
)
cc_test(
name = "sequence_string_projection_op_v2_test",
size = "small",
srcs = ["sequence_string_projection_op_v2_test.cc"],
deps = [
":sequence_string_projection_op_v2",
"@tensorflow_includes//:includes",
"@tensorflow_solib//:framework_lib",
],
)
gen_op_wrapper_py(
name = "sequence_string_projection_op_v2_py",
out = "sequence_string_projection_op_v2.py",
kernel_lib = ":sequence_string_projection_op_v2",
)
gen_op_wrapper_py(
name = "sequence_string_projection_op_py",
out = "sequence_string_projection_op.py",
kernel_lib = ":sequence_string_projection_op",
)
def tf_deps():
return [
"@tensorflow_includes//:includes",
"@tensorflow_solib//:framework_lib",
]
def tf_copts():
return ["-Wno-sign-compare"]
def _make_search_paths(prefix, levels_to_root):
return ",".join(
[
"-rpath,%s/%s" % (prefix, "/".join([".."] * search_level))
for search_level in range(levels_to_root + 1)
],
)
def _rpath_linkopts(name):
# Search parent directories up to the TensorFlow root directory for shared
# object dependencies, even if this op shared object is deeply nested
# (e.g. tensorflow/contrib/package:python/ops/_op_lib.so). tensorflow/ is then
# the root and tensorflow/libtensorflow_framework.so should exist when
# deployed. Other shared object dependencies (e.g. shared between contrib/
# ops) are picked up as long as they are in either the same or a parent
# directory in the tensorflow/ tree.
levels_to_root = native.package_name().count("/") + name.count("/")
return ["-Wl,%s" % (_make_search_paths("$$ORIGIN", levels_to_root),)]
def gen_op_wrapper_py(name, out, kernel_lib, linkopts = [], **kwargs):
"""Generates the py_library `name` with a data dep on the ops in kernel_lib.
The resulting py_library creates file `$out`, and has a dependency on a
symbolic library called lib{$name}_gen_op.so, which contains the kernels
and ops and can be loaded via `tf.load_op_library`.
Args:
name: The name of the py_library.
out: The name of the python file. Use "gen_{name}_ops.py".
kernel_lib: A cc_kernel_library target to generate for.
**kwargs: Any args to the `cc_binary` and `py_library` internal rules.
"""
if not out.endswith(".py"):
fail("Argument out must end with '.py', but saw: {}".format(out))
module_name = "lib{}_gen_op".format(name)
version_script_file = "%s-version-script.lds" % module_name
native.genrule(
name = module_name + "_version_script",
outs = [version_script_file],
cmd = "echo '{global:\n *tensorflow*;\n *deepmind*;\n local: *;};' >$@",
output_licenses = ["unencumbered"],
visibility = ["//visibility:private"],
)
native.cc_binary(
name = "{}.so".format(module_name),
deps = [kernel_lib] + tf_deps() + [version_script_file],
copts = tf_copts() + [
"-fno-strict-aliasing", # allow a wider range of code [aliasing] to compile.
"-fvisibility=hidden", # avoid symbol clashes between DSOs.
],
linkshared = 1,
linkopts = linkopts + _rpath_linkopts(module_name) + [
"-Wl,--version-script",
"$(location %s)" % version_script_file,
],
**kwargs
)
native.genrule(
name = "{}_genrule".format(out),
outs = [out],
cmd = """
echo 'import tensorflow as tf
_reverb_gen_op = tf.load_op_library(
tf.compat.v1.resource_loader.get_path_to_datafile(
"lib{}_gen_op.so"))
_locals = locals()
for k in dir(_reverb_gen_op):
_locals[k] = getattr(_reverb_gen_op, k)
del _locals' > $@""".format(name),
)
native.py_library(
name = name,
srcs = [out],
data = [":lib{}_gen_op.so".format(name)],
**kwargs
)
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tf_ops/projection_normalizer_util.h" // sequence_projection
#include <algorithm>
#include <cstddef>
#include <memory>
#include <sstream>
#include <utility>
#include "tf_ops/projection_util.h" // sequence_projection
// Returns true if the given text contains a number.
bool IsDigit(const std::string& text) {
Rune rune;
for (size_t i = 0; i < text.length();) {
const int bytes_read = charntorune(&rune, text.data(), 1);
if (rune == Runeerror || bytes_read == 0) break;
if (rune >= static_cast<Rune>('0') && rune <= static_cast<Rune>('9')) {
return true;
}
i += bytes_read;
}
return false;
}
// Gets the string containing |num_chars| characters from |start| position.
std::string GetCharToken(const std::vector<std::string>& char_tokens,
size_t start, size_t num_chars) {
std::string char_token = "";
if (start + num_chars <= char_tokens.size()) {
for (size_t i = 0; i < num_chars; ++i) {
char_token.append(char_tokens[start + i]);
}
}
return char_token;
}
// Counts how many times |pattern| appeared from |start| position.
int GetNumPattern(const std::vector<std::string>& char_tokens, size_t start,
size_t num_chars, const std::string& pattern) {
int count = 0;
for (size_t i = start; i < char_tokens.size(); i += num_chars) {
std::string cur_pattern = GetCharToken(char_tokens, i, num_chars);
if (pattern == cur_pattern) {
++count;
} else {
break;
}
}
return count;
}
std::string ContractToken(const char* input_ptr, size_t len, size_t num_chars) {
// This function contracts patterns whose length is |num_chars| and appeared
// more than twice. So if the input is shorter than 3 * |num_chars|, do not
// apply any contraction.
if (len < 3 * num_chars) {
return input_ptr;
}
std::vector<std::string> char_tokens = SplitByChar(input_ptr, len, len);
std::string token;
token.reserve(len);
for (size_t i = 0; i < char_tokens.size();) {
std::string cur_pattern = GetCharToken(char_tokens, i, num_chars);
// Count how many times this pattern appeared.
int num_cur_patterns = 0;
if (cur_pattern.find(" ") == std::string::npos && !IsDigit(cur_pattern)) {
num_cur_patterns =
GetNumPattern(char_tokens, i + num_chars, num_chars, cur_pattern);
}
if (num_cur_patterns >= 2) {
// If this pattern is repeated, store it only twice.
token.append(cur_pattern);
token.append(cur_pattern);
i += (num_cur_patterns + 1) * num_chars;
} else {
token.append(char_tokens[i]);
++i;
}
}
return token;
}
void ProjectionNormalizer::InitializeSeparators(const std::string& separators) {
for (size_t i = 0; i < separators.length(); ++i) {
if (separators[i] != ' ') {
separators_.insert(separators[i]);
}
}
}
std::string ProjectionNormalizer::NormalizeInternal(const char* input_ptr,
size_t len) {
std::string normalized;
normalized.reserve(len * 2);
for (size_t i = 0; i < len; ++i) {
char c = input_ptr[i];
bool matched_separator = separators_.find(c) != separators_.end();
if (matched_separator) {
if (i > 0 && input_ptr[i - 1] != ' ' && normalized.back() != ' ') {
normalized.append(" ");
}
}
normalized.append(1, c);
if (matched_separator) {
if (i + 1 < len && input_ptr[i + 1] != ' ' && c != '\'') {
normalized.append(" ");
}
}
}
return normalized;
}
std::string ProjectionNormalizer::Normalize(const std::string& input,
size_t max_input) {
return Normalize(input.c_str(), input.size(), max_input);
}
std::string ProjectionNormalizer::Normalize(const char* input_ptr, size_t len,
size_t max_input) {
std::string normalized(input_ptr, std::min(len, max_input));
if (normalize_repetition_) {
// Remove repeated 1 char (e.g. soooo => soo)
normalized = ContractToken(normalized.data(), normalized.length(), 1);
// Remove repeated 2 chars from the beginning (e.g. hahaha =>
// haha, xhahaha => xhaha, xyhahaha => xyhaha).
normalized = ContractToken(normalized.data(), normalized.length(), 2);
// Remove repeated 3 chars from the beginning
// (e.g. wowwowwow => wowwow, abcdbcdbcd => abcdbcd).
normalized = ContractToken(normalized.data(), normalized.length(), 3);
}
if (!separators_.empty()) {
// Add space around separators_.
normalized = NormalizeInternal(normalized.data(), normalized.length());
}
return normalized;
}
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_NORMALIZER_UTIL_H_
#define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_NORMALIZER_UTIL_H_
#include <string>
#include <unordered_set>
#include <vector>
#include "libutf/utf.h"
// Normalizes the input with the given |separators| by adding a space before and
// after each separator. When |normalize_repetition| is true, it removes the
// repeated characters (except numbers) which consecutively appeared more than
// twice in a word.
// Examples: arwwwww -> arww, good!!!!! -> good!!, hahaha => haha.
class ProjectionNormalizer {
public:
explicit ProjectionNormalizer(const std::string& separators,
bool normalize_repetition = false) {
InitializeSeparators(separators);
normalize_repetition_ = normalize_repetition;
}
// Normalizes the repeated characters (except numbers) which consecutively
// appeared more than twice in a word.
std::string Normalize(const std::string& input, size_t max_input = 300);
std::string Normalize(const char* input_ptr, size_t len,
size_t max_input = 300);
private:
// Parses and extracts supported separators.
void InitializeSeparators(const std::string& separators);
// Removes repeated chars.
std::string NormalizeInternal(const char* input_ptr, size_t len);
std::unordered_set<char> separators_;
bool normalize_repetition_;
};
#endif // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_NORMALIZER_UTIL_H_
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tf_ops/projection_tokenizer_util.h" // sequence_projection
#include <cstddef>
#include <iostream>
#include <memory>
#include <sstream>
#include <utility>
#include "tf_ops/projection_util.h" // sequence_projection
namespace {
constexpr char kApostrophe = '\'';
constexpr char kSpace = ' ';
constexpr char kComma = ',';
constexpr char kDot = '.';
constexpr size_t kInvalid = -1;
} // namespace
// Returns true if the input |c| is ascii number.
bool is_numeric(char c) { return c >= '0' && c <= '9'; }
// Returns true if we want to prepend the separator to the next token.
bool prepend_separator(char separator) { return separator == kApostrophe; }
void ProjectionTokenizer::InitializeSeparators(const std::string& separators) {
for (size_t i = 0; i < separators.length(); ++i) {
separators_.insert(separators[i]);
}
}
size_t ProjectionTokenizer::FindNextSeparator(const char* input_ptr,
size_t from,
size_t length) const {
auto index = from;
while (index < length) {
char c = input_ptr[index];
// Do not break a number (e.g. "10,000", "0.23").
if (c == kComma || c == kDot) {
if (index + 1 < length && is_numeric(input_ptr[index + 1])) {
c = input_ptr[++index];
}
}
if (separators_.find(c) != separators_.end()) {
break;
}
++index;
}
return index == length ? kInvalid : index;
}
std::vector<std::string> ProjectionTokenizer::Tokenize(
const char* input_ptr, size_t len, size_t max_input,
size_t max_tokens) const {
// If separators_ is not given, tokenize the input with a space.
if (separators_.empty()) {
return SplitBySpace(input_ptr, len, max_input, max_tokens);
}
std::vector<std::string> tokens;
size_t last_index =
max_input == kEntireString ? len : (len < max_input ? len : max_input);
size_t start = 0;
// Skip leading spaces.
while (start < last_index && input_ptr[start] == kSpace) {
start++;
}
auto end = FindNextSeparator(input_ptr, start, last_index);
while (end != kInvalid &&
(max_tokens == kAllTokens || tokens.size() < max_tokens - 1)) {
auto length = end - start;
if (length > 0) tokens.emplace_back(input_ptr + start, length);
// Add the separator (except space and apostrophe) as a token
char separator = input_ptr[end];
if (separator != kSpace && separator != kApostrophe) {
tokens.emplace_back(input_ptr + end, 1);
}
start = end + (prepend_separator(separator) ? 0 : 1);
end = FindNextSeparator(input_ptr, end + 1, last_index);
}
auto length = end == kInvalid ? (last_index - start) : (end - start);
if (length > 0) tokens.emplace_back(input_ptr + start, length);
return tokens;
}
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_TOKENIZER_UTIL_H_
#define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_TOKENIZER_UTIL_H_
#include <string>
#include <unordered_set>
#include <vector>
#include "libutf/utf.h"
// Tokenizes the input with the given separators. To properly tokenize a text
// containing contractions in English (e.g. I'm), it combines the apostrophe
// with the token coming after it. For example, the text "I'm happy" is
// tokenized into three tokens: "I", "'m", "happy". When |separators| is not
// given, use the space to tokenize the input.
// Note) This tokenization supports only English.
class ProjectionTokenizer {
public:
explicit ProjectionTokenizer(const std::string& separators) {
InitializeSeparators(separators);
}
// Tokenizes the input by separators_. Limit to max_tokens, when it is not -1.
std::vector<std::string> Tokenize(const std::string& input, size_t max_input,
size_t max_tokens) const {
return Tokenize(input.c_str(), input.size(), max_input, max_tokens);
}
std::vector<std::string> Tokenize(const char* input_ptr, size_t len,
size_t max_input, size_t max_tokens) const;
private:
// Parses and extracts supported separators.
void InitializeSeparators(const std::string& separators);
// Starting from input_ptr[from], search for the next occurrence of
// separators_. Don't search beyond input_ptr[length](non-inclusive). Return
// -1 if not found.
size_t FindNextSeparator(const char* input_ptr, size_t from,
size_t length) const;
std::unordered_set<char> separators_;
};
#endif // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_TOKENIZER_UTIL_H_
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tf_ops/projection_util.h" // sequence_projection
#include <cstddef>
#include <iostream>
#include <memory>
#include <sstream>
namespace {
constexpr size_t kInvalid = -1;
constexpr char kSpace = ' ';
} // namespace
std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes(
const std::pair<const char*, size_t>& source) const {
// Ideally the size of target should be less than or equal to source. But
// when we do to_lower the number of bytes needed to encode a unicode
// character could increase. To account for this 4 times the source length
// is allocated for target.
const char* csource = source.first;
int len = source.second;
auto target = std::unique_ptr<char[]>(new char[len * 4]);
auto target_ptr = target.get();
int i = 0;
while (i < len) {
Rune rune;
const int bytes_read = charntorune(&rune, csource + i, len - i);
if (bytes_read == 0) {
break;
}
i += bytes_read;
if (rune != Runeerror) {
Rune lower = tolowerrune(rune);
// Skip processing the unicode if exclude_nonalphaspace_unicodes_ is true
// and the unicode is not alpha and not space.
const Rune kSpaceRune = ' ';
if (exclude_nonalphaspace_unicodes_ && !isalpharune(lower) &&
lower != kSpaceRune) {
continue;
}
if (IsUnrestrictedVocabulary() || IsValidUnicode(lower)) {
const int bytes_written = runetochar(target_ptr, &lower);
target_ptr += bytes_written;
}
}
}
return std::string(target.get(), target_ptr);
}
void ProjectionUnicodeHandler::InitializeVocabulary(
const std::string& vocabulary) {
for (size_t i = 0, index = 0; i < vocabulary.length();) {
Rune rune;
const int bytes_read =
charntorune(&rune, vocabulary.c_str() + i, vocabulary.length() - i);
if (!bytes_read) {
break;
}
i += bytes_read;
// Include novel lower case unicode segments as part of valid chars.
if (rune == Runeerror) {
std::clog << "Invalid rune in vocabulary.";
} else if (IsValidUnicode(rune)) {
std::clog << "Duplicate rune " << rune << " found in vocabulary.";
} else if (rune != tolowerrune(rune)) {
std::clog << "Upper case rune " << rune << " found in vocabulary.";
} else {
valid_chars_[rune] = index++;
}
}
}
// Starting from input_ptr[from], search for the next occurrence of ' ',
// Don't search beyond input_ptr[length](non-inclusive), return -1 if not found.
inline size_t FindNextSpace(const char* input_ptr, size_t from, size_t length) {
size_t space_index;
for (space_index = from; space_index < length; space_index++) {
if (input_ptr[space_index] == kSpace) {
break;
}
}
return space_index == length ? kInvalid : space_index;
}
template <typename T>
void SplitBySpaceInternal(std::vector<T>* tokens, const char* input_ptr,
size_t len, size_t max_input, size_t max_tokens) {
size_t last_index =
max_input == kEntireString ? len : (len < max_input ? len : max_input);
size_t start = 0;
// skip leading spaces
while (start < last_index && input_ptr[start] == kSpace) {
start++;
}
auto end = FindNextSpace(input_ptr, start, last_index);
while (end != kInvalid &&
(max_tokens == kAllTokens || tokens->size() < max_tokens - 1)) {
auto length = end - start;
if (length > 0) {
tokens->emplace_back(input_ptr + start, length);
}
start = end + 1;
end = FindNextSpace(input_ptr, start, last_index);
}
auto length = end == kInvalid ? (last_index - start) : (end - start);
if (length > 0) {
tokens->emplace_back(input_ptr + start, length);
}
}
std::vector<std::pair<const char*, size_t>> SplitBySpaceAsPairs(
const char* input_ptr, size_t len, size_t max_tokens) {
std::vector<std::pair<const char*, size_t>> tokens;
SplitBySpaceInternal(&tokens, input_ptr, len, kEntireString, max_tokens);
return tokens;
}
std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len,
size_t max_input, size_t max_tokens) {
std::vector<std::string> tokens;
SplitBySpaceInternal(&tokens, input_ptr, len, max_input, max_tokens);
return tokens;
}
template <typename T>
void SplitByCharInternal(std::vector<T>* tokens, const char* input_ptr,
size_t len, size_t max_tokens) {
Rune rune;
for (size_t i = 0; i < len;) {
auto bytes_read = charntorune(&rune, input_ptr + i, len - i);
if (bytes_read == 0) break;
tokens->emplace_back(input_ptr + i, bytes_read);
if (max_tokens != kInvalid && tokens->size() == max_tokens) {
break;
}
i += bytes_read;
}
}
std::vector<std::pair<const char*, size_t>> SplitByCharAsPairs(
const char* input_ptr, size_t len, size_t max_tokens) {
std::vector<std::pair<const char*, size_t>> tokens;
SplitByCharInternal(&tokens, input_ptr, len, max_tokens);
return tokens;
}
std::vector<std::string> SplitByChar(const char* input_ptr, size_t len,
size_t max_tokens) {
std::vector<std::string> tokens;
SplitByCharInternal(&tokens, input_ptr, len, max_tokens);
return tokens;
}
std::string JoinPairsBySpace(
std::vector<std::pair<const char*, size_t>> words) {
std::stringstream ss;
bool first = true;
for (auto& str_pair : words) {
if (first) {
ss << std::string(str_pair.first, str_pair.second);
first = false;
} else {
ss << kSpace << std::string(str_pair.first, str_pair.second);
}
}
return ss.str();
}
std::vector<std::pair<const char*, size_t>> ProjectionUnicodeHandler::Tokenize(
const char* str, size_t len, bool by_space, int max_tokens) const {
return by_space ? SplitBySpaceAsPairs(str, len, max_tokens)
: SplitByCharAsPairs(str, len, max_tokens);
}
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_UTIL_H_
#define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_UTIL_H_
#include <cassert>
#include <string>
#include <unordered_map>
#include <vector>
#include "libutf/utf.h"
inline int charntorune(Rune* r, const char* s, int n) {
const int bytes_read = chartorune(r, const_cast<char *>(s));
if (bytes_read > n) {
*r = Runeerror;
return 0;
}
return bytes_read;
}
// A hashing wrapper class that can hash a string and generate a hash code with
// requested number of features (two bit values). Some of the implementations
// are copied from murmurhash.
class Hasher {
public:
explicit Hasher(int feature_size) : feature_size_(feature_size) {
GetHashCodesInternal(empty_string_, &null_hash_codes_);
}
void GetHashCodes(const std::string& word,
std::vector<uint64_t>* hash_codes) {
if (word.empty()) {
*hash_codes = null_hash_codes_;
} else {
hash_codes->clear();
GetHashCodesInternal(word, hash_codes);
}
}
private:
static constexpr uint64_t kMul = 0xc6a4a7935bd1e995ULL;
static constexpr uint64_t kMul2 = 0x9e3779b97f4a7835ULL;
inline uint64_t ShiftMix(uint64_t val) { return val ^ (val >> 47); }
inline uint64_t MurmurStep(uint64_t hash, uint64_t data) {
hash ^= ShiftMix(data * kMul) * kMul;
hash *= kMul;
return hash;
}
inline uint64_t Load64VariableLength(const void* p, int len) {
assert(len >= 1 && len <= 8);
const char* buf = static_cast<const char*>(p);
uint64_t val = 0;
--len;
do {
val = (val << 8) | buf[len];
// (--len >= 0) is about 10 % faster than (len--) in some benchmarks.
} while (--len >= 0);
// No ToHost64(...) needed. The bytes are accessed in little-endian manner
// on every architecture.
return val;
}
void GetMoreBits(uint64_t hash, uint64_t hash2, uint64_t* rlow,
uint64_t* rhigh) {
hash = ShiftMix(hash) * kMul;
hash2 ^= hash;
*rhigh = ShiftMix(hash);
*rlow = ShiftMix(hash2 * kMul2) * kMul2;
}
std::pair<uint64_t, uint64_t> MurmurHash128(const char* buf,
const size_t len) {
// Initialize the hashing value.
uint64_t hash = len * kMul;
// hash2 will be xored by hash during the hash computation iterations.
// In the end we use an alternative mixture multiplier for mixing
// the bits in hash2.
uint64_t hash2 = 0;
// Let's remove the bytes not divisible by the sizeof(uint64_t).
// This allows the inner loop to process the data as 64 bit integers.
const size_t len_aligned = len & ~0x7;
const char* end = buf + len_aligned;
for (const char* p = buf; p != end; p += 8) {
// Manually unrolling this loop 2x did not help on Intel Core 2.
hash = MurmurStep(hash, Load64VariableLength(p, 8));
hash2 ^= hash;
}
if ((len & 0x7) != 0) {
const uint64_t data = Load64VariableLength(end, len & 0x7);
hash ^= data;
hash *= kMul;
hash2 ^= hash;
}
hash = ShiftMix(hash) * kMul;
hash2 ^= hash;
hash = ShiftMix(hash);
// mul2 is a prime just above golden ratio. mul2 is used to ensure that the
// impact of the last few bytes is different to the upper and lower 64 bits.
hash2 = ShiftMix(hash2 * kMul2) * kMul2;
return std::make_pair(hash, hash2);
}
void GetHashCodesInternal(const std::string& word,
std::vector<uint64_t>* hash_codes) {
uint64_t hash_low = 0;
uint64_t hash_high = 0;
for (int i = 0; i < feature_size_; i += 64) {
if (i == 0) {
auto hash = MurmurHash128(word.c_str(), word.size());
hash_low = hash.first;
hash_high = hash.second;
} else {
GetMoreBits(hash_low, hash_high, &hash_low, &hash_high);
}
hash_codes->push_back(hash_low);
hash_codes->push_back(hash_high);
}
}
const std::string empty_string_ = "<null>";
const int feature_size_;
std::vector<uint64_t> null_hash_codes_;
};
// Unicode processor for tensorflow and tflite string projection ops.
class ProjectionUnicodeHandler {
public:
// Takes an utf8 string which lists the unicodes that are supported and are
// part of the vocabulary of this instance. When the utf8 string is empty,
// all unicode segments are supported by this instance. The boolean
// flag exclude_nonalphaspace_unicodes is used to indicate if nonalpha and
// space unicode segments from the input should be stripped out.
// Another way to analyse the filtering logic is as below.
// Vocabulary acts as a allowlist when provided and all unicode set when
// empty. The flag exclude_nonalphaspace_unicodes when true acts as a
// allowlist on all alpha characters and space. It includes the entire unicode
// set when false. Valid unicode segments are the intersection of these 2
// sets.
explicit ProjectionUnicodeHandler(const std::string& vocabulary,
bool exclude_nonalphaspace_unicodes = false)
: exclude_nonalphaspace_unicodes_(exclude_nonalphaspace_unicodes) {
InitializeVocabulary(vocabulary);
}
// Performs language independent lower case and returns a string with
// supported unicode segments.
std::string LowerCaseUTF8WithSupportedUnicodes(
const std::pair<const char*, size_t>& source) const;
// Returns a boolean flag indicating if the unicode segment is part of the
// vocabulary.
bool IsValidUnicode(Rune rune) const {
return valid_chars_.find(rune) != valid_chars_.end();
}
// Returns an index in [0, |vocabulary|), if the unicode is part of the
// vocabulary and -1 if it's not.
int UnicodeIndex(Rune rune) const {
return IsValidUnicode(rune) ? valid_chars_.at(rune) : -1;
}
// Returns |vocabulary|.
size_t NumberOfValidUnicodes() const { return valid_chars_.size(); }
// Returns true if the vocabulary is empty which means all unicode segments
// are supported.
bool IsUnrestrictedVocabulary() const { return valid_chars_.empty(); }
// Tokenizes input by space or unicode point segmentation. Limit to
// max_tokens, when it is not -1.
std::vector<std::pair<const char*, size_t>> Tokenize(const std::string& input,
bool by_space,
int max_tokens) const {
return Tokenize(input.c_str(), input.size(), by_space, max_tokens);
}
std::vector<std::pair<const char*, size_t>> Tokenize(const char* str,
size_t len,
bool by_space,
int max_tokens) const;
private:
// Parses and extracts supported unicode segments from a utf8 string.
void InitializeVocabulary(const std::string& vocabulary);
std::unordered_map<Rune, int> valid_chars_;
bool exclude_nonalphaspace_unicodes_;
};
static constexpr size_t kEntireString = SIZE_MAX;
static constexpr size_t kAllTokens = SIZE_MAX;
std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len,
size_t max_input, size_t max_tokens);
std::vector<std::string> SplitByChar(const char* input_ptr, size_t len,
size_t max_tokens);
std::string JoinPairsBySpace(std::vector<std::pair<const char*, size_t>> words);
#endif // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_PROJECTION_UTIL_H_
"""Reverb custom external dependencies."""
# Sanitize a dependency so that it works correctly from code that includes
# reverb as a submodule.
def clean_dep(dep):
return str(Label(dep))
def get_python_path(ctx):
path = ctx.os.environ.get("PYTHON_BIN_PATH")
if not path:
fail(
"Could not get environment variable PYTHON_BIN_PATH. " +
"Check your .bazelrc file.",
)
return path
def _find_tf_include_path(repo_ctx):
exec_result = repo_ctx.execute(
[
get_python_path(repo_ctx),
"-c",
"import tensorflow as tf; import sys; " +
"sys.stdout.write(tf.sysconfig.get_include())",
],
quiet = True,
)
if exec_result.return_code != 0:
fail("Could not locate tensorflow installation path:\n{}"
.format(exec_result.stderr))
return exec_result.stdout.splitlines()[-1]
def _find_tf_lib_path(repo_ctx):
exec_result = repo_ctx.execute(
[
get_python_path(repo_ctx),
"-c",
"import tensorflow as tf; import sys; " +
"sys.stdout.write(tf.sysconfig.get_lib())",
],
quiet = True,
)
if exec_result.return_code != 0:
fail("Could not locate tensorflow installation path:\n{}"
.format(exec_result.stderr))
return exec_result.stdout.splitlines()[-1]
def _find_numpy_include_path(repo_ctx):
exec_result = repo_ctx.execute(
[
get_python_path(repo_ctx),
"-c",
"import numpy; import sys; " +
"sys.stdout.write(numpy.get_include())",
],
quiet = True,
)
if exec_result.return_code != 0:
fail("Could not locate numpy includes path:\n{}"
.format(exec_result.stderr))
return exec_result.stdout.splitlines()[-1]
def _find_python_include_path(repo_ctx):
exec_result = repo_ctx.execute(
[
get_python_path(repo_ctx),
"-c",
"from distutils import sysconfig; import sys; " +
"sys.stdout.write(sysconfig.get_python_inc())",
],
quiet = True,
)
if exec_result.return_code != 0:
fail("Could not locate python includes path:\n{}"
.format(exec_result.stderr))
return exec_result.stdout.splitlines()[-1]
def _find_python_solib_path(repo_ctx):
exec_result = repo_ctx.execute(
[
get_python_path(repo_ctx),
"-c",
"import sys; vi = sys.version_info; " +
"sys.stdout.write('python{}.{}'.format(vi.major, vi.minor))",
],
)
if exec_result.return_code != 0:
fail("Could not locate python shared library path:\n{}"
.format(exec_result.stderr))
version = exec_result.stdout.splitlines()[-1]
basename = "lib{}.so".format(version)
exec_result = repo_ctx.execute(
["{}-config".format(version), "--configdir"],
quiet = True,
)
if exec_result.return_code != 0:
fail("Could not locate python shared library path:\n{}"
.format(exec_result.stderr))
solib_dir = exec_result.stdout.splitlines()[-1]
full_path = repo_ctx.path("{}/{}".format(solib_dir, basename))
if not full_path.exists:
fail("Unable to find python shared library file:\n{}/{}"
.format(solib_dir, basename))
return struct(dir = solib_dir, basename = basename)
def _eigen_archive_repo_impl(repo_ctx):
tf_include_path = _find_tf_include_path(repo_ctx)
repo_ctx.symlink(tf_include_path, "tf_includes")
repo_ctx.file(
"BUILD",
content = """
cc_library(
name = "includes",
hdrs = glob(["tf_includes/Eigen/**/*.h",
"tf_includes/Eigen/**",
"tf_includes/unsupported/Eigen/**/*.h",
"tf_includes/unsupported/Eigen/**"]),
# https://groups.google.com/forum/#!topic/bazel-discuss/HyyuuqTxKok
includes = ["tf_includes"],
visibility = ["//visibility:public"],
)
""",
executable = False,
)
def _nsync_includes_repo_impl(repo_ctx):
tf_include_path = _find_tf_include_path(repo_ctx)
repo_ctx.symlink(tf_include_path + "/external", "nsync_includes")
repo_ctx.file(
"BUILD",
content = """
cc_library(
name = "includes",
hdrs = glob(["nsync_includes/nsync/public/*.h"]),
includes = ["nsync_includes"],
visibility = ["//visibility:public"],
)
""",
executable = False,
)
def _zlib_includes_repo_impl(repo_ctx):
tf_include_path = _find_tf_include_path(repo_ctx)
repo_ctx.symlink(
tf_include_path + "/external/zlib",
"zlib",
)
repo_ctx.file(
"BUILD",
content = """
cc_library(
name = "includes",
hdrs = glob(["zlib/**/*.h"]),
includes = ["zlib"],
visibility = ["//visibility:public"],
)
""",
executable = False,
)
def _snappy_includes_repo_impl(repo_ctx):
tf_include_path = _find_tf_include_path(repo_ctx)
repo_ctx.symlink(
tf_include_path + "/external/snappy",
"snappy",
)
repo_ctx.file(
"BUILD",
content = """
cc_library(
name = "includes",
hdrs = glob(["snappy/*.h"]),
includes = ["snappy"],
visibility = ["//visibility:public"],
)
""",
executable = False,
)
def _protobuf_includes_repo_impl(repo_ctx):
tf_include_path = _find_tf_include_path(repo_ctx)
repo_ctx.symlink(tf_include_path, "tf_includes")
repo_ctx.symlink(Label("//third_party:protobuf.BUILD"), "BUILD")
def _tensorflow_includes_repo_impl(repo_ctx):
tf_include_path = _find_tf_include_path(repo_ctx)
repo_ctx.symlink(tf_include_path, "tensorflow_includes")
repo_ctx.file(
"BUILD",
content = """
cc_library(
name = "includes",
hdrs = glob(
[
"tensorflow_includes/**/*.h",
"tensorflow_includes/third_party/eigen3/**",
],
exclude = ["tensorflow_includes/absl/**/*.h"],
),
includes = ["tensorflow_includes"],
deps = [
"@eigen_archive//:eigen",
"@protobuf_archive//:includes",
"@zlib_includes//:includes",
"@snappy_includes//:includes",
],
visibility = ["//visibility:public"],
)
filegroup(
name = "protos",
srcs = glob(["tensorflow_includes/**/*.proto"]),
visibility = ["//visibility:public"],
)
""",
executable = False,
)
def _tensorflow_solib_repo_impl(repo_ctx):
tf_lib_path = _find_tf_lib_path(repo_ctx)
repo_ctx.symlink(tf_lib_path, "tensorflow_solib")
repo_ctx.file(
"BUILD",
content = """
cc_library(
name = "framework_lib",
srcs = ["tensorflow_solib/libtensorflow_framework.so.2"],
deps = ["@python_includes", "@python_includes//:numpy_includes"],
visibility = ["//visibility:public"],
)
""",
)
def _python_includes_repo_impl(repo_ctx):
python_include_path = _find_python_include_path(repo_ctx)
python_solib = _find_python_solib_path(repo_ctx)
repo_ctx.symlink(python_include_path, "python_includes")
numpy_include_path = _find_numpy_include_path(repo_ctx)
repo_ctx.symlink(numpy_include_path, "numpy_includes")
repo_ctx.symlink(
"{}/{}".format(python_solib.dir, python_solib.basename),
python_solib.basename,
)
# Note, "@python_includes" is a misnomer since we include the
# libpythonX.Y.so in the srcs, so we can get access to python's various
# symbols at link time.
repo_ctx.file(
"BUILD",
content = """
cc_library(
name = "python_includes",
hdrs = glob(["python_includes/**/*.h"]),
srcs = ["{}"],
includes = ["python_includes"],
visibility = ["//visibility:public"],
)
cc_library(
name = "numpy_includes",
hdrs = glob(["numpy_includes/**/*.h"]),
includes = ["numpy_includes"],
visibility = ["//visibility:public"],
)
""".format(python_solib.basename),
executable = False,
)
def cc_tf_configure():
"""Autoconf pre-installed tensorflow repo."""
make_nsync_repo = repository_rule(
implementation = _nsync_includes_repo_impl,
)
make_nsync_repo(name = "nsync_includes")
make_zlib_repo = repository_rule(
implementation = _zlib_includes_repo_impl,
)
make_zlib_repo(name = "zlib_includes")
make_snappy_repo = repository_rule(
implementation = _snappy_includes_repo_impl,
)
make_snappy_repo(name = "snappy_includes")
make_protobuf_repo = repository_rule(
implementation = _protobuf_includes_repo_impl,
)
make_protobuf_repo(name = "protobuf_archive")
make_tfinc_repo = repository_rule(
implementation = _tensorflow_includes_repo_impl,
)
make_tfinc_repo(name = "tensorflow_includes")
make_tflib_repo = repository_rule(
implementation = _tensorflow_solib_repo_impl,
)
make_tflib_repo(name = "tensorflow_solib")
make_python_inc_repo = repository_rule(
implementation = _python_includes_repo_impl,
)
make_python_inc_repo(name = "python_includes")
def _reverb_protoc_archive(ctx):
version = ctx.attr.version
sha256 = ctx.attr.sha256
override_version = ctx.os.environ.get("REVERB_PROTOC_VERSION")
if override_version:
sha256 = ""
version = override_version
urls = [
"https://github.com/protocolbuffers/protobuf/releases/download/v%s/protoc-%s-linux-x86_64.zip" % (version, version),
]
ctx.download_and_extract(
url = urls,
sha256 = sha256,
)
ctx.file(
"BUILD",
content = """
filegroup(
name = "protoc_bin",
srcs = ["bin/protoc"],
visibility = ["//visibility:public"],
)
""",
executable = False,
)
reverb_protoc_archive = repository_rule(
implementation = _reverb_protoc_archive,
attrs = {
"version": attr.string(mandatory = True),
"sha256": attr.string(mandatory = True),
},
)
def reverb_protoc_deps(version, sha256):
reverb_protoc_archive(name = "protobuf_protoc", version = version, sha256 = sha256)
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tf_ops/projection_normalizer_util.h" // sequence_projection
#include "tf_ops/projection_tokenizer_util.h" // sequence_projection
#include "tf_ops/projection_util.h" // sequence_projection
#include "tf_ops/text_distorter.h" // sequence_projection
#include "absl/container/flat_hash_map.h"
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
using ::tensorflow::int32;
using ::tensorflow::int64;
using ::tensorflow::uint64;
using ::tensorflow::OpKernel;
using ::tensorflow::OpKernelConstruction;
using ::tensorflow::OpKernelContext;
using ::tensorflow::Tensor;
using ::tensorflow::TensorShape;
using ::tensorflow::TensorShapeUtils;
using ::tensorflow::errors::InvalidArgument;
using tensorflow::shape_inference::DimensionHandle;
using tensorflow::shape_inference::InferenceContext;
constexpr char kBeginTokenTSP[] = "<BOS>";
constexpr char kEndTokenTSP[] = "<EOS>";
float* AllocateTensor(OpKernelContext* ctx, const std::string& tensor_name,
const TensorShape& tensor_shape) {
Tensor* tensor = nullptr;
auto status = ctx->allocate_output(tensor_name, tensor_shape, &tensor);
if (!TF_PREDICT_TRUE(status.ok())) {
ctx->CtxFailureWithWarning(__FILE__, __LINE__, status);
return nullptr;
}
return &tensor->flat<float>()(0);
}
class SequenceStringProjectionOp : public OpKernel {
public:
explicit SequenceStringProjectionOp(OpKernelConstruction* context)
: OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_size", &feature_size_));
hasher_ = absl::make_unique<Hasher>(feature_size_);
float distortion_probability = 0.0;
OP_REQUIRES_OK(context, context->GetAttr("distortion_probability",
&distortion_probability));
text_distorter_ = absl::make_unique<TextDistorter>(distortion_probability);
OP_REQUIRES_OK(context,
context->GetAttr("split_on_space", &split_on_space_));
OP_REQUIRES_OK(context, context->GetAttr("max_splits", &max_splits_));
OP_REQUIRES_OK(context, context->GetAttr("vocabulary", &vocabulary_));
bool add_bos_tag;
OP_REQUIRES_OK(context, context->GetAttr("add_bos_tag", &add_bos_tag));
bos_tag_ = add_bos_tag ? 1 : 0;
bool add_eos_tag;
OP_REQUIRES_OK(context, context->GetAttr("add_eos_tag", &add_eos_tag));
eos_tag_ = add_eos_tag ? 1 : 0;
// When word_novelty_bits is set to a positive integer, the last feature
// generated by the op captures the token frequency.
OP_REQUIRES_OK(context,
context->GetAttr("word_novelty_bits", &word_novelty_bits_));
CHECK_GE(word_novelty_bits_, 0);
CHECK_LE(word_novelty_bits_, 7);
if (word_novelty_bits_ != 0) {
CHECK_GE(feature_size_, 1);
}
// When doc_size_levels is set to a positive integer, the second to last
// feature generated by the op is derived from the log of the document
// size.
OP_REQUIRES_OK(context,
context->GetAttr("doc_size_levels", &doc_size_levels_));
CHECK_GE(doc_size_levels_, 0);
CHECK_LE(doc_size_levels_, 16);
if (doc_size_levels_ != 0) {
CHECK_GE(feature_size_, 2);
}
word_novelty_offset_ = 1.0f / (1 << word_novelty_bits_);
bool exclude_nonalphaspace_unicodes;
OP_REQUIRES_OK(context, context->GetAttr("exclude_nonalphaspace_unicodes",
&exclude_nonalphaspace_unicodes));
if (!vocabulary_.empty()) {
CHECK(!exclude_nonalphaspace_unicodes);
}
unicode_handler_ = absl::make_unique<ProjectionUnicodeHandler>(
vocabulary_, exclude_nonalphaspace_unicodes);
vocabulary_size_ = unicode_handler_->NumberOfValidUnicodes();
bool normalize_repetition;
OP_REQUIRES_OK(context, context->GetAttr("normalize_repetition",
&normalize_repetition));
std::string separators;
OP_REQUIRES_OK(context, context->GetAttr("token_separators", &separators));
if (!separators.empty() || normalize_repetition) {
projection_normalizer_ = absl::make_unique<ProjectionNormalizer>(
separators, normalize_repetition);
}
}
void Compute(OpKernelContext* ctx) override {
const Tensor* input_tensor;
OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
InvalidArgument("input must be a vector, got shape: ",
input_tensor->shape().DebugString()));
auto input_vec = input_tensor->vec<::tensorflow::tstring>();
const int64 batch_size = input_vec.dimension(0);
std::vector<std::vector<std::pair<const char*, size_t>>> words_batches;
int64 max_seq_len = 0;
words_batches.reserve(batch_size);
std::vector<std::string> normalized_input_vec(batch_size);
for (int64 i = 0; i < batch_size; ++i) {
std::vector<std::pair<const char*, size_t>> words;
if (projection_normalizer_ == nullptr) {
words =
unicode_handler_->Tokenize(input_vec(i).data(), input_vec(i).size(),
split_on_space_, max_splits_);
} else {
normalized_input_vec[i] = projection_normalizer_->Normalize(
input_vec(i).data(), input_vec(i).size(), SIZE_MAX);
words = unicode_handler_->Tokenize(normalized_input_vec[i],
split_on_space_, max_splits_);
}
const int64 seq_len =
static_cast<int64>(bos_tag_ + words.size() + eos_tag_);
CHECK_GT(seq_len, 0);
max_seq_len = std::max(max_seq_len, seq_len);
words_batches.emplace_back(std::move(words));
}
auto projection =
AllocateTensor(ctx, "projection",
TensorShape({batch_size, max_seq_len, feature_size_}));
AllocateTensor(ctx, "dummy_output", TensorShape({1}));
auto sequence_length =
AllocateTensor(ctx, "sequence_length", TensorShape({batch_size}));
if (!projection || !sequence_length) {
LOG(ERROR) << "Unable to create buffer!";
return;
}
const float mapping_table[4] = {0, 1, -1, 0};
const int increment = 32;
std::vector<uint64_t> hash_codes;
absl::flat_hash_map<uint64, int> word_counter;
for (int64 i = 0; i < batch_size; ++i) {
word_counter.clear();
const int64 num_tokens = words_batches[i].size();
sequence_length[i] = bos_tag_ + num_tokens + eos_tag_;
int64 offset0 = i * max_seq_len * feature_size_;
// Calculate doc_size_feature in [0, infinity)
float doc_size_feature =
(doc_size_levels_ != 0)
? std::log2(static_cast<float>(num_tokens)) / doc_size_levels_
: 0.0f;
// Rescale doc_size_feature to [-1, 1].
doc_size_feature = std::min(doc_size_feature, 1.0f) * 2.0f - 1.0f;
for (int64 j = -bos_tag_; j < num_tokens + eos_tag_; ++j) {
std::string word;
if (j < 0) {
// Use a special tag for begin of sentence.
word = kBeginTokenTSP;
} else if (j < num_tokens) {
auto uword = icu::UnicodeString::fromUTF8(
unicode_handler_->LowerCaseUTF8WithSupportedUnicodes(
words_batches[i][j]));
word = text_distorter_->DistortText(&uword);
} else {
// Use a special tag for end of sentence.
CHECK_EQ(eos_tag_, 1);
word = kEndTokenTSP;
}
hasher_->GetHashCodes(word, &hash_codes);
for (int hindex = 0, k = 0; hindex < hash_codes.size(); hindex++) {
auto hash = hash_codes[hindex];
for (int kmax = std::min(k + increment, feature_size_); k < kmax;) {
projection[offset0 + k++] = mapping_table[hash & 0x3];
hash >>= 2;
}
}
if (word_novelty_bits_ != 0 && !hash_codes.empty()) {
const auto word_hash = hash_codes[0];
projection[offset0 + feature_size_ - 1] =
std::min((word_counter[word_hash]++ * word_novelty_offset_),
1.0f) *
2.0f -
1.0f;
}
if (doc_size_levels_ != 0) {
projection[offset0 + feature_size_ - 2] = doc_size_feature;
}
offset0 += feature_size_;
}
const int pending = (max_seq_len - (bos_tag_ + num_tokens + eos_tag_));
memset(projection + offset0, 0, pending * feature_size_ * sizeof(float));
}
}
private:
int32 feature_size_;
std::unique_ptr<Hasher> hasher_;
std::unique_ptr<TextDistorter> text_distorter_;
std::unique_ptr<ProjectionUnicodeHandler> unicode_handler_;
std::unique_ptr<ProjectionNormalizer> projection_normalizer_;
std::string vocabulary_;
int vocabulary_size_;
int32 max_splits_;
bool split_on_space_;
int eos_tag_;
int bos_tag_;
int word_novelty_bits_;
int doc_size_levels_;
float word_novelty_offset_;
};
REGISTER_KERNEL_BUILDER(
Name("SequenceStringProjection").Device(::tensorflow::DEVICE_CPU),
SequenceStringProjectionOp);
REGISTER_OP("SequenceStringProjection")
.Input("input: string")
.Output("projection: float32")
.Output("dummy_output: float32")
.Output("sequence_length: float32")
.Attr("feature_size: int")
.Attr("distortion_probability: float = 0.0")
.Attr("vocabulary: string = ''")
.Attr("max_splits: int = -1")
.Attr("exclude_nonalphaspace_unicodes: bool = False")
.Attr("add_bos_tag: bool = False")
.Attr("add_eos_tag: bool = True")
.Attr("word_novelty_bits: int = 0")
.Attr("doc_size_levels: int = 0")
.Attr("split_on_space: bool = True")
.Attr("token_separators: string = ''")
.Attr("normalize_repetition: bool = false")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
DimensionHandle size;
int32 feature_size;
TF_RETURN_IF_ERROR(c->GetAttr("feature_size", &feature_size));
const int kMaxFeatureSize = 4096;
CHECK_GE(feature_size, 0);
CHECK_LE(feature_size, kMaxFeatureSize);
auto batch_size = c->Dim(c->input(0), 0);
c->set_output(0, c->MakeShape({batch_size, InferenceContext::kUnknownDim,
feature_size}));
c->set_output(1, c->MakeShape({1}));
c->set_output(2, c->MakeShape({batch_size}));
return tensorflow::Status::OK();
})
.Doc(R"doc(
This op referred to as Ternary Sequence String Projection op (TSP), tokenizes
input text either on space or unicode boundary. Fingerprint for each token is
computed using murmur hash and bit features are extracted from the fingerprint
that maps every 2 bits to the ternary output {-1, 0, 1}. This effectively turns
a batch of text input into a ternary rank 3 tensor (in float format) of shape
[batch size, max token length, requested number of features].
Input(s):
- input: A string tensor with batch size number of elements.
Attribute(s):
- feature_size: Length of the ternary vector generated for each token.
- distortion_probability: When non zero distort the input text with this
probability. Helps as a regularization method when training data set is
small.
- vocabulary: When not empty provides a list of unique unicode characters that
will be allowed in the input text before fingerprinting. Another way to
say it is that the vocabulary is an optional character allowlist for the
input text. It helps normalize the text.
- max_splits: Maximum number of tokens that are allowed. It helps restrict the
max token length of the projection output. When the value is -1 the op
does not restrict the number of tokens in the output.
- exclude_nonalphaspace_unicodes: When set to true excludes unicodes that are
not alphabets or space character. This is multilingual. Though the effect
of this flag can be achieved using vocabulary, the vocabulary will have to
be very large for multilingual input.
- add_bos_tag: When true inserts a begin of sentence tag.
- add_eos_tag: When true inserts a end of sentence tag.
- word_novelty_bits: When true adds a special feature to the ternary output
that captures the frequency of occurrence of a particular token. This is an
experimental feature.
- doc_size_levels: When true adds a special feature to the ternary projection
output the document size in log scale. This is an experimental feature.
- split_on_space: When true tokenization is done on space segmentation.
Otherwise tokenization is done by segmenting on unicode boundary.
Output(s):
- projection: Floating point tensor with ternary values of shape
[batch size, max token length, requested number of features].
- dummy_output: Ignore this output, will be eliminated in a subsequent version.
- sequence_length: Batch size length vector containing the number of tokens for
each input text entry.
)doc");
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h"
#include "tf_ops/projection_normalizer_util.h" // sequence_projection
#include "tf_ops/projection_util.h" // sequence_projection
#include "tf_ops/text_distorter.h" // sequence_projection
using ::tensorflow::int32;
using ::tensorflow::int64;
using ::tensorflow::OpKernel;
using ::tensorflow::OpKernelConstruction;
using ::tensorflow::OpKernelContext;
using ::tensorflow::Tensor;
using ::tensorflow::TensorShapeUtils;
using ::tensorflow::uint64;
using ::tensorflow::errors::InvalidArgument;
using ::tensorflow::shape_inference::DimensionHandle;
using ::tensorflow::shape_inference::InferenceContext;
using ::tensorflow::shape_inference::ShapeHandle;
constexpr char kBeginTokenTSP[] = "<BOS>";
constexpr char kEndTokenTSP[] = "<EOS>";
constexpr float kMappingTable[4] = {0, 1, -1, 0};
constexpr int kIncrement = 32;
class SequenceStringProjectionOpV2 : public OpKernel {
public:
explicit SequenceStringProjectionOpV2(OpKernelConstruction* context)
: OpKernel(context) {
OP_REQUIRES_OK(context, context->GetAttr("feature_size", &feature_size_));
hasher_ = absl::make_unique<Hasher>(feature_size_);
float distortion_probability = 0.0;
OP_REQUIRES_OK(context, context->GetAttr("distortion_probability",
&distortion_probability));
text_distorter_ = absl::make_unique<TextDistorter>(distortion_probability);
OP_REQUIRES_OK(context, context->GetAttr("vocabulary", &vocabulary_));
unicode_handler_ = absl::make_unique<ProjectionUnicodeHandler>(vocabulary_);
bool add_bos_tag;
OP_REQUIRES_OK(context, context->GetAttr("add_bos_tag", &add_bos_tag));
bos_tag_ = add_bos_tag ? 1 : 0;
bool add_eos_tag;
OP_REQUIRES_OK(context, context->GetAttr("add_eos_tag", &add_eos_tag));
eos_tag_ = add_eos_tag ? 1 : 0;
bool normalize_repetition;
OP_REQUIRES_OK(context, context->GetAttr("normalize_repetition",
&normalize_repetition));
if (normalize_repetition) {
projection_normalizer_ = absl::make_unique<ProjectionNormalizer>(
std::string(), normalize_repetition);
}
}
void Compute(OpKernelContext* ctx) override {
const Tensor* input_tensor;
OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(input_tensor->shape()),
InvalidArgument("`input` must be a matrix, got shape: ",
input_tensor->shape().DebugString()));
auto input_matrix = input_tensor->matrix<::tensorflow::tstring>();
const int64 batch_size = input_matrix.dimension(0);
const int64 max_seq_len = input_matrix.dimension(1);
const Tensor* seq_len;
OP_REQUIRES_OK(ctx, ctx->input("sequence_length", &seq_len));
OP_REQUIRES(
ctx, TensorShapeUtils::IsVector(seq_len->shape()),
InvalidArgument("`sequence_length` must be a vector, got shape: ",
seq_len->shape().DebugString()));
auto seq_len_vector = seq_len->vec<int32>();
OP_REQUIRES(
ctx, seq_len_vector.size() == batch_size,
InvalidArgument("`sequence_length` should have batch size number "
"of elements, got size ",
seq_len_vector.size(), ", batch size is ", batch_size));
Tensor* output_tensor = nullptr;
OP_REQUIRES_OK(
ctx, ctx->allocate_output(
"projection",
{batch_size, bos_tag_ + max_seq_len + eos_tag_, feature_size_},
&output_tensor));
float* projection = &output_tensor->flat<float>()(0);
std::vector<uint64_t> hash_codes;
for (int64 i = 0; i < batch_size; ++i) {
const int64 num_tokens = seq_len_vector(i);
OP_REQUIRES(ctx, num_tokens > 0,
InvalidArgument(
"`sequence_length` should have values greater than 0"));
OP_REQUIRES(ctx, num_tokens <= max_seq_len,
InvalidArgument("`sequence_length` should have values less "
"than or equal to max_seq_len"));
int64 offset0 = i * (bos_tag_ + max_seq_len + eos_tag_) * feature_size_;
for (int64 j = -bos_tag_; j < num_tokens + eos_tag_; ++j) {
std::string word;
if (j < 0) {
word = kBeginTokenTSP;
} else if (j < num_tokens) {
auto token = std::pair<const char*, int32>(input_matrix(i, j).data(),
input_matrix(i, j).size());
auto uword = icu::UnicodeString::fromUTF8(
unicode_handler_->LowerCaseUTF8WithSupportedUnicodes(token));
word = text_distorter_->DistortText(&uword);
if (projection_normalizer_) {
word = projection_normalizer_->Normalize(word.data(), word.size(),
SIZE_MAX);
}
} else {
word = kEndTokenTSP;
}
hasher_->GetHashCodes(word, &hash_codes);
for (int hindex = 0, k = 0; hindex < hash_codes.size(); hindex++) {
auto hash = hash_codes[hindex];
for (int kmax = std::min(k + kIncrement, feature_size_); k < kmax;) {
projection[offset0 + k++] = kMappingTable[hash & 0x3];
hash >>= 2;
}
}
offset0 += feature_size_;
}
const int fill_length = (max_seq_len - num_tokens) * feature_size_;
float* fill_start = projection + offset0;
std::fill(fill_start, fill_start + fill_length, 0.0f);
}
}
private:
int32 feature_size_;
std::unique_ptr<Hasher> hasher_;
std::unique_ptr<TextDistorter> text_distorter_;
std::unique_ptr<ProjectionUnicodeHandler> unicode_handler_;
std::unique_ptr<ProjectionNormalizer> projection_normalizer_;
std::string vocabulary_;
int eos_tag_;
int bos_tag_;
};
REGISTER_KERNEL_BUILDER(
Name("SequenceStringProjectionV2").Device(::tensorflow::DEVICE_CPU),
SequenceStringProjectionOpV2);
REGISTER_OP("SequenceStringProjectionV2")
.Input("input: string")
.Input("sequence_length: int32")
.Output("projection: float32")
.Attr("feature_size: int")
.Attr("distortion_probability: float = 0.0")
.Attr("vocabulary: string = ''")
.Attr("add_bos_tag: bool = False")
.Attr("add_eos_tag: bool = False")
.Attr("normalize_repetition: bool = False")
.SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
DimensionHandle size;
int32 feature_size;
TF_RETURN_IF_ERROR(c->GetAttr("feature_size", &feature_size));
const int kMaxFeatureSize = 4096;
CHECK_GT(feature_size, 0);
CHECK_LE(feature_size, kMaxFeatureSize);
ShapeHandle output_shape;
TF_RETURN_IF_ERROR(c->Concatenate(
c->input(0), c->MakeShape({feature_size}), &output_shape));
c->set_output(0, output_shape);
return tensorflow::Status::OK();
})
.Doc(R"doc(
This op referred to as Ternary Sequence String Projection Op V2 (TSPV2),
works with presegmented string `input`. It fingerprints each token using murmur
hash and extracts bit features from the fingerprint that maps every 2 bits to
the ternary output {-1, 0, 1}. This effectively turns a batch of text segments
into a ternary rank 3 tensor (in float format) of shape
[batch size, max sequence length, requested number of features].
Input(s):
- input: A string tensor with [batch size, max sequence length] tokens.
- sequence_length: A vector with batch size number of integers, where each
integer is in (0, max sequence length], and represents the number of valid
text segments in each batch entry.
Attribute(s):
- feature_size: Length of the ternary vector generated for each token.
- distortion_probability: When non zero distort the input tokens with this
probability. Helps as a regularization method when training data set is
small.
- vocabulary: When not empty provides a list of unique unicode characters that
will be allowed in the input text before fingerprinting. Expressed another
way the vocabulary is an optional character allowlist for the
input tokens. It helps normalize the text.
- add_bos_tag: When true inserts a begin of sentence tag.
- add_eos_tag: When true inserts a end of sentence tag.
- normalize_repetition: When true normalizes repetition in text tokens before
fingerprinting.
Output(s):
- projection: Floating point tensor with ternary values of shape
[batch size, max sequence length, requested number of features].
)doc");
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/framework/node_def_builder.h"
#include "tensorflow/core/framework/shape_inference_testutil.h"
#include "tensorflow/core/kernels/ops_testutil.h"
#include "tensorflow/core/kernels/ops_util.h"
namespace {
using ::tensorflow::DT_INT32;
using ::tensorflow::DT_STRING;
using ::tensorflow::int32;
using ::tensorflow::NodeDefBuilder;
using ::tensorflow::OpsTestBase;
using ::tensorflow::Tensor;
using ::tensorflow::TensorShape;
class SequenceStringProjectionOpV2Test : public OpsTestBase {
protected:
bool FeatureMatches(const Tensor& output, int i1, int j1, int i2, int j2) {
bool all_matches = true;
auto output_tensor = output.tensor<float, 3>();
for (int k = 0; k < output.dim_size(2); ++k) {
all_matches &= (output_tensor(i1, j1, k) == output_tensor(i2, j2, k));
}
return all_matches;
}
bool FeatureIsZero(const Tensor& output, int i, int j) {
auto output_tensor = output.tensor<float, 3>();
bool all_zeros = true;
for (int k = 0; k < output.dim_size(2); ++k) {
all_zeros &= (output_tensor(i, j, k) == 0.0f);
}
return all_zeros;
}
};
TEST_F(SequenceStringProjectionOpV2Test, TestOutput) {
TF_ASSERT_OK(NodeDefBuilder("test_op", "SequenceStringProjectionV2")
.Input({"input", 1, DT_STRING})
.Input({"sequence_length", 1, DT_INT32})
.Attr("vocabulary", "abcdefghijklmnopqrstuvwxyz")
.Attr("feature_size", 16)
.Finalize(node_def()));
TF_ASSERT_OK(InitOp());
AddInputFromArray<::tensorflow::tstring>(
TensorShape({2, 8, 1}),
{"hello", "world", "147", "dog", "xyz", "abc", "efg", "hij", "quick",
"hel1lo", "123", "jumped", "over", "the", "lazy", "dog"});
AddInputFromArray<int32>(TensorShape({3, 1}), {9, 0, 9});
EXPECT_EQ(RunOpKernel().error_message(),
"`input` must be a matrix, got shape: [2,8,1]");
auto old = *mutable_input(0).tensor;
*mutable_input(0).tensor = Tensor(DT_STRING, TensorShape({2, 8}));
(*mutable_input(0).tensor).flat<::tensorflow::tstring>() =
old.flat<::tensorflow::tstring>();
EXPECT_EQ(RunOpKernel().error_message(),
"`sequence_length` must be a vector, got shape: [3,1]");
*mutable_input(1).tensor = Tensor(DT_INT32, TensorShape({3}));
EXPECT_EQ(RunOpKernel().error_message(),
"`sequence_length` should have batch size number of elements, got "
"size 3, batch size is 2");
*mutable_input(1).tensor = Tensor(DT_INT32, TensorShape({2}));
(*mutable_input(1).tensor).flat<int32>()(0) = 9;
(*mutable_input(1).tensor).flat<int32>()(1) = 0;
EXPECT_EQ(
RunOpKernel().error_message(),
"`sequence_length` should have values less than or equal to max_seq_len");
(*mutable_input(1).tensor).flat<int32>()(0) = 4;
EXPECT_EQ(RunOpKernel().error_message(),
"`sequence_length` should have values greater than 0");
(*mutable_input(1).tensor).flat<int32>()(1) = 8;
TF_EXPECT_OK(RunOpKernel());
const Tensor& output = *GetOutput(0);
// First checks dimensions.
ASSERT_EQ(output.dims(), 3);
EXPECT_EQ(output.dim_size(0), 2); // Batch size
EXPECT_EQ(output.dim_size(1), 8); // Max sequence length
EXPECT_EQ(output.dim_size(2), 16); // Feature size
EXPECT_FALSE(FeatureMatches(output, 0, 0, 1, 0)); // hello != quick.
EXPECT_FALSE(FeatureMatches(output, 0, 1, 1, 1)); // world != hello.
EXPECT_TRUE(FeatureMatches(output, 0, 0, 1, 1)); // hello == hel1lo.
EXPECT_TRUE(FeatureMatches(output, 0, 2, 1, 2)); // 147 == 123 (oov values).
EXPECT_TRUE(FeatureMatches(output, 0, 3, 1, 7)); // dog == dog.
// Check zero padding for first sentence.
for (int i = 4; i < 8; ++i) {
EXPECT_TRUE(FeatureIsZero(output, 0, i));
}
}
TEST_F(SequenceStringProjectionOpV2Test, TestOutputBoS) {
TF_ASSERT_OK(NodeDefBuilder("test_op", "SequenceStringProjectionV2")
.Input({"input", 1, DT_STRING})
.Input({"sequence_length", 1, DT_INT32})
.Attr("add_bos_tag", true)
.Attr("vocabulary", "abcdefghijklmnopqrstuvwxyz")
.Attr("feature_size", 16)
.Finalize(node_def()));
TF_ASSERT_OK(InitOp());
AddInputFromArray<::tensorflow::tstring>(
TensorShape({2, 8}),
{"hello", "world", "147", "dog", "", "", "", "", "quick", "hel1lo", "123",
"jumped", "over", "the", "lazy", "dog"});
AddInputFromArray<int32>(TensorShape({2}), {4, 8});
TF_ASSERT_OK(RunOpKernel());
const Tensor& output = *GetOutput(0);
// First checks dimensions.
ASSERT_EQ(output.dims(), 3);
EXPECT_EQ(output.dim_size(0), 2); // Batch size
EXPECT_EQ(output.dim_size(1), 9); // Max sequence length
EXPECT_EQ(output.dim_size(2), 16); // Feature size
EXPECT_TRUE(FeatureMatches(output, 0, 0, 1, 0)); // <bos> == <bos>.
EXPECT_FALSE(FeatureMatches(output, 0, 1, 1, 1)); // hello != quick.
EXPECT_FALSE(FeatureMatches(output, 0, 2, 1, 2)); // world != hello.
EXPECT_TRUE(FeatureMatches(output, 0, 1, 1, 2)); // hello == hel1lo.
EXPECT_TRUE(FeatureMatches(output, 0, 3, 1, 3)); // 147 == 123 (oov values).
EXPECT_TRUE(FeatureMatches(output, 0, 4, 1, 8)); // dog == dog.
// Check zero padding for first sentence.
for (int i = 5; i < 9; ++i) {
EXPECT_TRUE(FeatureIsZero(output, 0, i));
}
}
TEST_F(SequenceStringProjectionOpV2Test, TestOutputEoS) {
TF_ASSERT_OK(NodeDefBuilder("test_op", "SequenceStringProjectionV2")
.Input({"input", 1, DT_STRING})
.Input({"sequence_length", 1, DT_INT32})
.Attr("add_eos_tag", true)
.Attr("vocabulary", "abcdefghijklmnopqrstuvwxyz")
.Attr("feature_size", 16)
.Finalize(node_def()));
TF_ASSERT_OK(InitOp());
AddInputFromArray<::tensorflow::tstring>(
TensorShape({2, 8}),
{"hello", "world", "147", "dog", "", "", "", "", "quick", "hel1lo", "123",
"jumped", "over", "the", "lazy", "dog"});
AddInputFromArray<int32>(TensorShape({2}), {4, 8});
TF_ASSERT_OK(RunOpKernel());
const Tensor& output = *GetOutput(0);
// First checks dimensions.
ASSERT_EQ(output.dims(), 3);
EXPECT_EQ(output.dim_size(0), 2); // Batch size
EXPECT_EQ(output.dim_size(1), 9); // Max sequence length
EXPECT_EQ(output.dim_size(2), 16); // Feature size
EXPECT_FALSE(FeatureMatches(output, 0, 0, 1, 0)); // hello != quick.
EXPECT_FALSE(FeatureMatches(output, 0, 1, 1, 1)); // world != hello.
EXPECT_TRUE(FeatureMatches(output, 0, 0, 1, 1)); // hello == hel1lo.
EXPECT_TRUE(FeatureMatches(output, 0, 2, 1, 2)); // 147 == 123 (oov values).
EXPECT_TRUE(FeatureMatches(output, 0, 3, 1, 7)); // dog == dog.
EXPECT_TRUE(FeatureMatches(output, 0, 4, 1, 8)); // <bos> == <bos>.
// Check zero padding for first sentence.
for (int i = 5; i < 9; ++i) {
EXPECT_TRUE(FeatureIsZero(output, 0, i));
}
}
TEST_F(SequenceStringProjectionOpV2Test, TestOutputBoSEoS) {
TF_ASSERT_OK(NodeDefBuilder("test_op", "SequenceStringProjectionV2")
.Input({"input", 1, DT_STRING})
.Input({"sequence_length", 1, DT_INT32})
.Attr("add_bos_tag", true)
.Attr("add_eos_tag", true)
.Attr("vocabulary", "abcdefghijklmnopqrstuvwxyz.")
.Attr("feature_size", 16)
.Finalize(node_def()));
TF_ASSERT_OK(InitOp());
AddInputFromArray<::tensorflow::tstring>(
TensorShape({2, 8}),
{"hello", "world", "147", "dog", "...", "..", "", "", "quick", "hel1lo",
"123", "jumped", "over", "the", "lazy", "dog"});
AddInputFromArray<int32>(TensorShape({2}), {6, 8});
TF_ASSERT_OK(RunOpKernel());
const Tensor& output = *GetOutput(0);
// First checks dimensions.
ASSERT_EQ(output.dims(), 3);
EXPECT_EQ(output.dim_size(0), 2); // Batch size
EXPECT_EQ(output.dim_size(1), 10); // Max sequence length
EXPECT_EQ(output.dim_size(2), 16); // Feature size
EXPECT_TRUE(FeatureMatches(output, 0, 0, 1, 0)); // <bos> == <bos>.
EXPECT_FALSE(FeatureMatches(output, 0, 1, 1, 1)); // hello != quick.
EXPECT_FALSE(FeatureMatches(output, 0, 2, 1, 2)); // world != hello.
EXPECT_TRUE(FeatureMatches(output, 0, 1, 1, 2)); // hello == hel1lo.
EXPECT_TRUE(FeatureMatches(output, 0, 3, 1, 3)); // 147 == 123 (oov values).
EXPECT_TRUE(FeatureMatches(output, 0, 4, 1, 8)); // dog == dog.
EXPECT_TRUE(FeatureMatches(output, 0, 7, 1, 9)); // <eos> == <eos>.
// Check for default normalize_repetition=false
EXPECT_FALSE(FeatureMatches(output, 0, 4, 0, 5)); // ... != ..
// Check zero padding for first sentence.
for (int i = 8; i < 10; ++i) {
EXPECT_TRUE(FeatureIsZero(output, 0, i));
}
}
TEST_F(SequenceStringProjectionOpV2Test, TestOutputNormalize) {
TF_ASSERT_OK(NodeDefBuilder("test_op", "SequenceStringProjectionV2")
.Input({"input", 1, DT_STRING})
.Input({"sequence_length", 1, DT_INT32})
.Attr("normalize_repetition", true)
.Attr("vocabulary", "abcdefghijklmnopqrstuvwxyz.")
.Attr("feature_size", 16)
.Finalize(node_def()));
TF_ASSERT_OK(InitOp());
AddInputFromArray<::tensorflow::tstring>(
TensorShape({2, 8}),
{"hello", "world", "..", "....", "", "", "", "", "quick", "hel1lo", "123",
"jumped", "over", "...", ".....", "dog"});
AddInputFromArray<int32>(TensorShape({2}), {4, 8});
TF_ASSERT_OK(RunOpKernel());
const Tensor& output = *GetOutput(0);
// First checks dimensions.
ASSERT_EQ(output.dims(), 3);
EXPECT_EQ(output.dim_size(0), 2); // Batch size
EXPECT_EQ(output.dim_size(1), 8); // Max sequence length
EXPECT_EQ(output.dim_size(2), 16); // Feature size
EXPECT_TRUE(FeatureMatches(output, 0, 2, 0, 3)); // .. == ....
EXPECT_TRUE(FeatureMatches(output, 1, 5, 1, 6)); // ... == ..
EXPECT_TRUE(FeatureMatches(output, 0, 3, 1, 6)); // .... == ...
// Check zero padding for first sentence.
for (int i = 4; i < 8; ++i) {
EXPECT_TRUE(FeatureIsZero(output, 0, i));
}
}
} // namespace
int main(int argc, char** argv) {
// On Linux, add: absl::SetFlag(&FLAGS_logtostderr, true);
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tf_ops/text_distorter.h" // sequence_projection
using tensorflow::uint32;
// Distorts the words in the text by inserting, deleting and swapping
// unicodes randomly with probability one third of distortion_probability.
std::string TextDistorter::DistortText(icu::UnicodeString* uword) {
if (distortion_probability_ > 0.0 &&
generator_.RandFloat() < distortion_probability_ && uword->length()) {
// Distort text with non zero length with distortion_probability_.
float distortion_type = generator_.RandFloat();
uint32 rindex = generator_.Rand32() % uword->length();
if (distortion_type < 0.33f) {
// Remove character with one third probability.
random_char_ = (*uword)[rindex];
uword->remove(rindex, 1);
} else if (distortion_type < 0.66f) {
// Swap character with one third probability if there are more than 2
// characters.
if (uword->length() > 2) {
random_char_ = (*uword)[rindex];
uword->remove(rindex, 1);
uword->insert(generator_.Rand32() % uword->length(), random_char_);
}
} else if (random_char_) {
// Insert character with one third probability.
uword->insert(rindex, random_char_);
}
}
// Convert unicode sequence back to string.
std::string word;
icu::StringByteSink<std::string> sink(&word);
uword->toUTF8(sink);
return word;
}
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_TEXT_DISTORTER_H_
#define TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_TEXT_DISTORTER_H_
#include <assert.h>
#include "icu4c/source/common/unicode/unistr.h"
#include "tensorflow/core/lib/random/simple_philox.h"
// A class that can be used to distort text randomly.
class TextDistorter {
public:
// Add a random seed for PhiloxRandom constructor
explicit TextDistorter(float distortion_probability)
: philox_(171),
generator_(&philox_),
distortion_probability_(distortion_probability) {
assert(distortion_probability_ >= 0.0);
assert(distortion_probability_ <= 1.0);
}
std::string DistortText(icu::UnicodeString* uword);
private:
tensorflow::random::PhiloxRandom philox_;
tensorflow::random::SimplePhilox generator_;
float distortion_probability_;
UChar32 random_char_ = 0;
};
#endif // TENSORFLOW_MODELS_SEQUENCE_PROJECTION_TF_OPS_TEXT_DISTORTER_H_
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment