Unverified Commit fec0338f authored by pyoung2778's avatar pyoung2778 Committed by GitHub
Browse files

Checkin seq_flow_lite (#10219)

parent c6d7d57d
...@@ -16,10 +16,10 @@ http_archive( ...@@ -16,10 +16,10 @@ http_archive(
http_archive( http_archive(
name = "org_tensorflow", name = "org_tensorflow",
sha256 = "fc6d7c57cd9427e695a38ad00fb6ecc3f623bac792dd44ad73a3f85b338b68be", sha256 = "40d3203ab5f246d83bae328288a24209a2b85794f1b3e2cd0329458d8e7c1985",
strip_prefix = "tensorflow-8a4ffe2e1ae722cff5306778df0cfca8b7f503fe", strip_prefix = "tensorflow-2.6.0",
urls = [ urls = [
"https://github.com/tensorflow/tensorflow/archive/8a4ffe2e1ae722cff5306778df0cfca8b7f503fe.tar.gz", "https://github.com/tensorflow/tensorflow/archive/v2.6.0.zip",
], ],
) )
...@@ -49,41 +49,6 @@ PROTOC_VERSION = "3.9.0" ...@@ -49,41 +49,6 @@ PROTOC_VERSION = "3.9.0"
PROTOC_SHA256 = "15e395b648a1a6dda8fd66868824a396e9d3e89bc2c8648e3b9ab9801bea5d55" PROTOC_SHA256 = "15e395b648a1a6dda8fd66868824a396e9d3e89bc2c8648e3b9ab9801bea5d55"
reverb_protoc_deps(version = PROTOC_VERSION, sha256 = PROTOC_SHA256) reverb_protoc_deps(version = PROTOC_VERSION, sha256 = PROTOC_SHA256)
# ABSL cpp library.
http_archive(
name = "com_google_absl",
sha256 = "f368a8476f4e2e0eccf8a7318b98dafbe30b2600f4e3cf52636e5eb145aba06a", # SHARED_ABSL_SHA
strip_prefix = "abseil-cpp-df3ea785d8c30a9503321a3d35ee7d35808f190d",
urls = [
"https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-cpp/archive/df3ea785d8c30a9503321a3d35ee7d35808f190d.tar.gz",
"https://github.com/abseil/abseil-cpp/archive/df3ea785d8c30a9503321a3d35ee7d35808f190d.tar.gz",
],
)
http_archive(
name = "rules_cc",
strip_prefix = "rules_cc-master",
urls = ["https://github.com/bazelbuild/rules_cc/archive/master.zip"],
)
# GoogleTest/GoogleMock framework. Used by most unit-tests.
http_archive(
name = "com_google_googletest",
urls = ["https://github.com/google/googletest/archive/master.zip"],
strip_prefix = "googletest-master",
)
# gflags needed by glog
http_archive(
name = "com_github_gflags_gflags",
sha256 = "6e16c8bc91b1310a44f3965e616383dbda48f83e8c1eaa2370a215057b00cabe",
strip_prefix = "gflags-77592648e3f3be87d6c7123eb81cbad75f9aef5a",
urls = [
"https://mirror.bazel.build/github.com/gflags/gflags/archive/77592648e3f3be87d6c7123eb81cbad75f9aef5a.tar.gz",
"https://github.com/gflags/gflags/archive/77592648e3f3be87d6c7123eb81cbad75f9aef5a.tar.gz",
],
)
# glog # glog
http_archive( http_archive(
name = "com_google_glog", name = "com_google_glog",
...@@ -92,16 +57,6 @@ http_archive( ...@@ -92,16 +57,6 @@ http_archive(
urls = ["https://github.com/google/glog/archive/v0.4.0.tar.gz"], urls = ["https://github.com/google/glog/archive/v0.4.0.tar.gz"],
) )
http_archive(
name = "absl_py",
sha256 = "603febc9b95a8f2979a7bdb77d2f5e4d9b30d4e0d59579f88eba67d4e4cc5462",
strip_prefix = "abseil-py-pypi-v0.9.0",
urls = [
"https://storage.googleapis.com/mirror.tensorflow.org/github.com/abseil/abseil-py/archive/pypi-v0.9.0.tar.gz",
"https://github.com/abseil/abseil-py/archive/pypi-v0.9.0.tar.gz",
],
)
http_archive( http_archive(
name = "utf_archive", name = "utf_archive",
build_file = "@//third_party:utf.BUILD", build_file = "@//third_party:utf.BUILD",
...@@ -113,25 +68,17 @@ http_archive( ...@@ -113,25 +68,17 @@ http_archive(
) )
#----------------------------------------------------------------------------- load("@org_tensorflow//tensorflow:workspace3.bzl", "tf_workspace3")
# proto tf_workspace3()
#-----------------------------------------------------------------------------
# proto_library, cc_proto_library and java_proto_library rules implicitly depend load("@org_tensorflow//tensorflow:workspace2.bzl", "tf_workspace2")
# on @com_google_protobuf//:proto, @com_google_protobuf//:cc_toolchain and tf_workspace2()
# @com_google_protobuf//:java_toolchain, respectively.
# This statement defines the @com_google_protobuf repo.
http_archive(
name = "com_google_protobuf",
strip_prefix = "protobuf-3.8.0",
urls = ["https://github.com/google/protobuf/archive/v3.8.0.zip"],
sha256 = "1e622ce4b84b88b6d2cdf1db38d1a634fe2392d74f0b7b74ff98f3a51838ee53",
)
load("//third_party/flatbuffers:workspace.bzl", flatbuffers = "repo") load("@org_tensorflow//tensorflow:workspace1.bzl", "tf_workspace1")
flatbuffers() tf_workspace1()
load("@org_tensorflow//tensorflow:workspace.bzl", "tf_workspace") load("@org_tensorflow//tensorflow:workspace0.bzl", "tf_workspace0")
tf_workspace(tf_repo_name = "org_tensorflow") tf_workspace0()
# TF submodule compilation doesn't take care of grpc deps. Do it manually here. # TF submodule compilation doesn't take care of grpc deps. Do it manually here.
...@@ -168,7 +115,7 @@ new_git_repository( ...@@ -168,7 +115,7 @@ new_git_repository(
remote = "https://github.com/unicode-org/icu", remote = "https://github.com/unicode-org/icu",
build_file = "@//third_party:icu.BUILD", build_file = "@//third_party:icu.BUILD",
patch_cmds = [ patch_cmds = [
"find . -type f -exec sed -i 's/#\s*include \"unicode/#include \"icu4c\/source\/common\/unicode/g' {} \;", "find . -type f -exec sed -i 's/#\\s*include \"unicode/#include \"icu4c\\/source\\/common\\/unicode/g' {} \\;",
], ],
) )
......
...@@ -5,5 +5,6 @@ sh_binary( ...@@ -5,5 +5,6 @@ sh_binary(
"//tf_ops:sequence_string_projection_op_py", "//tf_ops:sequence_string_projection_op_py",
"//tf_ops:sequence_string_projection_op_v2_py", "//tf_ops:sequence_string_projection_op_v2_py",
"//tf_ops:tf_custom_ops_py", "//tf_ops:tf_custom_ops_py",
"//tflite_ops:registerer",
], ],
) )
...@@ -30,3 +30,5 @@ cp -f "${RUNFILES_DIR}/tf_ops/libtf_custom_ops_py_gen_op.so" \ ...@@ -30,3 +30,5 @@ cp -f "${RUNFILES_DIR}/tf_ops/libtf_custom_ops_py_gen_op.so" \
cp -f "${RUNFILES_DIR}/tf_ops/tf_custom_ops_py.py" \ cp -f "${RUNFILES_DIR}/tf_ops/tf_custom_ops_py.py" \
"${BUILD_WORKSPACE_DIRECTORY}/tf_ops" "${BUILD_WORKSPACE_DIRECTORY}/tf_ops"
cp -f "${RUNFILES_DIR}/tflite_ops/registerer.so" \
"${BUILD_WORKSPACE_DIRECTORY}/tflite_ops"
...@@ -44,7 +44,7 @@ class _BazelBuildCommand(setuptools.Command): ...@@ -44,7 +44,7 @@ class _BazelBuildCommand(setuptools.Command):
setuptools.setup( setuptools.setup(
name='seq_flow_lite', name='seq_flow_lite',
version='0.1', version='0.1',
packages=['tf_ops'], packages=['tf_ops', 'tflite_ops'],
package_data={'': ['*.so']}, package_data={'': ['*.so']},
cmdclass={ cmdclass={
'build': _BuildCommand, 'build': _BuildCommand,
......
...@@ -48,9 +48,9 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter( ...@@ -48,9 +48,9 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
tflite::ops::builtin::BuiltinOpResolver resolver; tflite::ops::builtin::BuiltinOpResolver resolver;
resolver.AddCustom( resolver.AddCustom(
"SEQUENCE_STRING_PROJECTION", "SEQUENCE_STRING_PROJECTION",
tflite::ops::custom::Register_SEQUENCE_STRING_PROJECTION()); ::seq_flow_lite::ops::custom::Register_SEQUENCE_STRING_PROJECTION());
resolver.AddCustom("ExpectedValueOp", resolver.AddCustom("ExpectedValueOp",
tflite::ops::custom::Register_EXPECTED_VALUE()); ::seq_flow_lite::ops::custom::Register_EXPECTED_VALUE());
tflite::InterpreterBuilder(model, resolver, tflite::InterpreterBuilder(model, resolver,
/*error_reporter=*/nullptr)(&interpreter); /*error_reporter=*/nullptr)(&interpreter);
if (!interpreter) { if (!interpreter) {
...@@ -105,7 +105,7 @@ std::vector<float> InvokeModel( ...@@ -105,7 +105,7 @@ std::vector<float> InvokeModel(
const size_t num_classes = output_dims[kClassOutputClassIndex]; const size_t num_classes = output_dims[kClassOutputClassIndex];
for (int i = 0; i < num_classes; ++i) { for (int i = 0; i < num_classes; ++i) {
// Find class probability or log probability for the class index // Find class probability or log probability for the class index
classes.push_back(tflite::PodDequantize(*class_output, i)); classes.push_back(::seq_flow_lite::PodDequantize(*class_output, i));
} }
return classes; return classes;
} }
......
...@@ -30,6 +30,8 @@ from utils import tflite_utils # import seq_flow_lite module ...@@ -30,6 +30,8 @@ from utils import tflite_utils # import seq_flow_lite module
FLAGS = flags.FLAGS FLAGS = flags.FLAGS
flags.DEFINE_string("output_dir", None, "The output or model directory.") flags.DEFINE_string("output_dir", None, "The output or model directory.")
flags.DEFINE_enum("output", "sigmoid", ["logits", "sigmoid", "softmax"],
"Specification of the output tensor.")
def load_runner_config(): def load_runner_config():
...@@ -51,12 +53,20 @@ def main(_): ...@@ -51,12 +53,20 @@ def main(_):
encoder = model.Encoder(model_config, base_layers.TFLITE) encoder = model.Encoder(model_config, base_layers.TFLITE)
projection, seq_lengh = prxlayer(text) projection, seq_lengh = prxlayer(text)
logits = encoder(projection, seq_lengh) logits = encoder(projection, seq_lengh)
if FLAGS.output == "logits":
outputs = logits
elif FLAGS.output == "sigmoid":
outputs = tf.math.sigmoid(logits)
else:
assert FLAGS.output == "softmax", "Unexpected output"
outputs = tf.nn.softmax(logits)
session.run(tf.global_variables_initializer()) session.run(tf.global_variables_initializer())
session.run(tf.local_variables_initializer()) session.run(tf.local_variables_initializer())
saver = tf.train.Saver() saver = tf.train.Saver()
saver.restore(session, tf.train.latest_checkpoint(FLAGS.output_dir)) saver.restore(session, tf.train.latest_checkpoint(FLAGS.output_dir))
tflite_fb = tflite_utils.generate_tflite(session, graph, [text], [logits]) tflite_fb = tflite_utils.generate_tflite(session, graph, [text],
[outputs])
output_file_name = os.path.join(FLAGS.output_dir, "tflite.fb") output_file_name = os.path.join(FLAGS.output_dir, "tflite.fb")
with tf.gfile.Open(output_file_name, "wb") as f: with tf.gfile.Open(output_file_name, "wb") as f:
f.write(tflite_fb) f.write(tflite_fb)
......
...@@ -54,19 +54,23 @@ class BaseLayer(tf.keras.layers.Layer): ...@@ -54,19 +54,23 @@ class BaseLayer(tf.keras.layers.Layer):
assert len(tensor.get_shape().as_list()) == rank assert len(tensor.get_shape().as_list()) == rank
assert tensor.dtype == dtype assert tensor.dtype == dtype
def add_qweight(self, shape, num_bits=8): def add_weight_wrapper(self, shape):
"""Return a quantized weight variable for the given shape.""" """Return a weight variable for the given shape."""
if self.parameters.initializer is not None: if self.parameters.initializer is not None:
initializer = self.parameters.initializer initializer = self.parameters.initializer
else: else:
initializer = tf.keras.initializers.GlorotUniform() initializer = tf.keras.initializers.GlorotUniform()
weight = self.add_weight( weight = self.add_weight(
"weight", shape, initializer=initializer, trainable=True) "weight",
shape,
initializer=initializer,
trainable=True,
dtype=tf.float32)
self.add_reg_loss(weight) self.add_reg_loss(weight)
return self._weight_quantization(weight, num_bits=num_bits) return weight
def _weight_quantization(self, tensor, num_bits=8): def quantize_parameter(self, tensor, num_bits=8):
"""Quantize weights when enabled.""" """Quantize parameters when enabled."""
# For infer mode, toco computes the min/max from the weights offline to # For infer mode, toco computes the min/max from the weights offline to
# quantize it. During train/eval this is computed from the current value # quantize it. During train/eval this is computed from the current value
# in the session by the graph itself. # in the session by the graph itself.
...@@ -98,21 +102,37 @@ class BaseLayer(tf.keras.layers.Layer): ...@@ -98,21 +102,37 @@ class BaseLayer(tf.keras.layers.Layer):
def assign_moving_average(self, var, update, ema_decay): def assign_moving_average(self, var, update, ema_decay):
return var.assign(var.read_value() * (1 - ema_decay) + (ema_decay) * update) return var.assign(var.read_value() * (1 - ema_decay) + (ema_decay) * update)
def qrange_sigmoid(self, tensor): def quantize_tensor(self, tf_only):
if self.parameters.quantize: if tf_only and self.parameters.mode == TFLITE:
return False
return self.parameters.quantize
def qrange_sigmoid(self, tensor, tf_only=False):
if self.quantize_tensor(tf_only):
return tf.quantization.fake_quant_with_min_max_args(tensor, 0.0, 1.0) return tf.quantization.fake_quant_with_min_max_args(tensor, 0.0, 1.0)
return tensor return tensor
def qrange_tanh(self, tensor): def qrange_tanh(self, tensor, tf_only=False):
if self.parameters.quantize: if self.quantize_tensor(tf_only):
return tf.quantization.fake_quant_with_min_max_args(tensor, -1.0, 1.0) return tf.quantization.fake_quant_with_min_max_args(tensor, -1.0, 1.0)
return tensor return tensor
def quantized_tanh(self, tensor): def quantized_tanh(self, tensor, tf_only=False):
return self.qrange_tanh(tf.tanh(tensor)) return self.qrange_tanh(tf.tanh(tensor), tf_only)
def quantized_sigmoid(self, tensor): def quantized_sigmoid(self, tensor, tf_only=False):
return self.qrange_sigmoid(tf.sigmoid(tensor)) return self.qrange_sigmoid(tf.sigmoid(tensor), tf_only)
def get_batch_dimension(self, tensor): def get_batch_dimension(self, tensor):
return tensor.get_shape().as_list()[0] or tf.shape(tensor)[0] return tensor.get_shape().as_list()[0] or tf.shape(tensor)[0]
def inverse_normalizer(self, mask):
return tf.math.reciprocal(tf.reduce_sum(mask))
def random_drop_to_zero(self, tensor, zero_probability):
rnd = tf.random.uniform(
shape=tf.shape(tensor),
minval=-zero_probability,
maxval=(1.0 - zero_probability),
dtype=tensor.dtype)
return tf.math.ceil(rnd)
...@@ -60,7 +60,7 @@ class EncoderQConvolution(base_layers.BaseLayer): ...@@ -60,7 +60,7 @@ class EncoderQConvolution(base_layers.BaseLayer):
assert len(input_shapes) == self.rank assert len(input_shapes) == self.rank
self.in_filters = input_shapes[-1] self.in_filters = input_shapes[-1]
shape = self.ksize + [self.in_filters, self.out_filters] shape = self.ksize + [self.in_filters, self.out_filters]
self.filters = self.add_qweight(shape=shape) self.filters = self.add_weight_wrapper(shape=shape)
if self.bias: if self.bias:
self.b = self.add_bias(shape=[self.out_filters]) self.b = self.add_bias(shape=[self.out_filters])
...@@ -70,7 +70,7 @@ class EncoderQConvolution(base_layers.BaseLayer): ...@@ -70,7 +70,7 @@ class EncoderQConvolution(base_layers.BaseLayer):
def _conv_r4(self, inputs, normalize_method): def _conv_r4(self, inputs, normalize_method):
outputs = tf.nn.conv2d( outputs = tf.nn.conv2d(
inputs, inputs,
self.filters, self.quantize_parameter(self.filters),
strides=self.strides, strides=self.strides,
padding=self.padding, padding=self.padding,
dilations=self.dilations) dilations=self.dilations)
......
...@@ -47,7 +47,7 @@ class BaseQDense(base_layers.BaseLayer): ...@@ -47,7 +47,7 @@ class BaseQDense(base_layers.BaseLayer):
assert input_shapes[1] == 1 or input_shapes[2] == 1 assert input_shapes[1] == 1 or input_shapes[2] == 1
self.in_units = input_shapes[-1] self.in_units = input_shapes[-1]
shape = [self.in_units, self.units] shape = [self.in_units, self.units]
self.w = self.add_qweight(shape=shape) self.w = self.add_weight_wrapper(shape=shape)
if self.bias: if self.bias:
self.b = self.add_bias(shape=[self.units]) self.b = self.add_bias(shape=[self.units])
...@@ -55,7 +55,7 @@ class BaseQDense(base_layers.BaseLayer): ...@@ -55,7 +55,7 @@ class BaseQDense(base_layers.BaseLayer):
self.normalization = normalization_layers.BatchNormalization(**kwargs) self.normalization = normalization_layers.BatchNormalization(**kwargs)
def _dense_r2(self, inputs, normalize_method): def _dense_r2(self, inputs, normalize_method):
outputs = tf.matmul(inputs, self.w) outputs = tf.matmul(inputs, self.quantize_parameter(self.w))
if self.bias: if self.bias:
outputs = tf.nn.bias_add(outputs, self.b) outputs = tf.nn.bias_add(outputs, self.b)
if self.normalize: if self.normalize:
...@@ -98,7 +98,9 @@ class BaseQDenseVarLen(BaseQDense): ...@@ -98,7 +98,9 @@ class BaseQDenseVarLen(BaseQDense):
self.normalization = normalization_layers.VarLenBatchNormalization( self.normalization = normalization_layers.VarLenBatchNormalization(
rank=2, **kwargs) rank=2, **kwargs)
def call(self, inputs, mask, inverse_normalizer): def call(self, inputs, mask, inverse_normalizer=None):
if inverse_normalizer is None:
inverse_normalizer = self.inverse_normalizer(mask)
def normalize_method(tensor): def normalize_method(tensor):
maskr2 = tf.reshape(mask, [-1, 1]) maskr2 = tf.reshape(mask, [-1, 1])
......
...@@ -25,7 +25,7 @@ from tf_ops import sequence_string_projection_op_v2 as sspv2 # import seq_flow_l ...@@ -25,7 +25,7 @@ from tf_ops import sequence_string_projection_op_v2 as sspv2 # import seq_flow_l
class ProjectionLayer(base_layers.BaseLayer): class ProjectionLayer(base_layers.BaseLayer):
"""Base class for encoders.""" """Base class for encoders."""
def __init__(self, model_config, mode): def __init__(self, model_config, mode, **kwargs):
"""Create projection.""" """Create projection."""
def _get_params(varname, default_value=None): def _get_params(varname, default_value=None):
...@@ -50,7 +50,7 @@ class ProjectionLayer(base_layers.BaseLayer): ...@@ -50,7 +50,7 @@ class ProjectionLayer(base_layers.BaseLayer):
if mode == base_layers.TRAIN: if mode == base_layers.TRAIN:
_get_params("distortion_probability", 0.0) _get_params("distortion_probability", 0.0)
parameters = base_layers.Parameters(mode, self.quantize) parameters = base_layers.Parameters(mode, self.quantize)
super(ProjectionLayer, self).__init__(parameters=parameters) super(ProjectionLayer, self).__init__(parameters=parameters, **kwargs)
def call(self, inputs): def call(self, inputs):
projection, _, seq_length = ssp.sequence_string_projection( projection, _, seq_length = ssp.sequence_string_projection(
...@@ -74,7 +74,6 @@ class ProjectionLayer(base_layers.BaseLayer): ...@@ -74,7 +74,6 @@ class ProjectionLayer(base_layers.BaseLayer):
batch_size = self.get_batch_dimension(inputs) batch_size = self.get_batch_dimension(inputs)
projection = tf.reshape(projection, projection = tf.reshape(projection,
[batch_size, self.max_seq_len, self.feature_size]) [batch_size, self.max_seq_len, self.feature_size])
if self.mode in modes:
projection = self.qrange_tanh(projection) projection = self.qrange_tanh(projection)
return projection, seq_length return projection, seq_length
...@@ -82,7 +81,7 @@ class ProjectionLayer(base_layers.BaseLayer): ...@@ -82,7 +81,7 @@ class ProjectionLayer(base_layers.BaseLayer):
class ProjectionLayerPreSegmented(base_layers.BaseLayer): class ProjectionLayerPreSegmented(base_layers.BaseLayer):
"""Base class for encoders.""" """Base class for encoders."""
def __init__(self, model_config, mode): def __init__(self, model_config, mode, **kwargs):
"""Create projection.""" """Create projection."""
def _get_params(varname, default_value=None): def _get_params(varname, default_value=None):
...@@ -101,11 +100,13 @@ class ProjectionLayerPreSegmented(base_layers.BaseLayer): ...@@ -101,11 +100,13 @@ class ProjectionLayerPreSegmented(base_layers.BaseLayer):
if mode == base_layers.TRAIN: if mode == base_layers.TRAIN:
_get_params("distortion_probability", 0.0) _get_params("distortion_probability", 0.0)
parameters = base_layers.Parameters(mode, self.quantize) parameters = base_layers.Parameters(mode, self.quantize)
super(ProjectionLayerPreSegmented, self).__init__(parameters=parameters) super(ProjectionLayerPreSegmented, self).__init__(
parameters=parameters, **kwargs)
def call(self, inputs, sequence_length): def call(self, inputs):
tokens, sequence_length = inputs
projection = sspv2.sequence_string_projection_v2( projection = sspv2.sequence_string_projection_v2(
input=inputs, input=tokens,
sequence_length=sequence_length, sequence_length=sequence_length,
feature_size=self.feature_size, feature_size=self.feature_size,
distortion_probability=self.distortion_probability, distortion_probability=self.distortion_probability,
......
...@@ -27,6 +27,8 @@ class ActivationQuantization(base_layers.BaseLayer): ...@@ -27,6 +27,8 @@ class ActivationQuantization(base_layers.BaseLayer):
self.ema_decay = ema_decay self.ema_decay = ema_decay
self.num_bits = num_bits self.num_bits = num_bits
super(ActivationQuantization, self).__init__(**kwargs) super(ActivationQuantization, self).__init__(**kwargs)
def build(self, input_shapes):
if self.parameters.quantize: if self.parameters.quantize:
self.min_var = self.add_weight( self.min_var = self.add_weight(
"min", initializer=tf.keras.initializers.Zeros(), trainable=False) "min", initializer=tf.keras.initializers.Zeros(), trainable=False)
...@@ -53,6 +55,7 @@ class ActivationQuantization(base_layers.BaseLayer): ...@@ -53,6 +55,7 @@ class ActivationQuantization(base_layers.BaseLayer):
return inputs return inputs
def quantize_using_range(self, inputs): def quantize_using_range(self, inputs):
# This method can only be called after a call to "call" method in this class
if self.parameters.quantize: if self.parameters.quantize:
return tf.quantization.fake_quant_with_min_max_vars( return tf.quantization.fake_quant_with_min_max_vars(
inputs, self.min_var, self.max_var, num_bits=self.num_bits) inputs, self.min_var, self.max_var, num_bits=self.num_bits)
...@@ -66,21 +69,24 @@ class ConcatQuantization(ActivationQuantization): ...@@ -66,21 +69,24 @@ class ConcatQuantization(ActivationQuantization):
self.axis = axis self.axis = axis
super(ConcatQuantization, self).__init__(**kwargs) super(ConcatQuantization, self).__init__(**kwargs)
def reduce_list(self, tensor_list, functor): def _reduce_list(self, tensor_list, functor):
reduce_result = [functor(tensor) for tensor in tensor_list] reduce_result = [functor(tensor) for tensor in tensor_list]
# Toco expects 0.0 to be part of the quantization range. # Toco expects 0.0 to be part of the quantization range.
reduce_result.append(tf.constant(0.0)) reduce_result.append(tf.constant(0.0))
return functor(tf.stack(reduce_result)) return functor(tf.stack(reduce_result))
def call(self, tensors): def call(self, tensors):
# Ignore empty invocations done to build the keras layer.
if tensors is None:
return
if self.parameters.quantize: if self.parameters.quantize:
if self.parameters.mode == base_layers.TRAIN: if self.parameters.mode == base_layers.TRAIN:
# Toco expects 0.0 to be part of the quantization range. # Toco expects 0.0 to be part of the quantization range.
batch_min = self.reduce_list(tensors, tf.reduce_min) batch_min = self._reduce_list(tensors, tf.reduce_min)
min_var = self.assign_moving_average(self.min_var, batch_min, min_var = self.assign_moving_average(self.min_var, batch_min,
self.ema_decay) self.ema_decay)
batch_max = self.reduce_list(tensors, tf.reduce_max) batch_max = self._reduce_list(tensors, tf.reduce_max)
max_var = self.assign_moving_average(self.max_var, batch_max, max_var = self.assign_moving_average(self.max_var, batch_max,
self.ema_decay) self.ema_decay)
else: else:
......
...@@ -27,21 +27,17 @@ def classification_metric(per_example_loss, label_ids, logits): ...@@ -27,21 +27,17 @@ def classification_metric(per_example_loss, label_ids, logits):
} }
THRESHOLDS = [0.5]
def labeling_metric(per_example_loss, label_ids, logits): def labeling_metric(per_example_loss, label_ids, logits):
"""Compute eval metrics.""" """Compute eval metrics."""
scores = tf.math.sigmoid(logits) scores = tf.math.sigmoid(logits)
binary_prediction = tf.math.greater_equal(scores, 0.5)
num_classes = label_ids.get_shape().as_list()[-1] num_classes = label_ids.get_shape().as_list()[-1]
return_dict = {"eval_loss": tf.metrics.mean(per_example_loss)} return_dict = {"eval_loss": tf.metrics.mean(per_example_loss)}
for idx in range(num_classes): for idx in range(num_classes):
return_dict["auc/" + str(idx)] = tf.metrics.auc(label_ids[:, idx], return_dict["auc/" + str(idx)] = tf.metrics.auc(label_ids[:, idx],
scores[:, idx]) scores[:, idx])
return_dict["precision@" + str(THRESHOLDS) + "/" + return_dict["precision/" + str(idx)] = tf.metrics.precision(
str(idx)] = tf.metrics.precision_at_thresholds( label_ids[:, idx], binary_prediction[:, idx])
label_ids[:, idx], scores[:, idx], thresholds=THRESHOLDS) return_dict["recall/" + str(idx)] = tf.metrics.recall(
return_dict["recall@" + str(THRESHOLDS) + "/" + label_ids[:, idx], binary_prediction[:, idx])
str(idx)] = tf.metrics.recall_at_thresholds(
label_ids[:, idx], scores[:, idx], thresholds=THRESHOLDS)
return return_dict return return_dict
...@@ -38,6 +38,7 @@ class PaddedMaskedVarLenConv(conv_layers.EncoderQConvolutionVarLen): ...@@ -38,6 +38,7 @@ class PaddedMaskedVarLenConv(conv_layers.EncoderQConvolutionVarLen):
assert bool(ngram is None) != bool(skip_bigram is None) assert bool(ngram is None) != bool(skip_bigram is None)
self.kwidth = ngram if ngram is not None else (skip_bigram + 2) self.kwidth = ngram if ngram is not None else (skip_bigram + 2)
mask = [1] * self.kwidth mask = [1] * self.kwidth
self.skipgram = skip_bigram is not None
if skip_bigram is not None: if skip_bigram is not None:
mask[1], mask[skip_bigram] = 0, 0 mask[1], mask[skip_bigram] = 0, 0
self.mask = np.array(mask, dtype="float32").reshape((1, self.kwidth, 1, 1)) self.mask = np.array(mask, dtype="float32").reshape((1, self.kwidth, 1, 1))
...@@ -56,10 +57,10 @@ class PaddedMaskedVarLenConv(conv_layers.EncoderQConvolutionVarLen): ...@@ -56,10 +57,10 @@ class PaddedMaskedVarLenConv(conv_layers.EncoderQConvolutionVarLen):
return result * mask + (1 - mask) * self.invalid_value return result * mask + (1 - mask) * self.invalid_value
return result return result
def add_qweight(self, shape, num_bits=8): def quantize_parameter(self, weight, num_bits=8):
weight = super(PaddedMaskedVarLenConv, self).add_qweight( weight = super(PaddedMaskedVarLenConv, self).quantize_parameter(
shape=shape, num_bits=num_bits) weight, num_bits=num_bits)
return weight * tf.convert_to_tensor(self.mask) return weight * tf.convert_to_tensor(self.mask) if self.skipgram else weight
class AttentionPoolReduce(base_layers.BaseLayer): class AttentionPoolReduce(base_layers.BaseLayer):
...@@ -97,8 +98,8 @@ class AttentionPoolReduce(base_layers.BaseLayer): ...@@ -97,8 +98,8 @@ class AttentionPoolReduce(base_layers.BaseLayer):
class Encoder(tf.keras.layers.Layer): class Encoder(tf.keras.layers.Layer):
"""A PRADO keras model.""" """A PRADO keras model."""
def __init__(self, config, mode): def __init__(self, config, mode, **kwargs):
super(Encoder, self).__init__() super(Encoder, self).__init__(**kwargs)
def _get_params(varname, default_value=None): def _get_params(varname, default_value=None):
value = config[varname] if varname in config else default_value value = config[varname] if varname in config else default_value
...@@ -118,7 +119,7 @@ class Encoder(tf.keras.layers.Layer): ...@@ -118,7 +119,7 @@ class Encoder(tf.keras.layers.Layer):
_get_params("skip1bigram_channels", 0) _get_params("skip1bigram_channels", 0)
_get_params("skip2bigram_channels", 0) _get_params("skip2bigram_channels", 0)
_get_params("network_regularizer_scale", 1e-4) _get_params("network_regularizer_scale", 1e-4)
_get_params("keep_prob", 0.5) _get_params("keep_prob", 1.0)
self.num_classes = len(self.labels) self.num_classes = len(self.labels)
self.parameters = base_layers.Parameters( self.parameters = base_layers.Parameters(
...@@ -129,7 +130,6 @@ class Encoder(tf.keras.layers.Layer): ...@@ -129,7 +130,6 @@ class Encoder(tf.keras.layers.Layer):
units=self.embedding_size, rank=3, parameters=self.parameters) units=self.embedding_size, rank=3, parameters=self.parameters)
self.attention_fc = dense_layers.BaseQDenseVarLen( self.attention_fc = dense_layers.BaseQDenseVarLen(
units=self.embedding_size, rank=3, parameters=self.parameters) units=self.embedding_size, rank=3, parameters=self.parameters)
self.dropout = tf.keras.layers.Dropout(rate=(1 - self.keep_prob))
self.parameters = copy.copy(self.parameters) self.parameters = copy.copy(self.parameters)
self.parameters.regularizer_scale = self.network_regularizer_scale self.parameters.regularizer_scale = self.network_regularizer_scale
...@@ -161,8 +161,8 @@ class Encoder(tf.keras.layers.Layer): ...@@ -161,8 +161,8 @@ class Encoder(tf.keras.layers.Layer):
def _apply_fc_dropout(self, layer, inputs, mask, inverse_normalizer): def _apply_fc_dropout(self, layer, inputs, mask, inverse_normalizer):
outputs = layer(inputs, mask, inverse_normalizer) outputs = layer(inputs, mask, inverse_normalizer)
if self.parameters.mode == base_layers.TRAIN: if self.parameters.mode == base_layers.TRAIN and self.keep_prob < 1.0:
return self.dropout(outputs) return tf.nn.dropout(outputs, rate=(1 - self.keep_prob))
return outputs return outputs
def call(self, projection, seq_length): def call(self, projection, seq_length):
...@@ -178,14 +178,17 @@ class Encoder(tf.keras.layers.Layer): ...@@ -178,14 +178,17 @@ class Encoder(tf.keras.layers.Layer):
layer(values_in, attention_in, maskr3, inverse_normalizer) layer(values_in, attention_in, maskr3, inverse_normalizer)
for layer in self.attention_pool_layers for layer in self.attention_pool_layers
] ]
assert tensors, "no ngram channels have been configured"
pre_logits = self.concat_quantizer(tensors) pre_logits = self.concat_quantizer(tensors)
return self.final_fc(pre_logits) return self.final_fc(pre_logits)
class Model(Encoder): class Model(Encoder):
def __init__(self, config, mode): def __init__(self, config, mode, **kwargs):
super(Model, self).__init__(config, mode) super(Model, self).__init__(config, mode, **kwargs)
self.projection = projection_layers.ProjectionLayer(config, mode) self.projection = projection_layers.ProjectionLayer(config, mode)
def call(self, inputs): def call(self, inputs):
......
...@@ -93,8 +93,8 @@ py_binary( ...@@ -93,8 +93,8 @@ py_binary(
# Expect numpy installed # Expect numpy installed
# package TFLite flex delegate # package TFLite flex delegate
# package TFLite interpreter # package TFLite interpreter
"@org_tflite_support//tensorflow_lite_support/custom_ops/kernel:ngrams_op_resolver", "@org_tflite_support//tensorflow_lite_support/custom_ops/kernel:_pywrap_ngrams_op_resolver",
"@org_tflite_support//tensorflow_lite_support/custom_ops/kernel:whitespace_tokenizer_op_resolver", "@org_tflite_support//tensorflow_lite_support/custom_ops/kernel:_pywrap_whitespace_tokenizer_op_resolver",
# Expect tensorflow text installed # Expect tensorflow text installed
], ],
) )
......
...@@ -10,15 +10,6 @@ package( ...@@ -10,15 +10,6 @@ package(
], ],
) )
py_library(
name = "text_projection",
srcs = ["text_projection.py"],
srcs_version = "PY3",
deps = [
":sequence_string_projection_op_py",
],
)
cc_library( cc_library(
name = "sequence_string_projection_op", name = "sequence_string_projection_op",
srcs = [ srcs = [
...@@ -30,7 +21,6 @@ cc_library( ...@@ -30,7 +21,6 @@ cc_library(
":projection_util", ":projection_util",
":text_distorter", ":text_distorter",
"@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/random",
"@tensorflow_includes//:includes", "@tensorflow_includes//:includes",
"@tensorflow_solib//:framework_lib", "@tensorflow_solib//:framework_lib",
], ],
...@@ -71,11 +61,9 @@ cc_library( ...@@ -71,11 +61,9 @@ cc_library(
srcs = ["text_distorter.cc"], srcs = ["text_distorter.cc"],
hdrs = ["text_distorter.h"], hdrs = ["text_distorter.h"],
deps = [ deps = [
"@com_google_absl//absl/strings",
"@icu4c", "@icu4c",
"@tensorflow_includes//:includes", "@tensorflow_includes//:includes",
"@tensorflow_solib//:framework_lib", "@tensorflow_solib//:framework_lib",
"@utf_archive//:utf",
], ],
) )
...@@ -102,7 +90,6 @@ cc_library( ...@@ -102,7 +90,6 @@ cc_library(
"@tensorflow_includes//:includes", "@tensorflow_includes//:includes",
"@tensorflow_solib//:framework_lib", "@tensorflow_solib//:framework_lib",
"@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/random",
], ],
alwayslink = 1, alwayslink = 1,
) )
......
...@@ -79,7 +79,7 @@ std::string ContractToken(const char* input_ptr, size_t len, size_t num_chars) { ...@@ -79,7 +79,7 @@ std::string ContractToken(const char* input_ptr, size_t len, size_t num_chars) {
// Count how many times this pattern appeared. // Count how many times this pattern appeared.
int num_cur_patterns = 0; int num_cur_patterns = 0;
if (cur_pattern.find(" ") == std::string::npos && !IsDigit(cur_pattern)) { if (cur_pattern.find(' ') == std::string::npos && !IsDigit(cur_pattern)) {
num_cur_patterns = num_cur_patterns =
GetNumPattern(char_tokens, i + num_chars, num_chars, cur_pattern); GetNumPattern(char_tokens, i + num_chars, num_chars, cur_pattern);
} }
......
...@@ -25,25 +25,28 @@ limitations under the License. ...@@ -25,25 +25,28 @@ limitations under the License.
namespace { namespace {
constexpr int kInvalid = -1; constexpr int kInvalid = -1;
constexpr char kSpace = ' '; constexpr char kSpace = ' ';
} // namespace
// A HashEngine that uses MurmurHash to convert text to hashcodes.
class MurmurHash : public HashEngine { class MurmurHash : public HashEngine {
public: public:
void GetHashCodes(const std::string& word, std::vector<uint64_t>* hash_codes, std::vector<uint64_t> GetHashCodes(const std::string& word,
int feature_size) override { int feature_size) override {
std::vector<uint64_t> hash_codes;
hash_codes.reserve(2 * (feature_size / 64 + 1));
uint64_t hash_low = 0; uint64_t hash_low = 0;
uint64_t hash_high = 0; uint64_t hash_high = 0;
for (int i = 0; i < feature_size; i += 64) { for (int i = 0; i < feature_size; i += 64) {
if (i == 0) { if (i == 0) {
auto hash = MurmurHash128(word.c_str(), word.size()); auto hash = MurmurHash128(word.data(), word.size());
hash_low = hash.first; hash_low = hash.first;
hash_high = hash.second; hash_high = hash.second;
} else { } else {
GetMoreBits(hash_low, hash_high, &hash_low, &hash_high); GetMoreBits(hash_low, hash_high, &hash_low, &hash_high);
} }
hash_codes->push_back(hash_low); hash_codes.push_back(hash_low);
hash_codes->push_back(hash_high); hash_codes.push_back(hash_high);
} }
return hash_codes;
} }
private: private:
...@@ -78,7 +81,7 @@ class MurmurHash : public HashEngine { ...@@ -78,7 +81,7 @@ class MurmurHash : public HashEngine {
std::pair<uint64_t, uint64_t> MurmurHash128(const char* buf, std::pair<uint64_t, uint64_t> MurmurHash128(const char* buf,
const size_t len) { const size_t len) {
// Initialize the hashing value. // Initialize the hashing value.
uint64_t hash = len * kMul; uint64_t hash1 = len * kMul;
// hash2 will be xored by hash during the hash computation iterations. // hash2 will be xored by hash during the hash computation iterations.
// In the end we use an alternative mixture multiplier for mixing // In the end we use an alternative mixture multiplier for mixing
// the bits in hash2. // the bits in hash2.
...@@ -90,34 +93,38 @@ class MurmurHash : public HashEngine { ...@@ -90,34 +93,38 @@ class MurmurHash : public HashEngine {
for (const char* p = buf; p != end; p += 8) { for (const char* p = buf; p != end; p += 8) {
// Manually unrolling this loop 2x did not help on Intel Core 2. // Manually unrolling this loop 2x did not help on Intel Core 2.
hash = MurmurStep(hash, Load64VariableLength(p, 8)); hash1 = MurmurStep(hash1, Load64VariableLength(p, 8));
hash2 ^= hash; hash2 ^= hash1;
} }
if ((len & 0x7) != 0) { if ((len & 0x7) != 0) {
const uint64_t data = Load64VariableLength(end, len & 0x7); const uint64_t data = Load64VariableLength(end, len & 0x7);
hash ^= data; hash1 ^= data;
hash *= kMul; hash1 *= kMul;
hash2 ^= hash; hash2 ^= hash1;
} }
hash = ShiftMix(hash) * kMul; hash1 = ShiftMix(hash1) * kMul;
hash2 ^= hash; hash2 ^= hash1;
hash = ShiftMix(hash); hash1 = ShiftMix(hash1);
// mul2 is a prime just above golden ratio. mul2 is used to ensure that the // mul2 is a prime just above golden ratio. mul2 is used to ensure that the
// impact of the last few bytes is different to the upper and lower 64 bits. // impact of the last few bytes is different to the upper and lower 64 bits.
hash2 = ShiftMix(hash2 * kMul2) * kMul2; hash2 = ShiftMix(hash2 * kMul2) * kMul2;
return std::make_pair(hash, hash2); return {hash1, hash2};
} }
}; };
// A HashEngine that uses a prefix and suffix preserving hash to convert text
// to hashcodes.
class XFixHash : public HashEngine { class XFixHash : public HashEngine {
public: public:
explicit XFixHash(int bits_per_char) explicit XFixHash(int bits_per_char)
: bits_per_char_(bits_per_char), bit_mask_((1ULL << bits_per_char) - 1) {} : bits_per_char_(bits_per_char), bit_mask_((1ULL << bits_per_char) - 1) {}
void GetHashCodes(const std::string& word, std::vector<uint64_t>* hash_codes, std::vector<uint64_t> GetHashCodes(const std::string& word,
int feature_size) override { int feature_size) override {
std::vector<uint64_t> hash_codes;
hash_codes.reserve(2 * (feature_size / 64 + 1));
auto token_ptr = reinterpret_cast<const uint8_t*>(word.c_str()); auto token_ptr = reinterpret_cast<const uint8_t*>(word.c_str());
size_t token_size = word.size(); size_t token_size = word.size();
int token_idx = 0; int token_idx = 0;
...@@ -134,9 +141,10 @@ class XFixHash : public HashEngine { ...@@ -134,9 +141,10 @@ class XFixHash : public HashEngine {
hash_low = (hash_low << bits_per_char_) | (frhash & bit_mask_); hash_low = (hash_low << bits_per_char_) | (frhash & bit_mask_);
hash_high = (hash_high << bits_per_char_) | (brhash & bit_mask_); hash_high = (hash_high << bits_per_char_) | (brhash & bit_mask_);
} }
hash_codes->push_back(hash_low); hash_codes.push_back(hash_low);
hash_codes->push_back(hash_high); hash_codes.push_back(hash_high);
} }
return hash_codes;
} }
private: private:
...@@ -146,6 +154,8 @@ class XFixHash : public HashEngine { ...@@ -146,6 +154,8 @@ class XFixHash : public HashEngine {
const uint64_t bit_mask_; const uint64_t bit_mask_;
}; };
// A HashEngine that performs a position preserving unicode level hashing to
// convert text to hashcodes.
class UnicodeHash : public HashEngine { class UnicodeHash : public HashEngine {
public: public:
// bits_per_unicode should be a divisor of 64. // bits_per_unicode should be a divisor of 64.
...@@ -154,8 +164,10 @@ class UnicodeHash : public HashEngine { ...@@ -154,8 +164,10 @@ class UnicodeHash : public HashEngine {
bit_mask_(((1ULL << bits_per_unicode) - 1) << (64 - bits_per_unicode)) { bit_mask_(((1ULL << bits_per_unicode) - 1) << (64 - bits_per_unicode)) {
} }
void GetHashCodes(const std::string& word, std::vector<uint64_t>* hash_codes, std::vector<uint64_t> GetHashCodes(const std::string& word,
int feature_size) override { int feature_size) override {
std::vector<uint64_t> hash_codes;
hash_codes.reserve(2 * (feature_size / 64 + 1));
auto word_ptr = word.c_str(); auto word_ptr = word.c_str();
int utflength = utflen(const_cast<char*>(word_ptr)); int utflength = utflen(const_cast<char*>(word_ptr));
// Both `feature_size` and `bits_per_unicode` are bit lengths. // Both `feature_size` and `bits_per_unicode` are bit lengths.
...@@ -187,8 +199,9 @@ class UnicodeHash : public HashEngine { ...@@ -187,8 +199,9 @@ class UnicodeHash : public HashEngine {
hash = hash >> bits_per_unicode_; hash = hash >> bits_per_unicode_;
} }
} }
hash_codes->push_back(hash); hash_codes.push_back(hash);
} }
return hash_codes;
} }
private: private:
...@@ -197,6 +210,8 @@ class UnicodeHash : public HashEngine { ...@@ -197,6 +210,8 @@ class UnicodeHash : public HashEngine {
const uint64_t bit_mask_; const uint64_t bit_mask_;
}; };
} // namespace
bool Hasher::SupportedHashType(const std::string& hash_type) { bool Hasher::SupportedHashType(const std::string& hash_type) {
std::unordered_set<std::string> supported({kMurmurHash, kUnicodeHash8, std::unordered_set<std::string> supported({kMurmurHash, kUnicodeHash8,
kUnicodeHash16, kXfixHash8, kUnicodeHash16, kXfixHash8,
...@@ -225,7 +240,7 @@ Hasher* Hasher::CreateHasher(int feature_size, const std::string& hash_type) { ...@@ -225,7 +240,7 @@ Hasher* Hasher::CreateHasher(int feature_size, const std::string& hash_type) {
Hasher::Hasher(int feature_size, HashEngine* hash_engine) Hasher::Hasher(int feature_size, HashEngine* hash_engine)
: feature_size_(feature_size), hash_engine_(hash_engine) { : feature_size_(feature_size), hash_engine_(hash_engine) {
hash_engine_->GetHashCodes(empty_string_, &null_hash_codes_, feature_size_); null_hash_codes_ = hash_engine_->GetHashCodes(empty_string_, feature_size_);
} }
std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes( std::string ProjectionUnicodeHandler::LowerCaseUTF8WithSupportedUnicodes(
......
...@@ -21,22 +21,25 @@ limitations under the License. ...@@ -21,22 +21,25 @@ limitations under the License.
#include "libutf/utf.h" #include "libutf/utf.h"
constexpr int kFirstCapOffset = 3; inline constexpr int kFirstCapOffset = 3;
constexpr int kAllCapsOffset = 4; inline constexpr int kAllCapsOffset = 4;
constexpr int kWordNoveltyOffset = 1; inline constexpr int kWordNoveltyOffset = 1;
constexpr int kDocSizeOffset = 2; inline constexpr int kDocSizeOffset = 2;
const char kMurmurHash[] = "murmur"; inline constexpr char kMurmurHash[] = "murmur";
const char kXfixHash8[] = "xfixhash8"; inline constexpr char kXfixHash8[] = "xfixhash8";
const char kXfixHash16[] = "xfixhash16"; inline constexpr char kXfixHash16[] = "xfixhash16";
const char kXfixHash32[] = "xfixhash32"; inline constexpr char kXfixHash32[] = "xfixhash32";
const char kUnicodeHash8[] = "unicodehash8"; inline constexpr char kUnicodeHash8[] = "unicodehash8";
const char kUnicodeHash16[] = "unicodehash16"; inline constexpr char kUnicodeHash16[] = "unicodehash16";
// A base class that specifies the interface for a hash engine used by the
// projection operator.
class HashEngine { class HashEngine {
public: public:
virtual void GetHashCodes(const std::string& word, // Takes a string token `word` and a `feature_size` (measured in bits) and
std::vector<uint64_t>* hash_codes, // returns hash codes that represent the token.
virtual std::vector<uint64_t> GetHashCodes(const std::string& word,
int feature_size) = 0; int feature_size) = 0;
virtual ~HashEngine() {} virtual ~HashEngine() {}
}; };
...@@ -50,13 +53,12 @@ class Hasher { ...@@ -50,13 +53,12 @@ class Hasher {
const std::string& hash_type = kMurmurHash); const std::string& hash_type = kMurmurHash);
static bool SupportedHashType(const std::string& hash_type); static bool SupportedHashType(const std::string& hash_type);
bool GetHashCodes(const std::string& word, bool GetHashCodes(const std::string& word,
std::vector<uint64_t>* hash_codes) { std::vector<uint64_t>& hash_codes) {
if (!hash_engine_) return false; if (!hash_engine_) return false;
if (word.empty()) { if (word.empty()) {
*hash_codes = null_hash_codes_; hash_codes = null_hash_codes_;
} else { } else {
hash_codes->clear(); hash_codes = hash_engine_->GetHashCodes(word, feature_size_);
hash_engine_->GetHashCodes(word, hash_codes, feature_size_);
} }
return true; return true;
} }
...@@ -64,8 +66,13 @@ class Hasher { ...@@ -64,8 +66,13 @@ class Hasher {
private: private:
explicit Hasher(int feature_size, HashEngine* hash_engine); explicit Hasher(int feature_size, HashEngine* hash_engine);
const std::string empty_string_ = "<null>"; const std::string empty_string_ = "<null>";
// Size of the projection feature which represents the number of bits of
// hash codes that will be generated by this class.
const int feature_size_; const int feature_size_;
// The hash engine used by this class.
std::unique_ptr<HashEngine> hash_engine_; std::unique_ptr<HashEngine> hash_engine_;
// Hash codes for empty text is precaculated and stored below to speed
// up projection.
std::vector<uint64_t> null_hash_codes_; std::vector<uint64_t> null_hash_codes_;
}; };
...@@ -90,7 +97,8 @@ class ProjectionUnicodeHandler { ...@@ -90,7 +97,8 @@ class ProjectionUnicodeHandler {
} }
// Performs language independent lower case and returns a string with // Performs language independent lower case and returns a string with
// supported unicode segments. // supported unicode segments and two additional flags first_cap and all_caps
// which when true indicate the text is Firstcap or ALLCAPS.
std::string LowerCaseUTF8WithSupportedUnicodes( std::string LowerCaseUTF8WithSupportedUnicodes(
const std::pair<const char*, size_t>& source, bool* first_cap = nullptr, const std::pair<const char*, size_t>& source, bool* first_cap = nullptr,
bool* all_caps = nullptr) const; bool* all_caps = nullptr) const;
...@@ -126,14 +134,19 @@ class ProjectionUnicodeHandler { ...@@ -126,14 +134,19 @@ class ProjectionUnicodeHandler {
int max_tokens); int max_tokens);
private: private:
// Parses and extracts supported unicode segments from a utf8 string. // Parses and extracts supported or allowed unicode segments, also referred
// to as vocabulary, from a utf8 string.
void InitializeVocabulary(const std::string& vocabulary); void InitializeVocabulary(const std::string& vocabulary);
// A variable that maps a valid Unicode rune to its index in valid character
// vocabulary.
std::unordered_map<Rune, int> valid_chars_; std::unordered_map<Rune, int> valid_chars_;
// Controls whether to exclude non-alphabetic, non-space characters from the
// output text.
bool exclude_nonalphaspace_unicodes_; bool exclude_nonalphaspace_unicodes_;
}; };
static constexpr size_t kEntireString = SIZE_MAX; inline constexpr size_t kEntireString = SIZE_MAX;
static constexpr size_t kAllTokens = SIZE_MAX; inline constexpr size_t kAllTokens = SIZE_MAX;
std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len, std::vector<std::string> SplitBySpace(const char* input_ptr, size_t len,
size_t max_input, size_t max_tokens); size_t max_input, size_t max_tokens);
......
...@@ -198,7 +198,7 @@ cc_library( ...@@ -198,7 +198,7 @@ cc_library(
), ),
includes = ["tensorflow_includes"], includes = ["tensorflow_includes"],
deps = [ deps = [
"@eigen_archive//:eigen", "@eigen_archive//:eigen3",
"@protobuf_archive//:includes", "@protobuf_archive//:includes",
"@zlib_includes//:includes", "@zlib_includes//:includes",
"@snappy_includes//:includes", "@snappy_includes//:includes",
......
...@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. limitations under the License.
==============================================================================*/ ==============================================================================*/
#include "tf_ops/projection_normalizer_util.h" // seq_flow_lite
#include "tf_ops/projection_tokenizer_util.h" // seq_flow_lite
#include "tf_ops/projection_util.h" // seq_flow_lite
#include "tf_ops/text_distorter.h" // seq_flow_lite
#include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_map.h"
#include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/shape_inference.h" #include "tensorflow/core/framework/shape_inference.h"
#include "tensorflow/core/framework/tensor.h" #include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/framework/tensor_shape.h" #include "tensorflow/core/framework/tensor_shape.h"
#include "tf_ops/projection_normalizer_util.h" // seq_flow_lite
#include "tf_ops/projection_tokenizer_util.h" // seq_flow_lite
#include "tf_ops/projection_util.h" // seq_flow_lite
#include "tf_ops/text_distorter.h" // seq_flow_lite
using ::tensorflow::int32; using ::tensorflow::int32;
using ::tensorflow::int64; using ::tensorflow::int64;
...@@ -51,10 +51,11 @@ float* AllocateTensor(OpKernelContext* ctx, const std::string& tensor_name, ...@@ -51,10 +51,11 @@ float* AllocateTensor(OpKernelContext* ctx, const std::string& tensor_name,
return &tensor->flat<float>()(0); return &tensor->flat<float>()(0);
} }
// OpKernel for the sequence string projection op.
class SequenceStringProjectionOp : public OpKernel { class SequenceStringProjectionOp : public OpKernel {
public: public:
explicit SequenceStringProjectionOp(OpKernelConstruction* context) explicit SequenceStringProjectionOp(OpKernelConstruction* context)
: OpKernel(context) { : OpKernel(context), philox_(171), generator_(&philox_) {
OP_REQUIRES_OK(context, context->GetAttr("feature_size", &feature_size_)); OP_REQUIRES_OK(context, context->GetAttr("feature_size", &feature_size_));
std::string hashtype; std::string hashtype;
OP_REQUIRES_OK(context, context->GetAttr("hashtype", &hashtype)); OP_REQUIRES_OK(context, context->GetAttr("hashtype", &hashtype));
...@@ -159,7 +160,10 @@ class SequenceStringProjectionOp : public OpKernel { ...@@ -159,7 +160,10 @@ class SequenceStringProjectionOp : public OpKernel {
} }
const int64 seq_len = const int64 seq_len =
static_cast<int64>(bos_tag_ + words.size() + eos_tag_); static_cast<int64>(bos_tag_ + words.size() + eos_tag_);
CHECK_GT(seq_len, 0); CHECK_GT(seq_len, 0)
<< "Projection models expect input text to have at-least one valid "
"token. If empty text is a valid input for your model, please set "
"add_bos_tag to true.";
max_seq_len = std::max(max_seq_len, seq_len); max_seq_len = std::max(max_seq_len, seq_len);
words_batches.emplace_back(std::move(words)); words_batches.emplace_back(std::move(words));
} }
...@@ -208,7 +212,7 @@ class SequenceStringProjectionOp : public OpKernel { ...@@ -208,7 +212,7 @@ class SequenceStringProjectionOp : public OpKernel {
CHECK_EQ(eos_tag_, 1); CHECK_EQ(eos_tag_, 1);
word = kEndTokenTSP; word = kEndTokenTSP;
} }
hasher_->GetHashCodes(word, &hash_codes); hasher_->GetHashCodes(word, hash_codes);
for (int hindex = 0, k = 0; hindex < hash_codes.size(); hindex++) { for (int hindex = 0, k = 0; hindex < hash_codes.size(); hindex++) {
auto hash = hash_codes[hindex]; auto hash = hash_codes[hindex];
for (int kmax = std::min(k + increment, feature_size_); k < kmax;) { for (int kmax = std::min(k + increment, feature_size_); k < kmax;) {
...@@ -229,7 +233,7 @@ class SequenceStringProjectionOp : public OpKernel { ...@@ -229,7 +233,7 @@ class SequenceStringProjectionOp : public OpKernel {
doc_size_feature; doc_size_feature;
} }
if (add_first_cap_feature_ > 0.0f) { if (add_first_cap_feature_ > 0.0f) {
if (text_distorter_->BernouilleSample(add_first_cap_feature_)) { if (generator_.RandFloat() <= add_first_cap_feature_) {
projection[offset0 + feature_size_ - kFirstCapOffset] = projection[offset0 + feature_size_ - kFirstCapOffset] =
first_cap ? 1.0 : -1.0; first_cap ? 1.0 : -1.0;
} else { } else {
...@@ -237,7 +241,7 @@ class SequenceStringProjectionOp : public OpKernel { ...@@ -237,7 +241,7 @@ class SequenceStringProjectionOp : public OpKernel {
} }
} }
if (add_all_caps_feature_ > 0.0f) { if (add_all_caps_feature_ > 0.0f) {
if (text_distorter_->BernouilleSample(add_all_caps_feature_)) { if (generator_.RandFloat() <= add_all_caps_feature_) {
projection[offset0 + feature_size_ - kAllCapsOffset] = projection[offset0 + feature_size_ - kAllCapsOffset] =
all_caps ? 1.0 : -1.0; all_caps ? 1.0 : -1.0;
} else { } else {
...@@ -252,21 +256,49 @@ class SequenceStringProjectionOp : public OpKernel { ...@@ -252,21 +256,49 @@ class SequenceStringProjectionOp : public OpKernel {
} }
private: private:
// Objects used for random number generator.
tensorflow::random::PhiloxRandom philox_;
tensorflow::random::SimplePhilox generator_;
// Dimensionality of the ternary vector for each token in the text.
int32 feature_size_; int32 feature_size_;
// An object used to hash tokens in the text.
std::unique_ptr<Hasher> hasher_; std::unique_ptr<Hasher> hasher_;
// An object used for distorting text before projection.
std::unique_ptr<TextDistorter> text_distorter_; std::unique_ptr<TextDistorter> text_distorter_;
// An object used for manipulating unicode in the text. It performs tasks such
// as retaining only whitelisted unicodes in the text tokens and lowercasing
// them.
std::unique_ptr<ProjectionUnicodeHandler> unicode_handler_; std::unique_ptr<ProjectionUnicodeHandler> unicode_handler_;
// An object used for normalizing tokens in the text. This performs tasks
// such as identifying repeated characters and replace them with a single
// instance.
std::unique_ptr<ProjectionNormalizer> projection_normalizer_; std::unique_ptr<ProjectionNormalizer> projection_normalizer_;
// Character whitelist used by the projection operator.
std::string vocabulary_; std::string vocabulary_;
// Size of the character whitelist.
int vocabulary_size_; int vocabulary_size_;
// Maximum number of splits allowed in the text. The number of tokens in text
// post segmentation will be utmost max_splits_ + 1.
int32 max_splits_; int32 max_splits_;
// A flag that indicates how to segment text. When true text is segmented by
// space. Otherwise it is segmented on unicode boundaries.
bool split_on_space_; bool split_on_space_;
// When true include an end of sentence token in the projection.
int eos_tag_; int eos_tag_;
// When true include a begin of sentence token in the projection.
int bos_tag_; int bos_tag_;
// Number of bits used to capture word novelty. See tensorflow op
// documentation below for details.
int word_novelty_bits_; int word_novelty_bits_;
// Number of levels used to capture document size. See tensorflow op
// documentation below for details.
int doc_size_levels_; int doc_size_levels_;
// Distance between levels used for word novelty.
float word_novelty_offset_; float word_novelty_offset_;
// Adds boolean feature to indicate first_cap text with the below probability.
float add_first_cap_feature_; float add_first_cap_feature_;
// Adds boolean feature to indicate all_cap text with the below probability.
float add_all_caps_feature_; float add_all_caps_feature_;
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment