Commit b2c3fee4 authored by A. Unique TensorFlower's avatar A. Unique TensorFlower
Browse files

Comment/message updates: b/181866850 tracks next steps after b/149576200.

PiperOrigin-RevId: 360932205
parent 00633c22
...@@ -330,8 +330,7 @@ class SentencepieceTokenizer(tf.keras.layers.Layer): ...@@ -330,8 +330,7 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
if bool(model_file_path) == bool(model_serialized_proto): if bool(model_file_path) == bool(model_serialized_proto):
raise ValueError("Exact one of `model_file_path` and " raise ValueError("Exact one of `model_file_path` and "
"`model_serialized_proto` can be specified.") "`model_serialized_proto` can be specified.")
# TODO(chendouble): After b/149576200 is resolved, support # TODO(b/181866850): Support tokenize_with_offsets for strip_diacritics=True
# tokenize_with_offsets when strip_diacritics is True,
if tokenize_with_offsets and strip_diacritics: if tokenize_with_offsets and strip_diacritics:
raise ValueError("`tokenize_with_offsets` is not supported when " raise ValueError("`tokenize_with_offsets` is not supported when "
"`strip_diacritics` is set to True.") "`strip_diacritics` is set to True.")
...@@ -378,8 +377,8 @@ class SentencepieceTokenizer(tf.keras.layers.Layer): ...@@ -378,8 +377,8 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
""" """
if self._strip_diacritics: if self._strip_diacritics:
if self.tokenize_with_offsets: if self.tokenize_with_offsets:
raise ValueError("`tokenize_with_offsets` is not supported yet due to " raise ValueError("`tokenize_with_offsets` is not supported yet when "
"b/149576200, when `strip_diacritics` is set to True.") "`strip_diacritics` is set to True (b/181866850).")
inputs = text.normalize_utf8(inputs, "NFD") inputs = text.normalize_utf8(inputs, "NFD")
inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "") inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "")
......
...@@ -145,7 +145,7 @@ flags.DEFINE_integer( ...@@ -145,7 +145,7 @@ flags.DEFINE_integer(
"sequence length for the bert_pack_inputs subobject." "sequence length for the bert_pack_inputs subobject."
"Needed for --export_type preprocessing.") "Needed for --export_type preprocessing.")
flags.DEFINE_bool( flags.DEFINE_bool(
"tokenize_with_offsets", False, # Broken by b/149576200. "tokenize_with_offsets", False, # TODO(b/181866850)
"Whether to export a .tokenize_with_offsets subobject for " "Whether to export a .tokenize_with_offsets subobject for "
"--export_type preprocessing.") "--export_type preprocessing.")
flags.DEFINE_multi_string( flags.DEFINE_multi_string(
......
...@@ -564,7 +564,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -564,7 +564,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
def test_exported_callables(self, use_sp_model): def test_exported_callables(self, use_sp_model):
preprocess = tf.saved_model.load(self._do_export( preprocess = tf.saved_model.load(self._do_export(
["d", "ef", "abc", "xy"], do_lower_case=True, ["d", "ef", "abc", "xy"], do_lower_case=True,
tokenize_with_offsets=not use_sp_model, # TODO(b/149576200): drop this. tokenize_with_offsets=not use_sp_model, # TODO(b/181866850): drop this.
experimental_disable_assert=True, # TODO(b/175369555): drop this. experimental_disable_assert=True, # TODO(b/175369555): drop this.
use_sp_model=use_sp_model)) use_sp_model=use_sp_model))
...@@ -590,7 +590,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -590,7 +590,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
# .tokenize_with_offsets() # .tokenize_with_offsets()
if use_sp_model: if use_sp_model:
# TODO(b/149576200): Enable tokenize_with_offsets when it works and test. # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
self.assertFalse(hasattr(preprocess, "tokenize_with_offsets")) self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
else: else:
token_ids, start_offsets, limit_offsets = ( token_ids, start_offsets, limit_offsets = (
...@@ -691,7 +691,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -691,7 +691,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
def test_shapes(self, use_sp_model): def test_shapes(self, use_sp_model):
preprocess = tf.saved_model.load(self._do_export( preprocess = tf.saved_model.load(self._do_export(
["abc", "def"], do_lower_case=True, ["abc", "def"], do_lower_case=True,
tokenize_with_offsets=not use_sp_model, # TODO(b/149576200): drop this. tokenize_with_offsets=not use_sp_model, # TODO(b/181866850): drop this.
experimental_disable_assert=True, # TODO(b/175369555): drop this. experimental_disable_assert=True, # TODO(b/175369555): drop this.
use_sp_model=use_sp_model)) use_sp_model=use_sp_model))
...@@ -711,7 +711,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -711,7 +711,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
tf.TensorSpec([batch_size], tf.string)), tf.TensorSpec([batch_size], tf.string)),
token_out_shape, token_out_shape,
"with batch_size=%s" % batch_size) "with batch_size=%s" % batch_size)
# TODO(b/149576200): Enable tokenize_with_offsets when it works and test. # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
if use_sp_model: if use_sp_model:
self.assertFalse(hasattr(preprocess, "tokenize_with_offsets")) self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
else: else:
...@@ -771,7 +771,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase): ...@@ -771,7 +771,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
"quick", "fox", "lazy", "dog"] "quick", "fox", "lazy", "dog"]
preprocess = tf.saved_model.load(self._do_export( preprocess = tf.saved_model.load(self._do_export(
non_special_tokens, do_lower_case=True, non_special_tokens, do_lower_case=True,
tokenize_with_offsets=use_bert, # TODO(b/149576200): drop this. tokenize_with_offsets=use_bert, # TODO(b/181866850): drop this.
experimental_disable_assert=True, # TODO(b/175369555): drop this. experimental_disable_assert=True, # TODO(b/175369555): drop this.
add_mask_token=True, use_sp_model=not use_bert)) add_mask_token=True, use_sp_model=not use_bert))
vocab_size = len(non_special_tokens) + (5 if use_bert else 7) vocab_size = len(non_special_tokens) + (5 if use_bert else 7)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment