Comment/message updates: b/181866850 tracks next steps after b/149576200.

PiperOrigin-RevId: 360932205

Comment/message updates: b/181866850 tracks next steps after b/149576200.
PiperOrigin-RevId: 360932205
b2c3fee4 · A. Unique TensorFlower · 00633c22 · b2c3fee4 · b2c3fee4 · b2c3fee4
Commit b2c3fee4 authored Mar 04, 2021 by A. Unique TensorFlower
3 changed files
--- a/official/nlp/modeling/layers/text_layers.py
+++ b/official/nlp/modeling/layers/text_layers.py
@@ -330,8 +330,7 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
    if bool(model_file_path) == bool(model_serialized_proto):
      raise ValueError("Exact one of `model_file_path` and "
                       "`model_serialized_proto` can be specified.")
-    # TODO(chendouble): After b/149576200 is resolved, support
+    # TODO(b/181866850): Support tokenize_with_offsets for strip_diacritics=True
-    # tokenize_with_offsets when strip_diacritics is True,
    if tokenize_with_offsets and strip_diacritics:
      raise ValueError("`tokenize_with_offsets` is not supported when "
                       "`strip_diacritics` is set to True.")
@@ -378,8 +377,8 @@ class SentencepieceTokenizer(tf.keras.layers.Layer):
    """
    if self._strip_diacritics:
      if self.tokenize_with_offsets:
-        raise ValueError("`tokenize_with_offsets` is not supported yet due to "
+        raise ValueError("`tokenize_with_offsets` is not supported yet when "
-                         "b/149576200, when `strip_diacritics` is set to True.")
+                         "`strip_diacritics` is set to True (b/181866850).")
      inputs = text.normalize_utf8(inputs, "NFD")
      inputs = tf.strings.regex_replace(inputs, r"\p{Mn}", "")

--- a/official/nlp/tools/export_tfhub.py
+++ b/official/nlp/tools/export_tfhub.py
@@ -145,7 +145,7 @@ flags.DEFINE_integer(
    "sequence length for the bert_pack_inputs subobject."
    "Needed for --export_type preprocessing.")
 flags.DEFINE_bool(
-    "tokenize_with_offsets", False,  # Broken by b/149576200.
+    "tokenize_with_offsets", False,  # TODO(b/181866850)
    "Whether to export a .tokenize_with_offsets subobject for "
    "--export_type preprocessing.")
 flags.DEFINE_multi_string(

--- a/official/nlp/tools/export_tfhub_lib_test.py
+++ b/official/nlp/tools/export_tfhub_lib_test.py
@@ -564,7 +564,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
  def test_exported_callables(self, use_sp_model):
    preprocess = tf.saved_model.load(self._do_export(
        ["d", "ef", "abc", "xy"], do_lower_case=True,
-        tokenize_with_offsets=not use_sp_model,  # TODO(b/149576200): drop this.
+        tokenize_with_offsets=not use_sp_model,  # TODO(b/181866850): drop this.
        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
        use_sp_model=use_sp_model))
@@ -590,7 +590,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
    # .tokenize_with_offsets()
    if use_sp_model:
-      # TODO(b/149576200): Enable tokenize_with_offsets when it works and test.
+      # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
      self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
    else:
      token_ids, start_offsets, limit_offsets = (
@@ -691,7 +691,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
  def test_shapes(self, use_sp_model):
    preprocess = tf.saved_model.load(self._do_export(
        ["abc", "def"], do_lower_case=True,
-        tokenize_with_offsets=not use_sp_model,  # TODO(b/149576200): drop this.
+        tokenize_with_offsets=not use_sp_model,  # TODO(b/181866850): drop this.
        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
        use_sp_model=use_sp_model))
@@ -711,7 +711,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
              tf.TensorSpec([batch_size], tf.string)),
          token_out_shape,
          "with batch_size=%s" % batch_size)
-      # TODO(b/149576200): Enable tokenize_with_offsets when it works and test.
+      # TODO(b/181866850): Enable tokenize_with_offsets when it works and test.
      if use_sp_model:
        self.assertFalse(hasattr(preprocess, "tokenize_with_offsets"))
      else:
@@ -771,7 +771,7 @@ class ExportPreprocessingTest(tf.test.TestCase, parameterized.TestCase):
                          "quick", "fox", "lazy", "dog"]
    preprocess = tf.saved_model.load(self._do_export(
        non_special_tokens, do_lower_case=True,
-        tokenize_with_offsets=use_bert,  # TODO(b/149576200): drop this.
+        tokenize_with_offsets=use_bert,  # TODO(b/181866850): drop this.
        experimental_disable_assert=True,  # TODO(b/175369555): drop this.
        add_mask_token=True, use_sp_model=not use_bert))
    vocab_size = len(non_special_tokens) + (5 if use_bert else 7)