Merge branch 'panoptic-segmentation' into panoptic-deeplab-modeling

c127d527 · Srihari Humbarwadi · GitHub · 78657911 · 457bcb85 · c127d527
Unverified Commit c127d527 authored Feb 04, 2022 by Srihari Humbarwadi Committed by GitHub Feb 04, 2022
11 changed files
--- a/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
+++ b/research/object_detection/meta_architectures/deepmac_meta_arch_test.py
@@ -44,6 +44,7 @@ DEEPMAC_PROTO_TEXT = """
  box_consistency_loss_normalize: NORMALIZE_AUTO
  color_consistency_warmup_steps: 20
  color_consistency_warmup_start: 10
+  use_only_last_stage: false
 """


@@ -117,10 +118,11 @@ def build_meta_arch(**override_params):
      mask_size=16,
      postprocess_crop_size=128,
      max_roi_jitter_ratio=0.0,
-      roi_jitter_mode='random',
+      roi_jitter_mode='default',
      color_consistency_dilation=2,
      color_consistency_warmup_steps=0,
-      color_consistency_warmup_start=0)
+      color_consistency_warmup_start=0,
+      use_only_last_stage=True)

  params.update(override_params)

@@ -185,6 +187,7 @@ class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
    self.assertIsInstance(params, deepmac_meta_arch.DeepMACParams)
    self.assertEqual(params.dim, 153)
    self.assertEqual(params.box_consistency_loss_normalize, 'normalize_auto')
+    self.assertFalse(params.use_only_last_stage)

  def test_subsample_trivial(self):
    """Test subsampling masks."""
@@ -201,32 +204,71 @@ class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAllClose(result[2], boxes)
    self.assertAllClose(result[3], masks)

+  def test_filter_masked_classes(self):
+
+    classes = np.zeros((2, 3, 5), dtype=np.float32)
+    classes[0, 0] = [1.0, 0.0, 0.0, 0.0, 0.0]
+    classes[0, 1] = [0.0, 1.0, 0.0, 0.0, 0.0]
+    classes[0, 2] = [0.0, 0.0, 1.0, 0.0, 0.0]
+    classes[1, 0] = [0.0, 0.0, 0.0, 1.0, 0.0]
+    classes[1, 1] = [0.0, 0.0, 0.0, 0.0, 1.0]
+    classes[1, 2] = [0.0, 0.0, 0.0, 0.0, 1.0]
+    classes = tf.constant(classes)
+
+    weights = tf.constant([[1.0, 1.0, 1.0], [1.0, 1.0, 0.0]])
+    masks = tf.ones((2, 3, 32, 32), dtype=tf.float32)
+
+    classes, weights, masks = deepmac_meta_arch.filter_masked_classes(
+        [3, 4], classes, weights, masks)
+    expected_classes = np.zeros((2, 3, 5))
+    expected_classes[0, 0] = [0.0, 0.0, 0.0, 0.0, 0.0]
+    expected_classes[0, 1] = [0.0, 0.0, 0.0, 0.0, 0.0]
+    expected_classes[0, 2] = [0.0, 0.0, 1.0, 0.0, 0.0]
+    expected_classes[1, 0] = [0.0, 0.0, 0.0, 1.0, 0.0]
+    expected_classes[1, 1] = [0.0, 0.0, 0.0, 0.0, 0.0]
+    expected_classes[1, 2] = [0.0, 0.0, 0.0, 0.0, 0.0]
+
+    self.assertAllClose(expected_classes, classes.numpy())
+    self.assertAllClose(np.array(([0.0, 0.0, 1.0], [1.0, 0.0, 0.0])), weights)
+
+    self.assertAllClose(masks[0, 0], np.zeros((32, 32)))
+    self.assertAllClose(masks[0, 1], np.zeros((32, 32)))
+    self.assertAllClose(masks[0, 2], np.ones((32, 32)))
+    self.assertAllClose(masks[1, 0], np.ones((32, 32)))
+    self.assertAllClose(masks[1, 1], np.zeros((32, 32)))
+
  def test_fill_boxes(self):

-    boxes = tf.constant([[0., 0., 0.5, 0.5], [0.5, 0.5, 1.0, 1.0]])
+    boxes = tf.constant([[[0., 0., 0.5, 0.5], [0.5, 0.5, 1.0, 1.0]],
+                         [[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0]]])

    filled_boxes = deepmac_meta_arch.fill_boxes(boxes, 32, 32)
-    expected = np.zeros((2, 32, 32))
-    expected[0, :17, :17] = 1.0
-    expected[1, 16:, 16:] = 1.0
+    expected = np.zeros((2, 2, 32, 32))
+    expected[0, 0, :17, :17] = 1.0
+    expected[0, 1, 16:, 16:] = 1.0
+    expected[1, 0, :, :] = 1.0
+
+    filled_boxes = filled_boxes.numpy()
+    self.assertAllClose(expected[0, 0], filled_boxes[0, 0], rtol=1e-3)
+    self.assertAllClose(expected[0, 1], filled_boxes[0, 1], rtol=1e-3)
+    self.assertAllClose(expected[1, 0], filled_boxes[1, 0], rtol=1e-3)

-    self.assertAllClose(expected, filled_boxes.numpy(), rtol=1e-3)
+  def test_flatten_and_unpack(self):
+
+    t = tf.random.uniform((2, 3, 4, 5, 6))
+    flatten = tf.function(deepmac_meta_arch.flatten_first2_dims)
+    unpack = tf.function(deepmac_meta_arch.unpack_first2_dims)
+    result, d1, d2 = flatten(t)
+    result = unpack(result, d1, d2)
+    self.assertAllClose(result.numpy(), t)

  def test_crop_and_resize_instance_masks(self):

-    boxes = tf.zeros((5, 4))
-    masks = tf.zeros((5, 128, 128))
+    boxes = tf.zeros((8, 5, 4))
+    masks = tf.zeros((8, 5, 128, 128))
    output = deepmac_meta_arch.crop_and_resize_instance_masks(
        masks, boxes, 32)
-    self.assertEqual(output.shape, (5, 32, 32))
-
-  def test_crop_and_resize_feature_map(self):
-
-    boxes = tf.zeros((5, 4))
-    features = tf.zeros((128, 128, 7))
-    output = deepmac_meta_arch.crop_and_resize_feature_map(
-        features, boxes, 32)
-    self.assertEqual(output.shape, (5, 32, 32, 7))
+    self.assertEqual(output.shape, (8, 5, 32, 32))

  def test_embedding_projection_prob_shape(self):
    dist = deepmac_meta_arch.embedding_projection(
@@ -262,73 +304,75 @@ class DeepMACUtilsTest(tf.test.TestCase, parameterized.TestCase):

  def test_generate_2d_neighbors_shape(self):

-    inp = tf.zeros((13, 14, 3))
+    inp = tf.zeros((5, 13, 14, 3))
    out = deepmac_meta_arch.generate_2d_neighbors(inp)
-    self.assertEqual((8, 13, 14, 3), out.shape)
+    self.assertEqual((8, 5, 13, 14, 3), out.shape)

  def test_generate_2d_neighbors(self):
-
    inp = np.arange(16).reshape(4, 4).astype(np.float32)
    inp = tf.stack([inp, inp * 2], axis=2)
+    inp = tf.reshape(inp, (1, 4, 4, 2))
    out = deepmac_meta_arch.generate_2d_neighbors(inp, dilation=1)
-    self.assertEqual((8, 4, 4, 2), out.shape)
+    self.assertEqual((8, 1, 4, 4, 2), out.shape)

    for i in range(2):
      expected = np.array([0, 1, 2, 4, 6, 8, 9, 10]) * (i + 1)
-      self.assertAllEqual(out[:, 1, 1, i], expected)
+      self.assertAllEqual(out[:, 0, 1, 1, i], expected)

      expected = np.array([1, 2, 3, 5, 7, 9, 10, 11]) * (i + 1)
-      self.assertAllEqual(out[:, 1, 2, i], expected)
+      self.assertAllEqual(out[:, 0, 1, 2, i], expected)

      expected = np.array([4, 5, 6, 8, 10, 12, 13, 14]) * (i + 1)
-      self.assertAllEqual(out[:, 2, 1, i], expected)
+      self.assertAllEqual(out[:, 0, 2, 1, i], expected)

      expected = np.array([5, 6, 7, 9, 11, 13, 14, 15]) * (i + 1)
-      self.assertAllEqual(out[:, 2, 2, i], expected)
+      self.assertAllEqual(out[:, 0, 2, 2, i], expected)

  def test_generate_2d_neighbors_dilation2(self):
-
-    inp = np.arange(16).reshape(4, 4, 1).astype(np.float32)
+    inp = np.arange(16).reshape(1, 4, 4, 1).astype(np.float32)
    out = deepmac_meta_arch.generate_2d_neighbors(inp, dilation=2)
-    self.assertEqual((8, 4, 4, 1), out.shape)
+    self.assertEqual((8, 1, 4, 4, 1), out.shape)

    expected = np.array([0, 0, 0, 0, 2, 0, 8, 10])
-    self.assertAllEqual(out[:, 0, 0, 0], expected)
+    self.assertAllEqual(out[:, 0, 0, 0, 0], expected)

  def test_dilated_similarity_shape(self):
-
-    fmap = tf.zeros((32, 32, 9))
+    fmap = tf.zeros((5, 32, 32, 9))
    similarity = deepmac_meta_arch.dilated_cross_pixel_similarity(
        fmap)
-    self.assertEqual((8, 32, 32), similarity.shape)
+    self.assertEqual((8, 5, 32, 32), similarity.shape)

  def test_dilated_similarity(self):

-    fmap = np.zeros((5, 5, 2), dtype=np.float32)
+    fmap = np.zeros((1, 5, 5, 2), dtype=np.float32)

-    fmap[0, 0, :] = 1.0
-    fmap[4, 4, :] = 1.0
+    fmap[0, 0, 0, :] = 1.0
+    fmap[0, 4, 4, :] = 1.0

    similarity = deepmac_meta_arch.dilated_cross_pixel_similarity(
        fmap, theta=1.0, dilation=2)
-    self.assertAlmostEqual(similarity.numpy()[0, 2, 2],
+    self.assertAlmostEqual(similarity.numpy()[0, 0, 2, 2],
                           np.exp(-np.sqrt(2)))

  def test_dilated_same_instance_mask_shape(self):
-
-    instances = tf.zeros((5, 32, 32))
+    instances = tf.zeros((2, 5, 32, 32))
    output = deepmac_meta_arch.dilated_cross_same_mask_label(instances)
-    self.assertEqual((8, 5, 32, 32), output.shape)
+    self.assertEqual((8, 2, 5, 32, 32), output.shape)

  def test_dilated_same_instance_mask(self):
+    instances = np.zeros((3, 2, 5, 5), dtype=np.float32)
+    instances[0, 0, 0, 0] = 1.0
+    instances[0, 0, 2, 2] = 1.0
+    instances[0, 0, 4, 4] = 1.0
+
+    instances[2, 0, 0, 0] = 1.0
+    instances[2, 0, 2, 2] = 1.0
+    instances[2, 0, 4, 4] = 0.0

-    instances = np.zeros((2, 5, 5), dtype=np.float32)
-    instances[0, 0, 0] = 1.0
-    instances[0, 2, 2] = 1.0
-    instances[0, 4, 4] = 1.0
    output = deepmac_meta_arch.dilated_cross_same_mask_label(instances).numpy()
-    self.assertAllClose(np.ones((8, 5, 5)), output[:, 1, :, :])
-    self.assertAllClose([1, 0, 0, 0, 0, 0, 0, 1], output[:, 0, 2, 2])
+    self.assertAllClose(np.ones((8, 2, 5, 5)), output[:, 1, :, :])
+    self.assertAllClose([1, 0, 0, 0, 0, 0, 0, 1], output[:, 0, 0, 2, 2])
+    self.assertAllClose([1, 0, 0, 0, 0, 0, 0, 0], output[:, 2, 0, 2, 2])

  def test_per_pixel_single_conv_multiple_instance(self):

@@ -550,151 +594,184 @@ class DeepMACMaskHeadTest(tf.test.TestCase, parameterized.TestCase):
 @unittest.skipIf(tf_version.is_tf1(), 'Skipping TF2.X only test.')
 class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):

+  # TODO(vighneshb): Add batch_size > 1 tests for loss functions.
+
  def setUp(self):  # pylint:disable=g-missing-super-call
    self.model = build_meta_arch()

  def test_get_mask_head_input(self):

-    boxes = tf.constant([[0., 0., 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]],
+    boxes = tf.constant([[[0., 0., 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]],
+                         [[0., 0., 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]]],
                        dtype=tf.float32)

-    pixel_embedding = np.zeros((32, 32, 4), dtype=np.float32)
-    pixel_embedding[:16, :16] = 1.0
-    pixel_embedding[16:, 16:] = 2.0
+    pixel_embedding = np.zeros((2, 32, 32, 4), dtype=np.float32)
+    pixel_embedding[0, :16, :16] = 1.0
+    pixel_embedding[0, 16:, 16:] = 2.0
+    pixel_embedding[1, :16, :16] = 3.0
+    pixel_embedding[1, 16:, 16:] = 4.0
+
    pixel_embedding = tf.constant(pixel_embedding)

    mask_inputs = self.model._get_mask_head_input(boxes, pixel_embedding)
-    self.assertEqual(mask_inputs.shape, (2, 16, 16, 6))
+    self.assertEqual(mask_inputs.shape, (2, 2, 16, 16, 6))

    y_grid, x_grid = tf.meshgrid(np.linspace(-1.0, 1.0, 16),
                                 np.linspace(-1.0, 1.0, 16), indexing='ij')
-    for i in range(2):
-      mask_input = mask_inputs[i]
-      self.assertAllClose(y_grid, mask_input[:, :, 0])
-      self.assertAllClose(x_grid, mask_input[:, :, 1])
-      pixel_embedding = mask_input[:, :, 2:]
-      self.assertAllClose(np.zeros((16, 16, 4)) + i + 1, pixel_embedding)
+
+    for i, j in ([0, 0], [0, 1], [1, 0], [1, 1]):
+      self.assertAllClose(y_grid, mask_inputs[i, j, :, :, 0])
+      self.assertAllClose(x_grid, mask_inputs[i, j, :, :, 1])
+
+    zeros = np.zeros((16, 16, 4))
+    self.assertAllClose(zeros + 1, mask_inputs[0, 0, :, :, 2:])
+    self.assertAllClose(zeros + 2, mask_inputs[0, 1, :, :, 2:])
+    self.assertAllClose(zeros + 3, mask_inputs[1, 0, :, :, 2:])
+    self.assertAllClose(zeros + 4, mask_inputs[1, 1, :, :, 2:])

  def test_get_mask_head_input_no_crop_resize(self):

    model = build_meta_arch(predict_full_resolution_masks=True)
-    boxes = tf.constant([[0., 0., 1.0, 1.0], [0.0, 0.0, 0.5, 1.0]],
-                        dtype=tf.float32)
+    boxes = tf.constant([[[0., 0., 1.0, 1.0], [0.0, 0.0, 0.5, 1.0]],
+                         [[0.5, 0.5, 1.0, 1.0], [0.0, 0.0, 0.0, 0.0]]])

-    pixel_embedding_np = np.random.randn(32, 32, 4).astype(np.float32)
+    pixel_embedding_np = np.random.randn(2, 32, 32, 4).astype(np.float32)
    pixel_embedding = tf.constant(pixel_embedding_np)

    mask_inputs = model._get_mask_head_input(boxes, pixel_embedding)
-    self.assertEqual(mask_inputs.shape, (2, 32, 32, 6))
+    self.assertEqual(mask_inputs.shape, (2, 2, 32, 32, 6))

    y_grid, x_grid = tf.meshgrid(np.linspace(.0, 1.0, 32),
                                 np.linspace(.0, 1.0, 32), indexing='ij')

-    ys = [0.5, 0.25]
-    xs = [0.5, 0.5]
-    for i in range(2):
-      mask_input = mask_inputs[i]
-      self.assertAllClose(y_grid - ys[i], mask_input[:, :, 0])
-      self.assertAllClose(x_grid - xs[i], mask_input[:, :, 1])
-      pixel_embedding = mask_input[:, :, 2:]
-      self.assertAllClose(pixel_embedding_np, pixel_embedding)
+    self.assertAllClose(y_grid - 0.5, mask_inputs[0, 0, :, :, 0])
+    self.assertAllClose(x_grid - 0.5, mask_inputs[0, 0, :, :, 1])
+
+    self.assertAllClose(y_grid - 0.25, mask_inputs[0, 1, :, :, 0])
+    self.assertAllClose(x_grid - 0.5, mask_inputs[0, 1, :, :, 1])
+
+    self.assertAllClose(y_grid - 0.75, mask_inputs[1, 0, :, :, 0])
+    self.assertAllClose(x_grid - 0.75, mask_inputs[1, 0, :, :, 1])
+
+    self.assertAllClose(y_grid, mask_inputs[1, 1, :, :, 0])
+    self.assertAllClose(x_grid, mask_inputs[1, 1, :, :, 1])

  def test_get_instance_embeddings(self):

-    embeddings = np.zeros((32, 32, 2))
-    embeddings[8, 8] = 1.0
-    embeddings[24, 16] = 2.0
+    embeddings = np.zeros((2, 32, 32, 2))
+    embeddings[0, 8, 8] = 1.0
+    embeddings[0, 24, 16] = 2.0
+    embeddings[1, 8, 16] = 3.0
    embeddings = tf.constant(embeddings)

-    boxes = tf.constant([[0., 0., 0.5, 0.5], [0.5, 0.0, 1.0, 1.0]])
+    boxes = np.zeros((2, 2, 4), dtype=np.float32)
+    boxes[0, 0] = [0.0, 0.0, 0.5, 0.5]
+    boxes[0, 1] = [0.5, 0.0, 1.0, 1.0]
+    boxes[1, 0] = [0.0, 0.0, 0.5, 1.0]
+
+    boxes = tf.constant(boxes)

    center_embeddings = self.model._get_instance_embeddings(boxes, embeddings)

-    self.assertAllClose(center_embeddings, [[1.0, 1.0], [2.0, 2.0]])
+    self.assertAllClose(center_embeddings[0, 0], [1.0, 1.0])
+    self.assertAllClose(center_embeddings[0, 1], [2.0, 2.0])
+    self.assertAllClose(center_embeddings[1, 0], [3.0, 3.0])

  def test_get_groundtruth_mask_output(self):

-    boxes = tf.constant([[0., 0., 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]],
-                        dtype=tf.float32)
-    masks = np.zeros((2, 32, 32), dtype=np.float32)
-    masks[0, :16, :16] = 0.5
-    masks[1, 16:, 16:] = 0.1
+    boxes = np.zeros((2, 2, 4))
+    masks = np.zeros((2, 2, 32, 32))
+
+    boxes[0, 0] = [0.0, 0.0, 0.25, 0.25]
+    boxes[0, 1] = [0.75, 0.75, 1.0, 1.0]
+    boxes[1, 0] = [0.0, 0.0, 0.5, 1.0]
+    masks = np.zeros((2, 2, 32, 32), dtype=np.float32)
+    masks[0, 0, :16, :16] = 0.5
+    masks[0, 1, 16:, 16:] = 0.1
+    masks[1, 0, :17, :] = 0.3
    masks = self.model._get_groundtruth_mask_output(boxes, masks)
-    self.assertEqual(masks.shape, (2, 16, 16))
+    self.assertEqual(masks.shape, (2, 2, 16, 16))

-    self.assertAllClose(masks[0], np.zeros((16, 16)) + 0.5)
-    self.assertAllClose(masks[1], np.zeros((16, 16)) + 0.1)
+    self.assertAllClose(masks[0, 0], np.zeros((16, 16)) + 0.5)
+    self.assertAllClose(masks[0, 1], np.zeros((16, 16)) + 0.1)
+    self.assertAllClose(masks[1, 0], np.zeros((16, 16)) + 0.3)

-  def test_get_groundtruth_mask_output_crop_resize(self):
+  def test_get_groundtruth_mask_output_no_crop_resize(self):

    model = build_meta_arch(predict_full_resolution_masks=True)
-    boxes = tf.constant([[0., 0., 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]],
-                        dtype=tf.float32)
-    masks = tf.ones((2, 32, 32))
+    boxes = tf.zeros((2, 5, 4))
+    masks = tf.ones((2, 5, 32, 32))
    masks = model._get_groundtruth_mask_output(boxes, masks)
-    self.assertAllClose(masks, np.ones((2, 32, 32)))
+    self.assertAllClose(masks, np.ones((2, 5, 32, 32)))

-  def test_per_instance_loss(self):
+  def test_predict(self):

-    model = build_meta_arch()
-    model._mask_net = MockMaskNet()
-    boxes = tf.constant([[0.0, 0.0, 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]])
-    masks = np.zeros((2, 32, 32), dtype=np.float32)
-    masks[0, :16, :16] = 1.0
-    masks[1, 16:, 16:] = 1.0
-    masks = tf.constant(masks)
+    tf.keras.backend.set_learning_phase(True)
+    self.model.provide_groundtruth(
+        groundtruth_boxes_list=[tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)],
+        groundtruth_classes_list=[tf.one_hot([1, 0, 1, 1, 1], depth=6)],
+        groundtruth_weights_list=[tf.ones(5)],
+        groundtruth_masks_list=[tf.ones((5, 32, 32))])
+    prediction = self.model.predict(tf.zeros((1, 32, 32, 3)), None)
+    self.assertEqual(prediction['MASK_LOGITS_GT_BOXES'][0].shape,
+                     (1, 5, 16, 16))
+
+  def test_loss(self):

-    loss_dict = model._compute_per_instance_deepmac_losses(
-        boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)),
-        tf.zeros((16, 16, 3)))
+    model = build_meta_arch()
+    boxes = tf.constant([[[0.0, 0.0, 0.25, 0.25], [0.75, 0.75, 1.0, 1.0]]])
+    masks = np.zeros((1, 2, 32, 32), dtype=np.float32)
+    masks[0, 0, :16, :16] = 1.0
+    masks[0, 1, 16:, 16:] = 1.0
+    masks_pred = tf.fill((1, 2, 32, 32), 0.9)
+
+    loss_dict = model._compute_deepmac_losses(
+        boxes, masks_pred, masks, tf.zeros((1, 16, 16, 3)))
    self.assertAllClose(
        loss_dict[deepmac_meta_arch.DEEP_MASK_ESTIMATION],
-        np.zeros(2) - tf.math.log(tf.nn.sigmoid(0.9)))
+        np.zeros((1, 2)) - tf.math.log(tf.nn.sigmoid(0.9)))

-  def test_per_instance_loss_no_crop_resize(self):
+  def test_loss_no_crop_resize(self):

    model = build_meta_arch(predict_full_resolution_masks=True)
-    model._mask_net = MockMaskNet()
-    boxes = tf.constant([[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]])
-    masks = np.ones((2, 128, 128), dtype=np.float32)
-    masks = tf.constant(masks)
+    boxes = tf.constant([[[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]]])
+    masks = tf.ones((1, 2, 128, 128), dtype=tf.float32)
+    masks_pred = tf.fill((1, 2, 32, 32), 0.9)

-    loss_dict = model._compute_per_instance_deepmac_losses(
-        boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)),
-        tf.zeros((32, 32, 3)))
+    loss_dict = model._compute_deepmac_losses(
+        boxes, masks_pred, masks, tf.zeros((1, 32, 32, 3)))
    self.assertAllClose(
        loss_dict[deepmac_meta_arch.DEEP_MASK_ESTIMATION],
-        np.zeros(2) - tf.math.log(tf.nn.sigmoid(0.9)))
+        np.zeros((1, 2)) - tf.math.log(tf.nn.sigmoid(0.9)))

-  def test_per_instance_loss_no_crop_resize_dice(self):
+  def test_loss_no_crop_resize_dice(self):

    model = build_meta_arch(predict_full_resolution_masks=True,
                            use_dice_loss=True)
-    model._mask_net = MockMaskNet()
-    boxes = tf.constant([[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]])
-    masks = np.ones((2, 128, 128), dtype=np.float32)
+    boxes = tf.constant([[[0.0, 0.0, 1.0, 1.0], [0.0, 0.0, 1.0, 1.0]]])
+    masks = np.ones((1, 2, 128, 128), dtype=np.float32)
    masks = tf.constant(masks)
+    masks_pred = tf.fill((1, 2, 32, 32), 0.9)

-    loss_dict = model._compute_per_instance_deepmac_losses(
-        boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)),
-        tf.zeros((32, 32, 3)))
+    loss_dict = model._compute_deepmac_losses(
+        boxes, masks_pred, masks, tf.zeros((1, 32, 32, 3)))
    pred = tf.nn.sigmoid(0.9)
    expected = (1.0 - ((2.0 * pred) / (1.0 + pred)))
    self.assertAllClose(loss_dict[deepmac_meta_arch.DEEP_MASK_ESTIMATION],
-                        [expected, expected], rtol=1e-3)
+                        [[expected, expected]], rtol=1e-3)

  def test_empty_masks(self):
-    boxes = tf.zeros([0, 4])
-    masks = tf.zeros([0, 128, 128])

-    loss_dict = self.model._compute_per_instance_deepmac_losses(
-        boxes, masks, tf.zeros((32, 32, 2)), tf.zeros((32, 32, 2)),
-        tf.zeros((16, 16, 3)))
+    boxes = tf.zeros([1, 0, 4])
+    masks = tf.zeros([1, 0, 128, 128])
+
+    loss_dict = self.model._compute_deepmac_losses(
+        boxes, masks, masks,
+        tf.zeros((1, 16, 16, 3)))
    self.assertEqual(loss_dict[deepmac_meta_arch.DEEP_MASK_ESTIMATION].shape,
-                     (0,))
+                     (1, 0))

  def test_postprocess(self):
-
    model = build_meta_arch()
    model._mask_net = MockMaskNet()
    boxes = np.zeros((2, 3, 4), dtype=np.float32)
@@ -708,7 +785,6 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    self.assertAllClose(masks, prob * np.ones((2, 3, 16, 16)))

  def test_postprocess_emb_proj(self):
-
    model = build_meta_arch(network_type='embedding_projection',
                            use_instance_embedding=False,
                            use_xy=False, pixel_embedding_dim=8,
@@ -724,7 +800,6 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    self.assertEqual(masks.shape, (2, 3, 16, 16))

  def test_postprocess_emb_proj_fullres(self):
-
    model = build_meta_arch(network_type='embedding_projection',
                            predict_full_resolution_masks=True,
                            use_instance_embedding=False,
@@ -751,17 +826,6 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    prob = tf.nn.sigmoid(0.9).numpy()
    self.assertAllClose(masks, prob * np.ones((2, 3, 128, 128)))

-  def test_crop_masks_within_boxes(self):
-    masks = np.zeros((2, 32, 32))
-    masks[0, :16, :16] = 1.0
-    masks[1, 16:, 16:] = 1.0
-    boxes = tf.constant([[0.0, 0.0, 15.0 / 32, 15.0 / 32],
-                         [0.5, 0.5, 1.0, 1]])
-    masks = deepmac_meta_arch.crop_masks_within_boxes(
-        masks, boxes, 128)
-    masks = (masks.numpy() > 0.0).astype(np.float32)
-    self.assertAlmostEqual(masks.sum(), 2 * 128 * 128)
-
  def test_transform_boxes_to_feature_coordinates(self):
    batch_size = 2
    model = build_meta_arch()
@@ -816,13 +880,13 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):

  def test_box_consistency_loss(self):

-    boxes_gt = tf.constant([[0., 0., 0.49, 1.0]])
-    boxes_jittered = tf.constant([[0.0, 0.0, 1.0, 1.0]])
+    boxes_gt = tf.constant([[[0., 0., 0.49, 1.0]]])
+    boxes_jittered = tf.constant([[[0.0, 0.0, 1.0, 1.0]]])

-    mask_prediction = np.zeros((1, 32, 32)).astype(np.float32)
-    mask_prediction[0, :24, :24] = 1.0
+    mask_prediction = np.zeros((1, 1, 32, 32)).astype(np.float32)
+    mask_prediction[0, 0, :24, :24] = 1.0

-    loss = self.model._compute_per_instance_box_consistency_loss(
+    loss = self.model._compute_box_consistency_loss(
        boxes_gt, boxes_jittered, tf.constant(mask_prediction))

    yloss = tf.nn.sigmoid_cross_entropy_with_logits(
@@ -834,39 +898,39 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    yloss_mean = tf.reduce_mean(yloss)
    xloss_mean = tf.reduce_mean(xloss)

-    self.assertAllClose(loss, [yloss_mean + xloss_mean])
+    self.assertAllClose(loss[0], [yloss_mean + xloss_mean])

  def test_box_consistency_loss_with_tightness(self):

-    boxes_gt = tf.constant([[0., 0., 0.49, 0.49]])
+    boxes_gt = tf.constant([[[0., 0., 0.49, 0.49]]])
    boxes_jittered = None

-    mask_prediction = np.zeros((1, 8, 8)).astype(np.float32) - 1e10
-    mask_prediction[0, :4, :4] = 1e10
+    mask_prediction = np.zeros((1, 1, 8, 8)).astype(np.float32) - 1e10
+    mask_prediction[0, 0, :4, :4] = 1e10

    model = build_meta_arch(box_consistency_tightness=True,
                            predict_full_resolution_masks=True)
-    loss = model._compute_per_instance_box_consistency_loss(
+    loss = model._compute_box_consistency_loss(
        boxes_gt, boxes_jittered, tf.constant(mask_prediction))

-    self.assertAllClose(loss, [0.0])
+    self.assertAllClose(loss[0], [0.0])

  def test_box_consistency_loss_gt_count(self):

-    boxes_gt = tf.constant([
+    boxes_gt = tf.constant([[
        [0., 0., 1.0, 1.0],
-        [0., 0., 0.49, 0.49]])
+        [0., 0., 0.49, 0.49]]])
    boxes_jittered = None

-    mask_prediction = np.zeros((2, 32, 32)).astype(np.float32)
-    mask_prediction[0, :16, :16] = 1.0
-    mask_prediction[1, :8, :8] = 1.0
+    mask_prediction = np.zeros((1, 2, 32, 32)).astype(np.float32)
+    mask_prediction[0, 0, :16, :16] = 1.0
+    mask_prediction[0, 1, :8, :8] = 1.0

    model = build_meta_arch(
        box_consistency_loss_normalize='normalize_groundtruth_count',
        predict_full_resolution_masks=True)
-    loss_func = tf.function(
-        model._compute_per_instance_box_consistency_loss)
+    loss_func = (
+        model._compute_box_consistency_loss)
    loss = loss_func(
        boxes_gt, boxes_jittered, tf.constant(mask_prediction))

@@ -877,7 +941,7 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    xloss = yloss
    xloss_mean = tf.reduce_sum(xloss)

-    self.assertAllClose(loss[0], yloss_mean + xloss_mean)
+    self.assertAllClose(loss[0, 0], yloss_mean + xloss_mean)

    yloss = tf.nn.sigmoid_cross_entropy_with_logits(
        labels=tf.constant([1.0] * 16 + [0.0] * 16),
@@ -885,21 +949,20 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    yloss_mean = tf.reduce_sum(yloss)
    xloss = yloss
    xloss_mean = tf.reduce_sum(xloss)
-    self.assertAllClose(loss[1], yloss_mean + xloss_mean)
+    self.assertAllClose(loss[0, 1], yloss_mean + xloss_mean)

  def test_box_consistency_loss_balanced(self):
-
-    boxes_gt = tf.constant([
-        [0., 0., 0.49, 0.49]])
+    boxes_gt = tf.constant([[
+        [0., 0., 0.49, 0.49]]])
    boxes_jittered = None

-    mask_prediction = np.zeros((1, 32, 32)).astype(np.float32)
-    mask_prediction[0] = 1.0
+    mask_prediction = np.zeros((1, 1, 32, 32)).astype(np.float32)
+    mask_prediction[0, 0] = 1.0

    model = build_meta_arch(box_consistency_loss_normalize='normalize_balanced',
                            predict_full_resolution_masks=True)
    loss_func = tf.function(
-        model._compute_per_instance_box_consistency_loss)
+        model._compute_box_consistency_loss)
    loss = loss_func(
        boxes_gt, boxes_jittered, tf.constant(mask_prediction))

@@ -909,63 +972,64 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    yloss_mean = tf.reduce_sum(yloss) / 16.0
    xloss_mean = yloss_mean

-    self.assertAllClose(loss[0], yloss_mean + xloss_mean)
+    self.assertAllClose(loss[0, 0], yloss_mean + xloss_mean)

  def test_box_consistency_dice_loss(self):

    model = build_meta_arch(use_dice_loss=True)
-    boxes_gt = tf.constant([[0., 0., 0.49, 1.0]])
-    boxes_jittered = tf.constant([[0.0, 0.0, 1.0, 1.0]])
+    boxes_gt = tf.constant([[[0., 0., 0.49, 1.0]]])
+    boxes_jittered = tf.constant([[[0.0, 0.0, 1.0, 1.0]]])

    almost_inf = 1e10
-    mask_prediction = np.full((1, 32, 32), -almost_inf, dtype=np.float32)
-    mask_prediction[0, :24, :24] = almost_inf
+    mask_prediction = np.full((1, 1, 32, 32), -almost_inf, dtype=np.float32)
+    mask_prediction[0, 0, :24, :24] = almost_inf

-    loss = model._compute_per_instance_box_consistency_loss(
+    loss = model._compute_box_consistency_loss(
        boxes_gt, boxes_jittered, tf.constant(mask_prediction))

    yloss = 1 - 6.0 / 7
    xloss = 0.2

-    self.assertAllClose(loss, [yloss + xloss])
+    self.assertAllClose(loss, [[yloss + xloss]])

  def test_color_consistency_loss_full_res_shape(self):

    model = build_meta_arch(use_dice_loss=True,
                            predict_full_resolution_masks=True)
-    boxes = tf.zeros((3, 4))
-    img = tf.zeros((32, 32, 3))
-    mask_logits = tf.zeros((3, 32, 32))
+    boxes = tf.zeros((5, 3, 4))
+    img = tf.zeros((5, 32, 32, 3))
+    mask_logits = tf.zeros((5, 3, 32, 32))

-    loss = model._compute_per_instance_color_consistency_loss(
+    loss = model._compute_color_consistency_loss(
        boxes, img, mask_logits)
-    self.assertEqual([3], loss.shape)
+    self.assertEqual([5, 3], loss.shape)

  def test_color_consistency_1_threshold(self):
    model = build_meta_arch(predict_full_resolution_masks=True,
                            color_consistency_threshold=0.99)
-    boxes = tf.zeros((3, 4))
-    img = tf.zeros((32, 32, 3))
-    mask_logits = tf.zeros((3, 32, 32)) - 1e4
+    boxes = tf.zeros((5, 3, 4))
+    img = tf.zeros((5, 32, 32, 3))
+    mask_logits = tf.zeros((5, 3, 32, 32)) - 1e4

-    loss = model._compute_per_instance_color_consistency_loss(
+    loss = model._compute_color_consistency_loss(
        boxes, img, mask_logits)
-    self.assertAllClose(loss, np.zeros(3))
+    self.assertAllClose(loss, np.zeros((5, 3)))

  def test_box_consistency_dice_loss_full_res(self):

    model = build_meta_arch(use_dice_loss=True,
                            predict_full_resolution_masks=True)
-    boxes_gt = tf.constant([[0., 0., 1.0, 1.0]])
+    boxes_gt = tf.constant([[[0., 0., 1.0, 1.0]]])
    boxes_jittered = None

+    size = 32
    almost_inf = 1e10
-    mask_prediction = np.full((1, 32, 32), -almost_inf, dtype=np.float32)
-    mask_prediction[0, :16, :32] = almost_inf
+    mask_prediction = np.full((1, 1, size, size), -almost_inf, dtype=np.float32)
+    mask_prediction[0, 0, :(size // 2), :] = almost_inf

-    loss = model._compute_per_instance_box_consistency_loss(
+    loss = model._compute_box_consistency_loss(
        boxes_gt, boxes_jittered, tf.constant(mask_prediction))
-    self.assertAlmostEqual(loss[0].numpy(), 1 / 3)
+    self.assertAlmostEqual(loss[0, 0].numpy(), 1 / 3)

  def test_get_lab_image_shape(self):

@@ -975,18 +1039,18 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
  def test_loss_keys(self):
    model = build_meta_arch(use_dice_loss=True)
    prediction = {
-        'preprocessed_inputs': tf.random.normal((1, 32, 32, 3)),
-        'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 17))] * 2,
-        'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 19))] * 2,
-        'object_center': [tf.random.normal((1, 8, 8, 6))] * 2,
-        'box/offset': [tf.random.normal((1, 8, 8, 2))] * 2,
-        'box/scale': [tf.random.normal((1, 8, 8, 2))] * 2
+        'preprocessed_inputs': tf.random.normal((3, 32, 32, 3)),
+        'MASK_LOGITS_GT_BOXES': [tf.random.normal((3, 5, 8, 8))] * 2,
+        'object_center': [tf.random.normal((3, 8, 8, 6))] * 2,
+        'box/offset': [tf.random.normal((3, 8, 8, 2))] * 2,
+        'box/scale': [tf.random.normal((3, 8, 8, 2))] * 2
    }
    model.provide_groundtruth(
-        groundtruth_boxes_list=[tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)],
-        groundtruth_classes_list=[tf.one_hot([1, 0, 1, 1, 1], depth=6)],
-        groundtruth_weights_list=[tf.ones(5)],
-        groundtruth_masks_list=[tf.ones((5, 32, 32))])
+        groundtruth_boxes_list=[
+            tf.convert_to_tensor([[0., 0., 1., 1.]] * 5)] * 3,
+        groundtruth_classes_list=[tf.one_hot([1, 0, 1, 1, 1], depth=6)] * 3,
+        groundtruth_weights_list=[tf.ones(5)] * 3,
+        groundtruth_masks_list=[tf.ones((5, 32, 32))] * 3)
    loss = model.loss(prediction, tf.constant([[32, 32, 3.0]]))
    self.assertGreater(loss['Loss/deep_mask_estimation'], 0.0)

@@ -1008,8 +1072,7 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    num_stages = 1
    prediction = {
        'preprocessed_inputs': tf.random.normal((1, 32, 32, 3)),
-        'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 9))] * num_stages,
-        'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 8))] * num_stages,
+        'MASK_LOGITS_GT_BOXES': [tf.random.normal((1, 5, 8, 8))] * num_stages,
        'object_center': [tf.random.normal((1, 8, 8, 6))] * num_stages,
        'box/offset': [tf.random.normal((1, 8, 8, 2))] * num_stages,
        'box/scale': [tf.random.normal((1, 8, 8, 2))] * num_stages
@@ -1066,6 +1129,7 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
          f'{mask_loss} did not respond to change in weight.')

  def test_color_consistency_warmup(self):
+    tf.keras.backend.set_learning_phase(True)
    model = build_meta_arch(
        use_dice_loss=True,
        predict_full_resolution_masks=True,
@@ -1079,8 +1143,7 @@ class DeepMACMetaArchTest(tf.test.TestCase, parameterized.TestCase):
    num_stages = 1
    prediction = {
        'preprocessed_inputs': tf.random.normal((1, 32, 32, 3)),
-        'INSTANCE_EMBEDDING': [tf.random.normal((1, 8, 8, 9))] * num_stages,
-        'PIXEL_EMBEDDING': [tf.random.normal((1, 8, 8, 8))] * num_stages,
+        'MASK_LOGITS_GT_BOXES': [tf.random.normal((1, 5, 8, 8))] * num_stages,
        'object_center': [tf.random.normal((1, 8, 8, 6))] * num_stages,
        'box/offset': [tf.random.normal((1, 8, 8, 2))] * num_stages,
        'box/scale': [tf.random.normal((1, 8, 8, 2))] * num_stages

--- a/research/object_detection/protos/center_net.proto
+++ b/research/object_detection/protos/center_net.proto
@@ -403,7 +403,7 @@ message CenterNet {


  // Mask prediction support using DeepMAC. See https://arxiv.org/abs/2104.00613
-  // Next ID 24
+  // Next ID 25
  message DeepMACMaskEstimation {
    // The loss used for penalizing mask predictions.
    optional ClassificationLoss classification_loss = 1;
@@ -485,6 +485,14 @@ message CenterNet {

    optional int32 color_consistency_warmup_start = 23 [default=0];

+    // DeepMAC has been refactored to process the entire batch at once,
+    // instead of the previous (simple) approach of processing one sample at
+    // a time. Because of this, the memory consumption has increased and
+    // it's crucial to only feed the mask head the last stage outputs
+    // from the hourglass. Doing so halves the memory requirement of the
+    // mask head and does not cause a drop in evaluation metrics.
+    optional bool use_only_last_stage = 24 [default=false];
+
  }

  optional DeepMACMaskEstimation deepmac_mask_estimation = 14;

--- a/research/seq_flow_lite/layers/BUILD
+++ b/research/seq_flow_lite/layers/BUILD
@@ -76,3 +76,29 @@ py_strict_library(
        "//tf_ops:sequence_string_projection_op_v2_py",  # sequence projection
    ],
 )
+
+py_strict_library(
+    name = "misc_layers",
+    srcs = ["misc_layers.py"],
+    srcs_version = "PY3",
+    deps = [
+        # package tensorflow
+        "//layers:base_layers",  # sequence projection
+        "//layers:dense_layers",  # sequence projection
+        "//layers:quantization_layers",  # sequence projection
+    ],
+)
+
+py_strict_library(
+    name = "qrnn_layers",
+    srcs = ["qrnn_layers.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":base_layers",
+        ":conv_layers",
+        ":dense_layers",
+        ":quantization_layers",
+        # package tensorflow
+        "//tf_ops:tf_custom_ops_py",  # sequence projection
+    ],
+)
--- a/research/seq_flow_lite/layers/misc_layers.py
+++ b/research/seq_flow_lite/layers/misc_layers.py
+# Copyright 2020 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Lint as: python3
+"""Layers for embedding."""
+import tensorflow as tf
+
+from layers import base_layers # import seq_flow_lite module
+from layers import dense_layers # import seq_flow_lite module
+from layers import quantization_layers # import seq_flow_lite module
+
+
+class AttentionPooling(base_layers.BaseLayer):
+  """A basic attention pooling layer."""
+
+  def __init__(self, scalar=True, **kwargs):
+    self.scalar = scalar
+    # Attention logits should not have activation post linear layer so it can
+    # be positive or negative. This would enable the attention distribution to
+    # be anything that the network likes. Using relu activation makes the
+    # attention distribution biased towards uniform distribution.
+    # This gets better results for attention pooling. Though some outputs are
+    # emphasized for making classification decision, all other outputs have
+    # a non zero probability of influencing the class. This seems to result
+    # in better backprop.
+    self.attention = dense_layers.BaseQDenseVarLen(units=1, rank=3, **kwargs)
+    self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
+    super(AttentionPooling, self).__init__(**kwargs)
+
+  def build(self, input_shapes):
+    self.feature_size = input_shapes[-1]
+
+  def call(self, inputs, mask, inverse_normalizer):
+    self._assert_rank_and_type(inputs, 3)
+    self._assert_rank_and_type(mask, 3)
+    batch_size = self.get_batch_dimension(inputs)
+    attn_logits = self.attention(inputs, mask, inverse_normalizer)
+    if self.parameters.mode not in [base_layers.PREDICT, base_layers.TFLITE]:
+      invalid_mask = (1 - mask) * self.parameters.invalid_logit
+      attn_logits = attn_logits * mask + invalid_mask
+    attn_logits = tf.reshape(attn_logits, [batch_size, -1])
+    attention = tf.nn.softmax(attn_logits, axis=-1)
+    attention = self.qrange_sigmoid(attention, tf_only=True)
+    if self.parameters.mode in [base_layers.PREDICT, base_layers.TFLITE]:
+      inputs = tf.reshape(inputs, [-1, self.feature_size])
+    else:
+      attention = tf.expand_dims(attention, axis=1)
+    pre_logits = self.qactivation(tf.matmul(attention, inputs))
+    return tf.reshape(pre_logits, [batch_size, self.feature_size])
+
+
+class TreeInductionLayer(base_layers.BaseLayer):
+  """A basic tree induction layer."""
+
+  def __init__(self, **kwargs):
+    self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
+    super(TreeInductionLayer, self).__init__(**kwargs)
+
+  def call(self, keys, queries, sequence_length):
+    key_dim = keys.get_shape().as_list()[-1]
+    query_dim = queries.get_shape().as_list()[-1]
+    assert key_dim == query_dim, "Last dimension of keys/queries should match."
+
+    if self.parameters.mode not in [base_layers.PREDICT, base_layers.TFLITE]:
+      sequence_mask = tf.sequence_mask(
+          sequence_length, maxlen=tf.shape(keys)[1], dtype=tf.float32)
+      sequence_mask = tf.expand_dims(sequence_mask, axis=2)
+      attn_mask = tf.matmul(sequence_mask, sequence_mask, transpose_b=True)
+
+      attn_logits = self.qactivation(tf.matmul(keys, queries, transpose_b=True))
+      invalid_attn_mask = (1 - attn_mask) * self.parameters.invalid_logit
+      return attn_logits * attn_mask + invalid_attn_mask
+    else:
+      assert self.get_batch_dimension(keys) == 1
+      assert self.get_batch_dimension(queries) == 1
+      keys = tf.reshape(keys, [-1, key_dim])
+      queries = tf.reshape(queries, [-1, key_dim])
+
+      result = self.qactivation(tf.matmul(keys, queries, transpose_b=True))
+      # TODO(b/171063452): Bug needs to be fixed to handle this correctly.
+      # seq_dim = tf.shape(result)[1]
+      # result = tf.reshape(result, [1, seq_dim, seq_dim])
+      return result
--- a/research/seq_flow_lite/layers/qrnn_layers.py
+++ b/research/seq_flow_lite/layers/qrnn_layers.py
+# Copyright 2020 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Lint as: python3
+"""Layers for QRNN."""
+import tensorflow as tf
+
+from layers import base_layers # import seq_flow_lite module
+from layers import conv_layers # import seq_flow_lite module
+from layers import dense_layers # import seq_flow_lite module
+from layers import quantization_layers # import seq_flow_lite module
+from tf_ops import tf_custom_ops_py # import seq_flow_lite module
+
+QUASI_RNN_POOLING_F = "f"
+QUASI_RNN_POOLING_FO = "fo"
+QUASI_RNN_POOLING_IFO = "ifo"
+_QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP = {
+    QUASI_RNN_POOLING_F: 2,
+    QUASI_RNN_POOLING_FO: 3,
+    QUASI_RNN_POOLING_IFO: 4,
+}
+
+
+class QRNNUnidirectionalPoolingCore(base_layers.BaseLayer):
+  """Create a unidirectional QRNN pooling inner loop."""
+
+  def __init__(self, forward=True, **kwargs):
+    self.forward = forward
+    super(QRNNUnidirectionalPoolingCore, self).__init__(**kwargs)
+
+  def call(self, multiplier, constant):
+    if self.parameters.mode != base_layers.TFLITE:
+      return self._qrnn_pooling(multiplier, constant)
+    else:
+      return tf_custom_ops_py.pooling_op(multiplier, constant,
+                                         [1.0 if self.forward else 0.0])
+
+  def _qrnn_pooling(self, multipler, constant):
+    """Pooling step computes the internal states for all timesteps."""
+    assert multipler.get_shape().as_list() == constant.get_shape().as_list()
+
+    gate_static_shape = multipler.get_shape().as_list()
+    gate_shape = tf.shape(multipler)
+
+    feature_size = gate_static_shape[2]
+    assert feature_size is not None
+    batch_size = gate_static_shape[0] or gate_shape[0]
+    max_timestep = gate_static_shape[1] or gate_shape[1]
+
+    dynamic_loop = gate_static_shape[1] is None
+
+    # Get multiplier/constant in [timestep, batch, feature_size] format
+    multiplier_transposed = tf.transpose(multipler, [1, 0, 2])
+    constant_transposed = tf.transpose(constant, [1, 0, 2])
+
+    # Start state
+    state = tf.zeros((batch_size, feature_size), tf.float32)
+    if dynamic_loop:
+
+      # One pooling step
+      def _step(index, state, states):
+        m = multiplier_transposed[index, :, :]
+        c = constant_transposed[index, :, :]
+        new_state = state * m + c
+        next_index = index + 1 if self.forward else index - 1
+        return next_index, new_state, states.write(index, new_state)
+
+      # Termination condition
+      def _termination(index, state, states):
+        del state, states
+        return (index < max_timestep) if self.forward else (index >= 0)
+
+      states = tf.TensorArray(tf.float32, size=max_timestep)
+      index = 0 if self.forward else max_timestep - 1
+
+      # Dynamic pooling loop
+      _, state, states = tf.while_loop(_termination, _step,
+                                       [index, state, states])
+      states = states.stack()
+    else:
+      # Unstack them to process one timestep at a time
+      multiplier_list = tf.unstack(multiplier_transposed)
+      constant_list = tf.unstack(constant_transposed)
+      states = []
+
+      # Unroll either forward or backward based on the flag `forward`
+      timesteps = list(range(max_timestep)) if self.forward else reversed(
+          list(range(max_timestep)))
+
+      # Static pooling loop
+      for time in timesteps:
+        state = state * multiplier_list[time] + constant_list[time]
+        states.append(state)
+
+      # Stack them back in the right order
+      states = tf.stack(states if self.forward else list(reversed(states)))
+
+    # Change to [batch, timestep, feature_size]
+    return tf.transpose(states, [1, 0, 2])
+
+
+class QRNNUnidirectionalPooling(base_layers.BaseLayer):
+  """Create a unidirectional QRNN pooling."""
+
+  def __init__(self,
+               zoneout_probability=0.0,
+               forward=True,
+               pooling=QUASI_RNN_POOLING_FO,
+               output_quantized=True,
+               **kwargs):
+    self.zoneout_probability = zoneout_probability
+    self.pooling = pooling
+    self.forward = forward
+    self.output_quantized = output_quantized
+    if output_quantized and self.pooling == QUASI_RNN_POOLING_IFO:
+      self.qoutputs = quantization_layers.ActivationQuantization()
+    self.num_gates = _QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP[pooling]
+    assert pooling in _QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP.keys()
+    self.pooling_core = QRNNUnidirectionalPoolingCore(forward=forward, **kwargs)
+    super(QRNNUnidirectionalPooling, self).__init__(**kwargs)
+
+  def call(self, gates, mask):
+    return self._create_qrnn_pooling_unidirectional(gates, mask)
+
+  def _qrnn_preprocess(self, gates):
+    """Preprocess the gate inputs to the pooling layer."""
+    assert self.num_gates == len(gates)
+    dim = lambda tensor, index: tensor.get_shape().as_list()[index]
+
+    for tensor in gates:
+      assert len(tensor.get_shape().as_list()) == 3
+      for idx in range(3):
+        assert dim(gates[0], idx) == dim(tensor, idx)
+
+    if self.pooling == QUASI_RNN_POOLING_F:
+      z = self.quantized_tanh(gates[0], tf_only=True)
+      f = self.quantized_sigmoid(gates[1], tf_only=True)
+      return f, self.qrange_tanh(self.qrange_sigmoid(1 - f) * z), 1
+    elif self.pooling == QUASI_RNN_POOLING_FO:
+      z = self.quantized_tanh(gates[0], tf_only=True)
+      f = self.quantized_sigmoid(gates[1], tf_only=True)
+      o = self.quantized_sigmoid(gates[2], tf_only=True)
+      return f, self.qrange_tanh(self.qrange_sigmoid(1 - f) * z), o
+    else:  # self.pooling == QUASI_RNN_POOLING_IFO:
+      z = self.quantized_tanh(gates[0], tf_only=True)
+      i = self.quantized_sigmoid(gates[1], tf_only=True)
+      f = self.quantized_sigmoid(gates[2], tf_only=True)
+      o = self.quantized_sigmoid(gates[3], tf_only=True)
+      return f, self.qrange_tanh(i * z), o
+
+  def _qrnn_postprocess(self, states, multiplier):
+    """Postprocess the states and return the output tensors."""
+    if self.pooling == QUASI_RNN_POOLING_F:
+      return states
+    elif self.pooling == QUASI_RNN_POOLING_FO:
+      return self.qrange_tanh(states) * multiplier
+    else:  # self.pooling == QUASI_RNN_POOLING_IFO
+      return self.qoutputs(states) * multiplier
+
+  def _qrnn_zoneout(self, multipler, constant):
+    """Zoneout regularization for Quasi RNN."""
+    enable_zoneout = self.zoneout_probability > 0.0
+    if enable_zoneout and self.parameters.mode == base_layers.TRAIN:
+      # zoneout_mask is 1.0 with self.zoneout_probability and 0.0 with
+      # probability (1 - self.zoneout_probability)
+      zoneout_mask = tf.random.uniform(tf.shape(multipler), maxval=1.0)
+      zoneout_mask = tf.floor(zoneout_mask + self.zoneout_probability)
+
+      # When zoneout_mask is 1.0, do not update the state, retain the old state.
+      # This is achieved by making the multiplier 1.0 and constant 0.0.
+      # When zoneout_mask is 0.0 the multiplier and constant are unaffected.
+      # multipler is expected to be in the range [0.0, 1.0]. This is true since
+      # it is the result of a sigmoid.
+      multipler = tf.maximum(zoneout_mask, multipler)
+      constant *= (1 - zoneout_mask)
+    return multipler, constant
+
+  def _create_qrnn_pooling_unidirectional(self, gates, mask):
+    """Create QRNN Pooling in either forward or backward direction."""
+    m1, c1, outgate = self._qrnn_preprocess(gates)
+
+    # For inference zero padding will not be used. Hence sequence length is
+    # not necessary.
+    if self.parameters.mode not in [base_layers.PREDICT, base_layers.TFLITE]:
+      m1 = m1 * mask + (1 - mask) * tf.ones_like(m1)
+      c1 *= mask
+
+    m1, c1 = self._qrnn_zoneout(m1, c1)
+
+    states = self.pooling_core(m1, c1)
+
+    outputs = self._qrnn_postprocess(states, outgate)
+
+    # For inference zero padding will not be used. Hence sequence length is
+    # not necessary.
+    if self.parameters.mode not in [base_layers.PREDICT, base_layers.TFLITE]:
+      outputs *= mask
+
+    if self.output_quantized:
+      if self.pooling in [QUASI_RNN_POOLING_FO, QUASI_RNN_POOLING_F]:
+        outputs = self.qrange_tanh(outputs)
+      else:
+        outputs = self.qoutputs.quantize_using_range(outputs)
+
+    return outputs
+
+
+class QRNNUnidirectional(base_layers.BaseLayer):
+  """Create a unidirectional QRNN encoder."""
+
+  def __init__(self,
+               kwidth,
+               state_size,
+               zoneout_probability=0.0,
+               forward=True,
+               pooling=QUASI_RNN_POOLING_FO,
+               output_quantized=True,
+               **kwargs):
+    self.forward = forward
+    self.kwidth = kwidth
+    self.pooling = pooling
+    self.state_size = state_size
+    assert pooling in _QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP.keys()
+    self.num_gates = _QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP[pooling]
+    self.gate_layers = []
+    for _ in range(self.num_gates):
+      self.gate_layers.append(
+          conv_layers.EncoderQConvolutionVarLen(
+              filters=state_size,
+              ksize=kwidth,
+              rank=3,
+              padding="VALID",
+              activation=None,
+              **kwargs))
+    padding = [kwidth - 1, 0] if forward else [0, kwidth - 1]
+    self.zero_pad = tf.keras.layers.ZeroPadding1D(padding=padding)
+    self.qrnn_pooling = QRNNUnidirectionalPooling(
+        forward=forward,
+        zoneout_probability=zoneout_probability,
+        output_quantized=output_quantized,
+        pooling=pooling,
+        **kwargs)
+    super(QRNNUnidirectional, self).__init__(**kwargs)
+
+  def call(self, inputs, mask, inverse_normalizer=None):
+    if inverse_normalizer is None:
+      inverse_normalizer = tf.math.reciprocal(tf.reduce_sum(mask))
+    self._assert_rank_and_type(inputs, 3)
+    self._assert_rank_and_type(mask, 3)
+    maskr4 = tf.expand_dims(mask, axis=1)
+    padded_inputs = self.zero_pad(inputs)
+    gates = [
+        layer(padded_inputs, maskr4, inverse_normalizer)
+        for layer in self.gate_layers
+    ]
+    return self.qrnn_pooling(gates, mask)
+
+
+class QRNNUnidirectionalWithBottleneck(base_layers.BaseLayer):
+  """Create a unidirectional QRNN encoder with bottlenecks."""
+
+  def __init__(self,
+               kwidth,
+               state_size,
+               bottleneck_size,
+               zoneout_probability=0.0,
+               forward=True,
+               pooling=QUASI_RNN_POOLING_FO,
+               output_quantized=True,
+               **kwargs):
+    self.bottleneck_size = bottleneck_size
+    self.state_size = state_size
+    self.forward = forward
+    self.kwidth = kwidth
+    self.pooling = pooling
+    self.state_size = state_size
+    assert pooling in _QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP.keys()
+    self.num_gates = _QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP[pooling]
+    self.qrnn_pooling = QRNNUnidirectionalPooling(
+        forward=forward,
+        zoneout_probability=zoneout_probability,
+        output_quantized=output_quantized,
+        pooling=pooling,
+        **kwargs)
+    self.pre_conv_layers = []
+    self.gate_layers = []
+    self.post_conv_layers = []
+    for _ in range(self.num_gates):
+      self.pre_conv_layers.append(
+          dense_layers.BaseQDense(bottleneck_size, rank=3, **kwargs))
+      self.gate_layers.append(
+          conv_layers.EncoderQConvolution(
+              filters=bottleneck_size,
+              ksize=kwidth,
+              rank=3,
+              padding="SAME",
+              **kwargs))
+      self.post_conv_layers.append(
+          dense_layers.BaseQDense(
+              state_size, rank=3, activation=None, **kwargs))
+    super(QRNNUnidirectionalWithBottleneck, self).__init__(**kwargs)
+
+  def call(self, inputs, mask, inverse_normalizer=None):
+    if inverse_normalizer is None:
+      inverse_normalizer = tf.math.reciprocal(tf.reduce_sum(mask))
+    self._assert_rank_and_type(inputs, 3)
+    self._assert_rank_and_type(mask, 3)
+    pre_conv_out = [layer(inputs) for layer in self.pre_conv_layers]
+    gates = [layer(pre_conv_out[i]) for i, layer in enumerate(self.gate_layers)]
+    post_conv_out = [
+        layer(gates[i]) for i, layer in enumerate(self.post_conv_layers)
+    ]
+    return self.qrnn_pooling(post_conv_out, mask)
+
+
+class QRNNBidirectional(base_layers.BaseLayer):
+  """Create a bidirectional QRNN encoder."""
+
+  def __init__(self,
+               kwidth,
+               state_size,
+               zoneout_probability=0.0,
+               pooling=QUASI_RNN_POOLING_FO,
+               bottleneck_size=None,
+               **kwargs):
+    self.pooling = pooling
+    if bottleneck_size is None:
+      self.forward = QRNNUnidirectional(
+          kwidth=kwidth,
+          state_size=state_size,
+          forward=True,
+          output_quantized=False,
+          zoneout_probability=zoneout_probability,
+          pooling=pooling,
+          **kwargs)
+      self.backward = QRNNUnidirectional(
+          kwidth=kwidth,
+          state_size=state_size,
+          forward=False,
+          output_quantized=False,
+          zoneout_probability=zoneout_probability,
+          pooling=pooling,
+          **kwargs)
+    else:
+      self.forward = QRNNUnidirectionalWithBottleneck(
+          kwidth=kwidth,
+          state_size=state_size,
+          bottleneck_size=bottleneck_size,
+          forward=True,
+          output_quantized=False,
+          zoneout_probability=zoneout_probability,
+          pooling=pooling,
+          **kwargs)
+      self.backward = QRNNUnidirectionalWithBottleneck(
+          kwidth=kwidth,
+          state_size=state_size,
+          bottleneck_size=bottleneck_size,
+          forward=False,
+          output_quantized=False,
+          zoneout_probability=zoneout_probability,
+          pooling=pooling,
+          **kwargs)
+
+    self.qconcat = quantization_layers.ConcatQuantization(axis=2, **kwargs)
+    super(QRNNBidirectional, self).__init__(**kwargs)
+
+  def call(self, inputs, mask, inverse_normalizer=None):
+    if inverse_normalizer is None:
+      inverse_normalizer = tf.math.reciprocal(tf.reduce_sum(mask))
+    fwd_outputs = self.forward(inputs, mask, inverse_normalizer)
+    bwd_outputs = self.backward(inputs, mask, inverse_normalizer)
+
+    if self.pooling in [QUASI_RNN_POOLING_FO, QUASI_RNN_POOLING_F]:
+      outputs = [self.qrange_tanh(fwd_outputs), self.qrange_tanh(bwd_outputs)]
+      outputs = self.qrange_tanh(tf.concat(outputs, axis=2))
+    else:
+      outputs = self.qconcat([fwd_outputs, bwd_outputs])
+
+    return outputs
+
+
+class QRNNBidirectionalStack(base_layers.BaseLayer):
+  """Create a stack of bidirectional QRNN encoder."""
+
+  def __init__(self,
+               num_layers,
+               kwidth,
+               state_size,
+               zoneout_probability=0.0,
+               layerwise_decaying_zoneout=True,
+               pooling=QUASI_RNN_POOLING_FO,
+               bottleneck_size=None,
+               **kwargs):
+    self.layers = []
+    zp = zoneout_probability
+    for idx in range(num_layers):
+      if layerwise_decaying_zoneout:
+        zp = (zoneout_probability**(idx + 1))
+      self.layers.append(
+          QRNNBidirectional(
+              kwidth=kwidth,
+              state_size=state_size,
+              zoneout_probability=zp,
+              pooling=pooling,
+              bottleneck_size=bottleneck_size,
+              **kwargs))
+    super(QRNNBidirectionalStack, self).__init__(**kwargs)
+
+  def call(self, inputs, maskr3, inverse_normalizer):
+    return self._apply_qrnn_stack(inputs, maskr3, inverse_normalizer)
+
+  def _apply_qrnn_stack(self, inputs, mask3, inverse_normalizer):
+    if self.parameters.mode not in [base_layers.PREDICT, base_layers.TFLITE]:
+      inputs = inputs * mask3
+    for layer in self.layers:
+      outputs = layer(inputs, mask3, inverse_normalizer)
+      inputs = outputs
+    return outputs
+
+
+class QRNNBidirectionalStackWithSeqLength(QRNNBidirectionalStack):
+
+  def call(self, inputs, sequence_length):
+    mask = tf.sequence_mask(
+        sequence_length, tf.shape(inputs)[1], dtype=tf.float32)
+    inverse_normalizer = tf.math.reciprocal(tf.reduce_sum(mask))
+    maskr3 = tf.expand_dims(mask, 2)
+    return self._apply_qrnn_stack(inputs, maskr3, inverse_normalizer)
--- a/research/seq_flow_lite/models/BUILD
+++ b/research/seq_flow_lite/models/BUILD
@@ -20,3 +20,21 @@ py_library(
        "//tf_ops:tf_custom_ops_py",  # sequence projection
    ],
 )
+
+py_library(
+    name = "pqrnn",
+    srcs = ["pqrnn.py"],
+    srcs_version = "PY3",
+    deps = [
+        # package absl/logging
+        # package tensorflow
+        "//layers:base_layers",  # sequence projection
+        "//layers:dense_layers",  # sequence projection
+        "//layers:misc_layers",  # sequence projection
+        "//layers:projection_layers",  # sequence projection
+        "//layers:qrnn_layers",  # sequence projection
+        "//layers:quantization_layers",  # sequence projection
+        # "//tf_ops:tf_custom_ops"  # sequence projection
+        "//tf_ops:tf_custom_ops_py",  # sequence projection
+    ],
+)
--- a/research/seq_flow_lite/models/pqrnn.py
+++ b/research/seq_flow_lite/models/pqrnn.py
+# Copyright 2020 The TensorFlow Authors All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Lint as: python3
+"""Implementation of pQRNN model."""
+
+from absl import logging
+import tensorflow as tf
+
+from layers import base_layers # import seq_flow_lite module
+from layers import dense_layers # import seq_flow_lite module
+from layers import misc_layers # import seq_flow_lite module
+from layers import projection_layers # import seq_flow_lite module
+from layers import qrnn_layers # import seq_flow_lite module
+from layers import quantization_layers # import seq_flow_lite module
+
+
+class Encoder(tf.keras.layers.Layer):
+  """A pQRNN keras model."""
+
+  def __init__(self, config, mode, **kwargs):
+    super(Encoder, self).__init__(**kwargs)
+
+    def _get_params(varname, default_value=None):
+      value = config[varname] if varname in config else default_value
+      default = "" if varname in config else " (default)"
+      logging.info("%s = %s%s", varname, value, default)
+      setattr(self, varname, value)
+
+    _get_params("projection_bottleneck_size")
+    _get_params("qrnn_state_size")
+    _get_params("qrnn_kernel_width", 3)
+    _get_params("qrnn_zoneout_probability")
+    _get_params("number_qrnn_layers")
+    _get_params("labels")
+    _get_params("regularizer_scale")
+    _get_params("quantize")
+
+    self.num_classes = len(self.labels)
+    self.parameters = base_layers.Parameters(
+        mode, quantize=self.quantize, regularizer_scale=self.regularizer_scale)
+
+    self.bottleneck_layer = dense_layers.BaseQDenseVarLen(
+        units=self.projection_bottleneck_size,
+        rank=3,
+        parameters=self.parameters)
+
+    self.qrnn_stack = qrnn_layers.QRNNBidirectionalStack(
+        parameters=self.parameters,
+        zoneout_probability=self.qrnn_zoneout_probability,
+        kwidth=self.qrnn_kernel_width,
+        state_size=self.qrnn_state_size,
+        num_layers=self.number_qrnn_layers)
+
+    self.attention_pool = misc_layers.AttentionPooling(
+        parameters=self.parameters)
+
+    self.final_fc = dense_layers.BaseQDense(
+        units=self.num_classes,
+        rank=2,
+        parameters=self.parameters,
+        activation=None)
+
+  def call(self, projection, seq_length):
+    mask = tf.sequence_mask(
+        seq_length, tf.shape(projection)[1], dtype=tf.float32)
+    inverse_normalizer = tf.math.reciprocal(tf.reduce_sum(mask))
+    maskr3 = tf.expand_dims(mask, axis=2)
+    if self.parameters.mode in [base_layers.TRAIN, base_layers.EVAL]:
+      projection = projection * maskr3
+    bottleneck = self.bottleneck_layer(projection, maskr3, inverse_normalizer)
+    outputs = self.qrnn_stack(bottleneck, maskr3, inverse_normalizer)
+    pre_logits = self.attention_pool(outputs, maskr3, inverse_normalizer)
+    return self.final_fc(pre_logits)
+
+class Model(Encoder):
+
+  def __init__(self, config, mode, **kwargs):
+    super(Model, self).__init__(config, mode, **kwargs)
+    self.projection = projection_layers.ProjectionLayer(config, mode)
+
+  def call(self, inputs):
+    projection, seq_length = self.projection(inputs)
+    return super(Model, self).call(projection, seq_length)
--- a/research/seq_flow_lite/models/sgnn/BUILD
+++ b/research/seq_flow_lite/models/sgnn/BUILD
@@ -90,6 +90,7 @@ py_binary(
    main = "run_tflite.py",
    python_version = "PY3",
    deps = [
+        ":sgnn_projection_op_resolver",
        # Expect numpy installed
        # package TFLite flex delegate
        # package TFLite interpreter

--- a/research/seq_flow_lite/models/sgnn/sgnn.py
+++ b/research/seq_flow_lite/models/sgnn/sgnn.py
@@ -43,8 +43,6 @@ Hparams = collections.namedtuple(

 def preprocess(text):
  """Normalize the text, and return tokens."""
-  assert len(text.get_shape().as_list()) == 2
-  assert text.get_shape().as_list()[-1] == 1
  text = tf.reshape(text, [-1])
  text = tf_text.case_fold_utf8(text)
  tokenizer = tflite_text_api.WhitespaceTokenizer()

--- a/research/seq_flow_lite/tf_ops/tf_custom_ops.cc
+++ b/research/seq_flow_lite/tf_ops/tf_custom_ops.cc
@@ -69,3 +69,27 @@ REGISTER_OP("LayerNorm")
    .Doc(R"doc(
 Dummy layer norm op.
 )doc");
+
+class PoolingOp : public tensorflow::OpKernel {
+ public:
+  explicit PoolingOp(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {}
+
+  void Compute(tensorflow::OpKernelContext* ctx) override {}
+};
+
+REGISTER_KERNEL_BUILDER(Name("PoolingOp").Device(::tensorflow::DEVICE_CPU),
+                        PoolingOp);
+
+REGISTER_OP("PoolingOp")
+    .Input("multiplier: float32")
+    .Input("constant: float32")
+    .Input("forward: float32")
+    .Output("state: float32")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return tensorflow::Status::OK();
+    })
+    .Doc(R"doc(
+Dummy pooling op.
+)doc");
--- a/research/seq_flow_lite/utils/tflite_utils.py
+++ b/research/seq_flow_lite/utils/tflite_utils.py
@@ -80,6 +80,7 @@ def set_output_quantized_for_custom_ops(graph_def, use_mlir=True):
      'ExpectedValueOp': [tf.float32.as_datatype_enum],
      'LayerNorm': [tf.float32.as_datatype_enum],
      'UniformCausalAttn': [tf.float32.as_datatype_enum],
+      'DynamicUniformCausalAttn': [tf.float32.as_datatype_enum],
      'RnnDecoderReadState': [tf.float32.as_datatype_enum],
      'RnnDecoderWriteState': [tf.float32.as_datatype_enum],
  }