Deprecate `network.Classification` from `BertClassifier`.

PiperOrigin-RevId: 364722225

Deprecate `network.Classification` from `BertClassifier`.
PiperOrigin-RevId: 364722225
23ef9155 · Jeremiah Liu · A. Unique TensorFlower · 0674ba0f · 23ef9155 · 23ef9155
Commit 23ef9155 authored Mar 23, 2021 by Jeremiah Liu Committed by A. Unique TensorFlower Mar 23, 2021
3 changed files
--- a/official/nlp/modeling/layers/cls_head.py
+++ b/official/nlp/modeling/layers/cls_head.py
@@ -36,7 +36,8 @@ class ClassificationHead(tf.keras.layers.Layer):
    """Initializes the `ClassificationHead`.
    Args:
-      inner_dim: The dimensionality of inner projection layer.
+      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
+        then only the output projection layer is created.
      num_classes: Number of output classes.
      cls_token_idx: The index inside the sequence to pool.
      activation: Dense layer activation.
@@ -52,19 +53,25 @@ class ClassificationHead(tf.keras.layers.Layer):
    self.initializer = tf.keras.initializers.get(initializer)
    self.cls_token_idx = cls_token_idx
-    self.dense = tf.keras.layers.Dense(
+    if self.inner_dim:
-        units=inner_dim,
+      self.dense = tf.keras.layers.Dense(
-        activation=self.activation,
+          units=self.inner_dim,
-        kernel_initializer=self.initializer,
+          activation=self.activation,
-        name="pooler_dense")
+          kernel_initializer=self.initializer,
-    self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
+          name="pooler_dense")
+      self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
    self.out_proj = tf.keras.layers.Dense(
        units=num_classes, kernel_initializer=self.initializer, name="logits")
  def call(self, features):
-    x = features[:, self.cls_token_idx, :]  # take <CLS> token.
+    if not self.inner_dim:
-    x = self.dense(x)
+      x = features
-    x = self.dropout(x)
+    else:
+      x = features[:, self.cls_token_idx, :]  # take <CLS> token.
+      x = self.dense(x)
+      x = self.dropout(x)
    x = self.out_proj(x)
    return x
@@ -103,7 +110,8 @@ class MultiClsHeads(tf.keras.layers.Layer):
    """Initializes the `MultiClsHeads`.
    Args:
-      inner_dim: The dimensionality of inner projection layer.
+      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
+        then only the output projection layer is created.
      cls_list: a list of pairs of (classification problem name and the numbers
        of classes.
      cls_token_idx: The index inside the sequence to pool.
@@ -120,12 +128,13 @@ class MultiClsHeads(tf.keras.layers.Layer):
    self.initializer = tf.keras.initializers.get(initializer)
    self.cls_token_idx = cls_token_idx
-    self.dense = tf.keras.layers.Dense(
+    if self.inner_dim:
-        units=inner_dim,
+      self.dense = tf.keras.layers.Dense(
-        activation=self.activation,
+          units=inner_dim,
-        kernel_initializer=self.initializer,
+          activation=self.activation,
-        name="pooler_dense")
+          kernel_initializer=self.initializer,
-    self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
+          name="pooler_dense")
+      self.dropout = tf.keras.layers.Dropout(rate=self.dropout_rate)
    self.out_projs = []
    for name, num_classes in cls_list:
      self.out_projs.append(
@@ -134,9 +143,13 @@ class MultiClsHeads(tf.keras.layers.Layer):
              name=name))
  def call(self, features):
-    x = features[:, self.cls_token_idx, :]  # take <CLS> token.
+    if not self.inner_dim:
-    x = self.dense(x)
+      x = features
-    x = self.dropout(x)
+    else:
+      x = features[:, self.cls_token_idx, :]  # take <CLS> token.
+      x = self.dense(x)
+      x = self.dropout(x)
    outputs = {}
    for proj_layer in self.out_projs:
      outputs[proj_layer.name] = proj_layer(x)
@@ -195,7 +208,8 @@ class GaussianProcessClassificationHead(ClassificationHead):
    """Initializes the `GaussianProcessClassificationHead`.
    Args:
-      inner_dim: The dimensionality of inner projection layer.
+      inner_dim: The dimensionality of inner projection layer. If 0 or `None`
+        then only the output projection layer is created.
      num_classes: Number of output classes.
      cls_token_idx: The index inside the sequence to pool.
      activation: Dense layer activation.
@@ -220,8 +234,8 @@ class GaussianProcessClassificationHead(ClassificationHead):
        initializer=initializer,
        **kwargs)
-    # Applies spectral normalization to the pooler layer.
+    # Applies spectral normalization to the dense pooler layer.
-    if use_spec_norm:
+    if self.use_spec_norm and hasattr(self, "dense"):
      self.dense = spectral_normalization.SpectralNormalization(
          self.dense, inhere_layer_name=True, **self.spec_norm_kwargs)

--- a/official/nlp/modeling/layers/cls_head_test.py
+++ b/official/nlp/modeling/layers/cls_head_test.py
@@ -13,13 +13,24 @@
 # limitations under the License.
 """Tests for cls_head."""
+from absl.testing import parameterized
 import tensorflow as tf
 from official.nlp.modeling.layers import cls_head
-class ClassificationHeadTest(tf.test.TestCase):
+class ClassificationHeadTest(tf.test.TestCase, parameterized.TestCase):
+  @parameterized.named_parameters(("no_pooler_layer", 0, 2),
+                                  ("has_pooler_layer", 5, 4))
+  def test_pooler_layer(self, inner_dim, num_weights_expected):
+    test_layer = cls_head.ClassificationHead(inner_dim=inner_dim, num_classes=2)
+    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
+    _ = test_layer(features)
+    num_weights_observed = len(test_layer.get_weights())
+    self.assertEqual(num_weights_observed, num_weights_expected)
  def test_layer_invocation(self):
    test_layer = cls_head.ClassificationHead(inner_dim=5, num_classes=2)
@@ -37,7 +48,18 @@ class ClassificationHeadTest(tf.test.TestCase):
    self.assertAllEqual(layer.get_config(), new_layer.get_config())
-class MultiClsHeadsTest(tf.test.TestCase):
+class MultiClsHeadsTest(tf.test.TestCase, parameterized.TestCase):
+  @parameterized.named_parameters(("no_pooler_layer", 0, 4),
+                                  ("has_pooler_layer", 5, 6))
+  def test_pooler_layer(self, inner_dim, num_weights_expected):
+    cls_list = [("foo", 2), ("bar", 3)]
+    test_layer = cls_head.MultiClsHeads(inner_dim=inner_dim, cls_list=cls_list)
+    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
+    _ = test_layer(features)
+    num_weights_observed = len(test_layer.get_weights())
+    self.assertEqual(num_weights_observed, num_weights_expected)
  def test_layer_invocation(self):
    cls_list = [("foo", 2), ("bar", 3)]
@@ -58,13 +80,31 @@ class MultiClsHeadsTest(tf.test.TestCase):
    self.assertAllEqual(test_layer.get_config(), new_layer.get_config())
-class GaussianProcessClassificationHead(tf.test.TestCase):
+class GaussianProcessClassificationHead(tf.test.TestCase,
+                                        parameterized.TestCase):
  def setUp(self):
    super().setUp()
    self.spec_norm_kwargs = dict(norm_multiplier=1.,)
    self.gp_layer_kwargs = dict(num_inducing=512)
+  @parameterized.named_parameters(("no_pooler_layer", 0, 7),
+                                  ("has_pooler_layer", 5, 11))
+  def test_pooler_layer(self, inner_dim, num_weights_expected):
+    test_layer = cls_head.GaussianProcessClassificationHead(
+        inner_dim=inner_dim,
+        num_classes=2,
+        use_spec_norm=True,
+        use_gp_layer=True,
+        initializer="zeros",
+        **self.spec_norm_kwargs,
+        **self.gp_layer_kwargs)
+    features = tf.zeros(shape=(2, 10, 10), dtype=tf.float32)
+    _ = test_layer(features)
+    num_weights_observed = len(test_layer.get_weights())
+    self.assertEqual(num_weights_observed, num_weights_expected)
  def test_layer_invocation(self):
    test_layer = cls_head.GaussianProcessClassificationHead(
        inner_dim=5,

--- a/official/nlp/modeling/models/bert_classifier.py
+++ b/official/nlp/modeling/models/bert_classifier.py
@@ -18,7 +18,6 @@ import collections
 import tensorflow as tf
 from official.nlp.modeling import layers
-from official.nlp.modeling import networks
 @tf.keras.utils.register_keras_serializable(package='Text')
@@ -46,6 +45,10 @@ class BertClassifier(tf.keras.Model):
    dropout_rate: The dropout probability of the cls head.
    use_encoder_pooler: Whether to use the pooler layer pre-defined inside the
      encoder.
+    cls_head: (Optional) The layer instance to use for the classifier head
+      . It should take in the output from network and produce the final logits.
+      If set, the arguments ('num_classes', 'initializer', 'dropout_rate',
+      'use_encoder_pooler') will be ignored.
  """
  def __init__(self,
@@ -54,7 +57,12 @@ class BertClassifier(tf.keras.Model):
               initializer='glorot_uniform',
               dropout_rate=0.1,
               use_encoder_pooler=True,
+               cls_head=None,
               **kwargs):
+    self.num_classes = num_classes
+    self.initializer = initializer
+    self.use_encoder_pooler = use_encoder_pooler
+    self.cls_head = cls_head
    # We want to use the inputs of the passed network as the inputs to this
    # Model. To do this, we need to keep a handle to the network inputs for use
@@ -66,31 +74,28 @@ class BertClassifier(tf.keras.Model):
      # invoke the Network object with its own input tensors to start the Model.
      outputs = network(inputs)
      if isinstance(outputs, list):
-        cls_output = outputs[1]
+        cls_inputs = outputs[1]
      else:
-        cls_output = outputs['pooled_output']
+        cls_inputs = outputs['pooled_output']
-      cls_output = tf.keras.layers.Dropout(rate=dropout_rate)(cls_output)
+      cls_inputs = tf.keras.layers.Dropout(rate=dropout_rate)(cls_inputs)
-      classifier = networks.Classification(
-          input_width=cls_output.shape[-1],
-          num_classes=num_classes,
-          initializer=initializer,
-          output='logits',
-          name='sentence_prediction')
-      predictions = classifier(cls_output)
    else:
      outputs = network(inputs)
      if isinstance(outputs, list):
-        sequence_output = outputs[0]
+        cls_inputs = outputs[0]
      else:
-        sequence_output = outputs['sequence_output']
+        cls_inputs = outputs['sequence_output']
+    if cls_head:
+      classifier = cls_head
+    else:
      classifier = layers.ClassificationHead(
-          inner_dim=sequence_output.shape[-1],
+          inner_dim=0 if use_encoder_pooler else cls_inputs.shape[-1],
          num_classes=num_classes,
          initializer=initializer,
          dropout_rate=dropout_rate,
          name='sentence_prediction')
-      predictions = classifier(sequence_output)
+    predictions = classifier(cls_inputs)
    # b/164516224
    # Once we've created the network using the Functional API, we call
@@ -102,13 +107,7 @@ class BertClassifier(tf.keras.Model):
    super(BertClassifier, self).__init__(
        inputs=inputs, outputs=predictions, **kwargs)
    self._network = network
-    config_dict = {
+    config_dict = self._make_config_dict()
-        'network': network,
-        'num_classes': num_classes,
-        'initializer': initializer,
-        'use_encoder_pooler': use_encoder_pooler,
-    }
    # We are storing the config dict as a namedtuple here to ensure checkpoint
    # compatibility with an earlier version of this model which did not track
    # the config dict attribute. TF does not track immutable attrs which
@@ -132,3 +131,12 @@ class BertClassifier(tf.keras.Model):
  @classmethod
  def from_config(cls, config, custom_objects=None):
    return cls(**config)
+  def _make_config_dict(self):
+    return {
+        'network': self._network,
+        'num_classes': self.num_classes,
+        'initializer': self.initializer,
+        'use_encoder_pooler': self.use_encoder_pooler,
+        'cls_head': self.cls_head,
+    }