Make BertPretrainer to accept embedding_table explicitly

f7852565 · Sergey Mironov · 31da2245 · f7852565 · f7852565 · f7852565
Commit f7852565 authored Apr 16, 2020 by Sergey Mironov
3 changed files
--- a/official/nlp/bert/bert_models.py
+++ b/official/nlp/bert/bert_models.py
@@ -212,6 +212,7 @@ def pretrain_model(bert_config,
        stddev=bert_config.initializer_range)
  pretrainer_model = models.BertPretrainer(
      network=transformer_encoder,
+      embedding_table=transformer_encoder.get_embedding_table(),
      num_classes=2,  # The next sentence prediction label has two classes.
      num_token_predictions=max_predictions_per_seq,
      initializer=initializer,

--- a/official/nlp/modeling/models/bert_pretrainer.py
+++ b/official/nlp/modeling/models/bert_pretrainer.py
@@ -39,14 +39,15 @@ class BertPretrainer(tf.keras.Model):
  Arguments:
    network: A transformer network. This network should output a sequence output
-      and a classification output. Furthermore, it should expose its embedding
+      and a classification output.
-      table via a "get_embedding_table" method.
    num_classes: Number of classes to predict from the classification network.
    num_token_predictions: Number of tokens to predict from the masked LM.
    activation: The activation (if any) to use in the masked LM and
      classification networks. If None, no activation will be used.
    initializer: The initializer (if any) to use in the masked LM and
      classification networks. Defaults to a Glorot uniform initializer.
+    embedding_table: Embedding table of a network. If None, the
+      "network.get_embedding_table()" is used.
    output: The output style for this network. Can be either 'logits' or
      'predictions'.
  """
@@ -58,6 +59,7 @@ class BertPretrainer(tf.keras.Model):
               activation=None,
               initializer='glorot_uniform',
               output='logits',
+               embedding_table=None,
               **kwargs):
    self._self_setattr_tracking = False
    self._config = {
@@ -100,6 +102,7 @@ class BertPretrainer(tf.keras.Model):
        num_predictions=num_token_predictions,
        input_width=sequence_output.shape[-1],
        source_network=network,
+        embedding_table=embedding_table,
        activation=activation,
        initializer=initializer,
        output=output,

--- a/official/nlp/modeling/networks/masked_lm.py
+++ b/official/nlp/modeling/networks/masked_lm.py
@@ -37,6 +37,8 @@ class MaskedLM(network.Network):
    num_predictions: The number of predictions to make per sequence.
    source_network: The network with the embedding layer to use for the
      embedding layer.
+    embedding_table: The embedding table of a source network, If None, the
+      `source_network.get_embedding_table()` method is used.
    activation: The activation, if any, for the dense layer in this network.
    initializer: The intializer for the dense layer in this network. Defaults to
      a Glorot uniform initializer.
@@ -48,12 +50,16 @@ class MaskedLM(network.Network):
               input_width,
               num_predictions,
               source_network,
+               embedding_table=None,
               activation=None,
               initializer='glorot_uniform',
               output='logits',
               **kwargs):
-    embedding_table = source_network.get_embedding_table()
+    if embedding_table is None:
+      embedding_table = source_network.get_embedding_table()
    vocab_size, hidden_size = embedding_table.shape
    sequence_data = tf.keras.layers.Input(