Create ICTBertModel and update model/.__init__.py

Create ICTBertModel and update model/.init.py
fd33e930 · Neel Kant · bcb320ee · fd33e930 · fd33e930
Commit fd33e930 authored Mar 26, 2020 by Neel Kant
Hide whitespace changes
Inline Side-by-side

Showing with 45 additions and 1 deletion

megatron/model/__init__.py megatron/model/__init__.py +1 -1

megatron/model/bert_model.py megatron/model/bert_model.py +44 -0

No files found.
--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -14,6 +14,6 @@
 # limitations under the License.
 from .distributed import *
-from .bert_model import BertModel
+from .bert_model import BertModel, ICTBertModel
 from .gpt2_model import GPT2Model
 from .utils import get_params_for_weight_decay_optimization
--- a/megatron/model/bert_model.py
+++ b/megatron/model/bert_model.py
@@ -240,3 +240,47 @@ class BertModel(MegatronModule):
            self.ict_head.load_state_dict(state_dict[self._ict_head_key],
                                          strict=strict)
+class ICTBertModel(MegatronModule):
+    def __init__(self,
+                 num_layers,
+                 vocab_size,
+                 hidden_size,
+                 num_attention_heads,
+                 embedding_dropout_prob,
+                 attention_dropout_prob,
+                 output_dropout_prob,
+                 max_sequence_length,
+                 checkpoint_activations,
+                 ict_head_size,
+                 checkpoint_num_layers=1,
+                 layernorm_epsilon=1.0e-5,
+                 init_method_std=0.02,
+                 num_tokentypes=0,
+                 parallel_output=True,
+                 apply_query_key_layer_scaling=False,
+                 attention_softmax_in_fp32=False):
+        super(ICTBertModel, self).__init__()
+        bert_args = dict(
+            num_layers=num_layers,
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            embedding_dropout_prob=embedding_dropout_prob,
+            attention_dropout_prob=attention_dropout_prob,
+            output_dropout_prob=output_dropout_prob,
+            max_sequence_length=max_sequence_length,
+            checkpoint_activations=checkpoint_activations,
+            add_binary_head=False,
+            ict_head_size=ict_head_size,
+            checkpoint_num_layers=checkpoint_num_layers,
+            layernorm_epsilon=layernorm_epsilon,
+            init_method_std=init_method_std,
+            num_tokentypes=num_tokentypes,
+            parallel_output=parallel_output,
+            apply_query_key_layer_scaling=apply_query_key_layer_scaling,
+            attention_softmax_in_fp32=attention_softmax_in_fp32)
+        self.question_model = BertModel(**bert_args)
+        self.evidence_model = BertModel(**bert_args)