Fix grad norm computation

5c8238c3 · Deepak Narayanan · da1c96e9 · 5c8238c3
Commit 5c8238c3 authored Jul 30, 2021 by Deepak Narayanan
Hide whitespace changes
Inline Side-by-side

Showing with 3 additions and 4 deletions

megatron/model/language_model.py megatron/model/language_model.py +3 -4

No files found.
--- a/megatron/model/language_model.py
+++ b/megatron/model/language_model.py
@@ -164,9 +164,12 @@ class Embedding(MegatronModule):
    def zero_parameters(self):
        """Zero out all parameters in embedding."""
        self.word_embeddings.weight.data.fill_(0)
+        self.word_embeddings.weight.shared = True
        self.position_embeddings.weight.data.fill_(0)
+        self.position_embeddings.weight.shared = True
        if self.num_tokentypes > 0:
            self.tokentype_embeddings.weight.data.fill_(0)
+            self.tokentype_embeddings.weight.shared = True

    def add_tokentype_embeddings(self, num_tokentypes):
        """Add token-type embedding. This function is provided so we can add
@@ -331,10 +334,6 @@ class TransformerLanguageModel(MegatronModule):
        # Decoder (usually set to False, True if part of an encoder-decoder
        # architecture and in decoder-only stage).
        if self.add_decoder:
-            # Temporary assertion until we verify correctness of pipeline parallelism
-            # implementation of T5.
-            assert args.pipeline_model_parallel_size == 1, \
-                'pipeline parallelism is not supported in the presence of decoder'
            self.decoder = ParallelTransformer(
                self.init_method,
                output_layer_init_method,