addressing the comments

5ed304e4 · Mostofa Patwary · 0760822b · 5ed304e4 · 5ed304e4 · 5ed304e4
Commit 5ed304e4 authored Apr 05, 2023 by Mostofa Patwary
4 changed files
--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -515,7 +515,7 @@ def _add_network_size_args(parser):
    group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
                       help='Layer norm epsilon.')
    group.add_argument('--apply-layernorm-1p', action='store_true',
-                       help='Use layernorm 1p')
+                       help='Weight adjustment centered around zero.')
    group.add_argument('--apply-residual-connection-post-layernorm',
                       action='store_true',
                       help='If set, use original BERT residula connection '

--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

 from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
-#from .fused_layer_norm import MixedFusedLayerNorm1P as LayerNorm1P

 from .distributed import DistributedDataParallel
 from .bert_model import BertModel

--- a/megatron/model/fused_layer_norm.py
+++ b/megatron/model/fused_layer_norm.py
@@ -126,29 +126,3 @@ class MixedFusedLayerNorm(torch.nn.Module):
                                      keep_graph = True)

        return output
-
-
-
-#class MixedFusedLayerNorm1P(MixedFusedLayerNorm):
-#  def reset_parameters(self):
-#    init.zeros_(self.weight)
-#    init.zeros_(self.bias)
-#
-#  def forward(self, input):
-#
-#    if self.no_persist_layer_norm:
-#        return FusedLayerNormAffineFunction.apply(
-#          input, self.weight + 1, self.bias, self.normalized_shape, self.eps)
-#    else:
-#        output = FastLayerNormFN.apply(
-#          input, self.weight + 1, self.bias, self.eps)
-#
-#        # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
-#        # a populated '_base' field). This will result in schedule.py's
-#        # deallocate_output_tensor() throwing an error, so a viewless tensor is
-#        # created to prevent this.
-#        output = make_viewless_tensor(inp = output,
-#                                      requires_grad = input.requires_grad,
-#                                      keep_graph = True)
-#
-#        return output
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -638,9 +638,6 @@ class ParallelTransformerLayer(MegatronModule):
        apply_layernorm_1p = False
        if args.apply_layernorm_1p:
            apply_layernorm_1p = True
-            #from megatron.model import LayerNorm1P as LayerNorm
-        #else:
-        #    from megatron.model import LayerNorm

        # Layernorm on the input data.
        self.input_layernorm = LayerNorm(
@@ -1033,9 +1030,6 @@ class ParallelTransformer(MegatronModule):
        apply_layernorm_1p = False
        if args.apply_layernorm_1p:
            apply_layernorm_1p = True
-            #from megatron.model import LayerNorm1P as LayerNorm
-        #else:
-        #    from megatron.model import LayerNorm

        if self.post_process and self.post_layer_norm:
            # Final layer norm before output.