Commit 5ed304e4 authored by Mostofa Patwary's avatar Mostofa Patwary
Browse files

addressing the comments

parent 0760822b
...@@ -515,7 +515,7 @@ def _add_network_size_args(parser): ...@@ -515,7 +515,7 @@ def _add_network_size_args(parser):
group.add_argument('--layernorm-epsilon', type=float, default=1e-5, group.add_argument('--layernorm-epsilon', type=float, default=1e-5,
help='Layer norm epsilon.') help='Layer norm epsilon.')
group.add_argument('--apply-layernorm-1p', action='store_true', group.add_argument('--apply-layernorm-1p', action='store_true',
help='Use layernorm 1p') help='Weight adjustment centered around zero.')
group.add_argument('--apply-residual-connection-post-layernorm', group.add_argument('--apply-residual-connection-post-layernorm',
action='store_true', action='store_true',
help='If set, use original BERT residula connection ' help='If set, use original BERT residula connection '
......
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm from .fused_layer_norm import MixedFusedLayerNorm as LayerNorm
#from .fused_layer_norm import MixedFusedLayerNorm1P as LayerNorm1P
from .distributed import DistributedDataParallel from .distributed import DistributedDataParallel
from .bert_model import BertModel from .bert_model import BertModel
......
...@@ -126,29 +126,3 @@ class MixedFusedLayerNorm(torch.nn.Module): ...@@ -126,29 +126,3 @@ class MixedFusedLayerNorm(torch.nn.Module):
keep_graph = True) keep_graph = True)
return output return output
#class MixedFusedLayerNorm1P(MixedFusedLayerNorm):
# def reset_parameters(self):
# init.zeros_(self.weight)
# init.zeros_(self.bias)
#
# def forward(self, input):
#
# if self.no_persist_layer_norm:
# return FusedLayerNormAffineFunction.apply(
# input, self.weight + 1, self.bias, self.normalized_shape, self.eps)
# else:
# output = FastLayerNormFN.apply(
# input, self.weight + 1, self.bias, self.eps)
#
# # Apex's fast layer norm function outputs a 'view' tensor (i.e., has
# # a populated '_base' field). This will result in schedule.py's
# # deallocate_output_tensor() throwing an error, so a viewless tensor is
# # created to prevent this.
# output = make_viewless_tensor(inp = output,
# requires_grad = input.requires_grad,
# keep_graph = True)
#
# return output
...@@ -638,9 +638,6 @@ class ParallelTransformerLayer(MegatronModule): ...@@ -638,9 +638,6 @@ class ParallelTransformerLayer(MegatronModule):
apply_layernorm_1p = False apply_layernorm_1p = False
if args.apply_layernorm_1p: if args.apply_layernorm_1p:
apply_layernorm_1p = True apply_layernorm_1p = True
#from megatron.model import LayerNorm1P as LayerNorm
#else:
# from megatron.model import LayerNorm
# Layernorm on the input data. # Layernorm on the input data.
self.input_layernorm = LayerNorm( self.input_layernorm = LayerNorm(
...@@ -1033,9 +1030,6 @@ class ParallelTransformer(MegatronModule): ...@@ -1033,9 +1030,6 @@ class ParallelTransformer(MegatronModule):
apply_layernorm_1p = False apply_layernorm_1p = False
if args.apply_layernorm_1p: if args.apply_layernorm_1p:
apply_layernorm_1p = True apply_layernorm_1p = True
#from megatron.model import LayerNorm1P as LayerNorm
#else:
# from megatron.model import LayerNorm
if self.post_process and self.post_layer_norm: if self.post_process and self.post_layer_norm:
# Final layer norm before output. # Final layer norm before output.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment