Commit 420eec74 authored by Jared Casper's avatar Jared Casper
Browse files

Addressing comments.

parent 2cc3dac7
...@@ -140,7 +140,7 @@ class ParallelMLP(MegatronModule): ...@@ -140,7 +140,7 @@ class ParallelMLP(MegatronModule):
assert self.activation_func == F.gelu assert self.activation_func == F.gelu
intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel) intermediate_parallel = bias_gelu_impl(intermediate_parallel, bias_parallel)
else: else:
if self.add_bias: if bias_parallel is not None:
intermediate_parallel = intermediate_parallel + bias_parallel intermediate_parallel = intermediate_parallel + bias_parallel
intermediate_parallel = self.activation_func(intermediate_parallel) intermediate_parallel = self.activation_func(intermediate_parallel)
...@@ -674,7 +674,7 @@ class ParallelTransformerLayer(MegatronModule): ...@@ -674,7 +674,7 @@ class ParallelTransformerLayer(MegatronModule):
attention_type=AttnType.self_attn, attention_type=AttnType.self_attn,
attn_mask_type=self_attn_mask_type) attn_mask_type=self_attn_mask_type)
self.hidden_dropout = args.hidden_dropout self.hidden_dropout = args.hidden_dropout
self.bias_dropout_fusion = args.bias_dropout_fusion and args.add_bias_linear self.bias_dropout_fusion = args.bias_dropout_fusion
self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0.0 else None
# Layernorm on the attention output # Layernorm on the attention output
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment