fix typo

e515f026 · hwijeen · 90e0a0dd · e515f026 · e515f026
Commit e515f026 authored Jul 02, 2021 by hwijeen
Hide whitespace changes
Inline Side-by-side

Showing with 5 additions and 7 deletions

megatron/model/transformer.py megatron/model/transformer.py +1 -3

megatron/mpu/layers.py megatron/mpu/layers.py +4 -4

No files found.
--- a/megatron/model/transformer.py
+++ b/megatron/model/transformer.py
@@ -53,8 +53,7 @@ class ParallelMLP(MegatronModule):

    MLP will take the input with h hidden state, project it to 4*h
    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension. At the end, dropout is also
-    applied.
+    state back into h hidden dimension.
    """

    def __init__(self, init_method, output_layer_init_method):
@@ -84,7 +83,6 @@ class ParallelMLP(MegatronModule):
            init_method=output_layer_init_method,
            skip_bias_add=True)

-
    def forward(self, hidden_states):

        # [s, b, 4hp]

--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -256,7 +256,7 @@ class ColumnParallelLinear(torch.nn.Module):
                device=torch.cuda.current_device(), dtype=args.params_dtype))
            _initialize_affine_weight_gpu(self.weight, init_method,
                                          partition_dim=0, stride=stride)
-            
+
        if bias:
            if args.use_cpu_initialization:
                self.bias = Parameter(torch.empty(
@@ -286,7 +286,7 @@ class ColumnParallelLinear(torch.nn.Module):
            # All-gather across the partitions.
            output = gather_from_tensor_model_parallel_region(output_parallel)
        else:
-            output = output_parallel 
+            output = output_parallel
        output_bias = self.bias if self.skip_bias_add else None
        return output, output_bias

@@ -316,8 +316,8 @@ class RowParallelLinear(torch.nn.Module):
        keep_master_weight_for_test: This was added for testing and should be
                                     set to False. It returns the master weights
                                     used for initialization.
-        skip_bias_add: This was added to enable performance optimations where bias
-                       can be fused with other elementwise operations. we skip 
+        skip_bias_add: This was added to enable performance optimization where bias
+                       can be fused with other elementwise operations. We skip
                       adding bias but instead return it.
    """