addressed deepaks comments

43529f78 · mohammad · 242770dd · 43529f78 · 43529f78 · 43529f78
Commit 43529f78 authored Dec 30, 2020 by mohammad
Showing with 5 additions and 2 deletions

megatron/optimizer/grad_scaler.py megatron/optimizer/grad_scaler.py +1 -1

megatron/optimizer/optimizer.py megatron/optimizer/optimizer.py +1 -1

megatron/training.py megatron/training.py +3 -0

No files found.
--- a/megatron/optimizer/grad_scaler.py
+++ b/megatron/optimizer/grad_scaler.py
@@ -103,7 +103,7 @@ class DynamicGradScaler(MegatronGradScaler):
        if found_inf:
            self._growth_tracker = 0
            self._hysteresis_tracker -= 1
-            # Now if we are our of hysteresis count, scale down the loss.
+            # Now if we are out of hysteresis count, scale down the loss.
            if self._hysteresis_tracker <= 0:
                self._scale = torch.max(self._scale * self.backoff_factor,
                                        self.min_scale)

--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -375,7 +375,7 @@ class FP32Optimizer(MegatronOptimizer):
    @torch.no_grad()
    def step(self):
        """Clip gradients (if needed) and step the base optimizer.
-        Always return auccessful since there is no overflow."""
+        Always return successful since there is no overflow."""

        # Clip gradients.
        if self.clip_grad > 0.0:

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -183,6 +183,9 @@ def get_model(model_provider_func):
    model = model_provider_func()

    # Set tensor model parallel attributes if not set.
+    # Only parameters that are already tensor model parallel have these
+    # attributes set for them. We should make sure the default attributes
+    # are set for all params so the optimizer can use them.
    for param in model.parameters():
        mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param)