Commit 43529f78 authored by mohammad's avatar mohammad
Browse files

addressed deepaks comments

parent 242770dd
......@@ -103,7 +103,7 @@ class DynamicGradScaler(MegatronGradScaler):
if found_inf:
self._growth_tracker = 0
self._hysteresis_tracker -= 1
# Now if we are our of hysteresis count, scale down the loss.
# Now if we are out of hysteresis count, scale down the loss.
if self._hysteresis_tracker <= 0:
self._scale = torch.max(self._scale * self.backoff_factor,
self.min_scale)
......
......@@ -375,7 +375,7 @@ class FP32Optimizer(MegatronOptimizer):
@torch.no_grad()
def step(self):
"""Clip gradients (if needed) and step the base optimizer.
Always return auccessful since there is no overflow."""
Always return successful since there is no overflow."""
# Clip gradients.
if self.clip_grad > 0.0:
......
......@@ -183,6 +183,9 @@ def get_model(model_provider_func):
model = model_provider_func()
# Set tensor model parallel attributes if not set.
# Only parameters that are already tensor model parallel have these
# attributes set for them. We should make sure the default attributes
# are set for all params so the optimizer can use them.
for param in model.parameters():
mpu.set_defaults_if_not_set_tensor_model_parallel_attributes(param)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment