guard 'gather_params()' with 'if update_successful:'

f6811e28 · Lawrence McAfee · c64098ef · f6811e28 · f6811e28 · f6811e28
Commit f6811e28 authored Mar 11, 2022 by Lawrence McAfee
Showing with 33 additions and 2 deletions

megatron/optimizer/distrib_optimizer.py megatron/optimizer/distrib_optimizer.py +8 -1

megatron/optimizer/optimizer.py megatron/optimizer/optimizer.py +23 -0

megatron/training.py megatron/training.py +2 -1

No files found.
--- a/megatron/optimizer/distrib_optimizer.py
+++ b/megatron/optimizer/distrib_optimizer.py
@@ -177,12 +177,19 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
    def allocate_main_param_shards(cls, opt_group_shards):

        # Allocator method.
+        # >>>
        allocate_shard = lambda shard_size, dtype : torch.empty(
            (shard_size,),
            dtype = dtype,
            device = torch.cuda.current_device(),
            requires_grad = True)
-        
+        # allocate_shard = lambda shard_size, dtype : torch.zeros(
+        #     (shard_size,),
+        #     dtype = dtype,
+        #     device = torch.cuda.current_device(),
+        #     requires_grad = True)
+        # <<<
+
        # Allocate each group's param/grad shard.
        for group_index, group_shard in enumerate(opt_group_shards):


--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -330,6 +330,25 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
        # Check for nan.
        found_inf_flag = (self.found_inf.item() > 0)

+        # >>>
+        # if self.grad_scaler.scale <= 131072:
+        #     pax(0, {
+        #         # "grad_scaler" : self.grad_scaler,
+        #         # "found_inf_flag" : found_inf_flag,
+        #         "model_params" : [
+        #             p
+        #             for m in self.models
+        #             for p in m.parameters()
+        #         ],
+        #         "model_grads" : [
+        #             p.main_grad
+        #             for m in self.models
+        #             for p in m.parameters()
+        #         ],
+        #         # "main_grads" : main_grads,
+        #     })
+        # <<<
+
        return found_inf_flag

    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
@@ -411,6 +430,10 @@ class MixedPrecisionOptimizer(MegatronOptimizer):
            found_inf_flag = self._unscale_main_grads_and_check_for_nan()
            timers('optimizer-unscale-and-check-inf').stop()

+            # >>>
+            
+            # <<<
+
            # We are done with scaling gradients
            # so we can update the loss scale.
            self.grad_scaler.update(found_inf_flag)

--- a/megatron/training.py
+++ b/megatron/training.py
@@ -453,7 +453,8 @@ def train_step(forward_step_func, data_iterator,

    # >>>
    # Gather params.
-    optimizer.gather_model_params(args, timers, ITERATION)
+    if update_successful:
+        optimizer.gather_model_params(args, timers, ITERATION)
    # <<<

    # >>>