grad norm 'matches' (not bitwise equal).

6728a780 · Lawrence McAfee · c6d20c05 · 6728a780 · 6728a780 · 6728a780
Commit 6728a780 authored Mar 22, 2022 by Lawrence McAfee
3 changed files
--- a/megatron/optimizer/clip_grads.py
+++ b/megatron/optimizer/clip_grads.py
@@ -25,7 +25,8 @@ from megatron.model.module import param_is_not_shared
 from megatron.mpu.layers import param_is_not_tensor_parallel_duplicate


-def clip_grad_norm_fp32(parameters, max_norm, norm_type=2,
+def clip_grad_norm_fp32(parameters, grads_for_norm,
+                        max_norm, norm_type=2,
                        model_parallel_group=None):
    """Clips gradient norm of an iterable of parameters whose gradients
       are in fp32.
@@ -50,42 +51,26 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2,
    if isinstance(parameters, torch.Tensor):
        parameters = [parameters]

-    # Filter parameters based on:
-    #   - grad should not be none
-    #   - parameter should not be shared
-    #   - should not be a replica due to tensor model parallelism
-    grads = []
-    grads_for_norm = []
-    for param in parameters:
-        grad_not_none = param.grad is not None
-        is_not_shared = param_is_not_shared(param)
-        is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
-        if grad_not_none:
-            grad = param.grad.detach()
-        if grad_not_none:
-            # Make sure the grads are in fp32
-            assert param.grad.type() == 'torch.cuda.FloatTensor'
-            grads.append(grad)
-        if grad_not_none and is_not_shared and is_not_tp_duplicate:
-            grads_for_norm.append(grad)
-        # >>>
-        else:
-            # from lutil import pax
-            # pax({"grad": grad})
-            from megatron import get_args
-            args = get_args()
-            for r in range(torch.distributed.get_world_size()):
-                if torch.distributed.get_rank() == r:
-                    print("collect: r %d, dist-op %d, np %d, ne %d, g %s" % (
-                        torch.distributed.get_rank(),
-                        args.use_distributed_optimizer,
-                        len(parameters),
-                        sum(t.nelement() for t in parameters),
-                        str(tuple(grad.shape)),
-                    ))
-                torch.distributed.barrier()
-            exit(0)
-        # <<<
+    # >>>
+    # # Filter parameters based on:
+    # #   - grad should not be none
+    # #   - parameter should not be shared
+    # #   - should not be a replica due to tensor model parallelism
+    # grads = []
+    # grads_for_norm = []
+    # for param in parameters:
+    #     grad_not_none = param.grad is not None
+    #     is_not_shared = param_is_not_shared(param)
+    #     is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+    #     if grad_not_none:
+    #         grad = param.grad.detach()
+    #     if grad_not_none:
+    #         # Make sure the grads are in fp32
+    #         assert param.grad.type() == 'torch.cuda.FloatTensor'
+    #         grads.append(grad)
+    #     if grad_not_none and is_not_shared and is_not_tp_duplicate:
+    #         grads_for_norm.append(grad)
+    # <<<

    # Norm parameters.
    max_norm = float(max_norm)
@@ -118,30 +103,6 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2,
            # we need the pow(norm-type).
            total_norm = grad_norm ** norm_type

-            # >>>
-            # from megatron import get_args
-            # from lutil import pax
-            # args = get_args()
-            # for r in range(torch.distributed.get_world_size()):
-            #     if torch.distributed.get_rank() == r:
-            #         print("compute: r %d, dist-op %d, gnorm %f ... p %d, g %d, gn %d" % (
-            #             torch.distributed.get_rank(),
-            #             args.use_distributed_optimizer,
-            #             grad_norm.item(),
-            #             sum(t.nelement() for t in parameters),
-            #             sum(t.nelement() for t in grads),
-            #             sum(t.nelement() for t in grads_for_norm),
-            #         ))
-            #     torch.distributed.barrier()
-            # exit(0)
-            # pax(2, {
-            #     "use distrib opt" : args.use_distributed_optimizer,
-            #     "norm_type" : norm_type,
-            #     "grad_norm" : grad_norm.item(),
-            #     "total_norm" : total_norm.item(),
-            # })
-            # <<<
-
        else:
            for grad in grads_for_norm:
                grad_norm = torch.norm(grad, norm_type)
@@ -154,14 +115,14 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2,
        total_norm = total_norm.item() ** (1.0 / norm_type)

        # >>>
-        # from megatron import get_args
-        # from lutil import pax
-        # args = get_args()
-        # pax(0, {
-        #     "use distrib opt" : args.use_distributed_optimizer,
-        #     "norm_type" : norm_type,
-        #     "total_norm" : total_norm,
-        # })
+        from megatron import get_args
+        from lutil import pax
+        args = get_args()
+        pax(0, {
+            "use distrib opt" : args.use_distributed_optimizer,
+            "norm_type" : norm_type,
+            "total_norm" : total_norm,
+        })
        # <<<

    # Scale.

--- a/megatron/optimizer/distrib_optimizer.py
+++ b/megatron/optimizer/distrib_optimizer.py
@@ -198,7 +198,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):

    # >>>
    @classmethod
-    def get_grad_views_for_grad_norm(cls, opt_group_shards, optimizer):
+    def get_main_grad_views_for_grad_norm(cls, opt_group_shards, optimizer):

        grad_views = []
        # grad_views_SKIPPED = []
@@ -285,7 +285,7 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
        # <<<

        # Params for grad norm.
-        self.grad_views_for_grad_norm = self.get_grad_views_for_grad_norm(
+        self.main_grad_views_for_grad_norm = self.get_main_grad_views_for_grad_norm(
            self.opt_group_shards,
            self.optimizer)

@@ -344,6 +344,11 @@ class DistributedOptimizer(MixedPrecisionOptimizer):
    def get_main_grad(self, group_index):
        return self.get_main_param(group_index).grad

+    # >>>
+    def _get_main_grads_for_grad_norm(self):
+        return self.main_grad_views_for_grad_norm
+    # <<<
+
    def state_dict(self):
        state_dict = {}
        state_dict['optimizer'] = self.optimizer.state_dict()

--- a/megatron/optimizer/optimizer.py
+++ b/megatron/optimizer/optimizer.py
@@ -112,6 +112,12 @@ class MegatronOptimizer(ABC):
                params.append(param)
        return params

+    # >>>
+    @abstractmethod
+    # def get_grads_for_norm(self):
+    def _get_main_grads_for_grad_norm(self):
+        pass
+    # <<<

    def get_model_parallel_group(self):
        '''Default returned here, but the distributed optimizer overrides this.'''
@@ -119,20 +125,10 @@ class MegatronOptimizer(ABC):


    def clip_grad_norm(self, clip_grad):
-
-        # >>>
-        # model_params = [ p for m in self.models for p in m.parameters() ]
-        # optim_params = self.get_parameters()
-        # from lutil import pax
-        # pax(1, {
-        #     "model_params" : get_clippy(model_params),
-        #     "optim_params" : get_clippy(optim_params),
-        # })
-        # <<<
-
        params = self.get_parameters()
+        grads_for_norm = self._get_main_grads_for_grad_norm()
        return clip_grad_norm_fp32(
-            params, clip_grad,
+            params, grads_for_norm, clip_grad,
            model_parallel_group=self.get_model_parallel_group())


@@ -574,6 +570,36 @@ class Float16OptimizerWithFloat16Params(MixedPrecisionOptimizer):
            _zero_grad_group_helper(group, set_to_none)


+    def _get_main_grads_for_grad_norm(self):
+
+        # Filter parameters based on:
+        #   - grad should not be none
+        #   - parameter should not be shared
+        #   - should not be a replica due to tensor model parallelism
+        params = self.get_parameters()
+        # grads = []
+        grads_for_norm = []
+        for param in params:
+            grad = param.grad
+            grad_not_none = grad is not None
+            is_not_shared = param_is_not_shared(param)
+            is_not_tp_duplicate = param_is_not_tensor_parallel_duplicate(param)
+            # if grad_not_none:
+            #     grad = param.grad.detach()
+            # if grad_not_none:
+            #     # Make sure the grads are in fp32
+            #     assert param.grad.type() == 'torch.cuda.FloatTensor'
+            #     grads.append(grad)
+            if grad_not_none and is_not_shared and is_not_tp_duplicate:
+                grads_for_norm.append(grad)
+
+        # pax(0, {"grads_for_norm": [
+        #     str(tuple(g.shape))
+        #     for g in grads_for_norm
+        # ]})
+
+        return grads_for_norm
+
    def _collect_main_grad_data_for_unscaling(self):

        main_grads = []