Merge branch 'main' into checkpoint_util

06fc51ce · Jared Casper · ec561daa · 0ed2f6ac · 06fc51ce · 06fc51ce
Commit 06fc51ce authored Feb 18, 2022 by Jared Casper
20 changed files
--- a/megatron/mpu/layers.py
+++ b/megatron/mpu/layers.py
@@ -242,7 +242,7 @@ class ColumnParallelLinear(torch.nn.Module):
        input_size: first dimension of matrix A.
        output_size: second dimension of matrix A.
        bias: If true, add bias
-        gather_output: If true, call all-gether on output and make Y avaiable
+        gather_output: If true, call all-gather on output and make Y avaiable
                       to all GPUs, otherwise, every GPU will have its output
                       which is Y_i = XA_i
        init_method: method to initialize weights. Note that bias is always set

--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
--- a/megatron/optimizer/__init__.py
+++ b/megatron/optimizer/__init__.py
@@ -23,35 +23,68 @@ from .grad_scaler import ConstantGradScaler, DynamicGradScaler
 from .optimizer import Float16OptimizerWithFloat16Params, FP32Optimizer


-def _get_params_for_weight_decay_optimization(modules):
-    """Divide params into with-weight-decay and without-weight-decay groups.
-    Layernorms and baises will have no weight decay but the rest will.
+def get_param_groups(modules,
+                     no_weight_decay_cond,
+                     scale_lr_cond,
+                     lr_mult):
+    """creates param groups based on weight decay condition (regularized vs non regularized)
+       and learning rate scale condition (args.lr vs lr_mult * args.lr)
+       scale_lr_cond is used during finetuning where head of the network requires a scaled
+       version of the base learning rate. 
    """
-
-    weight_decay_params = {'params': []}
-    no_weight_decay_params = {'params': [], 'weight_decay': 0.0}
+    wd_no_scale_lr = []
+    wd_scale_lr = []
+    no_wd_no_scale_lr = []
+    no_wd_scale_lr = []
    for module in modules:
-        for module_ in module.modules():
-            if isinstance(module_, LayerNorm):
-                no_weight_decay_params['params'].extend(
-                    [p for p in list(module_._parameters.values())
-                     if p is not None])
+        for name, param in module.named_parameters():
+            if not param.requires_grad:
+                continue
+
+            if no_weight_decay_cond is not None:
+                no_wd = no_weight_decay_cond(name, param)
            else:
-                weight_decay_params['params'].extend(
-                    [p for n, p in list(module_._parameters.items())
-                     if p is not None and n != 'bias'])
-                no_weight_decay_params['params'].extend(
-                    [p for n, p in list(module_._parameters.items())
-                     if p is not None and n == 'bias'])
+                # do not regularize biases nor Norm parameters
+                no_wd = name.endswith(".bias") or len(param.shape) == 1

-    return weight_decay_params, no_weight_decay_params
+            if scale_lr_cond is not None:
+                scale_lr = scale_lr_cond(name, param)
+            else:
+                scale_lr = False

+            if not no_wd and not scale_lr:
+                wd_no_scale_lr.append(param)
+            elif not no_wd and scale_lr:
+                wd_scale_lr.append(param)
+            elif no_wd and not scale_lr:
+                no_wd_no_scale_lr.append(param)
+            else:
+                no_wd_scale_lr.append(param)

-def get_megatron_optimizer(model):
+    param_groups = []
+    if len(wd_no_scale_lr):
+        param_groups.append({'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0})
+    if len(wd_scale_lr):
+        param_groups.append({'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult})
+    if len(no_wd_no_scale_lr):
+        param_groups.append({'params': no_wd_no_scale_lr, 'wd_mult': 0.0, 'lr_mult': 1.0})
+    if len(no_wd_scale_lr):
+        param_groups.append({'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult})
+
+    return param_groups
+
+def get_megatron_optimizer(model,
+                           no_weight_decay_cond=None,
+                           scale_lr_cond=None,
+                           lr_mult=1.0):
    args = get_args()

    # Base optimizer.
-    param_groups = _get_params_for_weight_decay_optimization(model)
+    param_groups = get_param_groups(model,
+                                    no_weight_decay_cond,
+                                    scale_lr_cond,
+                                    lr_mult)
+
    if args.optimizer == 'adam':
        optimizer = Adam(param_groups,
                         lr=args.lr,

--- a/megatron/learning_rates.py
+++ b/megatron/learning_rates.py
--- a/megatron/p2p_communication.py
+++ b/megatron/p2p_communication.py
@@ -142,10 +142,16 @@ def _communicate(tensor_send_next, tensor_send_prev, recv_prev, recv_next,
        if recv_prev:
            tensor_recv_prev = mpu.gather_split_1d_tensor(
                tensor_recv_prev).view(tensor_shape).requires_grad_()
+            tensor_recv_prev = mpu.make_viewless_tensor(tensor_recv_prev,
+                                                        requires_grad = True,
+                                                        keep_graph = False)

        if recv_next:
            tensor_recv_next = mpu.gather_split_1d_tensor(
                tensor_recv_next).view(tensor_shape).requires_grad_()
+            tensor_recv_next = mpu.make_viewless_tensor(tensor_recv_next,
+                                                        requires_grad = True,
+                                                        keep_graph = False)

    return tensor_recv_prev, tensor_recv_next


--- a/megatron/schedules.py
+++ b/megatron/schedules.py
--- a/megatron/static/index.html
+++ b/megatron/static/index.html
--- a/megatron/text_generation/api.py
+++ b/megatron/text_generation/api.py
--- a/megatron/text_generation/generation.py
+++ b/megatron/text_generation/generation.py
--- a/megatron/text_generation_server.py
+++ b/megatron/text_generation_server.py
--- a/megatron/training.py
+++ b/megatron/training.py
--- a/megatron/utils.py
+++ b/megatron/utils.py
--- a/pretrain_vit.py
+++ b/pretrain_vit.py
--- a/pretrain_vision_dino.py
+++ b/pretrain_vision_dino.py
--- a/pretrain_vision_inpaint.py
+++ b/pretrain_vision_inpaint.py
--- a/tasks/finetune_utils.py
+++ b/tasks/finetune_utils.py
--- a/tasks/msdp/README.md
+++ b/tasks/msdp/README.md
--- a/tasks/msdp/evaluate.py
+++ b/tasks/msdp/evaluate.py
--- a/tasks/msdp/main.py
+++ b/tasks/msdp/main.py
--- a/tasks/msdp/metrics.py
+++ b/tasks/msdp/metrics.py