Merge branch 'main' into pipeline_parallel_main

63c340ec · Jared Casper · 6fae152a · ea81d62f · 63c340ec · 63c340ec
Commit 63c340ec authored Nov 19, 2020 by Jared Casper
7 changed files
--- a/README.md
+++ b/README.md
 [Megatron](https://arxiv.org/pdf/1909.08053.pdf) is a large, powerful transformer developed by the Applied Deep Learning Research team at NVIDIA. This repository is for ongoing research on training large transformer language models at scale. We developed efficient, model-parallel (tensor and pipeline), and multinode training of [GPT-2](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf) and [BERT](https://arxiv.org/pdf/1810.04805.pdf) using mixed precision.

-Using our GPT-2 model we achieve a perplexity of 10.8 on the WikiText-103 dataset (improving SOTA from 15.8) and an accuracy of 66.5% on the LAMBADA datasets. For BERT training, we swapped the position of the layer normalization and the residual connection in the model architecture (similar to GPT-2 architucture), which allowed the models to continue to improve as they were scaled up. Our BERT models with 3.9 billion parameters reaches a loss of 1.16, SQuAD 2.0 F1-score of 91.7, and RACE accuracy of 90.9%.
+Using our GPT-2 model we achieve a perplexity of 10.8 on the WikiText-103 dataset (improving SOTA from 15.8) and an accuracy of 66.5% on the LAMBADA datasets. For BERT training, we swapped the position of the layer normalization and the residual connection in the model architecture (similar to GPT-2 architucture), which allowed the models to continue to improve as they were scaled up. Our BERT model with 3.9 billion parameters reaches a loss of 1.16, SQuAD 2.0 F1-score of 91.7, and RACE accuracy of 90.9%.

 Our codebase is capable of efficiently training very large (several billion parameter) language models with both model and data parallelism. To demonstrate how the code scales with multiple GPUs we consider the following GPT-2 model sizes. All models use a vocabulary size of 51,200 and a sequence length of 1024.


--- a/megatron/arguments.py
+++ b/megatron/arguments.py
@@ -398,8 +398,8 @@ def _add_data_args(parser):
    group.add_argument('--split', type=str, default='969, 30, 1',
                       help='Comma-separated list of proportions for training,'
                       ' validation, and test split. For example the split '
-                       '`90,5,5` will use 90% of data for training, 5% for '
-                       'validation and 5% for test.')
+                       '`90,5,5` will use 90%% of data for training, 5%% for '
+                       'validation and 5%% for test.')
    group.add_argument('--vocab-file', type=str, default=None,
                       help='Path to the vocab file.')
    group.add_argument('--merge-file', type=str, default=None,

--- a/megatron/mpu/grads.py
+++ b/megatron/mpu/grads.py
@@ -124,7 +124,7 @@ def clip_grad_norm(parameters, max_norm, norm_type=2, parameter_names=None):
        total_norm = 0
        for p in parameters:
            if p.tensor_model_parallel or (get_tensor_model_parallel_rank() == 0):
-                param_norm = p.grad.data.norm(norm_type)
+                param_norm = torch.linalg.norm(p.grad.data.flatten(), norm_type)
                total_norm += param_norm.item() ** norm_type
        # Sum across all model-parallel GPUs.
        total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])

--- a/megatron/mpu/mappings.py
+++ b/megatron/mpu/mappings.py
@@ -90,7 +90,7 @@ class _CopyToModelParallelRegion(torch.autograd.Function):


 class _ReduceFromModelParallelRegion(torch.autograd.Function):
-    """All-redcue the input from the model parallel region."""
+    """All-reduce the input from the model parallel region."""

    @staticmethod
    def symbolic(graph, input_):

--- a/megatron/mpu/random.py
+++ b/megatron/mpu/random.py
@@ -54,7 +54,7 @@ def init_checkpointed_activations_memory_buffer():
    dtype = torch.half
    if not args.fp16:
        dtype = torch.float
-        
+
    global _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER
    assert _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER is None, \
        'checkpointed activations memory buffer is already allocated.'
@@ -224,7 +224,7 @@ def model_parallel_cuda_manual_seed(seed):
    # 2718 is just for fun and any POSITIVE value will work.
    offset = seed + 2718
    tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
-    # Data parallel gets the original sedd.
+    # Data parallel gets the original seed.
    data_parallel_seed = seed

    if torch.distributed.get_rank() == 0:
@@ -268,11 +268,11 @@ class CheckpointFunction(torch.autograd.Function):
            args[0].data = split_tensor_into_1d_equal_chunks(args[0].data)
            args[0].data = _CHECKPOINTED_ACTIVATIONS_MEMORY_BUFFER.add(
                args[0].data)
-            
+
        # Store everything.
        ctx.save_for_backward(*args)

-            
+
        return outputs

    @staticmethod

--- a/megatron/utils.py
+++ b/megatron/utils.py
@@ -67,7 +67,7 @@ def print_params_min_max_norm(optimizer, iteration):
            index += 1
            min_ = param.data.min()
            max_ = param.data.max()
-            norm = param.data.norm()
+            norm = torch.linalg.norm(param.data)
            string += '{:7d}, {:4d}, {:4d}, {:2d}, '.format(
                iteration, rank, index, int(param.tensor_model_parallel))
            string += '{:.6E}, {:.6E}, {:.6E}\n'.format(min_, max_, norm)

--- a/tools/openwebtext/group_duplicates_url.py
+++ b/tools/openwebtext/group_duplicates_url.py