Merge branch 'readme_update_jan_2021' into 'main'

Readme update + change gpt2 to gpt See merge request ADLR/megatron-lm!206

Merge branch 'readme_update_jan_2021' into 'main'
Readme update + change gpt2 to gpt See merge request ADLR/megatron-lm!206
4635bdf7 · Jared Casper · 86eb5bd8 · 152aab30 · 4635bdf7 · 4635bdf7
Commit 4635bdf7 authored Jan 11, 2021 by Jared Casper
20 changed files
--- a/README.md
+++ b/README.md
--- a/examples/evaluate_zeroshot_gpt2.sh
+++ b/examples/evaluate_zeroshot_gpt2.sh
--- a/examples/finetune_mnli_distributed.sh
+++ b/examples/finetune_mnli_distributed.sh
@@ -28,11 +28,11 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
               --num-layers 24 \
               --hidden-size 1024 \
               --num-attention-heads 16 \
-               --batch-size 8 \
+               --micro-batch-size 8 \
               --checkpoint-activations \
               --lr 5.0e-5 \
               --lr-decay-style linear \
-               --warmup 0.065 \
+               --lr-warmup-fraction 0.065 \
               --seq-length 512 \
               --max-position-embeddings 512 \
               --save-interval 500000 \

--- a/examples/finetune_race_distributed.sh
+++ b/examples/finetune_race_distributed.sh
@@ -28,11 +28,11 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
               --num-layers 24 \
               --hidden-size 1024 \
               --num-attention-heads 16 \
-               --batch-size 4 \
+               --micro-batch-size 4 \
               --checkpoint-activations \
               --lr 1.0e-5 \
               --lr-decay-style linear \
-               --warmup 0.06 \
+               --lr-warmup-fraction 0.06 \
               --seq-length 512 \
               --max-position-embeddings 512 \
               --save-interval 100000 \

--- a/examples/pretrain_bert.sh
+++ b/examples/pretrain_bert.sh
@@ -9,24 +9,24 @@ python pretrain_bert.py \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \
-       --batch-size 4 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
       --seq-length 512 \
       --max-position-embeddings 512 \
       --train-iters 2000000 \
+       --lr-decay-iters 990000 \
       --save $CHECKPOINT_PATH \
       --load $CHECKPOINT_PATH \
       --data-path $DATA_PATH \
       --vocab-file bert-vocab.txt \
       --data-impl mmap \
       --split 949,50,1 \
-       --distributed-backend nccl \
       --lr 0.0001 \
       --min-lr 0.00001 \
       --lr-decay-style linear \
-       --lr-decay-iters 990000 \
+       --lr-warmup-fraction .01 \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
-       --warmup .01 \
       --log-interval 100 \
       --save-interval 10000 \
       --eval-interval 1000 \

--- a/examples/pretrain_bert_distributed.sh
+++ b/examples/pretrain_bert_distributed.sh
@@ -15,11 +15,11 @@ DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $

 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
       pretrain_bert.py \
-       --tensor-model-parallel-size 1 \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \
-       --batch-size 4 \
+       --micro-batch-size 4 \
+       --global-batch-size 32 \
       --seq-length 512 \
       --max-position-embeddings 512 \
       --train-iters 1000000 \
@@ -36,7 +36,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
       --lr-decay-iters 990000 \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
-       --warmup .01 \
+       --lr-warmup-fraction .01 \
       --log-interval 100 \
       --save-interval 10000 \
       --eval-interval 1000 \

--- a/examples/pretrain_bert_distributed_with_mp.sh
+++ b/examples/pretrain_bert_distributed_with_mp.sh
@@ -20,8 +20,8 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \
-       --batch-size 2 \
-       --num-microbatches-in-minibatch 2 \
+       --micro-batch-size 2 \
+       --global-batch-size 16 \
       --seq-length 512 \
       --max-position-embeddings 512 \
       --train-iters 1000000 \
@@ -38,7 +38,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
       --lr-decay-iters 990000 \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
-       --warmup .01 \
+       --lr-warmup-fraction .01 \
       --log-interval 100 \
       --save-interval 10000 \
       --eval-interval 1000 \

--- a/examples/pretrain_gpt2.sh
+++ b/examples/pretrain_gpt2.sh
@@ -9,11 +9,12 @@ DATA_PATH=<Specify path and file prefix>_text_document
 CHECKPOINT_PATH=<Specify path>


-python pretrain_gpt2.py \
+python pretrain_gpt.py \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \
-       --batch-size 8 \
+       --micro-batch-size 4 \
+       --global-batch-size 8 \
       --seq-length 1024 \
       --max-position-embeddings 1024 \
       --train-iters 500000 \
@@ -31,7 +32,7 @@ python pretrain_gpt2.py \
       --lr-decay-style cosine \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
-       --warmup .01 \
+       --lr-warmup-fraction .01 \
       --checkpoint-activations \
       --log-interval 100 \
       --save-interval 10000 \

--- a/examples/pretrain_gpt3_175B.sh
+++ b/examples/pretrain_gpt3_175B.sh
+#!/bin/bash
+
+
+#SBATCH <SLURM OPTIONS> --nodes=128 --exclusive --ntasks-per-node=8 --job-name=megatron_gpt3_175b
+
+
+DIR=`pwd`
+DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
+mkdir -p $DIR/logs
+
+
+DATASET_1="<PATH TO THE FIRST DATASET>"
+DATASET_2="<PATH TO THE SECOND DATASET>"
+DATASET_3="<PATH TO THE THIRD DATASET>"
+DATASET="0.2 ${DATASET_1} 0.3 ${DATASET_2} 0.5 ${DATASET_3}"
+
+
+options=" \
+	--tensor-model-parallel-size 8 \
+	--pipeline-model-parallel-size 16 \
+        --num-layers 96 \
+        --hidden-size 12288 \
+        --num-attention-heads 96 \
+        --seq-length 2048 \
+        --max-position-embeddings 2048 \
+	--micro-batch-size 1 \
+	--global-batch-size 1536 \
+	--rampup-batch-size 16 16 5859375 \
+	--train-samples 146484375 \
+       	--lr-decay-samples 126953125 \
+        --lr-warmup-samples 183105 \
+        --lr 6.0e-5 \
+	--min-lr 6.0e-6 \
+        --lr-decay-style cosine \
+        --log-interval 10 \
+        --eval-iters 40 \
+        --eval-interval 1000 \
+	--data-path ${DATASET} \
+	--vocab-file <PATH TO gpt-vocab.json> \
+	--merge-file <PATH TO gpt-merges.txt> \
+	--save-interval 1000 \
+	--save <PATH TO CHECKPOINTS DIRECTORY> \
+	--load <PATH TO CHECKPOINTS DIRECTORY> \
+        --split 98,2,0 \
+        --clip-grad 1.0 \
+	--weight-decay 0.1 \
+	--adam-beta1 0.9 \
+	--adam-beta2 0.95 \
+	--tensorboard-dir <TENSORBOARD DIRECTORY> \
+        --fp16 \
+	--checkpoint-activations "
+
+
+run_cmd="python -u ${DIR}/pretrain_gpt.py $@ ${options}"
+
+
+srun -l \
+     --container-image "nvcr.io/nvidia/pytorch:20.12-py3" \
+     --container-mounts "<DIRECTORIES TO MOUNT>" \
+     --output=$DIR/logs/%x_%j_$DATETIME.log sh -c "${run_cmd}"
+
+
+set +x
+
--- a/examples/pretrain_gpt2_distributed.sh
+++ b/examples/pretrain_gpt2_distributed.sh
@@ -16,12 +16,12 @@ CHECKPOINT_PATH=<Specify path>
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_gpt2.py \
-       --tensor-model-parallel-size 1 \
+       pretrain_gpt.py \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \
-       --batch-size 8 \
+       --micro-batch-size 8 \
+       --global-batch-size 64 \
       --seq-length 1024 \
       --max-position-embeddings 1024 \
       --train-iters 500000 \
@@ -39,7 +39,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
       --min-lr 1.0e-5 \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
-       --warmup .01 \
+       --lr-warmup-fraction .01 \
       --checkpoint-activations \
       --log-interval 100 \
       --save-interval 10000 \

--- a/examples/pretrain_gpt2_distributed_with_mp.sh
+++ b/examples/pretrain_gpt2_distributed_with_mp.sh
@@ -16,14 +16,14 @@ CHECKPOINT_PATH=<Specify path>
 DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"

 python -m torch.distributed.launch $DISTRIBUTED_ARGS \
-       pretrain_gpt2.py \
+       pretrain_gpt.py \
       --tensor-model-parallel-size 2 \
       --pipeline-model-parallel-size 2 \
       --num-layers 24 \
       --hidden-size 1024 \
       --num-attention-heads 16 \
-       --batch-size 4 \
-       --num-microbatches-in-minibatch 2 \
+       --micro-batch-size 4 \
+       --global-batch-size 16 \
       --seq-length 1024 \
       --max-position-embeddings 1024 \
       --train-iters 500000 \
@@ -41,7 +41,7 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS \
       --min-lr 1.0e-5 \
       --weight-decay 1e-2 \
       --clip-grad 1.0 \
-       --warmup .01 \
+       --lr-warmup-fraction .01 \
       --checkpoint-activations \
       --log-interval 100 \
       --save-interval 10000 \

--- a/images/cases.png
+++ b/images/cases.png
--- a/images/cases_jan2021.png
+++ b/images/cases_jan2021.png
--- a/images/scaling-dp.png
+++ b/images/scaling-dp.png
--- a/images/scaling-mp.png
+++ b/images/scaling-mp.png
--- a/images/scaling.png
+++ b/images/scaling.png
--- a/megatron/data/gpt2_dataset.py
+++ b/megatron/data/gpt2_dataset.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""GPT2 style dataset."""
+"""GPT style dataset."""

 import os
 import time
@@ -107,7 +107,7 @@ def _build_train_valid_test_datasets(data_prefix, data_impl, splits_string,
        if splits[index + 1] > splits[index]:
            documents = np.arange(start=splits[index], stop=splits[index + 1],
                                  step=1, dtype=np.int32)
-            dataset = GPT2Dataset(name, data_prefix,
+            dataset = GPTDataset(name, data_prefix,
                                  documents, indexed_dataset,
                                  train_valid_test_num_samples[index],
                                  seq_length, seed)
@@ -136,7 +136,7 @@ def get_indexed_dataset_(data_prefix, data_impl, skip_warmup):
    return indexed_dataset


-class GPT2Dataset(torch.utils.data.Dataset):
+class GPTDataset(torch.utils.data.Dataset):

    def __init__(self, name, data_prefix, documents, indexed_dataset,
                 num_samples, seq_length, seed):

--- a/megatron/model/__init__.py
+++ b/megatron/model/__init__.py
@@ -34,10 +34,11 @@ from .bert_model import (BertModel,
                         BertModelFirstStage,
                         BertModelIntermediateStage,
                         BertModelLastStage)
-from .gpt2_model import (GPT2Model,
-                         GPT2ModelFirstStage,
-                         GPT2ModelIntermediateStage,
-                         GPT2ModelLastStage)
+from .realm_model import ICTBertModel
+from .gpt_model import (GPTModel,
+                        GPTModelFirstStage,
+                        GPTModelIntermediateStage,
+                        GPTModelLastStage)
 from .language_model import get_language_model
 from .module import FP16Module
 from .realm_model import ICTBertModel

--- a/megatron/model/gpt2_model.py
+++ b/megatron/model/gpt2_model.py
@@ -27,7 +27,7 @@ from .utils import init_method_normal
 from .utils import scaled_init_method_normal


-def gpt2_attention_mask_func(attention_scores, ltor_mask):
+def gpt_attention_mask_func(attention_scores, ltor_mask):
    attention_scores.masked_fill_(ltor_mask, -10000.0)
    return attention_scores

@@ -61,18 +61,18 @@ def post_language_model_processing(lm_output, labels, logit_weights,
        return loss


-class GPT2ModelBase(MegatronModule):
+class GPTModelBase(MegatronModule):
    """GPT-2 Language model."""

    def __init__(self, num_tokentypes=0, parallel_output=True):
-        super(GPT2ModelBase, self).__init__()
+        super(GPTModelBase, self).__init__()
        args = get_args()

        self.parallel_output = parallel_output
        self.fp16_lm_cross_entropy = args.fp16_lm_cross_entropy

        self.language_model, self._language_model_key = get_language_model(
-            attention_mask_func=gpt2_attention_mask_func,
+            attention_mask_func=gpt_attention_mask_func,
            num_tokentypes=num_tokentypes,
            add_pooler=False,
            init_method=init_method_normal(args.init_method_std),
@@ -81,17 +81,17 @@ class GPT2ModelBase(MegatronModule):

        self.initialize_word_embeddings(init_method_normal)

-    def forward(self, gpt2_model_input, attention_mask, labels=None,
+    def forward(self, gpt_model_input, attention_mask, labels=None,
                tokentype_ids=None, layer_past=None, get_key_value=False,
                forward_method_parallel_output=None):

        kwargs = {'layer_past': layer_past, 'get_key_value': get_key_value}
        if mpu.is_pipeline_first_stage():
-            (input_ids, position_ids) = gpt2_model_input
+            (input_ids, position_ids) = gpt_model_input
            args = [input_ids, position_ids, attention_mask]
            kwargs['tokentype_ids'] = tokentype_ids
        else:
-            args = [gpt2_model_input, attention_mask]
+            args = [gpt_model_input, attention_mask]
        lm_output = self.language_model(*args, **kwargs)

        if mpu.is_pipeline_last_stage():
@@ -130,17 +130,17 @@ class GPT2ModelBase(MegatronModule):
        self.language_model.load_state_dict(state_dict, strict=strict)


-class GPT2Model(GPT2ModelBase):
+class GPTModel(GPTModelBase):

    def __init__(self, num_tokentypes=0, parallel_output=True):
-        super(GPT2Model, self).__init__(
+        super(GPTModel, self).__init__(
            num_tokentypes=num_tokentypes,
            parallel_output=parallel_output)

    def forward(self, input_ids, position_ids, attention_mask, labels=None,
                tokentype_ids=None, layer_past=None, get_key_value=False,
                forward_method_parallel_output=None):
-        return super(GPT2Model, self).forward(
+        return super(GPTModel, self).forward(
            (input_ids, position_ids),
            attention_mask,
            labels=labels,
@@ -150,15 +150,15 @@ class GPT2Model(GPT2ModelBase):
            forward_method_parallel_output=forward_method_parallel_output)


-class GPT2ModelFirstStage(GPT2ModelBase):
+class GPTModelFirstStage(GPTModelBase):

    def __init__(self, num_tokentypes=0):
-        super(GPT2ModelFirstStage, self).__init__(
+        super(GPTModelFirstStage, self).__init__(
            num_tokentypes=num_tokentypes)

    def forward(self, input_ids, position_ids, attention_mask,
                tokentype_ids=None, layer_past=None, get_key_value=False):
-        return super(GPT2ModelFirstStage, self).forward(
+        return super(GPTModelFirstStage, self).forward(
            (input_ids, position_ids),
            attention_mask,
            tokentype_ids=tokentype_ids,
@@ -166,32 +166,32 @@ class GPT2ModelFirstStage(GPT2ModelBase):
            get_key_value=get_key_value)


-class GPT2ModelIntermediateStage(GPT2ModelBase):
+class GPTModelIntermediateStage(GPTModelBase):

    def __init__(self, num_tokentypes=0):
-        super(GPT2ModelIntermediateStage, self).__init__(
+        super(GPTModelIntermediateStage, self).__init__(
            num_tokentypes=num_tokentypes)

    def forward(self, hidden_state, attention_mask,
                layer_past=None, get_key_value=False):
-        return super(GPT2ModelIntermediateStage, self).forward(
+        return super(GPTModelIntermediateStage, self).forward(
            hidden_state,
            attention_mask,
            layer_past=layer_past,
            get_key_value=get_key_value)


-class GPT2ModelLastStage(GPT2ModelBase):
+class GPTModelLastStage(GPTModelBase):

    def __init__(self, num_tokentypes=0, parallel_output=True):
-        super(GPT2ModelLastStage, self).__init__(
+        super(GPTModelLastStage, self).__init__(
            num_tokentypes=num_tokentypes,
            parallel_output=parallel_output)

    def forward(self, hidden_state, attention_mask, labels=None,
                layer_past=None, get_key_value=False,
                forward_method_parallel_output=None):
-        return super(GPT2ModelLastStage, self).forward(
+        return super(GPTModelLastStage, self).forward(
            hidden_state,
            attention_mask,
            labels=labels,

--- a/pretrain_gpt2.py
+++ b/pretrain_gpt2.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Pretrain GPT2"""
+"""Pretrain GPT"""

 import torch

@@ -22,8 +22,11 @@ from megatron import print_rank_0
 from megatron import get_timers
 from megatron import get_tokenizer
 from megatron import mpu
-from megatron.data.gpt2_dataset import build_train_valid_test_datasets
-from megatron.model import GPT2Model, GPT2ModelFirstStage, GPT2ModelIntermediateStage, GPT2ModelLastStage
+from megatron.data.gpt_dataset import build_train_valid_test_datasets
+from megatron.model import (GPTModel,
+                            GPTModelFirstStage,
+                            GPTModelIntermediateStage,
+                            GPTModelLastStage)
 from megatron.training import pretrain
 from megatron.utils import get_ltor_masks_and_position_ids
 from megatron.utils import average_losses_across_data_parallel_group
@@ -31,20 +34,20 @@ from megatron.utils import average_losses_across_data_parallel_group
 def model_provider():
    """Build the model."""

-    print_rank_0('building GPT2 model ...')
+    print_rank_0('building GPT model ...')
    args = get_args()
    if mpu.get_pipeline_model_parallel_world_size() > 1:
        # Determine model based on position of stage in pipeline.
        if mpu.is_pipeline_first_stage():
-            model = GPT2ModelFirstStage(num_tokentypes=0)
+            model = GPTModelFirstStage(num_tokentypes=0)
        elif mpu.is_pipeline_last_stage():
-            model = GPT2ModelLastStage(
+            model = GPTModelLastStage(
                num_tokentypes=0, parallel_output=True)
        else:
-            model = GPT2ModelIntermediateStage(
+            model = GPTModelIntermediateStage(
                num_tokentypes=0)
    else:
-        model = GPT2Model(num_tokentypes=0, parallel_output=True)
+        model = GPTModel(num_tokentypes=0, parallel_output=True)

    return model

@@ -124,7 +127,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
    args = get_args()

    print_rank_0('> building train, validation, and test datasets '
-                 'for GPT2 ...')
+                 'for GPT ...')
    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
        data_prefix=args.data_path,
        data_impl=args.data_impl,
@@ -133,7 +136,7 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
        seq_length=args.seq_length,
        seed=args.seed,
        skip_warmup=(not args.mmap_warmup))
-    print_rank_0("> finished creating GPT2 datasets ...")
+    print_rank_0("> finished creating GPT datasets ...")

    return train_ds, valid_ds, test_ds