Merge branch 'megatron-lm_dtk24.04' into 'main'

Megatron lm dtk24.04 See merge request !1

Merge branch 'megatron-lm_dtk24.04' into 'main'
Megatron lm dtk24.04 See merge request !1
3aca1415 · liangjing · 0024a5c6 · 1005e9d3 · 3aca1415 · 3aca1415
Commit 3aca1415 authored Apr 29, 2024 by liangjing
20 changed files
--- a/single.sh
+++ b/single.sh
+#!/bin/bash
+# This example script is contributed by external user https://github.com/nrailgun
+set -ex
+######################################
+#####################################
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=5
+
+
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+
+export NCCL_IB_TIMEOUT=22
+# Change the below configurations here
+BASE_PATH=./tmp
+DATASET_1="./dataset/my-gpt2_text_document"
+DATASET="1 ${DATASET_1}"
+CHECKPOINT_PATH=./tmp
+
+TP=4
+PP=1
+
+HIDDEN_SIZE=4096 
+NUM_LAYERS=32
+NUM_HEADS=32
+SEQ_LENGTH=4096
+
+VOCAB_PATH=./gpt2-vocab.json
+MERGE_PATH=./gpt2-merges.txt
+
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=60 
+TRAIN_STEPS=250000 
+LR=3e-4
+MIN_LR=3e-5
+LR_WARMUP_STEPS=2000
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1
+
+
+APP="python3 -u pretrain_gpt.py \
+       --tensor-model-parallel-size $TP \
+       --pipeline-model-parallel-size $PP \
+       --num-layers $NUM_LAYERS \
+       --hidden-size $HIDDEN_SIZE \
+       --num-attention-heads $NUM_HEADS \
+       --micro-batch-size $MICRO_BATCH_SIZE \
+       --global-batch-size $GLOBAL_BATCH_SIZE \
+       --seq-length $SEQ_LENGTH \
+       --max-position-embeddings $SEQ_LENGTH \
+       --train-iters $TRAIN_STEPS \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATASET \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr $LR \
+       --lr-decay-style cosine \
+       --min-lr $MIN_LR \
+       --weight-decay $WEIGHT_DECAY \
+       --clip-grad $GRAD_CLIP \
+       --lr-warmup-iters $LR_WARMUP_STEPS \
+       --optimizer adam \
+       --adam-beta1 0.9 \
+       --adam-beta2 0.95 \
+       --log-interval 1 \
+       --vocab-file ${VOCAB_PATH} \
+       --merge-file ${MERGE_PATH} \
+       --tokenizer-type GPT2BPETokenizer \
+       --save-interval 1000 \
+       --eval-interval 1000 \
+       --eval-iters 1000 \
+       --fp16 \
+       --recompute-activations \
+       --disable-bias-linear \
+       --no-gradient-accumulation-fusion \
+       --rank ${RANK} \
+       --world_size ${WORLD_SIZE} \
+       --dist_url tcp://${1}:34566 \
+       --num-workers 2 \
+      "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3
+  ${APP}
+  ;;
+esac
--- a/tasks/glue/finetune.py
+++ b/tasks/glue/finetune.py
@@ -8,6 +8,7 @@ from megatron import get_tokenizer
 from megatron.model.classification import Classification
 from tasks.eval_utils import accuracy_func_provider
 from tasks.finetune_utils import finetune
+from megatron.arguments import core_transformer_config_from_args


 def glue_classification(num_classes, Dataset,
@@ -28,10 +29,11 @@ def glue_classification(num_classes, Dataset,
    def model_provider(pre_process=True, post_process=True):
        """Build the model."""
        args = get_args()
+        config = core_transformer_config_from_args()

        print_rank_0('building classification model for {} ...'.format(
            args.task))
-        model = Classification(num_classes=num_classes, num_tokentypes=2,
+        model = Classification(config=config, num_classes=num_classes, num_tokentypes=2,
                               pre_process=pre_process, post_process=post_process)

        return model

--- a/tasks/msdp/README.md
+++ b/tasks/msdp/README.md
@@ -7,7 +7,7 @@ Below we present the steps to run our multi-stage dialogue prompting (MSDP) fram

 ### Data Preparation
 1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
-2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datatsets.
+2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datasets.

 ### Stage-1: Prompting for Knowledge Generation
 1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.

--- a/tasks/orqa/README.md
+++ b/tasks/orqa/README.md
 ## End-to-End Training of Neural Retrievers for Open-Domain Question Answering

-Below we present the steps to run unsupervised and supervised trainining and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).
+Below we present the steps to run unsupervised and supervised training and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).

 ## Retriever Training


--- a/tasks/race/finetune.py
+++ b/tasks/race/finetune.py
@@ -9,6 +9,7 @@ from megatron.model.multiple_choice import MultipleChoice
 from tasks.eval_utils import accuracy_func_provider
 from tasks.finetune_utils import finetune
 from tasks.race.data import RaceDataset
+from megatron.arguments import core_transformer_config_from_args


 def train_valid_datasets_provider():
@@ -26,9 +27,10 @@ def train_valid_datasets_provider():

 def model_provider(pre_process=True, post_process=True):
    """Build the model."""
-
+    config = core_transformer_config_from_args(get_args())
    print_rank_0('building multichoice model for RACE ...')
-    model = MultipleChoice(num_tokentypes=2,
+    model = MultipleChoice(config=config,
+                           num_tokentypes=2,
                           pre_process=pre_process,
                           post_process=post_process)


--- a/tasks/zeroshot_gpt/evaluate.py
+++ b/tasks/zeroshot_gpt/evaluate.py
@@ -14,7 +14,8 @@ from megatron.checkpointing import load_checkpoint
 from megatron.model import GPTModel
 from megatron.training import get_model
 from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
-from megatron.p2p_communication import recv_forward, send_forward
+from megatron.core.pipeline_parallel.p2p_communication import recv_forward, send_forward
+from megatron.arguments import core_transformer_config_from_args
 from tasks.finetune_utils import build_data_loader

 from .datasets import build_dataset
@@ -31,6 +32,8 @@ def get_model_provider(eval_metric):
    def model_provider(pre_process=True, post_process=True):
        """Build the model."""

+        config = core_transformer_config_from_args(get_args())
+
        if eval_metric == 'loss':
            parallel_output = True
        elif eval_metric == 'accuracy':
@@ -40,7 +43,7 @@ def get_model_provider(eval_metric):
                                      'is not supported.'.format(eval_metric))

        print_rank_0('building GPT model ...')
-        model = GPTModel(num_tokentypes=0, parallel_output=parallel_output,
+        model = GPTModel(config, num_tokentypes=0, parallel_output=parallel_output,
                         pre_process=pre_process, post_process=post_process)

        return model
@@ -69,7 +72,7 @@ def process_batch(batch):
    return tokens, labels, attention_mask, position_ids, loss_mask


-def forward_step(batch, model, eval_metric):
+def forward_step(batch, model, eval_metric, config):
    """Forward step."""

    # Get the batch.
@@ -80,7 +83,8 @@ def forward_step(batch, model, eval_metric):
    args = get_args()
    args.micro_batch_size = len(labels)

-    input_tensor = recv_forward()
+    tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
+    input_tensor = recv_forward(tensor_shape, config)

    # Forward pass through the model.
    unwrapped_model = unwrap_model(
@@ -88,7 +92,7 @@ def forward_step(batch, model, eval_metric):
    unwrapped_model.set_input_tensor(input_tensor)
    output = model(tokens, position_ids, attention_mask)

-    send_forward(output)
+    send_forward(output, config)

    if parallel_state.is_pipeline_last_stage():
        # For loss, return the unreduced loss.
@@ -115,7 +119,8 @@ def forward_step(batch, model, eval_metric):
 def evaluate(data_loader, model, eval_metric):
    """Evaluation."""
    args = get_args()
-
+    config = core_transformer_config_from_args(args)
+    
    # Turn on evaluation mode which disables dropout.
    model.eval()

@@ -126,7 +131,7 @@ def evaluate(data_loader, model, eval_metric):
            if iteration % args.log_interval == 0:
                print_rank_0('> working on iteration: {}'.format(iteration))
            # Forward evaluation.
-            output = forward_step(batch, model, eval_metric)
+            output = forward_step(batch, model, eval_metric, config)

            # Reduce across processes.
            if parallel_state.is_pipeline_last_stage():

--- a/tests/functional_tests/python_test_utils/check_slurm_job_completion.py
+++ b/tests/functional_tests/python_test_utils/check_slurm_job_completion.py
+"""Check if a given slurm job id completed successfully
+   Usage:
+       python3 check_slurm_job_completion.py <JOB_ID>
+"""
+
+import sys
+import subprocess
+
+
+cmd = f"sacct -j {sys.argv[1]}"
+result = subprocess.check_output(cmd, shell=True).decode().split()
+assert len(result) > 14, "JOB state not available."
+
+status = result[19]
+exit_code = result[20]
+
+assert status == "COMPLETED", f"Job {sys.argv[1]} not completed."
+assert exit_code == "0:0", f"Job {sys.argv[1]} did not exit successfully."
+
--- a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+++ b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
 import os
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
 import sys
-import json
-import shutil
 import glob
 from tensorboard.backend.event_processing import event_accumulator


--- a/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
@@ -57,12 +57,14 @@ class TestCIPipeline:
        print(expected_list)
        actual_list = read_tb_logs_as_list(LOGS_DIR, loss_type)
        assert actual_list is not None, f"No TensorBoard events file was found in the logs for {loss_type}."
-        for i, step in enumerate(range(expected["start_step"], expected["end_step"], expected["step_interval"])):
+        actual_list_sliced = actual_list[expected["start_step"]:expected["end_step"]:expected["step_interval"]]
+        for i, (expected_val, actual_val) in enumerate(zip(expected_list, actual_list_sliced)):
+            step = i * expected["step_interval"]
            print(f"Checking step {step} against expected {i}")
            if test_type == TypeOfTest.APPROX:
-                assert actual_list[step] == pytest.approx(expected=expected_list[i], rel=self.margin_loss), f"{self.job_name} : The loss at step {step} should be approximately {expected_list[i]} but it is {actual_list[step]}."
+                assert actual_val == pytest.approx(expected=expected_val, rel=self.margin_loss), f"The loss at step {step} should be approximately {expected_val} but it is {actual_val}."
            else:
-                assert actual_list[step] == expected_list[i], f"The value at step {step} should be {expected_list[i]} but it is {actual_list[step]}."
+                assert actual_val == expected_val, f"The value at step {step} should be {expected_val} but it is {actual_val}."

    @pytest.mark.xfail
    def test_lm_loss_deterministic(self):

--- a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
 import os
+os.environ['OPENBLAS_NUM_THREADS'] = '1'
 import sys
 import json
 import shutil

--- a/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50444, 10.49325, 10.4863, 10.48386, 10.49892, 10.46644, 10.41921, 10.30106, 10.16285, 9.97939]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [17438.0, 18815.0, 22912.0, 18568.0, 19900.0, 23810.0, 22918.0]}, "iteration_timing_avg": 0.35970588235294115}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50685, 10.49816, 10.47982, 10.48566, 10.49535, 10.46661, 10.42394, 10.30692, 10.15978, 9.96955]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [19029.0, 19175.0, 22073.0, 18475.0, 20839.0, 23168.0, 22721.0]}, "iteration_timing_avg": 0.4121861764705882}
\ No newline at end of file
--- a/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54369, 10.5383, 10.55953, 10.54011, 10.51908, 10.49118, 10.46612, 10.31901, 10.15649, 9.96702]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [21736.0, 20433.0, 27243.0, 23240.0, 22459.0, 20724.0, 23451.0]}, "iteration_timing_avg": 0.8657461764705884}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.53088, 10.48503, 10.46274, 10.31498, 10.17119, 9.97324]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22611.0, 20670.0, 26082.0, 23626.0, 21993.0, 21751.0, 23179.0]}, "iteration_timing_avg": 0.874113823529412}
\ No newline at end of file
--- a/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44729, 10.44093, 10.45375, 10.44445, 10.44305, 10.44595, 10.39163, 10.25898, 10.13498, 9.95692]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27334.0, 20551.0, 28114.0, 24328.0, 24070.0, 20653.0, 21346.0]}, "iteration_timing_avg": 0.6318655882352939}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44877, 10.43852, 10.44018, 10.44114, 10.45622, 10.44144, 10.39044, 10.25684, 10.133, 9.95743]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [28181.0, 20629.0, 28241.0, 26287.0, 24057.0, 21016.0, 21238.0]}, "iteration_timing_avg": 0.7704600000000001}
\ No newline at end of file
--- a/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json
-{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4978, 10.49775, 10.48021, 10.50638, 10.49624, 10.47018, 10.34494, 10.25536, 10.10244, 9.91938]}, "num-zeros": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [26168.0, 19042.0, 28718.0, 22408.0, 26377.0, 34320.0, 21873.0]}, "iteration_timing_avg": 1.1249785294117647}
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.48681, 10.48784, 10.4873, 10.50417, 10.49446, 10.47819, 10.41361, 10.28135, 10.14425, 9.94149]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26852.0, 19803.0, 25492.0, 24594.0, 21586.0, 19658.0, 20766.0]}, "iteration_timing_avg": 1.4250708823529417}
\ No newline at end of file
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json
-{"lm loss": {"start_step": 0, "end_step": 37, "step_interval": 5, "values": [10.84266, 10.89696, 10.90542, 10.87498, 10.86279, 10.83628, 10.64437, 10.62386]}, "num-zeros": {"start_step": 0, "end_step": 20, "step_interval": 5, "values": [2093.0, 2474.0, 2327.0, 2213.0]}, "iteration_timing_avg": 0.080846}
+{"lm loss": {"start_step": 0, "end_step": 44, "step_interval": 5, "values": [10.84008, 10.89053, 10.90905, 10.87934, 10.86562, 10.83752, 10.64582, 10.62397, 10.53554]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [2078.0, 2320.0, 2519.0, 2248.0, 2127.0, 1987.0]}, "iteration_timing_avg": 0.09863333333333332}
\ No newline at end of file
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps_core_enabled.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps_core_enabled.json
+{"lm loss": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [10.83091, 10.87024, 10.89161, 10.81277, 10.6858, 10.61231, 10.09495, 10.21817]}, "num-zeros": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [1545.0, 1840.0, 1867.0, 1825.0, 1867.0, 1668.0, 1576.0, 1932.0]}, "iteration_timing_avg": 0.09399846153846156}
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps_core_enabled_rope_embeddings.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps_core_enabled_rope_embeddings.json
+{"lm loss": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [10.84538, 10.87913, 10.90387, 10.8235, 10.67913, 10.60602, 10.06785, 10.19695]}, "num-zeros": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [1722.0, 2142.0, 2149.0, 1966.0, 2038.0, 1914.0, 1745.0, 1956.0]}, "iteration_timing_avg": 0.10455653846153849}
\ No newline at end of file
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps.json
-{"lm loss": {"start_step": 0, "end_step": 49, "step_interval": 5, "values": [10.7947, 10.85294, 10.87058, 10.83388, 10.83025, 10.78755, 10.56419, 10.57339, 10.48735, 10.19553]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2452.0, 2744.0, 2176.0, 2722.0, 2636.0, 2535.0, 2996.0]}, "iteration_timing_avg": 0.1158709090909091}
+{"lm loss": {"start_step": 0, "end_step": 47, "step_interval": 5, "values": [10.81244, 10.87437, 10.90132, 10.84946, 10.84826, 10.81872, 10.61811, 10.61053, 10.52823, 10.22961]}, "num-zeros": {"start_step": 0, "end_step": 30, "step_interval": 5, "values": [2356.0, 2601.0, 2778.0, 2282.0, 2350.0, 2782.0]}, "iteration_timing_avg": 0.12793593749999999}
\ No newline at end of file
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled.json
+{"lm loss": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [10.79471, 10.86601, 10.89073, 10.78482, 10.6587, 10.58125]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [1609.0, 1850.0, 1921.0, 1942.0, 1853.0, 1674.0]}, "iteration_timing_avg": 0.12440000000000001}
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_disable_bias_linear.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp4_1nodes_50steps_core_enabled_disable_bias_linear.json
+{"lm loss": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [10.79474, 10.86607, 10.8908, 10.7851, 10.65905, 10.58193]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [1587.0, 1824.0, 2006.0, 1919.0, 1874.0, 1646.0]}, "iteration_timing_avg": 0.12088222222222227}
\ No newline at end of file