Merge branch 'citest' into 'main'

Testing infrastructure for Megatron core See merge request ADLR/megatron-lm!514

Merge branch 'citest' into 'main'
Testing infrastructure for Megatron core See merge request ADLR/megatron-lm!514
f5cf2e42 · Jared Casper · 95f872f5 · 8e6fa622 · f5cf2e42 · f5cf2e42
Commit f5cf2e42 authored Mar 08, 2023 by Jared Casper
20 changed files
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
-image: gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel
+image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov

-test:
+stages:
+  - test
+  - cleanup
+
+variables: &VARS
+  SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
+  DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
+  PYTORCH_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
+  PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
+  TESTS_TO_RUN_AFTER_MERGING: L0  # Can specify levels, ci job names etc as a space seperated list to run during merge request
+  TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests  
+  TEST_REGEX_ON_THIS_COMMIT: /.*bert.*/ #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
+  DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
+
+unit_tests:
  tags:
-    - docker_gpu_enabled
+    - docker_local_runner
+  stage: test
  script:
-    - torchrun --nproc_per_node=8  -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/
+    - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
  coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
  artifacts:
    paths:
      - coverage
    expire_in: 30 days
-    
\ No newline at end of file
+
+.selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
+  tags:
+    - ssh_selene_runner
+  stage: test
+  script: &selene-test-launcher-script
+    - echo "Running selene resume from checkpoint test. "
+    - pwd
+    - export BUILD_DIR=`pwd`
+    - export RUN_NAME=resume_${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes
+    - export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS 
+    - export DATA_DIR=$DATA_DIR
+    - echo "Run name is $RUN_NAME"
+    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
+    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
+    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
+    - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
+    - export LOGS_DIR=$BASE_DIR/logs
+    - export RESULTS_DIR=$BASE_DIR/results
+    - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
+    - echo "Submitting job"
+    - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES`
+    - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
+    - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
+    - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
+                "----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
+                "---------------------------------------------------\n"
+                "$(scontrol show job=${SLURM_JOBID})\n"
+                "---------------------------------------------------\n"
+    # Gitlab logs collapsible section markers
+    - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
+    # Follow output of the job
+    - echo "Finished job"  
+    - source $PYTHON_VIRTUAL_ENV
+    - pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+    - echo "Completed the job"
+  rules:
+    - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
+      when: always
+    - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
+      when: always
+  allow_failure: false
+
+.selene_test_launcher: &selene-test-launcher
+  tags:
+    - ssh_selene_runner
+  stage: test
+  script: &selene-test-launcher-script
+    - echo "Running selene test"
+    - pwd
+    - export BUILD_DIR=`pwd`
+    - export RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
+    - export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE
+    - export DATA_DIR=$DATA_DIR
+    - echo "Run name is $RUN_NAME"
+    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
+    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
+    - mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
+    - export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
+    - export LOGS_DIR=$BASE_DIR/logs
+    - export RESULTS_DIR=$BASE_DIR/results
+    - export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
+    - echo "Submitting job"
+    - sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE`
+    - export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
+    - bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
+    - \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
+                "----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
+                "---------------------------------------------------\n"
+                "$(scontrol show job=${SLURM_JOBID})\n"
+                "---------------------------------------------------\n"
+    # Gitlab logs collapsible section markers
+    - echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
+    # Follow output of the job
+    - echo "Finished job"  
+    - source $PYTHON_VIRTUAL_ENV
+    - |
+      if [[ "$DISPLAY_OUTPUT" == "True" ]]; then
+        python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME
+      fi
+    - echo "Checking against ground truth file"
+    - export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json
+    - pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+    - echo "Completed the job"
+  rules:
+    - if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
+      when: always
+    - if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
+      when: always
+  allow_failure: false
+    
+train.gpt3.345m_tp4_pp1_1node_50steps:
+  <<: *selene-test-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: gpt3
+    TP_SIZE: 4
+    PP_SIZE: 1
+    NUM_NODES: 1
+    MAX_STEPS: 50
+    TIME_LIMIT: "20:00"
+    TEST_LEVEL: L0
+
+train.gpt3.345m_tp2_pp2_1node_50steps:
+  <<: *selene-test-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: gpt3
+    TP_SIZE: 2
+    PP_SIZE: 2
+    NUM_NODES: 1
+    MAX_STEPS: 50
+    TIME_LIMIT: "20:00"
+    TEST_LEVEL: L0
+
+train.gpt3.345m_tp1_pp2_1node_50steps:
+  <<: *selene-test-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: gpt3
+    TP_SIZE: 1
+    PP_SIZE: 2
+    NUM_NODES: 1
+    MAX_STEPS: 50
+    TIME_LIMIT: "20:00"
+    TEST_LEVEL: L0 
+
+resume.checkpoint.gpt3.345m_tp1_pp2_1node:
+  <<: *selene-test-resume-checkpoint-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: gpt3
+    TP_SIZE: 1
+    PP_SIZE: 2
+    NUM_NODES: 1
+    TIME_LIMIT: "30:00"
+    TEST_LEVEL: L0 
+
+train.bert.345m_tp4_pp1_1node_50steps:
+  <<: *selene-test-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: bert
+    TP_SIZE: 4
+    PP_SIZE: 1
+    NUM_NODES: 1
+    MAX_STEPS: 50
+    TIME_LIMIT: "20:00"
+    TEST_LEVEL: L0
+
+train.bert.345m_tp2_pp2_1node_50steps:
+  <<: *selene-test-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: bert
+    TP_SIZE: 2
+    PP_SIZE: 2
+    NUM_NODES: 1
+    MAX_STEPS: 50
+    TIME_LIMIT: "20:00"
+    TEST_LEVEL: L0
+
+train.bert.345m_tp1_pp2_1node_50steps:
+  <<: *selene-test-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: bert
+    TP_SIZE: 1
+    PP_SIZE: 2
+    NUM_NODES: 1
+    MAX_STEPS: 50
+    TIME_LIMIT: "20:00"
+    TEST_LEVEL: L0 
+
+train.bert.345m_tp1_pp4_1node_50steps:
+  <<: *selene-test-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: bert
+    TP_SIZE: 1
+    PP_SIZE: 4
+    VP_SIZE: 2
+    NUM_NODES: 1
+    MAX_STEPS: 50
+    TIME_LIMIT: "20:00"
+    TEST_LEVEL: L0
+
+resume.checkpoint.bert.345m_tp1_pp2_1node:
+  <<: *selene-test-resume-checkpoint-launcher
+  variables:
+    <<: [*VARS]
+    RUN_MODEL: bert
+    TP_SIZE: 1
+    PP_SIZE: 2
+    NUM_NODES: 1
+    TIME_LIMIT: "30:00"
+    TEST_LEVEL: L0       
+
+cleanup.selene:
+  tags:
+    - ssh_selene_runner
+  stage: cleanup
+  variables:
+    <<: [*VARS]
+  script:
+    - NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | wc -l`
+    - find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | xargs rm -rf
+    - echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
+  allow_failure: true
+  rules:
+    - when: always
--- a/tests/functional_tests/__init__.py
+++ b/tests/functional_tests/__init__.py
--- a/tests/functional_tests/python_test_utils/__init__.py
+++ b/tests/functional_tests/python_test_utils/__init__.py
--- a/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+++ b/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py
+import os
+import sys
+import json
+import shutil
+import glob
+from tensorboard.backend.event_processing import event_accumulator
+
+
+def read_tb_logs_as_list(path, summary_name):
+    """Reads a TensorBoard Events file from the input path, and returns the
+    summary specified as input as a list.
+
+    Arguments:
+    path: str, path to the dir where the events file is located.
+    summary_name: str, name of the summary to read from the TB logs.
+    Output:
+    summary_list: list, the values in the read summary list, formatted as a list.
+    """
+    files = glob.glob(f"{path}/events*tfevents*")
+    files += glob.glob(f"{path}/results/events*tfevents*")
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
+    if files:
+        event_file = files[0]
+        ea = event_accumulator.EventAccumulator(event_file)
+        ea.Reload()
+        summary = ea.Scalars(summary_name)
+        summary_list = [round(x.value, 5) for x in summary]
+        print(summary_list)
+        return summary_list
+    raise FileNotFoundError(f"File not found matching: {path}/events*")    
+
+def collect_train_test_metrics(logs_dir, run_name):
+    # TODO: Fetch current baseline
+
+    # train loss
+    train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss")
+
+    # num zeros
+    num_zeros = read_tb_logs_as_list(logs_dir, "num-zeros")
+
+    iteration_time = read_tb_logs_as_list(logs_dir, "iteration-time")
+
+    # First few iterations might take a little longer. So we take the last 70 percent of the timings
+    idx = len(iteration_time)//3   
+    iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
+
+    train_metrics = {
+        "lm loss": {
+            "start_step": 0,
+            "end_step": len(train_loss_list),
+            "step_interval": 5,
+            "values": train_loss_list[0:len(train_loss_list):5],
+        },
+        "num-zeros": {
+            "start_step": 0,
+            "end_step": len(num_zeros),
+            "step_interval": 5,
+            "values": num_zeros[0:len(num_zeros):5],
+        },
+        "iteration_timing_avg": iteration_time_avg,
+    }
+    str_train_metrics = str(train_metrics).replace("'", "\"")
+    print(f"\n ----------- Store the following metrics in {run_name}.json ----------")
+    print(f"\n {str_train_metrics}", flush=True)
+
+if __name__ == '__main__':
+    args = sys.argv[1:]
+    logs_dir = args[0] # eg /lustre/fsw/joc/shanmugamr/megatron/logs/
+    run_name = args[1]
+    collect_train_test_metrics(logs_dir, run_name)
+
+
--- a/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_ci_pipeline.py
+import os
+import json
+import pytest
+import sys
+import glob
+from tensorboard.backend.event_processing import event_accumulator
+
+LOGS_DIR = os.getenv('LOGS_DIR')
+EXPECTED_METRICS_FILE = os.getenv('EXPECTED_METRICS_FILE')
+
+import enum
+
+class TypeOfTest(enum.Enum):
+    APPROX = 1
+    DETERMINISTIC = 2
+
+
+def read_tb_logs_as_list(path, summary_name):
+    """Reads a TensorBoard Events file from the input path, and returns the
+    summary specified as input as a list.
+
+    Arguments:
+    path: str, path to the dir where the events file is located.
+    summary_name: str, name of the summary to read from the TB logs.
+    Output:
+    summary_list: list, the values in the read summary list, formatted as a list.
+    """
+    files = glob.glob(f"{path}/events*tfevents*")
+    files += glob.glob(f"{path}/results/events*tfevents*")
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
+    if files:
+        event_file = files[0]
+        ea = event_accumulator.EventAccumulator(event_file)
+        ea.Reload()
+        summary = ea.Scalars(summary_name)
+        summary_list = [round(x.value, 5) for x in summary]
+        print(summary_list)
+        return summary_list
+    raise FileNotFoundError(f"File not found matching: {path}/events*")
+
+
+# If we require a variation of tests for any of the other pipelines we can just inherit this class.
+class TestCIPipeline:
+
+    margin_loss, margin_time = 0.05, 0.1
+    expected = None
+    if os.path.exists(EXPECTED_METRICS_FILE):
+        with open(EXPECTED_METRICS_FILE) as f:
+            expected = json.load(f)
+
+    def _test_helper(self, loss_type, test_type):
+        if self.expected is None:
+            raise FileNotFoundError("Expected data is none")
+        expected = self.expected[loss_type]
+        expected_list = expected["values"]
+        actual_list = read_tb_logs_as_list(LOGS_DIR, loss_type)
+        assert actual_list is not None, f"No TensorBoard events file was found in the logs for {loss_type}."
+        for i, step in enumerate(range(expected["start_step"], expected["end_step"], expected["step_interval"])):
+            if test_type == TypeOfTest.APPROX:
+                assert actual_list[step] == pytest.approx(expected=expected_list[i], rel=self.margin_loss), f"{self.job_name} : The loss at step {step} should be approximately {expected_list[i]} but it is {actual_list[step]}."
+            else:
+                assert actual_list[step] == expected_list[i], f"The value at step {step} should be {expected_list[i]} but it is {actual_list[step]}."
+
+    @pytest.mark.xfail
+    def test_lm_loss_deterministic(self):
+        # Expected training loss curve at different global steps.
+        self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
+
+    def test_lm_loss_approx(self):
+        # Expected training loss curve at different global steps.
+        self._test_helper("lm loss", TypeOfTest.APPROX)
+
+    def test_num_zeros_deterministic(self):
+        # Expected validation loss curve at different global steps.
+        self._test_helper("num-zeros", TypeOfTest.DETERMINISTIC)
+    
+    def iteration_timing_node(self):
+        expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
+        iteration_time = read_tb_logs_as_list(LOGS_DIR, "iteration-time")
+        idx = len(iteration_time)//3   
+        iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
+        assert expected_iteration_timing_avg == pytest.approx(expected=iteration_time_avg, rel=self.margin_time), f"The time per global step must be approximately {expected_iteration_timing_avg} but it is {iteration_time_avg}."
--- a/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+++ b/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
+import os
+import sys
+import json
+import shutil
+import glob
+from tensorboard.backend.event_processing import event_accumulator
+
+LOGS_DIR = os.getenv('LOGS_DIR')
+
+def read_tb_logs_as_list(path, summary_name, index):
+    files = glob.glob(f"{path}/events*tfevents*")
+    files += glob.glob(f"{path}/results/events*tfevents*")
+    files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
+    if files:
+        event_file = files[index]
+        ea = event_accumulator.EventAccumulator(event_file)
+        ea.Reload()
+        summary = ea.Scalars(summary_name)
+        summary_list = [round(x.value, 5) for x in summary]
+        print(summary_list)
+        return summary_list
+    raise FileNotFoundError(f"File not found matching: {path}/events*")    
+
+def collect_train_test_metrics(logs_dir, index):
+    train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss", index)
+    train_loss_list = [round(elem,3) for elem in train_loss_list]
+    train_metrics = {
+        "lm loss": train_loss_list[0:len(train_loss_list):5],
+    } 
+    str_train_metrics = str(train_metrics).replace("'", "\"")
+    print(f"\n ----------- The following are the metrics for ----------")
+    print(f"\n {str_train_metrics}", flush=True)
+    return train_metrics
+
+class TestCIPipeline:
+
+    train_metrics_100 = collect_train_test_metrics(LOGS_DIR, 0)
+    train_metrics_50_to_100 = collect_train_test_metrics(LOGS_DIR, 1)
+
+    def _test_helper(self, loss_type):
+        expected = self.train_metrics_100[loss_type]
+        print('expected : '  + str(expected))
+        actual = self.train_metrics_50_to_100[loss_type]
+        print('actual : '  + str(actual))
+        # NOTE : Doing this way because in gpt3 model when I run from 0 - 100 directly, it produces 1 extra element
+        # i.e expected is [10.84266, 10.89696, 10.90542, 10.87498, 10.86265, 10.83608, 10.64368, 10.62319, 10.53908, 10.25005, 10.20907, 9.96542, 9.96802, 9.92436, 9.79086, 9.26718, 9.61784, 9.19018, 9.45986, 9.62168, 9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22436, 9.19436, 9.11323, 9.09711, 9.04421, 9.36795]
+        # actual is : [9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22435, 9.19435, 9.11322, 9.09711, 9.04422]
+        # That extra element in expected is causing some issues. So doing it this way. Need to figure out whats happening
+        start_idx_expected = expected.index(actual[0]) # First element of actual
+        # Here we will just be comparing values of actual and second half (50-100) of expected
+        for i in range(len(actual)):
+            assert actual[i] == expected[start_idx_expected + i], f"The value at step {i} should be {expected[start_idx_expected + i]} but it is {actual[i]}."
+
+    def test_lm_loss_deterministic(self):
+        self._test_helper("lm loss")
\ No newline at end of file
--- a/tests/functional_tests/shell_test_utils/jobwait.sh
+++ b/tests/functional_tests/shell_test_utils/jobwait.sh
+#! /bin/bash
+
+JOBID=$1
+echo "Job id : $JOBID"
+
+if [[ $JOBID -eq "" ]]; then
+  exit 1
+fi
+
+sleep 10s
+
+while true; do
+    export STATE=`sacct -j $JOBID --format State --parsable2 --noheader |& head -n 1`
+    case "${STATE}" in
+        PENDING|RUNNING|REQUEUED)
+            echo "Job is still in $STATE"
+            sleep 15s
+            ;;
+        *)
+            sleep 30s
+            echo "Exiting with SLURM job status '${STATE}'"
+            exit 0
+            ;;
+    esac
+done
--- a/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp1_pp2_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50444, 10.49325, 10.4863, 10.48386, 10.49859, 10.46608, 10.41875, 10.30048, 10.16226, 9.97872]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [17438.0, 18790.0, 22561.0, 18532.0, 20246.0, 23670.0, 22788.0]}, "iteration_timing_avg": 0.3469323529411764}
\ No newline at end of file
--- a/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp1_pp4_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54369, 10.5383, 10.55953, 10.54011, 10.51871, 10.4908, 10.46566, 10.31844, 10.15596, 9.9664]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [21736.0, 20410.0, 27256.0, 23697.0, 22528.0, 21048.0, 23461.0]}, "iteration_timing_avg": 0.8071679411764707}
\ No newline at end of file
--- a/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp2_pp2_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44729, 10.44093, 10.45375, 10.44445, 10.44267, 10.44555, 10.39114, 10.25849, 10.1345, 9.9564]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27334.0, 20715.0, 28442.0, 24309.0, 23479.0, 20540.0, 21108.0]}, "iteration_timing_avg": 0.618779411764706}
\ No newline at end of file
--- a/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/bert/bert_tp4_pp1_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4978, 10.49775, 10.48021, 10.50638, 10.49579, 10.46974, 10.34444, 10.25478, 10.10195, 9.91877]}, "num-zeros": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [26168.0, 19293.0, 28643.0, 22573.0, 25980.0, 34292.0, 21318.0]}, "iteration_timing_avg": 1.0391188235294118}
\ No newline at end of file
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp1_pp2_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [10.84266, 10.89696, 10.90542, 10.87498, 10.86265, 10.83608, 10.64368]}, "num-zeros": {"start_step": 0, "end_step": 17, "step_interval": 5, "values": [2093.0, 2491.0, 2352.0, 2202.0]}, "iteration_timing_avg": 0.07941913043478262}
\ No newline at end of file
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp2_pp2_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 47, "step_interval": 5, "values": [10.85716, 10.88973, 10.879, 10.87014, 10.87964, 10.84443, 10.67203, 10.62868, 10.52733, 10.2536]}, "num-zeros": {"start_step": 0, "end_step": 30, "step_interval": 5, "values": [2450.0, 2383.0, 2525.0, 2234.0, 2313.0, 2514.0]}, "iteration_timing_avg": 0.11253562499999999}
\ No newline at end of file
--- a/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
+++ b/tests/functional_tests/test_results/gpt3/gpt3_tp4_pp1_1nodes_50steps.json
+{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86276, 10.88058, 10.87527, 10.88402, 10.89158, 10.84702, 10.6879, 10.62796, 10.53893, 10.26644]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2199.0, 2304.0, 2389.0, 1985.0, 2059.0, 2393.0, 2395.0]}, "iteration_timing_avg": 0.15685176470588238}
\ No newline at end of file
--- a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh
+++ b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh
+#! /bin/bash
+
+DATA_PATH=$1
+CHECKPOINT_PATH=$2
+TENSORBOARD_DIR=$3
+TP_SIZE=$4
+PP_SIZE=$5
+NNODES=$6
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+# Runs the "345M" parameter model
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+# Run for 100 iterations
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --use-checkpoint-args \
+       --use-checkpoint-opt_param-scheduler \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --log-params-norm \
+       --log-num-zeros-in-grad \
+       --log-validation-ppl-to-tensorboard \
+       --log-timers-to-tensorboard \
+       --tensorboard-dir ${TENSORBOARD_DIR} \
+       --micro-batch-size 4 \
+       --global-batch-size 128 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 100 \
+       --timing-log-level 2 \
+       --lr-decay-iters 990000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file /workspace/data/bert_data/vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-warmup-fraction 0.01 \
+       --log-interval 1 \
+       --save-interval 50 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --tensor-model-parallel-size $TP_SIZE \
+       --pipeline-model-parallel-size $PP_SIZE \
+       --no-gradient-accumulation-fusion \
+       --fp16
+
+echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt
+
+# Resume from 50th iteration ckpt and continue to 100 iterations
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --use-checkpoint-args \
+       --use-checkpoint-opt_param-scheduler \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --log-params-norm \
+       --log-num-zeros-in-grad \
+       --log-validation-ppl-to-tensorboard \
+       --log-timers-to-tensorboard \
+       --tensorboard-dir ${TENSORBOARD_DIR} \
+       --micro-batch-size 4 \
+       --global-batch-size 128 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters 100 \
+       --timing-log-level 2 \
+       --lr-decay-iters 990000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file /workspace/data/bert_data/vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-warmup-fraction 0.01 \
+       --log-interval 1 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --tensor-model-parallel-size $TP_SIZE \
+       --pipeline-model-parallel-size $PP_SIZE \
+       --no-gradient-accumulation-fusion \
+       --fp16
\ No newline at end of file
--- a/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh
+#! /bin/bash
+set -o xtrace
+
+DATA_PATH=$1
+CHECKPOINT_PATH=$2
+TENSORBOARD_DIR=$3
+TP_SIZE=$4
+PP_SIZE=$5
+NNODES=$6
+MAX_STEPS=$7
+VP_SIZE=$8
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+# Runs the "345M" parameter model
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_bert.py \
+       --num-layers 24 \
+       --hidden-size 1024 \
+       --num-attention-heads 16 \
+       --log-params-norm \
+       --log-num-zeros-in-grad \
+       --log-validation-ppl-to-tensorboard \
+       --log-timers-to-tensorboard \
+       --tensorboard-dir ${TENSORBOARD_DIR} \
+       --micro-batch-size 4 \
+       --global-batch-size 128 \
+       --seq-length 512 \
+       --max-position-embeddings 512 \
+       --train-iters $MAX_STEPS \
+       --timing-log-level 2 \
+       --lr-decay-iters 990000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file /workspace/data/bert_data/vocab.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.0001 \
+       --min-lr 0.00001 \
+       --lr-warmup-fraction 0.01 \
+       --log-interval 1 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --tensor-model-parallel-size $TP_SIZE \
+       --pipeline-model-parallel-size $PP_SIZE \
+       ${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
+       --no-gradient-accumulation-fusion \
+       --fp16 
\ No newline at end of file
--- a/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh
+++ b/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_resume_checkpoint_test.sh
+#!/bin/bash
+
+# Parameters
+#SBATCH --account=adlr
+#SBATCH --job-name=adlr-ci:megatron-job
+#SBATCH --nodes=1
+#SBATCH --partition=luna
+
+DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
+CHECKPOINT_PATH=/workspace/checkpoints
+TENSORBOARD_DIR=/workspace/logs
+
+srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
+  ls 
+  cd /workspace/megatron-lm
+  ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"
\ No newline at end of file
--- a/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/bert/sbatch_bert_distributed_test.sh
+#!/bin/bash
+
+# Parameters
+#SBATCH --account=adlr
+#SBATCH --job-name=adlr-ci:megatron-job
+#SBATCH --nodes=1
+#SBATCH --partition=luna
+
+DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
+CHECKPOINT_PATH=/workspace/checkpoints
+TENSORBOARD_DIR=/workspace/logs
+
+srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
+  ls 
+  cd /workspace/megatron-lm
+  ./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE"
\ No newline at end of file
--- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
+++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_resume_checkpoint_test.sh
+#! /bin/bash
+
+DATA_PATH=$1
+CHECKPOINT_PATH=$2
+TENSORBOARD_DIR=$3
+TP_SIZE=$4
+PP_SIZE=$5
+NNODES=$6
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+# Runs the "345M" parameter model
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+# Run for 100 iterations and save checkpoint at 50
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       --use-checkpoint-args \
+       --use-checkpoint-opt_param-scheduler \
+       --num-layers 12 \
+       --hidden-size 512 \
+       --num-attention-heads 8 \
+       --log-params-norm \
+       --log-num-zeros-in-grad \
+       --log-validation-ppl-to-tensorboard \
+       --log-timers-to-tensorboard \
+       --tensorboard-dir ${TENSORBOARD_DIR} \
+       --micro-batch-size 4 \
+       --global-batch-size 32 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 100 \
+       --timing-log-level 2 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
+       --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --log-interval 1 \
+       --save-interval 50 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --tensor-model-parallel-size $TP_SIZE \
+       --pipeline-model-parallel-size $PP_SIZE \
+       --no-gradient-accumulation-fusion \
+       --fp16
+
+echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt
+
+# Resume from 50th iteration ckpt and continue to 100 iterations
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       --use-checkpoint-args \
+       --use-checkpoint-opt_param-scheduler \
+       --num-layers 12 \
+       --hidden-size 512 \
+       --num-attention-heads 8 \
+       --log-params-norm \
+       --log-num-zeros-in-grad \
+       --log-validation-ppl-to-tensorboard \
+       --log-timers-to-tensorboard \
+       --tensorboard-dir ${TENSORBOARD_DIR} \
+       --micro-batch-size 4 \
+       --global-batch-size 32 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters 100 \
+       --timing-log-level 2 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
+       --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --log-interval 1 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --tensor-model-parallel-size $TP_SIZE \
+       --pipeline-model-parallel-size $PP_SIZE \
+       --no-gradient-accumulation-fusion \
+       --fp16
\ No newline at end of file
--- a/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
+++ b/tests/functional_tests/test_scripts/gpt3/pretrain_gpt3_distributed_test.sh
+#! /bin/bash
+
+DATA_PATH=$1
+CHECKPOINT_PATH=$2
+TENSORBOARD_DIR=$3
+TP_SIZE=$4
+PP_SIZE=$5
+NNODES=$6
+MAX_STEPS=$7
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+
+# Runs the "345M" parameter model
+DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS \
+       pretrain_gpt.py \
+       --num-layers 12 \
+       --hidden-size 512 \
+       --num-attention-heads 8 \
+       --log-params-norm \
+       --log-num-zeros-in-grad \
+       --log-validation-ppl-to-tensorboard \
+       --log-timers-to-tensorboard \
+       --tensorboard-dir ${TENSORBOARD_DIR} \
+       --micro-batch-size 4 \
+       --global-batch-size 32 \
+       --seq-length 1024 \
+       --max-position-embeddings 1024 \
+       --train-iters $MAX_STEPS \
+       --timing-log-level 2 \
+       --lr-decay-iters 320000 \
+       --save $CHECKPOINT_PATH \
+       --load $CHECKPOINT_PATH \
+       --data-path $DATA_PATH \
+       --vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
+       --merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
+       --data-impl mmap \
+       --split 949,50,1 \
+       --distributed-backend nccl \
+       --lr 0.00015 \
+       --lr-decay-style cosine \
+       --min-lr 1.0e-5 \
+       --weight-decay 1e-2 \
+       --clip-grad 1.0 \
+       --lr-warmup-fraction .01 \
+       --log-interval 1 \
+       --save-interval 10000 \
+       --eval-interval 1000 \
+       --eval-iters 10 \
+       --tensor-model-parallel-size $TP_SIZE \
+       --pipeline-model-parallel-size $PP_SIZE \
+       --no-gradient-accumulation-fusion \
+       --fp16