Commit f5cf2e42 authored by Jared Casper's avatar Jared Casper
Browse files

Merge branch 'citest' into 'main'

Testing infrastructure for Megatron core

See merge request ADLR/megatron-lm!514
parents 95f872f5 8e6fa622
image: gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel image: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
test: stages:
- test
- cleanup
variables: &VARS
SELENE_ADLR_CI_PATH: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron"
DATA_DIR: "/lustre/fsw/adlr/adlr-nlp/adlr_ci/megatron/data"
PYTORCH_IMAGE: gitlab-master.nvidia.com:5005/adlr/megatron-lm/ngc/pytorch:22.12-py3_pytest-cov
PYTHON_VIRTUAL_ENV: /lustre/fsw/adlr/adlr-nlp/adlr_ci/cienv/bin/activate
TESTS_TO_RUN_AFTER_MERGING: L0 # Can specify levels, ci job names etc as a space seperated list to run during merge request
TESTS_TO_RUN_ON_THIS_COMMIT: unit_tests
TEST_REGEX_ON_THIS_COMMIT: /.*bert.*/ #https://github.com/google/re2/wiki/Syntax (Can define regex as in this spec) e.g /.*gpt3.*/
DISPLAY_OUTPUT: "True" # Set to true for new tests to copy the logs for creating golden truth file
unit_tests:
tags: tags:
- docker_gpu_enabled - docker_local_runner
stage: test
script: script:
- torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/ - torchrun --nproc_per_node=8 -m pytest --cov-report=term --cov-report=html --cov=megatron/core tests/unit_tests
coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/' coverage: '/(?i)total.*? (100(?:\.0+)?\%|[1-9]?\d(?:\.\d+)?\%)$/'
artifacts: artifacts:
paths: paths:
- coverage - coverage
expire_in: 30 days expire_in: 30 days
\ No newline at end of file .selene_test_resume_checkpoint_launcher: &selene-test-resume-checkpoint-launcher
tags:
- ssh_selene_runner
stage: test
script: &selene-test-launcher-script
- echo "Running selene resume from checkpoint test. "
- pwd
- export BUILD_DIR=`pwd`
- export RUN_NAME=resume_${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes
- export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS
- export DATA_DIR=$DATA_DIR
- echo "Run name is $RUN_NAME"
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
- export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
- export LOGS_DIR=$BASE_DIR/logs
- export RESULTS_DIR=$BASE_DIR/results
- export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
- echo "Submitting job"
- sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_resume_checkpoint_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES`
- export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
- bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
- \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
"----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
"---------------------------------------------------\n"
"$(scontrol show job=${SLURM_JOBID})\n"
"---------------------------------------------------\n"
# Gitlab logs collapsible section markers
- echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
# Follow output of the job
- echo "Finished job"
- source $PYTHON_VIRTUAL_ENV
- pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_resume_checkpoint_pipeline.py
- echo "Completed the job"
rules:
- if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
when: always
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
when: always
allow_failure: false
.selene_test_launcher: &selene-test-launcher
tags:
- ssh_selene_runner
stage: test
script: &selene-test-launcher-script
- echo "Running selene test"
- pwd
- export BUILD_DIR=`pwd`
- export RUN_NAME=${RUN_MODEL}_tp${TP_SIZE}_pp${PP_SIZE}_${NUM_NODES}nodes_${MAX_STEPS}steps
- export TP_SIZE PP_SIZE NUM_NODES MAX_STEPS VP_SIZE
- export DATA_DIR=$DATA_DIR
- echo "Run name is $RUN_NAME"
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/checkpoints
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/logs
- mkdir -p $SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME/results
- export BASE_DIR=$SELENE_ADLR_CI_PATH/$CI_PIPELINE_ID/$RUN_NAME
- export LOGS_DIR=$BASE_DIR/logs
- export RESULTS_DIR=$BASE_DIR/results
- export CHECKPOINTS_DIR=$BASE_DIR/checkpoints
- echo "Submitting job"
- sbatch_submission=`sbatch $BUILD_DIR/tests/functional_tests/test_scripts/$RUN_MODEL/sbatch_${RUN_MODEL}_distributed_test.sh --export=BASE_DIR,BUILD_DIR,DATA_DIR,TP_SIZE,PP_SIZE,NUM_NODES,MAX_STEPS,VP_SIZE`
- export SLURM_JOBID=$(echo $sbatch_submission| grep 'Submitted batch job' | awk '{ print $4 }');
- bash $BUILD_DIR/tests/functional_tests/shell_test_utils/jobwait.sh $SLURM_JOBID
- \[ ! -z ${SLURM_JOBID} \] && echo -e " --------------------------------------------------\n"
"----------WAITING FOR SLURM JOB TO BEGIN-----------\n"
"---------------------------------------------------\n"
"$(scontrol show job=${SLURM_JOBID})\n"
"---------------------------------------------------\n"
# Gitlab logs collapsible section markers
- echo -e "\e[0Ksection_end:`date +%s`:slurm_setup\r\e[0K"
# Follow output of the job
- echo "Finished job"
- source $PYTHON_VIRTUAL_ENV
- |
if [[ "$DISPLAY_OUTPUT" == "True" ]]; then
python3 $BUILD_DIR/tests/functional_tests/python_test_utils/get_test_results_from_tensorboard_logs.py $LOGS_DIR $RUN_NAME
fi
- echo "Checking against ground truth file"
- export EXPECTED_METRICS_FILE=$BUILD_DIR/tests/functional_tests/test_results/$RUN_MODEL/$RUN_NAME.json
- pytest $BUILD_DIR/tests/functional_tests/python_test_utils/test_ci_pipeline.py
- echo "Completed the job"
rules:
- if: $TEST_LEVEL =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TESTS_TO_RUN_ON_THIS_COMMIT || $CI_JOB_NAME =~ $TEST_REGEX_ON_THIS_COMMIT
when: always
- if: '$CI_COMMIT_REF_NAME == $CI_DEFAULT_BRANCH && $TEST_LEVEL =~ $TESTS_TO_RUN_AFTER_MERGING'
when: always
allow_failure: false
train.gpt3.345m_tp4_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
train.gpt3.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
train.gpt3.345m_tp1_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
resume.checkpoint.gpt3.345m_tp1_pp2_1node:
<<: *selene-test-resume-checkpoint-launcher
variables:
<<: [*VARS]
RUN_MODEL: gpt3
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
TIME_LIMIT: "30:00"
TEST_LEVEL: L0
train.bert.345m_tp4_pp1_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 4
PP_SIZE: 1
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
train.bert.345m_tp2_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 2
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
train.bert.345m_tp1_pp2_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
train.bert.345m_tp1_pp4_1node_50steps:
<<: *selene-test-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 4
VP_SIZE: 2
NUM_NODES: 1
MAX_STEPS: 50
TIME_LIMIT: "20:00"
TEST_LEVEL: L0
resume.checkpoint.bert.345m_tp1_pp2_1node:
<<: *selene-test-resume-checkpoint-launcher
variables:
<<: [*VARS]
RUN_MODEL: bert
TP_SIZE: 1
PP_SIZE: 2
NUM_NODES: 1
TIME_LIMIT: "30:00"
TEST_LEVEL: L0
cleanup.selene:
tags:
- ssh_selene_runner
stage: cleanup
variables:
<<: [*VARS]
script:
- NUM_CLEANUP=`find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | wc -l`
- find ${SELENE_ADLR_CI_PATH}/* -type d -ctime +20 | xargs rm -rf
- echo "Finished cleaning $NUM_CLEANUP directories older than 20 days everything in Selene"
allow_failure: true
rules:
- when: always
import os
import sys
import json
import shutil
import glob
from tensorboard.backend.event_processing import event_accumulator
def read_tb_logs_as_list(path, summary_name):
"""Reads a TensorBoard Events file from the input path, and returns the
summary specified as input as a list.
Arguments:
path: str, path to the dir where the events file is located.
summary_name: str, name of the summary to read from the TB logs.
Output:
summary_list: list, the values in the read summary list, formatted as a list.
"""
files = glob.glob(f"{path}/events*tfevents*")
files += glob.glob(f"{path}/results/events*tfevents*")
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
if files:
event_file = files[0]
ea = event_accumulator.EventAccumulator(event_file)
ea.Reload()
summary = ea.Scalars(summary_name)
summary_list = [round(x.value, 5) for x in summary]
print(summary_list)
return summary_list
raise FileNotFoundError(f"File not found matching: {path}/events*")
def collect_train_test_metrics(logs_dir, run_name):
# TODO: Fetch current baseline
# train loss
train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss")
# num zeros
num_zeros = read_tb_logs_as_list(logs_dir, "num-zeros")
iteration_time = read_tb_logs_as_list(logs_dir, "iteration-time")
# First few iterations might take a little longer. So we take the last 70 percent of the timings
idx = len(iteration_time)//3
iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
train_metrics = {
"lm loss": {
"start_step": 0,
"end_step": len(train_loss_list),
"step_interval": 5,
"values": train_loss_list[0:len(train_loss_list):5],
},
"num-zeros": {
"start_step": 0,
"end_step": len(num_zeros),
"step_interval": 5,
"values": num_zeros[0:len(num_zeros):5],
},
"iteration_timing_avg": iteration_time_avg,
}
str_train_metrics = str(train_metrics).replace("'", "\"")
print(f"\n ----------- Store the following metrics in {run_name}.json ----------")
print(f"\n {str_train_metrics}", flush=True)
if __name__ == '__main__':
args = sys.argv[1:]
logs_dir = args[0] # eg /lustre/fsw/joc/shanmugamr/megatron/logs/
run_name = args[1]
collect_train_test_metrics(logs_dir, run_name)
import os
import json
import pytest
import sys
import glob
from tensorboard.backend.event_processing import event_accumulator
LOGS_DIR = os.getenv('LOGS_DIR')
EXPECTED_METRICS_FILE = os.getenv('EXPECTED_METRICS_FILE')
import enum
class TypeOfTest(enum.Enum):
APPROX = 1
DETERMINISTIC = 2
def read_tb_logs_as_list(path, summary_name):
"""Reads a TensorBoard Events file from the input path, and returns the
summary specified as input as a list.
Arguments:
path: str, path to the dir where the events file is located.
summary_name: str, name of the summary to read from the TB logs.
Output:
summary_list: list, the values in the read summary list, formatted as a list.
"""
files = glob.glob(f"{path}/events*tfevents*")
files += glob.glob(f"{path}/results/events*tfevents*")
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
if files:
event_file = files[0]
ea = event_accumulator.EventAccumulator(event_file)
ea.Reload()
summary = ea.Scalars(summary_name)
summary_list = [round(x.value, 5) for x in summary]
print(summary_list)
return summary_list
raise FileNotFoundError(f"File not found matching: {path}/events*")
# If we require a variation of tests for any of the other pipelines we can just inherit this class.
class TestCIPipeline:
margin_loss, margin_time = 0.05, 0.1
expected = None
if os.path.exists(EXPECTED_METRICS_FILE):
with open(EXPECTED_METRICS_FILE) as f:
expected = json.load(f)
def _test_helper(self, loss_type, test_type):
if self.expected is None:
raise FileNotFoundError("Expected data is none")
expected = self.expected[loss_type]
expected_list = expected["values"]
actual_list = read_tb_logs_as_list(LOGS_DIR, loss_type)
assert actual_list is not None, f"No TensorBoard events file was found in the logs for {loss_type}."
for i, step in enumerate(range(expected["start_step"], expected["end_step"], expected["step_interval"])):
if test_type == TypeOfTest.APPROX:
assert actual_list[step] == pytest.approx(expected=expected_list[i], rel=self.margin_loss), f"{self.job_name} : The loss at step {step} should be approximately {expected_list[i]} but it is {actual_list[step]}."
else:
assert actual_list[step] == expected_list[i], f"The value at step {step} should be {expected_list[i]} but it is {actual_list[step]}."
@pytest.mark.xfail
def test_lm_loss_deterministic(self):
# Expected training loss curve at different global steps.
self._test_helper("lm loss", TypeOfTest.DETERMINISTIC)
def test_lm_loss_approx(self):
# Expected training loss curve at different global steps.
self._test_helper("lm loss", TypeOfTest.APPROX)
def test_num_zeros_deterministic(self):
# Expected validation loss curve at different global steps.
self._test_helper("num-zeros", TypeOfTest.DETERMINISTIC)
def iteration_timing_node(self):
expected_iteration_timing_avg = self.expected["train_step_timing_avg"]
iteration_time = read_tb_logs_as_list(LOGS_DIR, "iteration-time")
idx = len(iteration_time)//3
iteration_time_avg = sum(iteration_time[idx:])/len(iteration_time[idx:])
assert expected_iteration_timing_avg == pytest.approx(expected=iteration_time_avg, rel=self.margin_time), f"The time per global step must be approximately {expected_iteration_timing_avg} but it is {iteration_time_avg}."
import os
import sys
import json
import shutil
import glob
from tensorboard.backend.event_processing import event_accumulator
LOGS_DIR = os.getenv('LOGS_DIR')
def read_tb_logs_as_list(path, summary_name, index):
files = glob.glob(f"{path}/events*tfevents*")
files += glob.glob(f"{path}/results/events*tfevents*")
files.sort(key=lambda x: os.path.getmtime(os.path.join(path, x)))
if files:
event_file = files[index]
ea = event_accumulator.EventAccumulator(event_file)
ea.Reload()
summary = ea.Scalars(summary_name)
summary_list = [round(x.value, 5) for x in summary]
print(summary_list)
return summary_list
raise FileNotFoundError(f"File not found matching: {path}/events*")
def collect_train_test_metrics(logs_dir, index):
train_loss_list = read_tb_logs_as_list(logs_dir, "lm loss", index)
train_loss_list = [round(elem,3) for elem in train_loss_list]
train_metrics = {
"lm loss": train_loss_list[0:len(train_loss_list):5],
}
str_train_metrics = str(train_metrics).replace("'", "\"")
print(f"\n ----------- The following are the metrics for ----------")
print(f"\n {str_train_metrics}", flush=True)
return train_metrics
class TestCIPipeline:
train_metrics_100 = collect_train_test_metrics(LOGS_DIR, 0)
train_metrics_50_to_100 = collect_train_test_metrics(LOGS_DIR, 1)
def _test_helper(self, loss_type):
expected = self.train_metrics_100[loss_type]
print('expected : ' + str(expected))
actual = self.train_metrics_50_to_100[loss_type]
print('actual : ' + str(actual))
# NOTE : Doing this way because in gpt3 model when I run from 0 - 100 directly, it produces 1 extra element
# i.e expected is [10.84266, 10.89696, 10.90542, 10.87498, 10.86265, 10.83608, 10.64368, 10.62319, 10.53908, 10.25005, 10.20907, 9.96542, 9.96802, 9.92436, 9.79086, 9.26718, 9.61784, 9.19018, 9.45986, 9.62168, 9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22436, 9.19436, 9.11323, 9.09711, 9.04421, 9.36795]
# actual is : [9.73772, 8.85732, 9.43185, 9.27912, 9.6832, 9.5127, 9.5419, 9.02549, 8.55077, 8.91355, 8.83375, 9.17722, 9.22435, 9.19435, 9.11322, 9.09711, 9.04422]
# That extra element in expected is causing some issues. So doing it this way. Need to figure out whats happening
start_idx_expected = expected.index(actual[0]) # First element of actual
# Here we will just be comparing values of actual and second half (50-100) of expected
for i in range(len(actual)):
assert actual[i] == expected[start_idx_expected + i], f"The value at step {i} should be {expected[start_idx_expected + i]} but it is {actual[i]}."
def test_lm_loss_deterministic(self):
self._test_helper("lm loss")
\ No newline at end of file
#! /bin/bash
JOBID=$1
echo "Job id : $JOBID"
if [[ $JOBID -eq "" ]]; then
exit 1
fi
sleep 10s
while true; do
export STATE=`sacct -j $JOBID --format State --parsable2 --noheader |& head -n 1`
case "${STATE}" in
PENDING|RUNNING|REQUEUED)
echo "Job is still in $STATE"
sleep 15s
;;
*)
sleep 30s
echo "Exiting with SLURM job status '${STATE}'"
exit 0
;;
esac
done
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50444, 10.49325, 10.4863, 10.48386, 10.49859, 10.46608, 10.41875, 10.30048, 10.16226, 9.97872]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [17438.0, 18790.0, 22561.0, 18532.0, 20246.0, 23670.0, 22788.0]}, "iteration_timing_avg": 0.3469323529411764}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54369, 10.5383, 10.55953, 10.54011, 10.51871, 10.4908, 10.46566, 10.31844, 10.15596, 9.9664]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [21736.0, 20410.0, 27256.0, 23697.0, 22528.0, 21048.0, 23461.0]}, "iteration_timing_avg": 0.8071679411764707}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44729, 10.44093, 10.45375, 10.44445, 10.44267, 10.44555, 10.39114, 10.25849, 10.1345, 9.9564]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27334.0, 20715.0, 28442.0, 24309.0, 23479.0, 20540.0, 21108.0]}, "iteration_timing_avg": 0.618779411764706}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4978, 10.49775, 10.48021, 10.50638, 10.49579, 10.46974, 10.34444, 10.25478, 10.10195, 9.91877]}, "num-zeros": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [26168.0, 19293.0, 28643.0, 22573.0, 25980.0, 34292.0, 21318.0]}, "iteration_timing_avg": 1.0391188235294118}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [10.84266, 10.89696, 10.90542, 10.87498, 10.86265, 10.83608, 10.64368]}, "num-zeros": {"start_step": 0, "end_step": 17, "step_interval": 5, "values": [2093.0, 2491.0, 2352.0, 2202.0]}, "iteration_timing_avg": 0.07941913043478262}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 47, "step_interval": 5, "values": [10.85716, 10.88973, 10.879, 10.87014, 10.87964, 10.84443, 10.67203, 10.62868, 10.52733, 10.2536]}, "num-zeros": {"start_step": 0, "end_step": 30, "step_interval": 5, "values": [2450.0, 2383.0, 2525.0, 2234.0, 2313.0, 2514.0]}, "iteration_timing_avg": 0.11253562499999999}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.86276, 10.88058, 10.87527, 10.88402, 10.89158, 10.84702, 10.6879, 10.62796, 10.53893, 10.26644]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2199.0, 2304.0, 2389.0, 1985.0, 2059.0, 2393.0, 2395.0]}, "iteration_timing_avg": 0.15685176470588238}
\ No newline at end of file
#! /bin/bash
DATA_PATH=$1
CHECKPOINT_PATH=$2
TENSORBOARD_DIR=$3
TP_SIZE=$4
PP_SIZE=$5
NNODES=$6
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
export CUDA_DEVICE_MAX_CONNECTIONS=1
# Runs the "345M" parameter model
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
# Run for 100 iterations
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_bert.py \
--use-checkpoint-args \
--use-checkpoint-opt_param-scheduler \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--log-params-norm \
--log-num-zeros-in-grad \
--log-validation-ppl-to-tensorboard \
--log-timers-to-tensorboard \
--tensorboard-dir ${TENSORBOARD_DIR} \
--micro-batch-size 4 \
--global-batch-size 128 \
--seq-length 512 \
--max-position-embeddings 512 \
--train-iters 100 \
--timing-log-level 2 \
--lr-decay-iters 990000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file /workspace/data/bert_data/vocab.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--min-lr 0.00001 \
--lr-warmup-fraction 0.01 \
--log-interval 1 \
--save-interval 50 \
--eval-interval 1000 \
--eval-iters 10 \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--no-gradient-accumulation-fusion \
--fp16
echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt
# Resume from 50th iteration ckpt and continue to 100 iterations
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_bert.py \
--use-checkpoint-args \
--use-checkpoint-opt_param-scheduler \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--log-params-norm \
--log-num-zeros-in-grad \
--log-validation-ppl-to-tensorboard \
--log-timers-to-tensorboard \
--tensorboard-dir ${TENSORBOARD_DIR} \
--micro-batch-size 4 \
--global-batch-size 128 \
--seq-length 512 \
--max-position-embeddings 512 \
--train-iters 100 \
--timing-log-level 2 \
--lr-decay-iters 990000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file /workspace/data/bert_data/vocab.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--min-lr 0.00001 \
--lr-warmup-fraction 0.01 \
--log-interval 1 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--no-gradient-accumulation-fusion \
--fp16
\ No newline at end of file
#! /bin/bash
set -o xtrace
DATA_PATH=$1
CHECKPOINT_PATH=$2
TENSORBOARD_DIR=$3
TP_SIZE=$4
PP_SIZE=$5
NNODES=$6
MAX_STEPS=$7
VP_SIZE=$8
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
export CUDA_DEVICE_MAX_CONNECTIONS=1
# Runs the "345M" parameter model
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_bert.py \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--log-params-norm \
--log-num-zeros-in-grad \
--log-validation-ppl-to-tensorboard \
--log-timers-to-tensorboard \
--tensorboard-dir ${TENSORBOARD_DIR} \
--micro-batch-size 4 \
--global-batch-size 128 \
--seq-length 512 \
--max-position-embeddings 512 \
--train-iters $MAX_STEPS \
--timing-log-level 2 \
--lr-decay-iters 990000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file /workspace/data/bert_data/vocab.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.0001 \
--min-lr 0.00001 \
--lr-warmup-fraction 0.01 \
--log-interval 1 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
${VP_SIZE:+--num-layers-per-virtual-pipeline-stage "$VP_SIZE"} \
--no-gradient-accumulation-fusion \
--fp16
\ No newline at end of file
#!/bin/bash
# Parameters
#SBATCH --account=adlr
#SBATCH --job-name=adlr-ci:megatron-job
#SBATCH --nodes=1
#SBATCH --partition=luna
DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
CHECKPOINT_PATH=/workspace/checkpoints
TENSORBOARD_DIR=/workspace/logs
srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
ls
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_resume_checkpoint_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES"
\ No newline at end of file
#!/bin/bash
# Parameters
#SBATCH --account=adlr
#SBATCH --job-name=adlr-ci:megatron-job
#SBATCH --nodes=1
#SBATCH --partition=luna
DATA_PATH=/workspace/data/bert_data/my-bert_00_text_sentence
CHECKPOINT_PATH=/workspace/checkpoints
TENSORBOARD_DIR=/workspace/logs
srun --output $BASE_DIR/results/slurm-%j.out --error $BASE_DIR/results/slurm-%j.out --container-image gitlab-master.nvidia.com/dl/dgx/pytorch:21.12-py3-devel --container-mounts $BASE_DIR/logs:/workspace/logs,$BASE_DIR/checkpoints:/workspace/checkpoints,$BUILD_DIR:/workspace/megatron-lm,$DATA_DIR:/workspace/data --no-container-mount-home bash -c "
ls
cd /workspace/megatron-lm
./tests/functional_tests/test_scripts/bert/pretrain_bert_distributed_test.sh $DATA_PATH $CHECKPOINT_PATH $TENSORBOARD_DIR $TP_SIZE $PP_SIZE $NUM_NODES $MAX_STEPS $VP_SIZE"
\ No newline at end of file
#! /bin/bash
DATA_PATH=$1
CHECKPOINT_PATH=$2
TENSORBOARD_DIR=$3
TP_SIZE=$4
PP_SIZE=$5
NNODES=$6
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
export CUDA_DEVICE_MAX_CONNECTIONS=1
# Runs the "345M" parameter model
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
# Run for 100 iterations and save checkpoint at 50
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt.py \
--use-checkpoint-args \
--use-checkpoint-opt_param-scheduler \
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--log-params-norm \
--log-num-zeros-in-grad \
--log-validation-ppl-to-tensorboard \
--log-timers-to-tensorboard \
--tensorboard-dir ${TENSORBOARD_DIR} \
--micro-batch-size 4 \
--global-batch-size 32 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 100 \
--timing-log-level 2 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
--merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--log-interval 1 \
--save-interval 50 \
--eval-interval 1000 \
--eval-iters 10 \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--no-gradient-accumulation-fusion \
--fp16
echo 50 > $CHECKPOINT_PATH/latest_checkpointed_iteration.txt
# Resume from 50th iteration ckpt and continue to 100 iterations
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt.py \
--use-checkpoint-args \
--use-checkpoint-opt_param-scheduler \
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--log-params-norm \
--log-num-zeros-in-grad \
--log-validation-ppl-to-tensorboard \
--log-timers-to-tensorboard \
--tensorboard-dir ${TENSORBOARD_DIR} \
--micro-batch-size 4 \
--global-batch-size 32 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters 100 \
--timing-log-level 2 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
--merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--log-interval 1 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--no-gradient-accumulation-fusion \
--fp16
\ No newline at end of file
#! /bin/bash
DATA_PATH=$1
CHECKPOINT_PATH=$2
TENSORBOARD_DIR=$3
TP_SIZE=$4
PP_SIZE=$5
NNODES=$6
MAX_STEPS=$7
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
export CUDA_DEVICE_MAX_CONNECTIONS=1
# Runs the "345M" parameter model
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt.py \
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--log-params-norm \
--log-num-zeros-in-grad \
--log-validation-ppl-to-tensorboard \
--log-timers-to-tensorboard \
--tensorboard-dir ${TENSORBOARD_DIR} \
--micro-batch-size 4 \
--global-batch-size 32 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--train-iters $MAX_STEPS \
--timing-log-level 2 \
--lr-decay-iters 320000 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATA_PATH \
--vocab-file /workspace/data/gpt3_data/gpt2-vocab.json \
--merge-file /workspace/data/gpt3_data/gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--log-interval 1 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--tensor-model-parallel-size $TP_SIZE \
--pipeline-model-parallel-size $PP_SIZE \
--no-gradient-accumulation-fusion \
--fp16
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment