Commit 3aca1415 authored by liangjing's avatar liangjing
Browse files

Merge branch 'megatron-lm_dtk24.04' into 'main'

Megatron lm dtk24.04

See merge request !1
parents 0024a5c6 1005e9d3
Pipeline #1806 passed with stage
#!/bin/bash
# This example script is contributed by external user https://github.com/nrailgun
set -ex
######################################
#####################################
export CUDA_DEVICE_MAX_CONNECTIONS=1
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=5
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export NCCL_IB_TIMEOUT=22
# Change the below configurations here
BASE_PATH=./tmp
DATASET_1="./dataset/my-gpt2_text_document"
DATASET="1 ${DATASET_1}"
CHECKPOINT_PATH=./tmp
TP=4
PP=1
HIDDEN_SIZE=4096
NUM_LAYERS=32
NUM_HEADS=32
SEQ_LENGTH=4096
VOCAB_PATH=./gpt2-vocab.json
MERGE_PATH=./gpt2-merges.txt
MICRO_BATCH_SIZE=1
GLOBAL_BATCH_SIZE=60
TRAIN_STEPS=250000
LR=3e-4
MIN_LR=3e-5
LR_WARMUP_STEPS=2000
WEIGHT_DECAY=0.1
GRAD_CLIP=1
APP="python3 -u pretrain_gpt.py \
--tensor-model-parallel-size $TP \
--pipeline-model-parallel-size $PP \
--num-layers $NUM_LAYERS \
--hidden-size $HIDDEN_SIZE \
--num-attention-heads $NUM_HEADS \
--micro-batch-size $MICRO_BATCH_SIZE \
--global-batch-size $GLOBAL_BATCH_SIZE \
--seq-length $SEQ_LENGTH \
--max-position-embeddings $SEQ_LENGTH \
--train-iters $TRAIN_STEPS \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--data-path $DATASET \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr $LR \
--lr-decay-style cosine \
--min-lr $MIN_LR \
--weight-decay $WEIGHT_DECAY \
--clip-grad $GRAD_CLIP \
--lr-warmup-iters $LR_WARMUP_STEPS \
--optimizer adam \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--log-interval 1 \
--vocab-file ${VOCAB_PATH} \
--merge-file ${MERGE_PATH} \
--tokenizer-type GPT2BPETokenizer \
--save-interval 1000 \
--eval-interval 1000 \
--eval-iters 1000 \
--fp16 \
--recompute-activations \
--disable-bias-linear \
--no-gradient-accumulation-fusion \
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url tcp://${1}:34566 \
--num-workers 2 \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3
${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3
${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3
${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3
${APP}
;;
esac
......@@ -8,6 +8,7 @@ from megatron import get_tokenizer
from megatron.model.classification import Classification
from tasks.eval_utils import accuracy_func_provider
from tasks.finetune_utils import finetune
from megatron.arguments import core_transformer_config_from_args
def glue_classification(num_classes, Dataset,
......@@ -28,10 +29,11 @@ def glue_classification(num_classes, Dataset,
def model_provider(pre_process=True, post_process=True):
"""Build the model."""
args = get_args()
config = core_transformer_config_from_args()
print_rank_0('building classification model for {} ...'.format(
args.task))
model = Classification(num_classes=num_classes, num_tokentypes=2,
model = Classification(config=config, num_classes=num_classes, num_tokentypes=2,
pre_process=pre_process, post_process=post_process)
return model
......
......@@ -7,7 +7,7 @@ Below we present the steps to run our multi-stage dialogue prompting (MSDP) fram
### Data Preparation
1. Dataset Download: [Wizard of Wikipedia](https://parl.ai/projects/wizard_of_wikipedia/) and [Wizard of Internet](https://parl.ai/projects/sea/)
2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datatsets.
2. Data Processing: We provide the script to run the [`data processing`](../../examples/msdp/data_processing.sh) of the datasets.
### Stage-1: Prompting for Knowledge Generation
1. We provide the script to perform the [`first-stage prompting`](../../examples/msdp/prompt_knwl_gen.sh) for the knowledge generation.
......
## End-to-End Training of Neural Retrievers for Open-Domain Question Answering
Below we present the steps to run unsupervised and supervised trainining and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).
Below we present the steps to run unsupervised and supervised training and evaluation of the retriever for [open domain question answering](https://arxiv.org/abs/2101.00408).
## Retriever Training
......
......@@ -9,6 +9,7 @@ from megatron.model.multiple_choice import MultipleChoice
from tasks.eval_utils import accuracy_func_provider
from tasks.finetune_utils import finetune
from tasks.race.data import RaceDataset
from megatron.arguments import core_transformer_config_from_args
def train_valid_datasets_provider():
......@@ -26,9 +27,10 @@ def train_valid_datasets_provider():
def model_provider(pre_process=True, post_process=True):
"""Build the model."""
config = core_transformer_config_from_args(get_args())
print_rank_0('building multichoice model for RACE ...')
model = MultipleChoice(num_tokentypes=2,
model = MultipleChoice(config=config,
num_tokentypes=2,
pre_process=pre_process,
post_process=post_process)
......
......@@ -14,7 +14,8 @@ from megatron.checkpointing import load_checkpoint
from megatron.model import GPTModel
from megatron.training import get_model
from megatron.utils import get_ltor_masks_and_position_ids, unwrap_model
from megatron.p2p_communication import recv_forward, send_forward
from megatron.core.pipeline_parallel.p2p_communication import recv_forward, send_forward
from megatron.arguments import core_transformer_config_from_args
from tasks.finetune_utils import build_data_loader
from .datasets import build_dataset
......@@ -31,6 +32,8 @@ def get_model_provider(eval_metric):
def model_provider(pre_process=True, post_process=True):
"""Build the model."""
config = core_transformer_config_from_args(get_args())
if eval_metric == 'loss':
parallel_output = True
elif eval_metric == 'accuracy':
......@@ -40,7 +43,7 @@ def get_model_provider(eval_metric):
'is not supported.'.format(eval_metric))
print_rank_0('building GPT model ...')
model = GPTModel(num_tokentypes=0, parallel_output=parallel_output,
model = GPTModel(config, num_tokentypes=0, parallel_output=parallel_output,
pre_process=pre_process, post_process=post_process)
return model
......@@ -69,7 +72,7 @@ def process_batch(batch):
return tokens, labels, attention_mask, position_ids, loss_mask
def forward_step(batch, model, eval_metric):
def forward_step(batch, model, eval_metric, config):
"""Forward step."""
# Get the batch.
......@@ -80,7 +83,8 @@ def forward_step(batch, model, eval_metric):
args = get_args()
args.micro_batch_size = len(labels)
input_tensor = recv_forward()
tensor_shape = (args.seq_length, args.micro_batch_size, args.hidden_size)
input_tensor = recv_forward(tensor_shape, config)
# Forward pass through the model.
unwrapped_model = unwrap_model(
......@@ -88,7 +92,7 @@ def forward_step(batch, model, eval_metric):
unwrapped_model.set_input_tensor(input_tensor)
output = model(tokens, position_ids, attention_mask)
send_forward(output)
send_forward(output, config)
if parallel_state.is_pipeline_last_stage():
# For loss, return the unreduced loss.
......@@ -115,7 +119,8 @@ def forward_step(batch, model, eval_metric):
def evaluate(data_loader, model, eval_metric):
"""Evaluation."""
args = get_args()
config = core_transformer_config_from_args(args)
# Turn on evaluation mode which disables dropout.
model.eval()
......@@ -126,7 +131,7 @@ def evaluate(data_loader, model, eval_metric):
if iteration % args.log_interval == 0:
print_rank_0('> working on iteration: {}'.format(iteration))
# Forward evaluation.
output = forward_step(batch, model, eval_metric)
output = forward_step(batch, model, eval_metric, config)
# Reduce across processes.
if parallel_state.is_pipeline_last_stage():
......
"""Check if a given slurm job id completed successfully
Usage:
python3 check_slurm_job_completion.py <JOB_ID>
"""
import sys
import subprocess
cmd = f"sacct -j {sys.argv[1]}"
result = subprocess.check_output(cmd, shell=True).decode().split()
assert len(result) > 14, "JOB state not available."
status = result[19]
exit_code = result[20]
assert status == "COMPLETED", f"Job {sys.argv[1]} not completed."
assert exit_code == "0:0", f"Job {sys.argv[1]} did not exit successfully."
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
import sys
import json
import shutil
import glob
from tensorboard.backend.event_processing import event_accumulator
......
......@@ -57,12 +57,14 @@ class TestCIPipeline:
print(expected_list)
actual_list = read_tb_logs_as_list(LOGS_DIR, loss_type)
assert actual_list is not None, f"No TensorBoard events file was found in the logs for {loss_type}."
for i, step in enumerate(range(expected["start_step"], expected["end_step"], expected["step_interval"])):
actual_list_sliced = actual_list[expected["start_step"]:expected["end_step"]:expected["step_interval"]]
for i, (expected_val, actual_val) in enumerate(zip(expected_list, actual_list_sliced)):
step = i * expected["step_interval"]
print(f"Checking step {step} against expected {i}")
if test_type == TypeOfTest.APPROX:
assert actual_list[step] == pytest.approx(expected=expected_list[i], rel=self.margin_loss), f"{self.job_name} : The loss at step {step} should be approximately {expected_list[i]} but it is {actual_list[step]}."
assert actual_val == pytest.approx(expected=expected_val, rel=self.margin_loss), f"The loss at step {step} should be approximately {expected_val} but it is {actual_val}."
else:
assert actual_list[step] == expected_list[i], f"The value at step {step} should be {expected_list[i]} but it is {actual_list[step]}."
assert actual_val == expected_val, f"The value at step {step} should be {expected_val} but it is {actual_val}."
@pytest.mark.xfail
def test_lm_loss_deterministic(self):
......
import os
os.environ['OPENBLAS_NUM_THREADS'] = '1'
import sys
import json
import shutil
......
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50444, 10.49325, 10.4863, 10.48386, 10.49892, 10.46644, 10.41921, 10.30106, 10.16285, 9.97939]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [17438.0, 18815.0, 22912.0, 18568.0, 19900.0, 23810.0, 22918.0]}, "iteration_timing_avg": 0.35970588235294115}
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.50685, 10.49816, 10.47982, 10.48566, 10.49535, 10.46661, 10.42394, 10.30692, 10.15978, 9.96955]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [19029.0, 19175.0, 22073.0, 18475.0, 20839.0, 23168.0, 22721.0]}, "iteration_timing_avg": 0.4121861764705882}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54369, 10.5383, 10.55953, 10.54011, 10.51908, 10.49118, 10.46612, 10.31901, 10.15649, 9.96702]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [21736.0, 20433.0, 27243.0, 23240.0, 22459.0, 20724.0, 23451.0]}, "iteration_timing_avg": 0.8657461764705884}
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.54837, 10.54636, 10.55694, 10.54151, 10.53088, 10.48503, 10.46274, 10.31498, 10.17119, 9.97324]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [22611.0, 20670.0, 26082.0, 23626.0, 21993.0, 21751.0, 23179.0]}, "iteration_timing_avg": 0.874113823529412}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44729, 10.44093, 10.45375, 10.44445, 10.44305, 10.44595, 10.39163, 10.25898, 10.13498, 9.95692]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [27334.0, 20551.0, 28114.0, 24328.0, 24070.0, 20653.0, 21346.0]}, "iteration_timing_avg": 0.6318655882352939}
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.44877, 10.43852, 10.44018, 10.44114, 10.45622, 10.44144, 10.39044, 10.25684, 10.133, 9.95743]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [28181.0, 20629.0, 28241.0, 26287.0, 24057.0, 21016.0, 21238.0]}, "iteration_timing_avg": 0.7704600000000001}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.4978, 10.49775, 10.48021, 10.50638, 10.49624, 10.47018, 10.34494, 10.25536, 10.10244, 9.91938]}, "num-zeros": {"start_step": 0, "end_step": 35, "step_interval": 5, "values": [26168.0, 19042.0, 28718.0, 22408.0, 26377.0, 34320.0, 21873.0]}, "iteration_timing_avg": 1.1249785294117647}
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.48681, 10.48784, 10.4873, 10.50417, 10.49446, 10.47819, 10.41361, 10.28135, 10.14425, 9.94149]}, "num-zeros": {"start_step": 0, "end_step": 34, "step_interval": 5, "values": [26852.0, 19803.0, 25492.0, 24594.0, 21586.0, 19658.0, 20766.0]}, "iteration_timing_avg": 1.4250708823529417}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 37, "step_interval": 5, "values": [10.84266, 10.89696, 10.90542, 10.87498, 10.86279, 10.83628, 10.64437, 10.62386]}, "num-zeros": {"start_step": 0, "end_step": 20, "step_interval": 5, "values": [2093.0, 2474.0, 2327.0, 2213.0]}, "iteration_timing_avg": 0.080846}
{"lm loss": {"start_step": 0, "end_step": 44, "step_interval": 5, "values": [10.84008, 10.89053, 10.90905, 10.87934, 10.86562, 10.83752, 10.64582, 10.62397, 10.53554]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [2078.0, 2320.0, 2519.0, 2248.0, 2127.0, 1987.0]}, "iteration_timing_avg": 0.09863333333333332}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [10.83091, 10.87024, 10.89161, 10.81277, 10.6858, 10.61231, 10.09495, 10.21817]}, "num-zeros": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [1545.0, 1840.0, 1867.0, 1825.0, 1867.0, 1668.0, 1576.0, 1932.0]}, "iteration_timing_avg": 0.09399846153846156}
{"lm loss": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [10.84538, 10.87913, 10.90387, 10.8235, 10.67913, 10.60602, 10.06785, 10.19695]}, "num-zeros": {"start_step": 0, "end_step": 38, "step_interval": 5, "values": [1722.0, 2142.0, 2149.0, 1966.0, 2038.0, 1914.0, 1745.0, 1956.0]}, "iteration_timing_avg": 0.10455653846153849}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 49, "step_interval": 5, "values": [10.7947, 10.85294, 10.87058, 10.83388, 10.83025, 10.78755, 10.56419, 10.57339, 10.48735, 10.19553]}, "num-zeros": {"start_step": 0, "end_step": 33, "step_interval": 5, "values": [2452.0, 2744.0, 2176.0, 2722.0, 2636.0, 2535.0, 2996.0]}, "iteration_timing_avg": 0.1158709090909091}
{"lm loss": {"start_step": 0, "end_step": 47, "step_interval": 5, "values": [10.81244, 10.87437, 10.90132, 10.84946, 10.84826, 10.81872, 10.61811, 10.61053, 10.52823, 10.22961]}, "num-zeros": {"start_step": 0, "end_step": 30, "step_interval": 5, "values": [2356.0, 2601.0, 2778.0, 2282.0, 2350.0, 2782.0]}, "iteration_timing_avg": 0.12793593749999999}
\ No newline at end of file
{"lm loss": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [10.79471, 10.86601, 10.89073, 10.78482, 10.6587, 10.58125]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [1609.0, 1850.0, 1921.0, 1942.0, 1853.0, 1674.0]}, "iteration_timing_avg": 0.12440000000000001}
{"lm loss": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [10.79474, 10.86607, 10.8908, 10.7851, 10.65905, 10.58193]}, "num-zeros": {"start_step": 0, "end_step": 27, "step_interval": 5, "values": [1587.0, 1824.0, 2006.0, 1919.0, 1874.0, 1646.0]}, "iteration_timing_avg": 0.12088222222222227}
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment