"include/vscode:/vscode.git/clone" did not exist on "a89034c8021ea6e55bf55688eeb99065de9de928"
Unverified Commit 4e37499b authored by Kirthi Shankar Sivamani's avatar Kirthi Shankar Sivamani Committed by GitHub
Browse files

[PyTorch] Distributed testing (#398)



* Initial setup
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Fix testfile
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Fix commit
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Test script
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Fixes
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Fixes
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Add logs
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Add perf summary
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Reviews and improvements
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Generalize GPU count
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* add plots
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* Better plot
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

* get default file name with time
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>

---------
Signed-off-by: default avatarKirthi Shankar Sivamani <ksivamani@nvidia.com>
parent 45a2ac41
# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
set -e
: ${TE_PATH:=/opt/transformerengine}
git clone https://github.com/NVIDIA/Megatron-LM.git
cd Megatron-LM
git checkout f24fac4ed0dcf0522056521a93445d9a82f501a9
pytest -v -s $TE_PATH/tests/pytorch/distributed/test_convergence.py
python $TE_PATH/tests/pytorch/distributed/print_logs.py
# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
import os
import re
import glob
import datetime
from prettytable import PrettyTable
from matplotlib import pyplot as plt
NUM_MOST_RECENT_RUNS = 100
te_path = os.getenv("TE_PATH", "/opt/transformerengine")
mlm_log_dir = os.path.join(te_path, "ci_logs")
te_ci_log_dir = "/data/transformer_engine_ci_logs"
te_ci_plot_dir = os.path.join(te_ci_log_dir, "plots")
convergence_pattern = (
"validation loss at iteration \d* on validation set | lm loss"
" value: ([\d.]*)E\+(\d*) | lm loss PPL: ([\d.]*)E\+(\d*)"
)
perf_pattern = "elapsed time per iteration \(ms\): ([\d.]*)"
def get_output_file():
now = datetime.datetime.now()
default_fname = f"unknown_pipeline_id_{now.month}_{now.day}_{now.year}_{now.hour}_{now.minute}"
fname = f"{os.getenv('CI_PIPELINE_ID', default_fname)}.txt"
return os.path.join(te_ci_log_dir, fname)
def get_run_metrics(filename):
"""Return the loss, perplexity, and step time for a given megatron-LM logfile."""
with open(filename, "r") as f:
data = f.read()
# Loss and PPL
convergence_matches = re.findall(convergence_pattern, data)
loss = round(float(convergence_matches[1][0]) * (10 ** int(convergence_matches[1][1])), 2)
ppl = round(float(convergence_matches[2][2]) * (10 ** int(convergence_matches[2][3])), 2)
step_times_str = re.findall(perf_pattern, data)
step_times = [float(x) for x in step_times_str]
avg_step_time = round(sum(step_times) / len(step_times), 2)
return loss, ppl, avg_step_time
def print_run_logs():
tables = []
raw_logs = []
for model_config in os.listdir(mlm_log_dir):
model_config_dir = os.path.join(mlm_log_dir, model_config)
table = PrettyTable()
table.title = model_config
table.field_names = ["Config", "Loss", "Perplexity", "Avg time per step (ms)"]
for exp in os.listdir(model_config_dir):
filename = os.path.join(model_config_dir, exp)
loss, ppl, time_per_step = get_run_metrics(filename)
exp_name = exp[:-4]
table.add_row([exp_name, loss, ppl, time_per_step])
raw_logs.append(f"{model_config} {exp_name} {loss} {ppl} {time_per_step}\n")
tables.append(table)
with open(get_output_file(), "w") as f:
for raw_log in raw_logs:
f.write(raw_log)
for table in tables:
print(table)
def save_plot(title, legend, data, filename, ylabel):
x = list(range(1, len(data[0]) + 1))
plt.figure()
for label, y in zip(legend, data):
plt.plot(x, y, "-o", label=label)
plt.title(title)
plt.legend()
plt.xlabel(f"Last {NUM_MOST_RECENT_RUNS} runs")
plt.ylabel(ylabel)
plt.savefig(os.path.join(te_ci_plot_dir, filename))
def perf_and_loss_plots():
files = glob.glob(os.path.join(te_ci_log_dir, "*.txt"))
files.sort(key=os.path.getctime)
files = files[-NUM_MOST_RECENT_RUNS:]
data = {}
for filename in files:
with open(filename) as file:
for line in file:
line = line.strip()
model_config, exp_name, loss, _, time_per_step = line.split(" ")
if model_config not in data:
data[model_config] = {}
if exp_name not in data[model_config]:
data[model_config][exp_name] = {"loss": [], "perf": []}
data[model_config][exp_name]["loss"].append(float(loss))
data[model_config][exp_name]["perf"].append(float(time_per_step))
for model_config, experiments in data.items():
lm_loss_data = []
lm_perf_data = []
legend = []
for exp_name, lm_data in experiments.items():
legend.append(exp_name)
lm_loss_data.append(lm_data["loss"])
lm_perf_data.append(lm_data["perf"])
save_plot(
model_config + " loss", legend,
lm_loss_data, model_config + "_loss.png",
"LM-Loss",
)
save_plot(
model_config + " perf",
legend, lm_perf_data, model_config + "_perf.png",
"Time per step (ms)",
)
if __name__ == "__main__":
print_run_logs()
perf_and_loss_plots()
#!/bin/bash
# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
# This script allows flexibly running various sizes of
# GPT3 models with named hyperparameters.
# Trick to get kwargs.
for ARGUMENT in "$@"
do
KEY=$(echo $ARGUMENT | cut -f1 -d=)
KEY_LENGTH=${#KEY}
VALUE="${ARGUMENT:$KEY_LENGTH+1}"
export "$KEY"="$VALUE"
done
# Set defaults for all arguments.
: ${DP_SIZE:="1"}
: ${TP_SIZE:="1"}
: ${PP_SIZE:="1"}
: ${NUM_LAYERS:="12"}
: ${HIDDEN_SIZE:="768"}
: ${NHEADS:="12"}
: ${SEQLEN:="2048"}
: ${MAX_POSITION_EMBEDDINGS:="2048"}
: ${MBS:="8"}
: ${GBS:="32"}
: ${STEPS:="400"}
: ${LR:="6.0e-4"}
: ${MIN_LR:="6.0e-5"}
: ${SAVE_INTERVAL:="1000"}
: ${SPLIT:="98,2,0"}
: ${CLIP_GRAD:="1.0"}
: ${WEIGHT_DECAY:="0.1"}
: ${ADAM_BETA1:="0.9"}
: ${ADAM_BETA2:="0.95"}
: ${INIT_METHOD_STD:="0.023"}
: ${SP:="False"}
: ${DTYPE:="bf16"}
: ${WGRAD_FUSION:="True"}
: ${FP8:="False"}
: ${FP8_AMAX_HISTORY_LEN:="32"}
: ${TRANSFORMER_IMPL:="transformer_engine"}
: ${FILENAME:="log.txt"}
# Logging.
DIR=`pwd`
TENSORBOARD_DIR="${DIR}/tensorboard"
CHECKPOINT_DIR="${DIR}/checkpoints"
mkdir -p ${TENSORBOARD_DIR}
mkdir -p ${CHECKPOINT_DIR}
# Dataset.
. /data/gpt3/pile-cc1-cc2-shuf/gpt3_blend.sh
# Set GP3 options.
options=" \
--exit-duration-in-mins 230 \
--tensor-model-parallel-size ${TP_SIZE} \
--pipeline-model-parallel-size ${PP_SIZE} \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NHEADS} \
--seq-length ${SEQLEN} \
--max-position-embeddings ${MAX_POSITION_EMBEDDINGS} \
--micro-batch-size ${MBS} \
--global-batch-size ${GBS} \
--train-iters ${STEPS} \
--lr ${LR} \
--min-lr ${MIN_LR} \
--lr-decay-style cosine \
--log-interval 1 \
--eval-iters 50 \
--eval-interval 2000 \
--data-path ${DATA_BLEND} \
--vocab-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-vocab.json \
--merge-file /data/gpt3/pile-cc1-cc2-shuf/bpe/gpt2-merges.txt \
--save-interval ${SAVE_INTERVAL} \
--save ${CHECKPOINT_DIR} \
--load ${CHECKPOINT_DIR} \
--split ${SPLIT} \
--clip-grad ${CLIP_GRAD} \
--weight-decay ${WEIGHT_DECAY} \
--adam-beta1 ${ADAM_BETA1} \
--adam-beta2 ${ADAM_BETA2} \
--init-method-std ${INIT_METHOD_STD} \
--log-params-norm \
--log-num-zeros-in-grad \
--no-query-key-layer-scaling \
--DDP-impl local \
--transformer-impl ${TRANSFORMER_IMPL} \
--tensorboard-dir ${TENSORBOARD_DIR} \
--fp8-margin 0 \
--fp8-interval 1 \
--fp8-amax-history-len ${FP8_AMAX_HISTORY_LEN} \
--fp8-amax-compute-algo max"
if [[ "$SP" == "True" ]]; then
options+=" --sequence-parallel"
fi
if [[ "$WGRAD_FUSION" == "False" ]]; then
options+=" --no-gradient-accumulation-fusion"
fi
if [[ "$FP8" != "False" ]]; then
options+=" --fp8-${FP8}"
fi
if [[ "$DTYPE" != "fp32" ]]; then
options+=" --${DTYPE}"
fi
# Run GPT3.
NUM_GPUS=$((${DP_SIZE}*${TP_SIZE}*${PP_SIZE}))
NVTE_TORCH_COMPILE=0 NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 NVTE_FLASH_ATTN=1 NVTE_FWD_LAYERNORM_SM_MARGIN=0 NVTE_BWD_LAYERNORM_SM_MARGIN=0 CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_BIAS_GELU_NVFUSION=0 NVTE_BIAS_DROPOUT_FUSION=0 python -m torch.distributed.launch --use_env --nnodes=1 --nproc_per_node=${NUM_GPUS} ${DIR}/pretrain_gpt.py ${options} 2>&1 | tee $FILENAME
# Remove checkpoints.
rm -rf ${CHECKPOINT_DIR}/*
# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# See LICENSE for license information.
from typing import List, Tuple, Union
import pytest
import subprocess
import os
from dataclasses import dataclass, asdict
from functools import lru_cache
import torch
@dataclass()
class ModelConfigGPT:
NUM_LAYERS: int = 12
HIDDEN_SIZE: int = 768
NHEADS: int = 12
SEQLEN: int = 2048
MAX_POSITION_EMBEDDINGS: int = 2048
LR: float = 6.0e-4
MIN_LR: float = 6.0e-5
SPLIT: str = "98,2,0"
CLIP_GRAD: float = 1.0
WEIGHT_DECAY: float = 0.1
ADAM_BETA1: float = 0.9
ADAM_BETA2: float = 0.95
INIT_METHOD_STD: float = 0.023
model_configs = {
"126m": ModelConfigGPT(),
}
dtypes = ["bf16"]
fp8_recipes = [False, "hybrid"]
all_boolean = [True, False]
te_path = os.getenv("TE_PATH", "/opt/transformerengine")
mlm_log_dir = os.path.join(te_path, "ci_logs")
@lru_cache(maxsize=1)
def get_parallel_configs() -> List[Tuple[int, int]]:
"""Returns valid combinations of (tp, pp)."""
sizes = [1, 2, 4]
num_devices = torch.cuda.device_count()
parallel_configs = []
if num_devices > 1:
for dp in sizes:
for tp in sizes:
for pp in sizes:
if dp * tp * pp == num_devices:
parallel_configs.append((dp, tp, pp))
return parallel_configs
def get_filename(
model: str, dp: int, tp: int, pp: int, sp: bool, use_te: bool, fp8_recipe: Union[bool, str]
) -> str:
sp = tp if sp else 1
config = f"gpt3_{model}_dp{dp}_tp{tp}_pp{pp}_sp{sp}"
config_dir = os.path.join(mlm_log_dir, config)
os.makedirs(config_dir, exist_ok=True)
fname = f"{'te' if use_te else 'megatron'}" + (f"_fp8_{fp8_recipe}" if fp8_recipe else "") + ".txt"
return os.path.join(config_dir, fname)
def get_bash_arguments(filename: str, **kwargs) -> List[str]:
args = []
script_path = os.path.join(te_path, "tests/pytorch/distributed/run_megatron_lm_gpt.sh")
args.append(script_path)
for k, v in kwargs.items():
args.append(f"{k}={str(v)}")
args.append(f"FILENAME={filename}")
return args
@pytest.mark.parametrize("sp", all_boolean)
@pytest.mark.parametrize("use_te", all_boolean)
@pytest.mark.parametrize("dtype", dtypes)
@pytest.mark.parametrize("fp8_recipe", fp8_recipes)
@pytest.mark.parametrize("dp, tp, pp", get_parallel_configs())
@pytest.mark.parametrize("model", model_configs.keys())
def test_distributed(dtype, fp8_recipe, dp, tp, pp, sp, use_te, model):
if sp and tp == 1:
pytest.skip("No tensor parallel.")
if fp8_recipe and not use_te:
pytest.skip("TransformerEngine needed for FP8.")
subprocess.run(
get_bash_arguments(
get_filename(model, dp, tp, pp, sp, use_te, fp8_recipe),
DTYPE=dtype,
FP8=fp8_recipe,
SP=sp,
DP_SIZE=dp,
TP_SIZE=tp,
PP_SIZE=pp,
TRANSFORMER_IMPL="transformer_engine" if use_te else "local",
**asdict(model_configs[model]),
),
check=True)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment