Commit 7c19b3a8 authored by wangsen's avatar wangsen
Browse files

Initial commit

parents
Pipeline #1721 failed with stages
in 0 seconds
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/llama2-text-7b_v0.2.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/hf/tokenizer.model"
# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="2"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
trtllm_options=" \
--tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
--engine-dir /tmp/trtllm_engine \
--tokenizer ${CHECKPOINT_LOAD_DIR}/hf \
--max-input-len 2048 \
--max-output-len 512 \
--max-batch-size 8 "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
--swiglu \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--normalization RMSNorm \
--rotary-percent 1.0 \
--no-position-embedding \
--no-masked-softmax-fusion \
--no-bias-gelu-fusion \
--no-bias-dropout-fusion \
--no-async-tensor-model-parallel-allreduce \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 11008 \
--num-attention-heads 32 \
--seq-length 4096 \
--max-position-embeddings 4096 \
--micro-batch-size 1 \
--make-vocab-size-divisible-by 1 \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--save-interval 1000000 \
--use-dist-ckpt \
--load ${CHECKPOINT_LOAD_DIR}
--fp16"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options}
# This script is using mpi4py which will fork multiple processes.
python examples/inference/trtllm_text_generation.py ${trtllm_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/nemotron3-8b_v0.3.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="gptnext"
CHECKPOINT_LOAD_DIR="${NAME}"
TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/tokenizer.model"
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="1"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
trtllm_options=" \
--tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
--engine-dir /tmp/trtllm_engine \
--tokenizer ${TOKENIZER_MODEL} \
--max-input-len 2048 \
--max-output-len 512 \
--max-batch-size 8 "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--apply-layernorm-1p \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--no-rope-fusion \
--no-position-embedding \
--use-rotary-position-embeddings \
--rotary-percent 0.5 \
--squared-relu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 4096 \
--max-position-embeddings 4096 \
--micro-batch-size 1 \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--save-interval 1000000 \
--load ${CHECKPOINT_LOAD_DIR} \
--fp16 \
--use-dist-ckpt"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options}
# This script is using mpi4py which will fork multiple processes.
python examples/inference/trtllm_text_generation.py ${trtllm_options}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Sample Generate GPT."""
import functools
import os
import sys
from pathlib import Path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
import modelopt.torch.quantization as mtq
import torch
from datasets import load_dataset
from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group
from tqdm import tqdm
# [ModelOpt]: changing the default model provider to the ModelOpt version
from megatron.core import mpu
from megatron.inference.arguments import add_modelopt_args
from megatron.inference.checkpointing import load_modelopt_checkpoint
from megatron.inference.gpt.model_provider import model_provider
from megatron.inference.text_generation import generate_and_post_process
from megatron.training import get_args, get_model, initialize_megatron
from megatron.training.checkpointing import save_checkpoint
from megatron.training.utils import print_rank_0, unwrap_model
QUANT_CFG_CHOICES = {
"int8": mtq.INT8_DEFAULT_CFG,
"int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
"fp8": mtq.FP8_DEFAULT_CFG,
"int4_awq": mtq.INT4_AWQ_CFG,
"w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
"int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
}
def add_trtllm_ckpt_export_args(parser):
"""Add additional arguments for TensorRT-LLM."""
group = parser.add_argument_group(title="trtllm")
group.add_argument(
"--export-dir", type=str, help="The output TensorRT-LLM checkpoint.",
)
group.add_argument(
"--decoder", type=str, choices=["gptnext", 'llama'], help="The decoder type of the model.",
)
group.add_argument(
"--inference-tensor-parallel",
type=int,
help="Tensor parallel for the inference time, can be different from the training config.",
default=1,
)
def add_text_generate_ptq_args(parser):
"""Add additional arguments for ModelOpt text generation PTQ."""
group = parser.add_argument_group(title='ModelOpt text generation ptq')
group.add_argument(
"--calib-dataset",
type=str,
default="cnn_dailymail",
help="Calibration datasets from HuggingFace datasets.",
)
group.add_argument(
"--calib-batch-size", type=int, default=4, help="Batch size to use for ptq calibration."
)
group.add_argument(
"--calib-size", type=int, default=512, help="Samples to use for ptq calibration."
)
parser.add_argument(
"--prompts",
type=str,
default=(
"Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
),
help="Input texts. Please use | to separate different batches.",
)
add_modelopt_args(parser)
add_trtllm_ckpt_export_args(parser)
return parser
def get_calib_dataloader(
data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512
):
if data == "pileval":
dataset = load_dataset(
"json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train"
)
text_column = "text"
elif data == "wikitext":
dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
text_column = "text"
elif data == "cnn_dailymail":
dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
text_column = "article"
calib_size = max(min(len(dataset), calib_size), batch_size)
for i in range(calib_size // batch_size):
batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
for j in range(len(batch)):
batch[j] = batch[j][:max_sequence_length]
yield batch
if __name__ == "__main__":
initialize_megatron(
extra_args_provider=add_text_generate_ptq_args,
args_defaults={
'tokenizer_type': 'GPT2BPETokenizer',
'no_load_rng': True,
'no_load_optim': True,
},
)
args = get_args()
if args.num_layers_per_virtual_pipeline_stage is not None:
print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.")
exit()
print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
args.exit_on_missing_checkpoint = True
# Set up model and load checkpoint
# [ModelOpt]: make sure that output logits are allgathered.
text_generation_model_provider = functools.partial(model_provider, parallel_output=False)
model = get_model(text_generation_model_provider, wrap_with_ddp=False)
if args.load is not None:
load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
print_rank_0("Done loading checkpoint")
# Removing virtual pipeline parallel and other wrapper
assert len(model) == 1, "Above condition should have caught this"
unwrapped_model = unwrap_model(model)
all_prompts = args.prompts.split("|")
def custom_prompt_forward_loop_func(model):
for prompt in tqdm(all_prompts):
if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
(
prompts_plus_generations,
prompts_plus_generations_segments,
logprobs,
_,
) = generate_and_post_process(
model,
prompts=[prompt],
tokens_to_generate=128,
return_output_log_probs=True,
temperature=1.0,
)
print_rank_0(prompts_plus_generations)
else:
generate_and_post_process(model)
def hf_dataset_forword_loop_func(model):
dataloader = get_calib_dataloader(args.calib_dataset, args.calib_batch_size, args.calib_size)
for prompts in tqdm(dataloader, total=args.calib_size//args.calib_batch_size):
if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
(
prompts_plus_generations,
prompts_plus_generations_segments,
logprobs,
_,
) = generate_and_post_process(
model,
prompts=prompts,
tokens_to_generate=0,
return_output_log_probs=True,
temperature=1.0,
)
else:
generate_and_post_process(model)
ptq_forward_loop_func = custom_prompt_forward_loop_func
if args.calib_dataset is not None:
ptq_forward_loop_func = hf_dataset_forword_loop_func
# Setting data parallel and tensor parallel group
set_data_parallel_group(mpu.get_data_parallel_group())
set_tensor_parallel_group(mpu.get_tensor_model_parallel_group())
if args.export_quant_cfg in QUANT_CFG_CHOICES:
mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
if "*output_layer*" not in mtq_config["quant_cfg"]:
mtq_config["quant_cfg"]["*output_layer*"] = {"enable": False}
if "awq" in args.export_quant_cfg:
weight_quantizer = mtq_config["quant_cfg"]["*weight_quantizer"] # type: ignore
if isinstance(weight_quantizer, list):
weight_quantizer = weight_quantizer[0]
weight_quantizer["block_sizes"][-1] = 128
print_rank_0("Quantizing the model...")
mtq.quantize(unwrapped_model[0], mtq_config, ptq_forward_loop_func)
custom_prompt_forward_loop_func(model[0])
if args.save is not None and args.export_quant_cfg in QUANT_CFG_CHOICES:
save_checkpoint(1, unwrapped_model, None, None, 0)
print_rank_0(f"Fake Quantized Model:\n {unwrapped_model[0]}")
if args.export_dir:
assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported."
Path(args.export_dir).mkdir(parents=True, exist_ok=True)
print_rank_0("Exporting TensorRT-LLM checkpoints.")
from modelopt.torch.export import export_tensorrt_llm_checkpoint
# In TRT LLM, squared relu activation does not support bf16. So we use fp16 by default.
export_tensorrt_llm_checkpoint(
unwrapped_model[0],
args.decoder,
torch.bfloat16 if args.bf16 else torch.float16,
export_dir=args.export_dir,
inference_tensor_parallel=args.inference_tensor_parallel,
inference_pipeline_parallel=1,
use_nfs_workspace=True,
)
print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""An example script to run the tensorrt_llm engine."""
import argparse
from pathlib import Path
import numpy as np
import torch
from modelopt.deploy.llm import LLM, build_tensorrt_llm
from transformers import AutoTokenizer, T5Tokenizer
class CustomSentencePieceTokenizer(T5Tokenizer):
"""This is a custom GPTSentencePiece Tokenizer modified from the T5Tokenizer.
Note:
The modification is kept minimal to make `encode` and `batch_decode` working
properly (used in TensorRT-LLM engine). Other functions have not been tested.
"""
def __init__(self, model):
super().__init__(model, extra_ids=0, bos_token="<s>", pad_token="<pad>")
def encode(self, text, add_special_tokens: bool = True, **kwargs):
return torch.Tensor(self.sp_model.encode_as_ids(text))
def batch_encode_plus(
self, batch_text_or_text_pairs, add_special_tokens: bool = True, **kwargs
):
return {'input_ids': self.sp_model.encode_as_ids(batch_text_or_text_pairs)}
def batch_decode(self, sequences, skip_special_tokens: bool = False, **kwargs):
if isinstance(sequences, np.ndarray) or torch.is_tensor(sequences):
sequences = sequences.tolist()
return self.sp_model.decode(sequences)
def decode(self, token_ids, skip_special_tokens: bool = False, **kwargs):
return self.sp_model.decode([token_ids])[0]
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--tokenizer", type=str, default="")
parser.add_argument("--max-input-len", type=int, default=4096)
parser.add_argument("--max-output-len", type=int, default=512)
parser.add_argument("--max-batch-size", type=int, default=8)
parser.add_argument("--tensorrt-llm-checkpoint-dir", type=str, default=None)
parser.add_argument("--engine-dir", type=str, default="/tmp/trtllm_engine")
parser.add_argument(
"--input-texts",
type=str,
default=(
"Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
),
help="Input texts. Please use | to separate different batches.",
)
parser.add_argument("--max-beam-width", type=int, default=1)
parser.add_argument("--profiler-output", type=str, default="")
return parser.parse_args()
def run(args):
tokenizer_path = Path(args.tokenizer)
if tokenizer_path.is_dir():
# For llama models, use local HF tokenizer which is a folder.
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
elif tokenizer_path.is_file():
# For nextllm and nemotron models, use local Megatron GPTSentencePiece tokenizer which is a model file.
tokenizer = CustomSentencePieceTokenizer(args.tokenizer)
else:
raise ValueError(
"arg.tokenizer must be a dir to a hf tokenizer checkpoint for llama or a SentencePiece .model file for gptnext"
)
print(tokenizer, tokenizer.vocab_size)
if not hasattr(args, "profiler_output"):
args.profiler_output = ""
input_texts = args.input_texts.split("|")
assert input_texts, "input_text not specified"
print(input_texts)
if args.tensorrt_llm_checkpoint_dir is not None:
print("Building TensorRT-LLM engines.")
build_tensorrt_llm(
args.tensorrt_llm_checkpoint_dir + "/config.json",
args.engine_dir,
max_input_len=args.max_input_len,
max_batch_size=args.max_batch_size,
max_beam_width=args.max_beam_width,
num_build_workers=1,
)
print(f"TensorRT-LLM engines saved to {args.engine_dir}")
free_memory_before = torch.cuda.mem_get_info()
# This is a ModelOpt wrapper on top of tensorrt_llm.hlapi.llm.LLM
llm_engine = LLM(args.engine_dir, tokenizer)
torch.cuda.cudart().cudaProfilerStart()
# outputs = llm_engine.generate_text(input_texts, args.max_output_len, args.max_beam_width)
outputs = llm_engine.generate(input_texts)
torch.cuda.cudart().cudaProfilerStop()
free_memory_after = torch.cuda.mem_get_info()
print(
f"Used GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB"
)
print(outputs)
if __name__ == "__main__":
args = parse_arguments()
run(args)
#!/bin/bash
# This example will start serving the 345M model.
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--seed 42
#!/bin/bash
# This example will start serving the 345M model that is partitioned 8 way tensor parallel
DISTRIBUTED_ARGS="--nproc_per_node 8 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
pip install flask-restful
python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--seed 42
#!/bin/bash
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=5
#export HIP_ALLOC_INITIALIZE=0
#export GPU_MAX_HW_QUEUES=20
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=./tmp #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path>
#VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
#MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH="/root/megatron-llama/dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
TOKENIZER_PATH="/root/megatron-llama/tokenizer.model"
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 40
--hidden-size 5120
--num-attention-heads 40
--ffn-hidden-size 13824
--seq-length 4096
--max-position-embeddings 4096
)
TRAINING_ARGS=(
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 60
--train-iters 5
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--use-distributed-optimizer
--use-flash-attn-triton
--recompute-activations
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
--ddp-average-in-collective
--overlap-grad-reduce
--no-gradient-accumulation-fusion
--swiglu
--sequence-parallel
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
)
#--use-flash-attn
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 4
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization RMSNorm
--no-position-embedding
--tokenizer-model $TOKENIZER_PATH
--tokenizer-type Llama2Tokenizer
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
#!/bin/bash
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=5
export HIP_ALLOC_INITIALIZE=0
export GPU_MAX_HW_QUEUES=20
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_HCA=mlx5_1,mlx5_2
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export NCCL_IB_TIMEOUT=22
CHECKPOINT_PATH=./tmp #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path>
DATA_PATH="/root/megatron-llama/dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
TOKENIZER_PATH="/root/megatron-llama/tokenizer.model"
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 40
--hidden-size 5120
--num-attention-heads 40
--ffn-hidden-size 13824
--seq-length 4096
--max-position-embeddings 4096
)
TRAINING_ARGS=(
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 60
--train-iters 5
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--use-distributed-optimizer
--use-flash-attn-v2
--recompute-activations
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
--ddp-average-in-collective
--overlap-grad-reduce
--no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
)
#--use-flash-attn
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 4
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization RMSNorm
--no-position-embedding
--tokenizer-model $TOKENIZER_PATH
--tokenizer-type Llama2Tokenizer
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
APP="python3 -u pretrain_gpt.py \
$GPT_MODEL_ARGS[@] \
$TRAINING_ARGS[@] \
$MODEL_PARALLEL_ARGS[@] \
$DATA_ARGS[@] \
$EVAL_AND_LOGGING_ARGS[@]
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url tcp://${1}:34566 \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
esac
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
checkpoints/
data-cache/
tensorboard/
triton-cache/
FROM nvcr.io/nvidia/pytorch:23.12-py3
RUN pip uninstall -y causal-conv1d triton && \
pip install causal-conv1d==1.2.2.post1 sentencepiece==0.1.99 triton==2.1.0 flask-restful
WORKDIR /tmp
RUN git clone https://github.com/state-spaces/mamba.git && \
cd mamba && \
git checkout v2.0.3 && \
python setup.py install && \
cd .. && \
rm -rf mamba
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
FROM nvcr.io/nvidia/pytorch:24.02-py3
RUN apt update && \
apt -y upgrade && \
apt install -y --no-install-recommends \
software-properties-common \
build-essential \
python3-pip \
python3-dev \
bash \
git \
vim \
python-is-python3 \
default-jre
RUN pip install --upgrade pip
RUN pip install einops einops-exts sentencepiece braceexpand webdataset
RUN pip install transformers datasets
RUN pip install pytest-cov pytest_mock nltk wrapt
RUN pip install zarr "tensorstore==0.1.45"
RUN pip install git+https://github.com/fanshiqing/grouped_gemm@main
RUN pip install black==19.10b0 isort click==8.0.2
RUN pip install pycocoevalcap megatron-energon
RUN pip install git+https://github.com/openai/CLIP.git
# Use --no-deps for the following to avoid outdated and unnecessary dependencies.
RUN pip install mmf --no-deps
RUN pip install open-flamingo[eval] --no-deps
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment