Commit d520d24f authored by silencealiang's avatar silencealiang
Browse files

Merge branch 'main' into 'main'

megatron升级v0.10

See merge request !3
parents 3aca1415 481609bb
Pipeline #2055 failed with stages
in 0 seconds
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/llama2-text-7b_v0.2.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/hf/tokenizer.model"
# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="2"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
trtllm_options=" \
--tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
--engine-dir /tmp/trtllm_engine \
--tokenizer ${CHECKPOINT_LOAD_DIR}/hf \
--max-input-len 2048 \
--max-output-len 512 \
--max-batch-size 8 "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
--swiglu \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--normalization RMSNorm \
--rotary-percent 1.0 \
--no-position-embedding \
--no-masked-softmax-fusion \
--no-bias-gelu-fusion \
--no-bias-dropout-fusion \
--no-async-tensor-model-parallel-allreduce \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 11008 \
--num-attention-heads 32 \
--seq-length 4096 \
--max-position-embeddings 4096 \
--micro-batch-size 1 \
--make-vocab-size-divisible-by 1 \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--save-interval 1000000 \
--use-dist-ckpt \
--load ${CHECKPOINT_LOAD_DIR} \
--fp16"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/llama-3_1-8b-nemo_v1.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="1"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="2"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
--attention-backend unfused \
--swiglu \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--normalization RMSNorm \
--rotary-percent 1.0 \
--hidden-dropout 0.0 \
--attention-dropout 0.0 \
--no-bias-gelu-fusion \
--no-bias-dropout-fusion \
--no-async-tensor-model-parallel-allreduce \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--seq-length 131072 \
--max-position-embeddings 131072 \
--micro-batch-size 4 \
--make-vocab-size-divisible-by 128 \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model meta-llama/Meta-Llama-3.1-8B \
--save-interval 1000000 \
--use-rope-scaling \
--use-dist-ckpt \
--load ${CHECKPOINT_LOAD_DIR} \
--rotary-base 500000 \
--fp16"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/llama-3_1-8b-nemo_v1.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="1"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="2"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
--attention-backend unfused \
--swiglu \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--normalization RMSNorm \
--rotary-percent 1.0 \
--hidden-dropout 0.0 \
--attention-dropout 0.0 \
--no-bias-gelu-fusion \
--no-bias-dropout-fusion \
--no-async-tensor-model-parallel-allreduce \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--seq-length 8192 \
--max-position-embeddings 8192 \
--micro-batch-size 4 \
--make-vocab-size-divisible-by 128 \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model meta-llama/Meta-Llama-3-8B \
--save-interval 1000000 \
--use-dist-ckpt \
--load ${CHECKPOINT_LOAD_DIR} \
--rotary-base 500000 \
--fp16"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/nemotron3-8b_v0.3.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="gptnext"
CHECKPOINT_LOAD_DIR="${NAME}/nemo"
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="1"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--apply-layernorm-1p \
--attn-attention unfused \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--no-rope-fusion \
--no-position-embedding \
--use-rotary-position-embeddings \
--rotary-percent 0.5 \
--squared-relu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 16384 \
--group-query-attention \
--num-attention-heads 48 \
--kv-channels 128 \
--seq-length 4096 \
--num-query-groups 8 \
--max-position-embeddings 4096 \
--micro-batch-size 4 \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model nvidia/Minitron-8B-Base \
--save-interval 1000000 \
--load ${CHECKPOINT_LOAD_DIR} \
--bf16 \
--use-dist-ckpt"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/Mistral-NeMo-12B-Base"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="1"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--untie-embeddings-and-output-weights \
--attention-backend unfused \
--disable-bias-linear \
--use-rotary-position-embeddings \
--rotary-percent 1.0 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 40 \
--hidden-size 5120 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--seq-length 8192 \
--kv-channels 128 \
--normalization RMSNorm \
--swiglu \
--num-query-groups 8 \
--group-query-attention \
--position-embedding-type rope \
--max-position-embeddings 8192 \
--micro-batch-size 1 \
--tokenizer-type HuggingFaceTokenizer \
--tiktoken-pattern v2 \
--tokenizer-model mistralai/Mistral-Nemo-Base-2407 \
--save-interval 1000000 \
--load ${CHECKPOINT_LOAD_DIR} \
--fp16 \
--rotary-base 1000000 \
--use-dist-ckpt"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/Mistral-NeMo-12B-Base"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
export NVTE_FLASH_ATTN=0
export NVTE_FUSED_ATTN=0
export NVTE_UNFUSED_ATTN=1
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="1"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--untie-embeddings-and-output-weights \
--no-masked-softmax-fusion \
--no-position-embedding \
--use-mcore-models \
--disable-bias-linear \
--rotary-percent 1.0 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--seq-length 4096 \
--kv-channels 128 \
--normalization RMSNorm \
--swiglu \
--num-query-groups 8 \
--num-experts 8 \
--moe-router-topk 2 \
--moe-aux-loss-coeff 1e-2 \
--moe-router-load-balancing-type aux_loss \
--group-query-attention \
--position-embedding-type rope \
--no-rope-fusion \
--max-position-embeddings 32768 \
--micro-batch-size 1 \
--tokenizer-type HuggingFaceTokenizer \
--tiktoken-pattern v2 \
--tokenizer-model mistralai/Mixtral-8x7B-Instruct-v0.1 \
--save-interval 1000000 \
--load ${CHECKPOINT_LOAD_DIR} \
--bf16 \
--rotary-base 1000000 \
--use-dist-ckpt"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/export/ptq_and_trtllm_export/text_generation_ptq.py ${options} ${additional_options}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Sample Generate GPT."""
import functools
import os
import sys
from pathlib import Path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
import modelopt.torch.quantization as mtq
import torch
from datasets import load_dataset
from tqdm import tqdm
# [ModelOpt]: changing the default model provider to the ModelOpt version
from megatron.core import mpu
from megatron.inference.arguments import add_modelopt_args
from megatron.inference.checkpointing import load_modelopt_checkpoint
from megatron.inference.gpt.model_provider import model_provider
from megatron.inference.text_generation import generate_and_post_process
from megatron.training import get_args, get_model, initialize_megatron
from megatron.training.checkpointing import save_checkpoint
from megatron.training.utils import print_rank_0, unwrap_model
QUANT_CFG_CHOICES = {
"int8": mtq.INT8_DEFAULT_CFG,
"int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
"fp8": mtq.FP8_DEFAULT_CFG,
"int4_awq": mtq.INT4_AWQ_CFG,
"w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
"int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
}
def add_trtllm_ckpt_export_args(parser):
"""Add additional arguments for TensorRT-LLM."""
group = parser.add_argument_group(title="trtllm")
group.add_argument(
"--export-dir", type=str, help="The output TensorRT-LLM checkpoint.",
)
group.add_argument(
"--decoder", type=str, choices=["gptnext", 'llama'], help="The decoder type of the model.",
)
group.add_argument(
"--inference-tensor-parallel",
type=int,
help="Tensor parallel for the inference time, can be different from the training config.",
default=1,
)
def add_text_generate_ptq_args(parser):
"""Add additional arguments for ModelOpt text generation PTQ."""
group = parser.add_argument_group(title='ModelOpt text generation ptq')
group.add_argument(
"--calib-dataset",
type=str,
default="cnn_dailymail",
help="Calibration datasets from HuggingFace datasets.",
)
group.add_argument(
"--calib-batch-size", type=int, default=4, help="Batch size to use for ptq calibration."
)
group.add_argument(
"--calib-size", type=int, default=512, help="Samples to use for ptq calibration."
)
parser.add_argument(
"--prompts",
type=str,
default=(
"Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
),
help="Input texts. Please use | to separate different batches.",
)
add_modelopt_args(parser)
add_trtllm_ckpt_export_args(parser)
return parser
def get_calib_dataloader(
data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512
):
if data == "pileval":
dataset = load_dataset(
"json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train"
)
text_column = "text"
elif data == "wikitext":
dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
text_column = "text"
elif data == "cnn_dailymail":
dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
text_column = "article"
calib_size = max(min(len(dataset), calib_size), batch_size)
for i in range(calib_size // batch_size):
batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
for j in range(len(batch)):
batch[j] = batch[j][:max_sequence_length]
yield batch
if __name__ == "__main__":
initialize_megatron(
extra_args_provider=add_text_generate_ptq_args,
args_defaults={
'tokenizer_type': 'GPT2BPETokenizer',
'no_load_rng': True,
'no_load_optim': True,
},
)
args = get_args()
if args.num_layers_per_virtual_pipeline_stage is not None:
print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.")
exit()
print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
args.exit_on_missing_checkpoint = True
if hasattr(args, 'moe_grouped_gemm') and args.moe_grouped_gemm == True:
print_rank_0("WARNING: Forcing moe_grouped_gemm to False for PTQ and export.")
args.moe_grouped_gemm = False
# Set up model and load checkpoint
# [ModelOpt]: make sure that output logits are allgathered.
text_generation_model_provider = functools.partial(model_provider, parallel_output=False)
model = get_model(text_generation_model_provider, wrap_with_ddp=False)
if args.load is not None:
load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
print_rank_0("Done loading checkpoint")
# Removing virtual pipeline parallel and other wrapper
assert len(model) == 1, "Above condition should have caught this"
unwrapped_model = unwrap_model(model)
all_prompts = args.prompts.split("|")
def custom_prompt_forward_loop_func(model):
for prompt in tqdm(all_prompts):
if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
(
prompts_plus_generations,
prompts_plus_generations_segments,
logprobs,
_,
) = generate_and_post_process(
model,
prompts=[prompt],
tokens_to_generate=128,
return_output_log_probs=True,
temperature=1.0,
)
print_rank_0(prompts_plus_generations)
else:
generate_and_post_process(model)
def hf_dataset_forword_loop_func(model):
dataloader = get_calib_dataloader(args.calib_dataset, args.calib_batch_size, args.calib_size)
for prompts in tqdm(dataloader, total=args.calib_size//args.calib_batch_size):
if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
(
prompts_plus_generations,
prompts_plus_generations_segments,
logprobs,
_,
) = generate_and_post_process(
model,
prompts=prompts,
tokens_to_generate=0,
return_output_log_probs=False,
temperature=1.0,
)
else:
generate_and_post_process(model)
ptq_forward_loop_func = custom_prompt_forward_loop_func
if args.calib_dataset is not None:
ptq_forward_loop_func = hf_dataset_forword_loop_func
if args.export_quant_cfg in QUANT_CFG_CHOICES:
mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
if "*output_layer*" not in mtq_config["quant_cfg"]:
mtq_config["quant_cfg"]["*output_layer*"] = {"enable": False}
if "awq" in args.export_quant_cfg:
weight_quantizer = mtq_config["quant_cfg"]["*weight_quantizer"] # type: ignore
if isinstance(weight_quantizer, list):
weight_quantizer = weight_quantizer[0]
weight_quantizer["block_sizes"][-1] = 128
print_rank_0("Quantizing the model...")
mtq.quantize(unwrapped_model[0], mtq_config, ptq_forward_loop_func)
custom_prompt_forward_loop_func(model[0])
if args.save is not None and args.export_quant_cfg in QUANT_CFG_CHOICES:
save_checkpoint(1, unwrapped_model, None, None, 0)
print_rank_0(f"Fake Quantized Model:\n {unwrapped_model[0]}")
if args.export_dir:
assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported."
Path(args.export_dir).mkdir(parents=True, exist_ok=True)
print_rank_0("Exporting TensorRT-LLM checkpoints.")
from modelopt.torch.export import export_tensorrt_llm_checkpoint
# In TRT LLM, squared relu activation does not support bf16. So we use fp16 by default.
export_tensorrt_llm_checkpoint(
unwrapped_model[0],
args.decoder,
torch.bfloat16 if args.bf16 else torch.float16,
export_dir=args.export_dir,
inference_tensor_parallel=args.inference_tensor_parallel,
inference_pipeline_parallel=1,
use_nfs_workspace=True,
)
print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
torch.distributed.barrier()
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""An example script to run the tensorrt_llm engine."""
import argparse
from pathlib import Path
import subprocess
from typing import Optional, Union
import numpy as np
import torch
from modelopt.deploy.llm import LLM
from tensorrt_llm.models import PretrainedConfig
from transformers import AutoTokenizer, T5Tokenizer
import tensorrt_llm
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--tokenizer", type=str, default="")
parser.add_argument("--engine-dir", type=str, default="/tmp/trtllm_engine")
parser.add_argument(
"--input-texts",
type=str,
default=(
"Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
),
help="Input texts. Please use | to separate different batches.",
)
return parser.parse_args()
def run(args):
try:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
except Exception as e:
raise Exception(f"Failed to load tokenizer: {e}")
print(tokenizer, tokenizer.vocab_size)
input_texts = args.input_texts.split("|")
assert input_texts, "input_text not specified"
print(input_texts)
free_memory_before = torch.cuda.mem_get_info()
# This is a ModelOpt wrapper on top of tensorrt_llm.hlapi.llm.LLM
llm_engine = LLM(args.engine_dir, tokenizer)
torch.cuda.cudart().cudaProfilerStart()
# outputs = llm_engine.generate_text(input_texts, args.max_output_len, args.max_beam_width)
outputs = llm_engine.generate(input_texts)
torch.cuda.cudart().cudaProfilerStop()
free_memory_after = torch.cuda.mem_get_info()
print(
f"Used GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB"
)
print(outputs)
if __name__ == "__main__":
args = parse_arguments()
run(args)
# Megatron Core To TRTLLM Export Documentation
This guide will walk you through how you can use the megatron core export for exporting models to trtllm format
### Contents
- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation)
- [Contents](#contents)
- [1. Quick Start](#1-quick-start)
- [1.1 Understanding The Code](#11-understanding-the-code)
- [1.2 Running The Code](#12-running-the-code)
- [2. GPU Export](#2-gpu-export)
- [3. Future work](#4-future-work)
#### 1. Quick Start
This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py)
NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function.
<br>
##### 1.1 Understanding The Code
***STEP 1 - We initialize model parallel and other default arguments***
We initalize tp and pp to 1 so that we can get the full model state dict on cpu
```python
initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
```
***STEP 2 - We load the model using the model_provider_function***
NOTE: We create a simple gpt model
```python
transformer_config = TransformerConfig(
num_layers=2,
hidden_size=64, # Needs to be atleast 32 times num_attn_heads
num_attention_heads=2,
use_cpu_initialization=True,
pipeline_dtype=torch.float32,
)
gpt_model = GPTModel(
config=transformer_config,
transformer_layer_spec=get_gpt_layer_local_spec(),
vocab_size=100,
max_sequence_length=_SEQUENCE_LENGTH,
)
# Optionally you can also load a model using this code
# sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
# checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
# gpt_model.load_state_dict(checkpoint)
```
***STEP 3 - Instantiate the TRTLLM Helper***
We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py) For the GPT model we instantiate trtllm_helper as shown below.
```python
if hasattr(gpt_model, "rotary_pos_emb"):
seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor
trtllm_helper = TRTLLMHelper(
transformer_config=gpt_model.config,
model_type=ModelType.gpt,
position_embedding_type = gpt_model.position_embedding_type,
max_position_embeddings = gpt_model.max_position_embeddings,
rotary_percentage = gpt_model.rotary_percent,
rotary_base = gpt_model.rotary_base,
moe_tp_mode = 2,
multi_query_mode = False,
activation = "gelu",
seq_len_interpolation_factor = seq_len_interpolation_factor,
share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
)
```
***STEP 4 - Get the TRTLLM Weights and configs***
To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export.
```python
model_state_dict={}
for key , val in gpt_model.state_dict().items():
# val is non for _extra_state layers . We filter it out
if val is not None:
model_state_dict[key] = val
export_config = ExportConfig(inference_tp_size = 2)
weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
model_state_dict= model_state_dict,
dtype = DataType.bfloat16,
export_config=export_config
)
```
***STEP 5 - Build the TRTLLM Engine***
Following code is used to build the TRTLLM Engine.
```python
for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
trtllm_helper.build_and_save_engine(
max_input_len=256,
max_output_len=256,
max_batch_size=8,
engine_dir='/opt/megatron-lm/engine',
trtllm_model_weights=trtllm_model_weights,
trtllm_model_config=trtllm_model_config,
lora_ckpt_list=None,
use_lora_plugin=None,
max_lora_rank=64,
lora_target_modules=None,
max_prompt_embedding_table_size=0,
paged_kv_cache=True,
remove_input_padding=True,
paged_context_fmha=False,
use_refit=False,
max_num_tokens=None,
max_seq_len=512,
opt_num_tokens=None,
max_beam_width=1,
tokens_per_block=128,
multiple_profiles=False,
gpt_attention_plugin="auto",
gemm_plugin="auto",
)
```
<br>
##### 1.2 Running The Code
An example run script is shown below.
```
# In a workstation
MLM_PATH=/path/to/megatron-lm
CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86
docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash
# Inside the container run the following.
cd /opt/megatron-lm/
CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1 examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
```
<br>
#### 2. GPU Export
You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device.
In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk.
To run the gpu version
```
CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2 examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
```
<br>
#### 3. Future work
The following are planned for the future releases .
* Pipeline parallellism for export (Work in progress)
* GPU Export for more models (Work in progress for some models)
* Refit functionality
* VLLM Support
\ No newline at end of file
import os
import torch
from megatron.core import parallel_state
from megatron.core import dist_checkpointing
from megatron.core.export.model_type import ModelType
from megatron.core.export.data_type import DataType
from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.models.gpt.gpt_model import GPTModel
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
_SEQUENCE_LENGTH = 64
_VOCAB_SIZE = 256
def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
parallel_state.destroy_model_parallel()
# Torch setup for distributed training
rank = int(os.environ['LOCAL_RANK'])
world_size = torch.cuda.device_count()
torch.cuda.set_device(rank)
torch.distributed.init_process_group(world_size=world_size, rank=rank)
# Megatron core distributed training initialization
parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size)
def model_provider():
"""Build the model."""
transformer_config = TransformerConfig(
num_layers=2,
hidden_size=64,
num_attention_heads=2,
use_cpu_initialization=True,
pipeline_dtype=torch.float32
)
gpt_model = GPTModel(
config=transformer_config,
transformer_layer_spec=get_gpt_layer_local_spec(),
vocab_size=_VOCAB_SIZE,
max_sequence_length=_SEQUENCE_LENGTH,
)
return gpt_model
def load_distributed_checkpoint(checkpoint_path, gpt_model):
sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
gpt_model.load_state_dict(checkpoint)
return gpt_model
if __name__ == "__main__":
initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
model_parallel_cuda_manual_seed(123)
gpt_model = model_provider()
device = torch.device("cuda")
gpt_model.to(device)
# Optionally you can also load a gpt model from ckpt_path using this code below
# gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
seq_len_interpolation_factor = None
if hasattr(gpt_model, "rotary_pos_emb"):
seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor
trtllm_helper = TRTLLMHelper(
transformer_config=gpt_model.config,
model_type=ModelType.gpt,
position_embedding_type = gpt_model.position_embedding_type,
max_position_embeddings = gpt_model.max_position_embeddings,
rotary_percentage = gpt_model.rotary_percent,
rotary_base = gpt_model.rotary_base,
moe_tp_mode = 2,
multi_query_mode = False,
activation = "gelu",
seq_len_interpolation_factor = seq_len_interpolation_factor,
share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
)
trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
model_state_dict= gpt_model.state_dict(),
dtype = DataType.bfloat16,
on_device_distributed_conversion=True,
vocab_size=_VOCAB_SIZE,
gpus_per_node=2,
)
trtllm_helper.build_and_save_engine(
max_input_len=256,
max_output_len=256,
max_batch_size=8,
engine_dir='/opt/megatron-lm/engine',
trtllm_model_weights=trtllm_model_weights[0],
trtllm_model_config=trtllm_model_config[0],
lora_ckpt_list=None,
use_lora_plugin=None,
max_lora_rank=64,
lora_target_modules=None,
max_prompt_embedding_table_size=0,
paged_kv_cache=True,
remove_input_padding=True,
paged_context_fmha=False,
use_refit=False,
max_num_tokens=None,
max_seq_len=512,
opt_num_tokens=None,
max_beam_width=1,
tokens_per_block=128,
multiple_profiles=False,
gpt_attention_plugin="auto",
gemm_plugin="auto",
)
import os
import torch
from megatron.core import parallel_state
from megatron.core import dist_checkpointing
from megatron.core.export.model_type import ModelType
from megatron.core.export.data_type import DataType
from megatron.core.export.export_config import ExportConfig
from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.models.gpt.gpt_model import GPTModel
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
_SEQUENCE_LENGTH = 64
def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
parallel_state.destroy_model_parallel()
# Torch setup for distributed training
rank = int(os.environ['LOCAL_RANK'])
world_size = torch.cuda.device_count()
torch.cuda.set_device(rank)
torch.distributed.init_process_group(world_size=world_size, rank=rank)
# Megatron core distributed training initialization
parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
def model_provider():
"""Build the model."""
transformer_config = TransformerConfig(
num_layers=2,
hidden_size=64, # Needs to be atleast 32 times num_attn_heads
num_attention_heads=2,
use_cpu_initialization=True,
pipeline_dtype=torch.float32,
)
gpt_model = GPTModel(
config=transformer_config,
transformer_layer_spec=get_gpt_layer_local_spec(),
vocab_size=100,
max_sequence_length=_SEQUENCE_LENGTH,
)
return gpt_model
def load_distributed_checkpoint(checkpoint_path, gpt_model):
sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
gpt_model.load_state_dict(checkpoint)
return gpt_model
if __name__ == "__main__":
# Need to use TP1 PP1 for export on single device
initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
model_parallel_cuda_manual_seed(123)
gpt_model = model_provider()
# Optionally you can also load a gpt model from ckpt_path using this code below
# gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
seq_len_interpolation_factor = None
if hasattr(gpt_model, "rotary_pos_emb"):
seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor
trtllm_helper = TRTLLMHelper(
transformer_config=gpt_model.config,
model_type=ModelType.gpt,
position_embedding_type = gpt_model.position_embedding_type,
max_position_embeddings = gpt_model.max_position_embeddings,
rotary_percentage = gpt_model.rotary_percent,
rotary_base = gpt_model.rotary_base,
moe_tp_mode = 2,
multi_query_mode = False,
activation = "gelu",
seq_len_interpolation_factor = seq_len_interpolation_factor,
share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
)
export_config = ExportConfig(inference_tp_size = 2)
# NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api
weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
model_state_dict= gpt_model.state_dict(),
dtype = DataType.bfloat16,
export_config=export_config
)
for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
trtllm_helper.build_and_save_engine(
max_input_len=256,
max_output_len=256,
max_batch_size=8,
engine_dir='/opt/megatron-lm/engine',
trtllm_model_weights=trtllm_model_weights,
trtllm_model_config=trtllm_model_config,
lora_ckpt_list=None,
use_lora_plugin=None,
max_lora_rank=64,
lora_target_modules=None,
max_prompt_embedding_table_size=0,
paged_kv_cache=True,
remove_input_padding=True,
paged_context_fmha=False,
use_refit=False,
max_num_tokens=None,
max_seq_len=512,
opt_num_tokens=None,
max_beam_width=1,
tokens_per_block=128,
multiple_profiles=False,
gpt_attention_plugin="auto",
gemm_plugin="auto",
)
\ No newline at end of file
#!/bin/bash
WORLD_SIZE=8
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
TRAIN_DATA="data/glue_data/MNLI/train.tsv"
VALID_DATA="data/glue_data/MNLI/dev_matched.tsv \
data/glue_data/MNLI/dev_mismatched.tsv"
PRETRAINED_CHECKPOINT=checkpoints/bert_345m
VOCAB_FILE=bert-vocab.txt
CHECKPOINT_PATH=checkpoints/bert_345m_mnli
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
--task MNLI \
--seed 1234 \
--train-data $TRAIN_DATA \
--valid-data $VALID_DATA \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--epochs 5 \
--pretrained-checkpoint $PRETRAINED_CHECKPOINT \
--tensor-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--micro-batch-size 8 \
--lr 5.0e-5 \
--lr-decay-style linear \
--lr-warmup-fraction 0.065 \
--seq-length 512 \
--max-position-embeddings 512 \
--save-interval 500000 \
--save $CHECKPOINT_PATH \
--log-interval 10 \
--eval-interval 100 \
--eval-iters 50 \
--weight-decay 1.0e-1 \
--fp16
#!/bin/bash
WORLD_SIZE=8
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
TRAIN_DATA="data/RACE/train/middle"
VALID_DATA="data/RACE/dev/middle \
data/RACE/dev/high"
VOCAB_FILE=bert-vocab.txt
PRETRAINED_CHECKPOINT=checkpoints/bert_345m
CHECKPOINT_PATH=checkpoints/bert_345m_race
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
--task RACE \
--seed 1234 \
--train-data $TRAIN_DATA \
--valid-data $VALID_DATA \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--epochs 3 \
--pretrained-checkpoint $PRETRAINED_CHECKPOINT \
--tensor-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--micro-batch-size 4 \
--lr 1.0e-5 \
--lr-decay-style linear \
--lr-warmup-fraction 0.06 \
--seq-length 512 \
--max-position-embeddings 512 \
--save-interval 100000 \
--save $CHECKPOINT_PATH \
--log-interval 10 \
--eval-interval 100 \
--eval-iters 50 \
--weight-decay 1.0e-1 \
--clip-grad 1.0 \
--hidden-dropout 0.1 \
--attention-dropout 0.1 \
--fp16
#!/bin/bash
# Finetune a BERT or pretrained ICT model using Google natural question data
# Datasets can be downloaded from the following link:
# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
WORLD_SIZE=8
DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT_PATH=<Specify path for the finetuned retriever model>
# Load either of the below
BERT_LOAD_PATH=<Path of BERT pretrained model>
PRETRAINED_CHECKPOINT=<Path of Pretrained ICT model>
python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
--task RET-FINETUNE-NQ \
--train-with-neg \
--train-hard-neg 1 \
--pretrained-checkpoint ${PRETRAINED_CHECKPOINT} \
--num-layers 12 \
--hidden-size 768 \
--num-attention-heads 12 \
--tensor-model-parallel-size 1 \
--tokenizer-type BertWordPieceLowerCase \
--train-data nq-train.json \
--valid-data nq-dev.json \
--save ${CHECKPOINT_PATH} \
--load ${CHECKPOINT_PATH} \
--vocab-file bert-vocab.txt \
--bert-load ${BERT_LOAD_PATH} \
--save-interval 5000 \
--log-interval 10 \
--eval-interval 20000 \
--eval-iters 100 \
--indexer-log-interval 1000 \
--faiss-use-gpu \
--DDP-impl torch \
--fp16 \
--retriever-report-topk-accuracies 1 5 10 20 100 \
--seq-length 512 \
--retriever-seq-length 256 \
--max-position-embeddings 512 \
--retriever-score-scaling \
--epochs 80 \
--micro-batch-size 8 \
--eval-micro-batch-size 16 \
--indexer-batch-size 128 \
--lr 2e-5 \
--lr-warmup-fraction 0.01 \
--weight-decay 1e-1
# GPT3 MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
- [3. Training Results](#3-training-results)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
DATA_PATH="" #<Specify path and file prefix>_text_document
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
```
NOTE: Depending on the environment you are running it the above command might like slightly different.
## 2. Configurations
<a id="markdown-configurations" name="configurations"></a>
The example in this folder shows you how to run 175B model. There are other configs you could run as well
### 345M
```
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--seq-length 1024 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 857M
```
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
# WARNING: Yaml configs is currently an experimental feature
language_model:
# model architecture
num_layers: 24
hidden_size: 1024
num_attention_heads: 16
num_query_groups: null
ffn_hidden_size: null
kv_channels: null
hidden_dropout: 0.0
attention_dropout: 0.0
fp32_residual_connection: False
apply_residual_connection_post_layernorm: False
layernorm_epsilon: 1.e-5
layernorm_zero_centered_gamma: True
add_bias_linear: False
bias_activation_fusion: False
add_qkv_bias: False
gated_linear_unit: False
activation_func: swiglu
num_moe_experts: null
rotary_interleaved: False
window_size: null
# initialization
init_method: null
init_method_std: 0.02
output_layer_init_method: null
# mixed-precision
apply_query_key_layer_scaling: False
attention_softmax_in_fp32: False
# fusion
bias_swiglu_fusion: True
masked_softmax_fusion: True
persist_layer_norm: False
memory_efficient_layer_norm: False
bias_dropout_fusion: True
apply_rope_fusion: True
# activation recomputation
recompute_granularity: null
recompute_method: null
recompute_num_layers: null
distribute_saved_activations: null
# fp8 related
fp8: null
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1
fp8_amax_compute_algo: "most_recent"
fp8_wgrad: True
# miscellaneous
clone_scatter_output_in_embedding: True
normalization: "LayerNorm" # alt value supported by TE: "RMSNorm"
# MoE related
moe_router_load_balancing_type: "aux_loss"
moe_router_topk: 2
moe_grouped_gemm: False
moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss.
moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss
moe_input_jitter_eps: null
moe_token_dropping: False
model_parallel:
# Model parallelism
tensor_model_parallel_size: 1
context_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
sequence_parallel: True
expert_model_parallel_size: 1
# Initialization
perform_initialization: True
use_cpu_initialization: null
# Training
fp16: False
bf16: True
params_dtype: null # Set from above arguments for core
timers: null
# Optimizations
gradient_accumulation_fusion: True
async_tensor_model_parallel_allreduce: True
tp_comm_overlap: False
# Debug Options
tp_comm_split_ag: True
tp_comm_atomic_ag: True
tp_comm_split_rs: True
tp_comm_atomic_rs: True
tp_comm_bulk_wgrad: True
tp_comm_bulk_dgrad: True
# Parallelism
finalize_model_grads_func: null
# Pipeline Parallel
pipeline_dtype: null
grad_scale_func: null
enable_autocast: False
autocast_dtype: null
variable_seq_lengths: False
num_microbatches_with_partial_activation_checkpoints: null
overlap_p2p_comm: False
batch_p2p_comm: True
batch_p2p_sync: True
use_ring_exchange_p2p: False
deallocate_pipeline_outputs: False
no_sync_func: null
grad_sync_func: null
param_sync_func: null
pipeline_model_parallel_split_rank: null
# CPU Offloading
cpu_offloading: False
cpu_offloading_num_layers: 0
_cpu_offloading_context: null
cpu_offloading_weights: False
cpu_offloading_activations: True
# Timing
barrier_with_L1_time: True
# training:
use_legacy_models: False
spec: null
micro_batch_size: 2
global_batch_size: 128
rampup_batch_size: [32, 32, 65324160]
check_for_nan_in_loss_and_grad: True
num_layers_per_virtual_pipeline_stage: null
encoder_num_layers: null
decoder_num_layers: null
rotary_seq_len_interpolation_factor: null
add_position_embedding: False
make_vocab_size_divisible_by: 128
group_query_attention: False
exit_signal_handler: False
exit_duration_in_mins: null
exit_interval: null
untie_embeddings_and_output_weights: True
position_embedding_type: rope
rotary_percent: 0.5
openai_gelu: False
squared_relu: False
swiglu: True
onnx_safe: null
bert_binary_head: True
max_position_embeddings: 4096
transformer_impl: local
use_flash_attn: False
seed: 1234
data_parallel_random_init: False
# Optimizer
optimizer: adam
lr: 2.5e-4
lr_decay_style: cosine
lr_decay_iters: null
lr_decay_samples: 255126953
lr_warmup_fraction: null
lr_warmup_iters: 0
lr_warmup_samples: 81381
lr_warmup_init: 0.0
min_lr: 2.5e-5
weight_decay: 0.1
start_weight_decay: null
end_weight_decay: null
weight_decay_incr_style: constant
clip_grad: 1.0
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.e-08
sgd_momentum: 0.9
override_opt_param_scheduler: False
use_checkpoint_opt_param_scheduler: False
# checkpointing arguments
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
load: null
no_load_optim: null
no_load_rng: null
finetune: False
use_checkpoint_args: False
exit_on_missing_checkpoint: False
# loss arguments
loss_scale: null
initial_loss_scale: 4294967296
min_loss_scale: 1.0
loss_scale_window: 1000
hysteresis: 2
accumulate_allreduce_grads_in_fp32: False
fp16_lm_cross_entropy: False
# distributed arguments
distributed_backend: nccl
distributed_timeout_minutes: 10
overlap_grad_reduce: False
align_grad_reduce: True
overlap_param_gather: False
align_param_gather: False
scatter_gather_tensors_in_pipeline: True
local_rank: null
lazy_mpu_init: null
empty_unused_memory_level: 0
standalone_embedding_stage: False
use_distributed_optimizer: False
nccl_communicator_config_path: null
train_iters: null
eval_iters: 32
eval_interval: 2000
skip_train: False
adlr_autoresume: False
adlr_autoresume_interval: 1000
# garbage collection
manual_gc: False
manual_gc_interval: 0
manual_gc_eval: True
tp_comm_overlap_cfg: null
#data
data_path: null
split: '99,1,0'
train_data_path: null
valid_data_path: null
test_data_path: null
data_cache_path: null
mock_data: False
vocab_size: null
vocab_file: null
merge_file: null
vocab_extra_ids: 0
seq_length: 4096
encoder_seq_length: null
decoder_seq_length: null
retriever_seq_length: 256
sample_rate: 1.0
mask_prob: 0.15
short_seq_prob: 0.1
num_workers: 2
tokenizer_type: GPTSentencePieceTokenizer
tokenizer_model: null
reset_position_ids: False
reset_attention_mask: False
eod_mask_loss: False
train_samples: 268554688
dataloader_type: null
#profile:
profile: False
profile_ranks: [0]
profile_step_end: 12
profile_step_start: 10
#logging:
log_params_norm: True
log_num_zeros_in_grad: True
log_throughput: False
log_progress: False
timing_log_level: 0
timing_log_option: minmax
tensorboard_log_interval: 1
tensorboard_queue_size: 1000
log_timers_to_tensorboard: False
log_validation_ppl_to_tensorboard: False
log_memory_to_tensorboard: False
log_world_size_to_tensorboard: False
log_loss_scale_to_tensorboard: True
wandb_project: ''
wandb_exp_name: ''
wandb_save_dir: ''
enable_one_logger: True
one_logger_project: megatron-lm
one_logger_run_name: null
log_interval: 100
tensorboard_dir: null
#!/bin/bash
# Runs the "175B" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH=$5 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 96
--hidden-size 12288
--num-attention-heads 96
--seq-length 2048
--max-position-embeddings 2048
--attention-backend auto # Can use (flash/fused/unfused/local)
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 1536
--rampup-batch-size 16 16 5859375
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--fp16
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 16
)
DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
This diff is collapsed.
This diff is collapsed.
import argparse
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
# Set up argument parsing
parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.")
parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation")
parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint")
# Parse command-line arguments
args = parser.parse_args()
model_path = args.model_path
prompt = args.prompt
config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, config=config)
model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda()
inputs = tokenizer(prompt, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].cuda()
# top_k, top_p and do_sample are set for greedy argmax based sampling
outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment