Commit 1106877d authored by jerrrrry's avatar jerrrrry
Browse files

“13.0”

parents
Pipeline #2934 failed with stages
in 0 seconds
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Tensor-parallel size options = [2, 4, 8, 16, 32].
TP=2
# Batch size (global batch size) options = [32, 128, 512].
GBS=32
# Set tensor-parallel and data-parallel size options.
DP=$((64/TP))
# Other params.
PP=1
MBS=1
NLS=32
HS=3840
NAH=32
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8
# Name of the job.
export JOB_NAME=results_figure_15_tensor_parallel_size_${TP}_data_parallel_size_${DP}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Microbatch size options = [1, 2, 4, 8].
MBS=1
# Batch size (global batch size) options = [128, 512].
GBS=128
# Other params.
TP=8
PP=8
NLS=32
HS=15360
NAH=128
DDP=local
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
NNODES=8
# Name of the job.
export JOB_NAME=results_figure_16_microbatch_size_${MBS}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Activation recomputation options = [YES, NO].
ACTIVATION_RECOMPUTATION=YES
# Batch size (global batch size) options = [1, 2, 4, ..., 256].
GBS=1
# Set activation recomputation.
if [ ${ACTIVATION_RECOMPUTATION} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${ACTIVATION_RECOMPUTATION} == "NO" ]; then
MEGATRON_EXTRA_PARAMS=""
else
echo "Invalid configuration"
exit 1
fi
# Other params.
TP=8
PP=16
MBS=1
NLS=80
HS=12288
NAH=96
DDP=local
NNODES=16
# Name of the job.
export JOB_NAME=results_figure_17_activation_recomputation_${ACTIVATION_RECOMPUTATION}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# Scatter-gather communication optimization options = [YES, NO].
SCATTER_GATHER=YES
# Batch size (global batch size) options = [12, 24, 36, ..., 60].
GBS=12
# Set scatter-gather communication optimization options.
if [ ${SCATTER_GATHER} == "YES" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
elif [ ${SCATTER_GATHER} == "NO" ]; then
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
else
echo "Invalid configuration"
exit 1
fi
# Other params.
TP=8
PP=12
MBS=1
NLS=96
HS=12288
NAH=96
DDP=local
NNODES=12
# Name of the job.
export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
#!/bin/bash
# ================================
# Choose the case to run.
# ================================
# model size options = [1.7B, 3.6B, 7.5B, 18B, 39B, 76B, 145B, 310B, 530B, 1T]
MODEL_SIZE=1.7B
if [ ${MODEL_SIZE} == "1.7B" ]; then
TP=1
PP=1
MBS=16
GBS=512
NLS=24
HS=2304
NAH=24
DDP=torch
NNODES=4
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "3.6B" ]; then
TP=2
PP=1
MBS=16
GBS=512
NLS=30
HS=3072
NAH=32
DDP=torch
NNODES=8
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "7.5B" ]; then
TP=4
PP=1
MBS=16
GBS=512
NLS=36
HS=4096
NAH=32
DDP=torch
NNODES=16
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "18B" ]; then
TP=8
PP=1
MBS=8
GBS=1024
NLS=40
HS=6144
NAH=48
DDP=torch
NNODES=32
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "39B" ]; then
TP=8
PP=2
MBS=4
GBS=1536
NLS=48
HS=8192
NAH=64
DDP=local
NNODES=64
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
elif [ ${MODEL_SIZE} == "76B" ]; then
TP=8
PP=4
MBS=2
GBS=1792
NLS=60
HS=10240
NAH=80
DDP=local
NNODES=128
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5"
elif [ ${MODEL_SIZE} == "145B" ]; then
TP=8
PP=8
MBS=2
GBS=2304
NLS=80
HS=12288
NAH=96
DDP=local
NNODES=192
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 5 "
elif [ ${MODEL_SIZE} == "310B" ]; then
TP=8
PP=16
MBS=1
GBS=2160
NLS=96
HS=16384
NAH=128
DDP=local
NNODES=240
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 3 "
elif [ ${MODEL_SIZE} == "530B" ]; then
TP=8
PP=35
MBS=1
GBS=2520
NLS=105
HS=20480
NAH=128
DDP=local
NNODES=315
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 1 "
elif [ ${MODEL_SIZE} == "1T" ]; then
TP=8
PP=64
MBS=1
GBS=3072
NLS=128
HS=25600
NAH=160
DDP=local
NNODES=384
MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform "
else
echo "Invalid configuration"
exit 1
fi
# Name of the job
export JOB_NAME=results_table_1_model_size_${MODEL_SIZE}
# Import the configs.
. `pwd`/CONFIG.sh
# Submit the job.
. `pwd`/SBATCH.sh
exit 0
# BERT MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
VOCAB_FILE="" #<Specify path to file>//bert-vocab.txt
DATA_PATH="" #<Specify path and file prefix>_text_document
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH "
```
NOTE: Depending on the environment you are running it the above command might like slightly different.
## 2. Configurations
<a id="markdown-configurations" name="configurations"></a>
The example in this folder shows you how to run 340m large model. There are other configs you could run as well
### 4B
```
--num-layers 48 \
--hidden-size 2560 \
--num-attention-heads 32 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 20B
```
--num-layers 48 \
--hidden-size 6144 \
--num-attention-heads 96 \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 4 \
```
\ No newline at end of file
#!/bin/bash
# Runs the "340M" parameter model (Bert - Large)
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/bert-vocab.json
DATA_PATH=$4 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
BERT_MODEL_ARGS=(
--num-layers 24
--hidden-size 1024
--num-attention-heads 16
--seq-length 512
--max-position-embeddings 512
--attention-backend auto # Can use (flash/fused/unfused/local)
)
TRAINING_ARGS=(
--micro-batch-size 4
--global-batch-size 32
--train-iters 1000000
--weight-decay 1e-2
--clip-grad 1.0
--fp16
--lr 0.0001
--lr-decay-iters 990000
--lr-decay-style linear
--min-lr 1.0e-5
--weight-decay 1e-2
--lr-warmup-fraction .01
--clip-grad 1.0
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 16
)
DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_bert.py \
${BERT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
\ No newline at end of file
# Megatron Core Export
This module is used to export megatron core models to different inference frameworks.
Currently we support TRTLLM export . In the future we will be adding support for VLLM etc.
## PTQ AND EXPORT
Follow the examples of [TensorRT Model Optimizer](../post_training/modelopt) to perform post training quantization, followed by an export to a HF-like checkpoint for TensorRT-LLM, vLLM, and SGLang deployment.
# TRTLLM EXPORT
Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.
# Megatron Core To TRTLLM Export Documentation
This guide will walk you through how you can use the megatron core export for exporting models to trtllm format
### Contents
- [Megatron Core To TRTLLM Export Documentation](#megatron-core-to-trtllm-export-documentation)
- [Contents](#contents)
- [1. Quick Start](#1-quick-start)
- [1.1 Understanding The Code](#11-understanding-the-code)
- [1.2 Running The Code](#12-running-the-code)
- [2. GPU Export](#2-gpu-export)
- [3. Future work](#4-future-work)
#### 1. Quick Start
This will walk you through the flow of converting an mcore gpt model to trtllm format using single device mode. The file can be found at [gpt_single_device_cpu_export.py](./single_device_export/gpt_single_device_cpu_export.py)
NOTE: For faster performance, if your entire model will fit into gpu memory, pre transfer the model state dict to gpu and then call the get_trtllm_pretrained_config_and_model_weights function.
<br>
##### 1.1 Understanding The Code
***STEP 1 - We initialize model parallel and other default arguments***
We initalize tp and pp to 1 so that we can get the full model state dict on cpu
```python
initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
```
***STEP 2 - We load the model using the model_provider_function***
NOTE: We create a simple gpt model
```python
transformer_config = TransformerConfig(
num_layers=2,
hidden_size=64, # Needs to be atleast 32 times num_attn_heads
num_attention_heads=2,
use_cpu_initialization=True,
pipeline_dtype=torch.float32,
)
gpt_model = GPTModel(
config=transformer_config,
transformer_layer_spec=get_gpt_layer_local_spec(),
vocab_size=100,
max_sequence_length=_SEQUENCE_LENGTH,
)
# Optionally you can also load a model using this code
# sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
# checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
# gpt_model.load_state_dict(checkpoint)
```
***STEP 3 - Instantiate the TRTLLM Helper***
We instantiate the [TRTLLM Helper](../../../megatron/core/export/trtllm/trtllm_helper.py) For the GPT model we instantiate trtllm_helper as shown below.
```python
if hasattr(gpt_model, "rotary_pos_emb"):
seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor
trtllm_helper = TRTLLMHelper(
transformer_config=gpt_model.config,
model_type=ModelType.gpt,
position_embedding_type = gpt_model.position_embedding_type,
max_position_embeddings = gpt_model.max_position_embeddings,
rotary_percentage = gpt_model.rotary_percent,
rotary_base = gpt_model.rotary_base,
moe_tp_mode = 2,
multi_query_mode = False,
activation = "gelu",
seq_len_interpolation_factor = seq_len_interpolation_factor,
share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
)
```
***STEP 4 - Get the TRTLLM Weights and configs***
To convert model weights to trtllm weights and configs, we use the [single_device_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/single_device_trtllm_model_weights_converter.py). We pass as inputs the model state dict, and export config. In this example we use inference tp size as 2 for the export.
```python
model_state_dict={}
for key , val in gpt_model.state_dict().items():
# val is non for _extra_state layers . We filter it out
if val is not None:
model_state_dict[key] = val
export_config = ExportConfig(inference_tp_size = 2)
weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
model_state_dict= model_state_dict,
dtype = DataType.bfloat16,
export_config=export_config
)
```
***STEP 5 - Build the TRTLLM Engine***
Following code is used to build the TRTLLM Engine.
```python
for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
trtllm_helper.build_and_save_engine(
max_input_len=256,
max_output_len=256,
max_batch_size=8,
engine_dir='/opt/megatron-lm/engine',
trtllm_model_weights=trtllm_model_weights,
trtllm_model_config=trtllm_model_config,
lora_ckpt_list=None,
use_lora_plugin=None,
max_lora_rank=64,
lora_target_modules=None,
max_prompt_embedding_table_size=0,
paged_kv_cache=True,
remove_input_padding=True,
paged_context_fmha=False,
use_refit=False,
max_num_tokens=None,
max_seq_len=512,
opt_num_tokens=None,
max_beam_width=1,
tokens_per_block=128,
multiple_profiles=False,
gpt_attention_plugin="auto",
gemm_plugin="auto",
)
```
<br>
##### 1.2 Running The Code
An example run script is shown below.
```
# In a workstation
MLM_PATH=/path/to/megatron-lm
CONTAINER_IMAGE=gitlab-master.nvidia.com:5005/dl/joc/nemo-ci/trtllm_0.12/train:pipe.17669124-x86
docker run -it --gpus=all --ipc=host -v $MLM_PATH/:/opt/megatron-lm $CONTAINER_IMAGE bash
# Inside the container run the following.
cd /opt/megatron-lm/
CUDA_VISIBLE_DEVICES=0 torchrun --nproc-per-node 1 examples/export/trtllm_export/single_device_export/gpt_single_device_cpu_export.py
```
<br>
#### 2. GPU Export
You can use the [gpt_distributed_gpu_export.py](./distributed_export/gpt_distributed_gpu_export.py) to run a more optimized on device distributed. version of trtllm export. Internally this uses the [distributed_converter](../../../megatron/core/export/trtllm/trtllm_weights_converter/distributed_trtllm_model_weights_converter.py) to convert model weights on device.
In the single device version you collect all the model weights on CPU/GPU, convert it to trtllm format, and then store the engine back on disk. In the GPU version you load each individual state dict on the gpus, convert it on the device itself and store the engine on disk.
To run the gpu version
```
CUDA_VISIBLE_DEVICES=0,1 torchrun --nproc-per-node 2 examples/export/trtllm_export/distributed_export/gpt_distributed_gpu_export.py
```
<br>
#### 3. Future work
The following are planned for the future releases .
* Pipeline parallellism for export (Work in progress)
* GPU Export for more models (Work in progress for some models)
* Refit functionality
* VLLM Support
\ No newline at end of file
import os
import torch
from megatron.core import parallel_state
from megatron.core import dist_checkpointing
from megatron.core.export.model_type import ModelType
from megatron.core.export.data_type import DataType
from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.models.gpt.gpt_model import GPTModel
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
_SEQUENCE_LENGTH = 64
_VOCAB_SIZE = 256
def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
parallel_state.destroy_model_parallel()
# Torch setup for distributed training
rank = int(os.environ['LOCAL_RANK'])
world_size = torch.cuda.device_count()
torch.cuda.set_device(rank)
torch.distributed.init_process_group(world_size=world_size, rank=rank)
# Megatron core distributed training initialization
parallel_state.initialize_model_parallel(tensor_model_parallel_size = tensor_model_parallel_size, pipeline_model_parallel_size=pipeline_model_parallel_size)
def model_provider():
"""Build the model."""
transformer_config = TransformerConfig(
num_layers=2,
hidden_size=64,
num_attention_heads=2,
use_cpu_initialization=True,
pipeline_dtype=torch.float32
)
gpt_model = GPTModel(
config=transformer_config,
transformer_layer_spec=get_gpt_layer_local_spec(),
vocab_size=_VOCAB_SIZE,
max_sequence_length=_SEQUENCE_LENGTH,
)
return gpt_model
def load_distributed_checkpoint(checkpoint_path, gpt_model):
sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
gpt_model.load_state_dict(checkpoint)
return gpt_model
if __name__ == "__main__":
initialize_distributed(tensor_model_parallel_size=2, pipeline_model_parallel_size=1)
model_parallel_cuda_manual_seed(123)
gpt_model = model_provider()
device = torch.device("cuda")
gpt_model.to(device)
# Optionally you can also load a gpt model from ckpt_path using this code below
# gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
seq_len_interpolation_factor = None
if hasattr(gpt_model, "rotary_pos_emb"):
seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor
trtllm_helper = TRTLLMHelper(
transformer_config=gpt_model.config,
model_type=ModelType.gpt,
position_embedding_type = gpt_model.position_embedding_type,
max_position_embeddings = gpt_model.max_position_embeddings,
rotary_percentage = gpt_model.rotary_percent,
rotary_base = gpt_model.rotary_base,
moe_tp_mode = 2,
multi_query_mode = False,
activation = "gelu",
seq_len_interpolation_factor = seq_len_interpolation_factor,
share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
)
trtllm_model_weights, trtllm_model_config = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
model_state_dict= gpt_model.state_dict(),
dtype = DataType.bfloat16,
on_device_distributed_conversion=True,
vocab_size=_VOCAB_SIZE,
gpus_per_node=2,
)
trtllm_helper.build_and_save_engine(
max_input_len=256,
max_output_len=256,
max_batch_size=8,
engine_dir='/opt/megatron-lm/engine',
trtllm_model_weights=trtllm_model_weights[0],
trtllm_model_config=trtllm_model_config[0],
lora_ckpt_list=None,
use_lora_plugin=None,
max_lora_rank=64,
lora_target_modules=None,
max_prompt_embedding_table_size=0,
paged_kv_cache=True,
remove_input_padding=True,
paged_context_fmha=False,
use_refit=False,
max_num_tokens=None,
max_seq_len=512,
opt_num_tokens=None,
max_beam_width=1,
tokens_per_block=128,
multiple_profiles=False,
gpt_attention_plugin="auto",
gemm_plugin="auto",
)
import os
import torch
from megatron.core import parallel_state
from megatron.core import dist_checkpointing
from megatron.core.export.model_type import ModelType
from megatron.core.export.data_type import DataType
from megatron.core.export.export_config import ExportConfig
from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
from megatron.core.tensor_parallel.random import model_parallel_cuda_manual_seed
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.models.gpt.gpt_model import GPTModel
from megatron.core.models.gpt.gpt_layer_specs import get_gpt_layer_local_spec
_SEQUENCE_LENGTH = 64
def initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
parallel_state.destroy_model_parallel()
# Torch setup for distributed training
rank = int(os.environ['LOCAL_RANK'])
world_size = torch.cuda.device_count()
torch.cuda.set_device(rank)
torch.distributed.init_process_group(world_size=world_size, rank=rank)
# Megatron core distributed training initialization
parallel_state.initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size)
def model_provider():
"""Build the model."""
transformer_config = TransformerConfig(
num_layers=2,
hidden_size=64, # Needs to be atleast 32 times num_attn_heads
num_attention_heads=2,
use_cpu_initialization=True,
pipeline_dtype=torch.float32,
)
gpt_model = GPTModel(
config=transformer_config,
transformer_layer_spec=get_gpt_layer_local_spec(),
vocab_size=100,
max_sequence_length=_SEQUENCE_LENGTH,
)
return gpt_model
def load_distributed_checkpoint(checkpoint_path, gpt_model):
sharded_state_dict=gpt_model.sharded_state_dict(prefix='')
checkpoint = dist_checkpointing.load(sharded_state_dict=sharded_state_dict, checkpoint_dir=checkpoint_path)
gpt_model.load_state_dict(checkpoint)
return gpt_model
if __name__ == "__main__":
# Need to use TP1 PP1 for export on single device
initialize_distributed(tensor_model_parallel_size=1, pipeline_model_parallel_size=1)
model_parallel_cuda_manual_seed(123)
gpt_model = model_provider()
# Optionally you can also load a gpt model from ckpt_path using this code below
# gpt_model = load_distributed_checkpoint(gpt_model=gpt_model, checkpoint_path=ckpt_path)
seq_len_interpolation_factor = None
if hasattr(gpt_model, "rotary_pos_emb"):
seq_len_interpolation_factor = gpt_model.rotary_pos_emb.seq_len_interpolation_factor
trtllm_helper = TRTLLMHelper(
transformer_config=gpt_model.config,
model_type=ModelType.gpt,
position_embedding_type = gpt_model.position_embedding_type,
max_position_embeddings = gpt_model.max_position_embeddings,
rotary_percentage = gpt_model.rotary_percent,
rotary_base = gpt_model.rotary_base,
moe_tp_mode = 2,
multi_query_mode = False,
activation = "gelu",
seq_len_interpolation_factor = seq_len_interpolation_factor,
share_embeddings_and_output_weights=gpt_model.share_embeddings_and_output_weights
)
export_config = ExportConfig(inference_tp_size = 2)
# NOTE : For faster performance, if your entire model will fit in gpu memory, transfer model state dict to GPU and then call this api
weight_list, config_list = trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
model_state_dict= gpt_model.state_dict(),
dtype = DataType.bfloat16,
export_config=export_config
)
for trtllm_model_weights, trtllm_model_config in zip(weight_list, config_list):
trtllm_helper.build_and_save_engine(
max_input_len=256,
max_output_len=256,
max_batch_size=8,
engine_dir='/opt/megatron-lm/engine',
trtllm_model_weights=trtllm_model_weights,
trtllm_model_config=trtllm_model_config,
lora_ckpt_list=None,
use_lora_plugin=None,
max_lora_rank=64,
lora_target_modules=None,
max_prompt_embedding_table_size=0,
paged_kv_cache=True,
remove_input_padding=True,
paged_context_fmha=False,
use_refit=False,
max_num_tokens=None,
max_seq_len=512,
opt_num_tokens=None,
max_beam_width=1,
tokens_per_block=128,
multiple_profiles=False,
gpt_attention_plugin="auto",
gemm_plugin="auto",
)
\ No newline at end of file
# GPT3 MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
- [3. Training Results](#3-training-results)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
DATA_PATH="" #<Specify path and file prefix>_text_document
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
```
NOTE: Depending on the environment you are running it the above command might like slightly different.
## 2. Configurations
<a id="markdown-configurations" name="configurations"></a>
The example in this folder shows you how to run 175B model. There are other configs you could run as well
### 345M
```
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--seq-length 1024 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 857M
```
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
# WARNING: Yaml configs is currently an experimental feature
language_model:
# model architecture
num_layers: 24
hidden_size: 1024
num_attention_heads: 16
num_query_groups: null
ffn_hidden_size: null
kv_channels: null
hidden_dropout: 0.0
attention_dropout: 0.0
fp32_residual_connection: False
apply_residual_connection_post_layernorm: False
layernorm_epsilon: 1.e-5
layernorm_zero_centered_gamma: True
add_bias_linear: False
bias_activation_fusion: False
add_qkv_bias: False
gated_linear_unit: False
activation_func: swiglu
num_moe_experts: null
rotary_interleaved: False
window_size: null
# initialization
init_method: null
init_method_std: 0.02
output_layer_init_method: null
# mixed-precision
apply_query_key_layer_scaling: False
attention_softmax_in_fp32: False
# fusion
bias_swiglu_fusion: True
masked_softmax_fusion: True
persist_layer_norm: False
memory_efficient_layer_norm: False
bias_dropout_fusion: True
apply_rope_fusion: True
# activation recomputation
recompute_granularity: null
recompute_method: null
recompute_num_layers: null
distribute_saved_activations: null
# fp8 related
fp8: null
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1
fp8_amax_compute_algo: "most_recent"
fp8_wgrad: True
# miscellaneous
clone_scatter_output_in_embedding: True
normalization: "LayerNorm" # alt value supported by TE: "RMSNorm"
# MoE related
moe_router_load_balancing_type: "aux_loss"
moe_router_topk: 2
moe_router_group_topk: null
moe_router_num_groups: null
moe_grouped_gemm: False
moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss.
moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss
moe_input_jitter_eps: null
moe_token_dropping: False
model_parallel:
# Model parallelism
tensor_model_parallel_size: 1
context_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
sequence_parallel: True
expert_model_parallel_size: 1
# Initialization
perform_initialization: True
use_cpu_initialization: null
# Training
fp16: False
bf16: True
params_dtype: null # Set from above arguments for core
timers: null
# Optimizations
gradient_accumulation_fusion: True
async_tensor_model_parallel_allreduce: True
tp_comm_overlap: False
# Debug Options
tp_comm_split_ag: True
tp_comm_atomic_ag: True
tp_comm_split_rs: True
tp_comm_atomic_rs: True
tp_comm_bulk_wgrad: True
tp_comm_bulk_dgrad: True
# Parallelism
finalize_model_grads_func: null
# Pipeline Parallel
pipeline_dtype: null
grad_scale_func: null
enable_autocast: False
autocast_dtype: null
variable_seq_lengths: False
num_microbatches_with_partial_activation_checkpoints: null
overlap_p2p_comm: False
batch_p2p_comm: True
batch_p2p_sync: True
use_ring_exchange_p2p: False
deallocate_pipeline_outputs: False
no_sync_func: null
grad_sync_func: null
param_sync_func: null
pipeline_model_parallel_split_rank: null
# CPU Offloading
cpu_offloading: False
cpu_offloading_num_layers: 0
_cpu_offloading_context: null
cpu_offloading_weights: False
cpu_offloading_activations: True
# Timing
barrier_with_L1_time: True
# training:
use_legacy_models: False
spec: null
micro_batch_size: 2
global_batch_size: 128
rampup_batch_size: [32, 32, 65324160]
check_for_nan_in_loss_and_grad: True
num_layers_per_virtual_pipeline_stage: null
encoder_num_layers: null
decoder_num_layers: null
rotary_seq_len_interpolation_factor: null
add_position_embedding: False
make_vocab_size_divisible_by: 128
group_query_attention: False
exit_signal_handler: False
exit_duration_in_mins: null
exit_interval: null
untie_embeddings_and_output_weights: True
position_embedding_type: rope
rotary_percent: 0.5
openai_gelu: False
squared_relu: False
swiglu: True
onnx_safe: null
bert_binary_head: True
max_position_embeddings: 4096
transformer_impl: local
use_flash_attn: False
seed: 1234
data_parallel_random_init: False
# Optimizer
optimizer: adam
lr: 2.5e-4
lr_decay_style: cosine
lr_decay_iters: null
lr_decay_samples: 255126953
lr_warmup_fraction: null
lr_warmup_iters: 0
lr_warmup_samples: 81381
lr_warmup_init: 0.0
min_lr: 2.5e-5
weight_decay: 0.1
start_weight_decay: null
end_weight_decay: null
weight_decay_incr_style: constant
clip_grad: 1.0
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.e-08
sgd_momentum: 0.9
override_opt_param_scheduler: False
use_checkpoint_opt_param_scheduler: False
# checkpointing arguments
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
load: null
no_load_optim: null
no_load_rng: null
finetune: False
use_checkpoint_args: False
exit_on_missing_checkpoint: False
# loss arguments
loss_scale: null
initial_loss_scale: 4294967296
min_loss_scale: 1.0
loss_scale_window: 1000
hysteresis: 2
accumulate_allreduce_grads_in_fp32: False
fp16_lm_cross_entropy: False
# distributed arguments
distributed_backend: nccl
distributed_timeout_minutes: 10
overlap_grad_reduce: False
align_grad_reduce: True
overlap_param_gather: False
align_param_gather: False
scatter_gather_tensors_in_pipeline: True
local_rank: null
lazy_mpu_init: null
empty_unused_memory_level: 0
standalone_embedding_stage: False
use_distributed_optimizer: False
nccl_communicator_config_path: null
train_iters: null
eval_iters: 32
eval_interval: 2000
skip_train: False
adlr_autoresume: False
adlr_autoresume_interval: 1000
# garbage collection
manual_gc: False
manual_gc_interval: 0
manual_gc_eval: True
tp_comm_overlap_cfg: null
#data
data_path: null
split: '99,1,0'
train_data_path: null
valid_data_path: null
test_data_path: null
data_cache_path: null
mock_data: False
vocab_size: null
vocab_file: null
merge_file: null
vocab_extra_ids: 0
seq_length: 4096
encoder_seq_length: null
decoder_seq_length: null
retriever_seq_length: 256
sample_rate: 1.0
mask_prob: 0.15
short_seq_prob: 0.1
num_workers: 2
tokenizer_type: GPTSentencePieceTokenizer
tokenizer_model: null
reset_position_ids: False
reset_attention_mask: False
eod_mask_loss: False
train_samples: 268554688
dataloader_type: null
#profile:
profile: False
profile_ranks: [0]
profile_step_end: 12
profile_step_start: 10
#logging:
log_params_norm: True
log_num_zeros_in_grad: True
log_throughput: False
log_progress: False
timing_log_level: 0
timing_log_option: minmax
tensorboard_log_interval: 1
tensorboard_queue_size: 1000
log_timers_to_tensorboard: False
log_validation_ppl_to_tensorboard: False
log_memory_to_tensorboard: False
log_world_size_to_tensorboard: False
log_loss_scale_to_tensorboard: True
wandb_project: ''
wandb_exp_name: ''
wandb_save_dir: ''
enable_one_logger: True
one_logger_project: megatron-lm
one_logger_run_name: null
log_interval: 100
tensorboard_dir: null
#!/bin/bash
# Runs the "175B" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH=$5 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 96
--hidden-size 12288
--num-attention-heads 96
--seq-length 2048
--max-position-embeddings 2048
--attention-backend auto # Can use (flash/fused/unfused/local)
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 1536
--rampup-batch-size 16 16 5859375
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--fp16
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 16
)
DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
### Megatron Core Inference Documentation
This guide provides an example for Megatron Core for running model inference.
### Contents
- [Megatron Core Inference Documentation](#megatron-core-inference-documentation)
- [Contents](#contents)
- [1. Quick Start](#1-quick-start)
- [1.1 Understanding The Code](#11-understanding-the-code)
- [1.2 Running The Code](#12-running-the-code)
- [2. Flow of Control In MCore Backend](#2-flow-of-control-in-mcore-backend)
- [3. Customizing The Inference Pipeline](#3-customizing-the-inference-pipeline)
- [3.1. Create Your Own Inference Backend](#31-create-your-own-inference-backend)
- [3.2. Create Your Own Text Generation Controller](#32-create-your-own-text-generation-controller)
- [3.3. Support Other Models](#33-support-other-models)
- [3.3. Modify Inference Parameters](#33-modify-inference-parameters)
- [4. Future work](#4-future-work)
<br>
#### 1. Quickstart
This example runs statically-batched inference on a model trained using Megatron Core. The entrypoint is [gpt_static_inference.py](./gpt/gpt_static_inference.py). A similar workflow can be adapted for [gpt_dynamic_inference.py](./gpt/gpt_dynamic_inference.py).
<br>
##### 1.1 Code Walkthrough
***STEP 1 - Initialize model parallel and other default arguments***
The micro batch size defaults to 1. It is not used in tensor-parallelism only, and for pipeline-parallel models it is calculated at runtime.
```python
# Initialize Megatron model using the same model provider from training.
initialize_megatron(
args_defaults={'no_load_rng': True, 'no_load_optim': True, 'micro_batch_size': 1}
)
```
***STEP 2 - Load the model using the model_provider_function***
The model provider function supports both MCore and Legacy models.
```python
# Load the model checkpoint
model = get_model(model_provider, wrap_with_ddp=False)
load_checkpoint(model, None, None)
model.eval()
model = model[0]
```
***STEP 3 - Choose an engine***
Text generation requires an inference engine, which includes a scheduler. The default engine is the [Megatron Core engine](../../megatron/core/inference/engine/mcore_engine.py) with a [text generation controller](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py). TRTLLMEngine will be supported in the future.
```python
# Create an inference wrapper to setup the model.
inference_wrapped_model = GPTInferenceWrapper(model, args)
# Define a sampling loop.
text_generation_controller = TextGenerationController(
inference_wrapped_model=inference_wrapped_model,
tokenizer=tokenizer
)
# Create a static or dynamic inference engine.
inference_engine = StaticInferenceEngine(
text_generation_controller=text_generation_controller,
max_batch_size=args.max_batch_size
)
```
***STEP 4 - Run text generation***
The [SamplingParams](../../megatron/core/inference/sampling_params.py) class uses suggested defaults. Customize this to change top_p, top_k, number of tokens to generate, etc. The result is returned as a list of [InferenceRequests](../../megatron/core/inference/inference_request.py).
```python
results: List[InferenceRequest] = inference_engine.generate(
prompts=args.prompts, sampling_params=sampling_params
)
if torch.distributed.get_rank() == 0:
for idx, result in enumerate(results):
print(f' ------------- RESULT FOR PROMPT {idx} --------------- ')
result = {
'id': result.request_id,
'input_prompt': result.prompt,
'generated_text': result.generated_text,
'generated_tokens' : result.generated_tokens
}
print(result)
```
<br>
##### 1.2 Running The Code
An example Slurm script is shown below. Set the tokenizer paths, inference params, and other settings appropriately.
For a recap on sampling parameters, refer to [this blog](https://ivibudh.medium.com/a-guide-to-controlling-llm-model-output-exploring-top-k-top-p-and-temperature-parameters-ed6a31313910).
```
# Slurm cluster settings
ACCOUNT=<account>
MLM_PATH=/path/to/megatron-lm
GPT_CKPT=/path/to/gpt/ckpt
VOCAB_MERGE_FILE_PATH=/path/to/vocab/and/merge/file
CONTAINER_IMAGE=nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.11
srun --account $ACCOUNT \
--job-name=$ACCOUNT:inference \
--partition=batch \
--time=01:00:00 \
--container-image $CONTAINER_IMAGE \
--container-mounts $MLM_PATH:/workspace/megatron-lm/,$GPT_CKPT:/workspace/mcore_gpt_ckpt,$VOCAB_MERGE_FILE_PATH:/workspace/tokenizer \
--no-container-mount-home \
--pty /bin/bash \
# Inside the container run the following.
cd megatron-lm/
export CUDA_DEVICE_MAX_CONNECTIONS=1
TOKENIZER_ARGS=(
--vocab-file /workspace/tokenizer/gpt2-vocab.json
--merge-file /workspace/tokenizer/gpt2-merges.txt
--tokenizer-type GPT2BPETokenizer
)
MODEL_ARGS=(
--use-checkpoint-args
--use-mcore-models
--load /workspace/mcore_gpt_ckpt
)
INFERENCE_SPECIFIC_ARGS=(
--attention-dropout 0.0
--hidden-dropout 0.0
--num-tokens-to-generate 20
--max-batch-size 4
)
torchrun --nproc-per-node=4 examples/inference/gpt/gpt_static_inference.py \
${TOKENIZER_ARGS[@]} \
${MODEL_ARGS[@]} \
${INFERENCE_SPECIFIC_ARGS[@]} \
--prompts "prompt one " "sample prompt two" "sample prompt 3"
NOTE: Other parameters which can be customized for inference:
--temperature (Sampling temperature)
--top_k (top_k sampling)
--top_p (top_p sampling)
--num-tokens-to-generate (Number of tokens to generate for each prompt)
--inference-batch-times-seqlen-threshold (During inference, if batch-size times sequence-length is smaller than this threshold then we will not use microbatched pipelining.')
--use-dist-ckpt (If using dist checkpoint format for the model)
--use-legacy-models (If using legacy models instead of MCore models)
```
<br>
#### 2. Control Flow in the MCore Backend
An example of inference with static batching is provided in [gpt_static_inference.py](./gpt/gpt_static_inference.py).
* [mcore_engine](../../megatron/core/inference/engines/mcore_engine.py) **generate()** function is called with the input prompts.
* The `Scheduler` in the engine will add these prompts to the [active requests] pool (../../megatron/core/inference/inference_request.py) until max batch size is hit. Remaining requests will be added to the waiting requests pool.
* The engine will run until all requests (waiting + active) are completed.
* The active requests are passed into **generate_all_output_tokens_static_batch()** of the text generation controller .
* This function uses the **prep_model_for_inference()** method of the [model_inference_wrappers](../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) and runs an autoregressive sampling loop
* In the autoregressive loop, the **get_batch_for_context_window()** method of the inference wrapper is called to slice out the input tokens and masks
* Input tokens and masks are passed it into the **run_one_forward_step()** method, which calls the model `.forward()` method to get the output logits
* Output logits are synchronized across all pipeline parallel ranks
* The text generation controller obtains the log probabilities and samples tokens based on the strategy defined in the sampling parameters.
* The sampled tokens are then appended to the input prompt tokens for the next iteration
* The **update_generation_status()** method of the text generation controller checks which prompts have finished generating or hit a stop condition
* After the inference loop, the result is detokenized and stored as an attribute of the InferenceRequest. These requests are marked as completed.
* The **update_requests_pool()** method of the scheduler moves completed requests into the completed request pool and waiting requests into the active request pool
<br>
#### 3. Customizing The Inference Pipeline
The inference pipeline supports three levels of customization:
* **Inference engine** - The MCore Engine supports static and dynamic batching. Modify this to add a new backend.
* **Text generation controller** - The main sampling loop. Customize this to support alternative tokenization or implement a new sampling strategy.
* **Inference Wrapped Model** - Change this to support a new model.
* **Modify Inference Parameters** - Change this to update top_p, top_k, number of tokens to be generated, temperature, and other sampling parameters.
<br>
##### 3.1. Create Your Own Inference Backend
The [abstract_engine.py](./../../megatron/core/inference/engine/abstract_engine.py) file contains a `generate` method that can be extended to support a new backend.
```python
class AbstractEngine(ABC):
@staticmethod
def generate(self) -> dict:
"""The abstract backend's generate function.
To define a new backend, implement this method and return the outputs as a dictionary.
```
<br>
##### 3.2. Implement a new Sampling Loop
The [TextGenerationController](../../megatron/core/inference/text_generation_controllers/text_generation_controller.py) contains the main sampling loop and can be modified to support new tokenization, detokenization, or sampling strategies.
``` python
class TextGenerationController:
def tokenize_prompt(self, prompt: str) -> Tuple[torch.Tensor, torch.Tensor]:
"""Utility to tokenize the input prompts"""
def sample_from_logits(
self,
last_token_logits: torch.Tensor,
sampling_params: SamplingParams,
vocab_size: int,
generation_started : Optional[torch.Tensor] = None,
top_n_logprobs_dict: Dict[int, List[Dict[str, float]]] = None,
) -> torch.Tensor:
"""Samples the logits to generate outputs
Given the logits of the last token, this function samples according to the parameters defined in sampling_params and returns the sampled tokens. If sampling_params.top_n_logprobs > 0
at each step it also updates the top_n_logprobs_dict.
"""
def update_generation_status(
self,
updated_prompts_tokens: torch.Tensor,
generation_started: torch.Tensor,
current_context_end_position: int,
is_generation_done_tensor: torch.Tensor,
generated_sequence_lengths: torch.Tensor,
) -> torch.Tensor:
"""Function to check which prompts have reached an end condition
We check which prompts have reached an end condition and set the corresponding flags of the is_generation_done_tensor to True . The generated sequence lengths increases as we keep generating, until that prompts hits an eod condition. The generation started status tensor helps us determine which prompts have started generating
"""
def generate_all_output_tokens_static_batch(
self, active_requests: OrderedDict[int, InferenceRequest],
) -> OrderedDict[int, InferenceRequest]:
"""Utility to generate all the output tokens and probabilities for the prompts .
This utility generates the output tokens for a static batch. It runs the forward steps till all prompts complete generation, updates the status of these requests to completed, adds the generated result and returns these requests
"""
def detokenize_generations(self, prompt_tokens_with_generated_tokens: torch.Tensor) -> str:
"""Detokenize the output generations"""
```
<br>
##### 3.3. Support Other Models
Extend [abstract_model_inference_wrapper.py](./../../megatron/core/inference/model_inference_wrappers/abstract_model_inference_wrapper.py) to support other models. The abstract model wrapper implements:
* Forward method which calls the model `forward` method depending on model parallel settings
* Initializes the model and puts it in `.eval()` mode
* Setup for the input parameters (max batch size, max seq length)
The following methods should be implemented:
```python
class AbstractModelInferenceWrapper:
def prep_model_for_inference(self, prompts_tokens: torch.Tensor):
"""A utility function for preparing model for inference
The function gets called once before the auto regressive inference loop. It puts the model in eval mode , and gets some model and inference data parameters. Extend this to build position ids ,attention mask etc, so that required slices can be extracted during the forward pass
"""
@abc.abstractclassmethod
def get_batch_for_context_window(self) -> List:
"""Returns the input data for inference
This function gets called iteratively in the inference loop. It can be used to extract relevant input from the prompt tokens, attention mask etc. required for each step in inference.
```
Refer to [gpt_inference_wrapper.py](../../megatron/core/inference/model_inference_wrappers/gpt/gpt_inference_wrapper.py) for an example of implementing this for GPTModel.
<br>
##### 3.3. Modify Inference Parameters
We use [common inference params](../../megatron/core/inference/sampling_params.py) for text generation. Customize this to change `top_p`, `top_k`, number of tokens to generate etc. Other attributes can be added for the inference loop as shown below.
```
from megatron.core.inference.sampling_params import SamplingParams
c = SamplingParams(temperature=0.5)
c.add_attributes({'min_length':4, 'eod_id':153})
```
<br>
#### 4. Future work
The following features are planned for future releases.
* TRTLLM Engine support
* Continuous batching optimizations
* Speculative decoding
\ No newline at end of file
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
import torch
from argparse import ArgumentParser
from collections import defaultdict
from tqdm import tqdm
from typing import List
from megatron.core.inference.contexts.dynamic_context import (
ContextOverflowError,
DynamicInferenceContext,
)
from megatron.core.inference.engines import DynamicInferenceEngine
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import GPTInferenceWrapper
from megatron.core.inference.sampling_params import SamplingParams
from megatron.core.inference.text_generation_controllers.text_generation_controller import TextGenerationController
from megatron.core.transformer.module import MegatronModule
from megatron.training import (
get_args,
get_model as _get_model,
get_tokenizer,
initialize_megatron,
)
from megatron.training.checkpointing import load_checkpoint
from pretrain_gpt import model_provider
from .utils import add_common_inference_args, build_requests, get_curr_time, Request
def add_dynamic_inference_args(parser: ArgumentParser) -> ArgumentParser:
"""Dynamic inference arguments."""
add_common_inference_args(parser)
group = parser.add_argument_group(title='Dynamic inference')
group.add_argument("--inference-ckpt-non-strict", action="store_true",
help="Load checkpoint with `strict=False`.")
return parser
def get_model() -> MegatronModule:
"""Initialize model and load checkpoint."""
args = get_args()
# Build model.
model = _get_model(model_provider, wrap_with_ddp=False)
# Load checkpoint.
assert args.load is not None
args.exit_on_missing_checkpoint = True
load_checkpoint(
ddp_model=model,
optimizer=None,
opt_param_scheduler=None,
strict=not args.inference_ckpt_non_strict,
)
# No virtual PP.
assert len(model) == 1, "Above condition should have caught this"
model = model[0]
# Eval mode.
model.eval()
return model
def get_inference_context(
requests: List[Request],
sampling_params: SamplingParams,
):
"""The inference context manages the KV cache and other inference state."""
args = get_args()
# Max sequence length.
max_gen_length = sampling_params.num_tokens_to_generate
max_context_length = max(len(r.prompt_tokens) for r in requests)
max_sequence_length = max_context_length + max_gen_length
# Inference context.
context = DynamicInferenceContext(
params_dtype=args.params_dtype,
num_layers=args.num_layers,
kv_channels=args.kv_channels,
num_attention_heads=args.num_query_groups if args.group_query_attention else args.num_attention_heads,
max_sequence_length=max_sequence_length,
buffer_size_gb=args.inference_dynamic_batching_buffer_size_gb,
buffer_guaranteed_fraction=args.inference_dynamic_batching_buffer_guaranteed_fraction,
chunk_size_tokens=args.inference_dynamic_batching_chunk_size,
buffer_overflow_factor=args.inference_dynamic_batching_buffer_overflow_factor,
max_requests_override=args.inference_dynamic_batching_max_requests_override,
max_tokens_override=args.inference_dynamic_batching_max_tokens_override,
tensor_model_parallel_size=args.tensor_model_parallel_size,
)
return context
def get_inference_controller(
model: MegatronModule,
context: DynamicInferenceContext,
) -> TextGenerationController:
"""Buid text generation controller, which manages the model inference context.
Args:
model (MegatronModule): Megatron GPT model.
context (DynamicInferenceContext): Context for managing KV cache.
Return:
(TextGenerationController) Inference text generation controller.
"""
args = get_args()
tokenizer = get_tokenizer()
# Wrap model in inference wrapper.
model = GPTInferenceWrapper(model, args, context)
# Note: the following is taken from AbstractModelInferenceWrapper.prep_model_for_inference().
from megatron.core import parallel_state
model.model_is_pipeline_parallel = not (
parallel_state.is_pipeline_first_stage() and
parallel_state.is_pipeline_last_stage()
)
# Text generation controller.
controller = TextGenerationController(model, tokenizer)
return controller
def run_inference(
requests: List[Request],
sampling_params: SamplingParams,
engine: DynamicInferenceEngine,
) -> None:
"""Add requests to engine and generate tokens.
Args:
requests (List[Request]): Requests that are to be added and processed.
sampling_params (SamplingParams): Sampling params for the logits.
engine (DynamicInferenceEngine): Inference engine that manages generating tokens.
Return:
None.
"""
# Initialize request arrival times.
base_arrival_time = get_curr_time()
for request in requests:
request.time_arrival = request.time_offset + base_arrival_time
# Add and process requests.
num_requests_total = len(requests)
num_requests_added = 0
num_requests_finished = 0
step_id = 0
step_times = {"prefill": [], "decode": []}
add_times = []
output_times = []
tbar = tqdm(total=num_requests_total)
while True:
curr_time = get_curr_time()
# Add requests with 'earlier' arrival time.
add_start = get_curr_time()
while num_requests_added < num_requests_total:
request = requests[num_requests_added]
if request.time_arrival > curr_time:
break
try:
# Using `prompt_text` instead of `prompt_tokens` for fair comparison.
engine.add_request(num_requests_added, request.prompt_text)
request.time_start = get_curr_time()
request.state = "started"
num_requests_added += 1
tbar.update(1)
except ContextOverflowError:
break
add_times.append(get_curr_time() - add_start)
# Step inference engine (i.e., generate a token for each active request).
is_decode_only = engine.context.is_decode_only()
finished_requests, step_time = engine.step(sampling_params, verbose=True)
step_id += 1
if len(finished_requests) > 0:
output_start = get_curr_time()
if is_decode_only:
step_times["decode"].append(step_time)
else:
step_times["prefill"].append(step_time)
# Append output tokens.
for finished_request in finished_requests:
request = requests[finished_request.request_id]
request.output_tokens = finished_request.generated_tokens
request.time_end = get_curr_time()
request.output_text = finished_request.generated_text
request.state = "finished"
num_requests_finished += 1
output_times.append(get_curr_time() - output_start)
# Check if all requests are finished.
if not (engine.has_unfinished_requests() or
num_requests_added < num_requests_total):
break
return step_times, add_times, output_times
if __name__ == "__main__":
# Initialize Megatron.
initialize_megatron(
extra_args_provider=add_dynamic_inference_args,
args_defaults={'no_load_rng': True,
'no_load_optim': True},
)
args = get_args()
tokenizer = get_tokenizer()
# Sampling params.
sampling_params = SamplingParams(
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
return_log_probs=args.return_log_probs,
num_tokens_to_generate=args.num_tokens_to_generate,
)
# Requests, context, conroller.
model = get_model()
requests = build_requests(args, tokenizer)
context = get_inference_context(requests, sampling_params)
controller = get_inference_controller(model, context)
# Inference engine.
engine = DynamicInferenceEngine(controller,
context,
termination_id=tokenizer.eod,
enable_cuda_graph=args.enable_cuda_graph,
random_seed=args.seed)
# Print setup.
setup_prefix = "dynamic | cg %d | %s | bf %.0f, flw %.1f [r %d, t %d], gtd %.2f [r %d] ... reqs %d" % (
args.enable_cuda_graph,
(
f"<user prompts, n {len(args.prompts)}>"
if args.prompts else
"<auto prompts> %s, %d, %.1e, %.1e" % (
"(%s)" % " ".join(map(str, args.num_tokens_to_prompt)),
args.num_tokens_to_generate,
args.incoming_requests_duration,
args.incoming_requests_per_sec,
)
),
args.inference_dynamic_batching_buffer_size_gb,
args.inference_dynamic_batching_buffer_overflow_factor,
context.max_requests,
context.max_tokens,
args.inference_dynamic_batching_buffer_guaranteed_fraction,
context.gtd_request_count,
len(requests),
)
print("~~~")
print(setup_prefix)
print("~~~")
# Run and time test.
t = get_curr_time()
step_times, add_times, output_times = run_inference(requests, sampling_params, engine)
total_time = get_curr_time() - t
# Validate all requests finished.
for request in requests:
assert request.state == "finished"
# Print unique prompts + outputs.
if torch.distributed.get_rank() == 0:
print("~~~~ Unique prompts + outputs. ~~~~")
# Map requests by their prompt.
unique_prompt_map = defaultdict(list)
for request_idx, request in enumerate(requests):
unique_prompt_map[request.prompt_text].append(request_idx)
# Print unique prompts + outputs.
for unique_idx, (prompt_text, request_idxs) in enumerate(unique_prompt_map.items()):
request_idx = request_idxs[0]
request = requests[request_idx]
print(f"{unique_idx}/{len(unique_prompt_map)} [{len(request_idxs)}]. {prompt_text} ... %s" % request.output_text.replace("\n", "\\n"))
# Timing results.
stats = torch.cuda.memory_stats()
print("~~~")
print("%s ... mem %.1f/%.1f ... total time: %.3f ... step time: total %.3f [ p %.3f, d %.3f ], mean [ p %.3f, d %.3f ], count [ p %d, d %d ] ... add time: %.3f, output time: %.3f." % (
setup_prefix,
stats["allocated_bytes.all.peak"] / (1024**3),
stats["reserved_bytes.all.peak"] / (1024**3),
sum(step_times["prefill"]) + sum(step_times["decode"]) + sum(add_times),
sum(step_times["prefill"]) + sum(step_times["decode"]),
sum(step_times["prefill"]),
sum(step_times["decode"]),
sum(step_times["prefill"]) / len(step_times["prefill"]),
sum(step_times["decode"]) / len(step_times["decode"]),
len(step_times["prefill"]),
len(step_times["decode"]),
sum(add_times),
sum(output_times),
))
print("~~~")
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# Run dynamic batching inference on the 12B GPT model.
set -u
pip install simpy
pip install sentencepiece
pip install tiktoken
export CUDA_DEVICE_MAX_CONNECTIONS=1
: ${CHECKPOINT_DIR:?"CHECKPOINT_DIR is not set"}
: ${TOKENIZER_MODEL:?"TOKENIZER_MODEL is not set"}
: ${NUM_TOKENS_TO_PROMPT="8 32"}
: ${NUM_TOKENS_TO_GENERATE=256}
: ${INCOMING_REQUESTS_DURATION=10.}
: ${INCOMING_REQUESTS_PER_SEC=100.}
: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB=50.}
: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR=1.}
: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION=0.05}
: ${ENGINE=dynamic}
: ${EXTRA_ARGS=""}
# NSIGHT_PREFIX=/path/to/nsight/profile
# --inference-rng-tracker \ # ... re-add after bugfix.
ARGS=" \
--no-persist-layer-norm \
--apply-layernorm-1p \
--no-position-embedding \
--group-query-attention \
--num-query-groups 8 \
--load ${CHECKPOINT_DIR} \
--use-checkpoint-args \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--use-rotary-position-embeddings \
--position-embedding-type rope \
--rotary-base 1000000 \
--rotary-percent 1.0 \
--swiglu \
--normalization RMSNorm \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--exit-duration-in-mins 5740 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 40 \
--hidden-size 5120 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--kv-channels 128 \
--seq-length 1024 \
--max-position-embeddings 1024 \
--micro-batch-size 64 \
--bf16 \
--tokenizer-type TikTokenizer \
--tiktoken-pattern v2 \
--tokenizer-model ${TOKENIZER_MODEL} \
--distributed-timeout-minutes 2400 \
--transformer-impl local \
--use-flash-attn \
\
--inference-dynamic-batching \
--inference-dynamic-batching-buffer-size-gb ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB} \
--inference-dynamic-batching-buffer-overflow-factor ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR} \
--inference-dynamic-batching-buffer-guaranteed-fraction ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION} \
\
--enable-cuda-graph \
${EXTRA_ARGS} \
"
if [[ -v PROMPTS ]]; then
ARGS+=" --prompts ${PROMPTS}"
else
ARGS+=" \
--num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \
--num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \
--incoming-requests-duration ${INCOMING_REQUESTS_DURATION} \
--incoming-requests-per-sec ${INCOMING_REQUESTS_PER_SEC} \
"
fi
CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}"
if [[ -v NSIGHT_PREFIX ]]; then
CMD="nsys profile -t cuda,nvtx,mpi -s none --wait=primary --show-output=true --force-overwrite=true --export=sqlite -o ${NSIGHT_PREFIX} ${CMD}"
fi
eval ${CMD}
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
# Run dynamic batching inference on the 357M GPT model.
set -u
pip install simpy
pip install sentencepiece
pip install tiktoken
export CUDA_DEVICE_MAX_CONNECTIONS=1
: ${CHECKPOINT_DIR:?"CHECKPOINT_DIR is not set"}
: ${VOCAB_FILE:?"VOCAB_FILE is not set"}
: ${MERGE_FILE:?"MERGE_FILE is not set"}
: ${NUM_TOKENS_TO_PROMPT="8 32"}
: ${NUM_TOKENS_TO_GENERATE=256}
: ${INCOMING_REQUESTS_DURATION=10.}
: ${INCOMING_REQUESTS_PER_SEC=100.}
: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB=50.}
: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR=1.}
: ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION=0.05}
: ${ENGINE=dynamic}
: ${EXTRA_ARGS=""}
# NSIGHT_PREFIX=/path/to/nsight/profile
# --inference-rng-tracker \ # ... re-add after bugfix.
ARGS=" \
--exit-on-missing-checkpoint \
--transformer-impl local \
--load ${CHECKPOINT_DIR} \
--tokenizer-type GPT2BPETokenizer \
--vocab-file ${VOCAB_FILE} \
--merge-file ${MERGE_FILE} \
--exit-on-missing-checkpoint \
--max-position-embeddings 2048 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--num-attention-heads 16 \
--hidden-size 1024 \
--bf16 \
--micro-batch-size 1 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--seed 42 \
--use-flash-attn \
\
--inference-dynamic-batching \
--inference-dynamic-batching-buffer-size-gb ${INFERENCE_DYNAMIC_BATCHING_BUFFER_SIZE_GB} \
--inference-dynamic-batching-buffer-overflow-factor ${INFERENCE_DYNAMIC_BATCHING_BUFFER_OVERFLOW_FACTOR} \
--inference-dynamic-batching-buffer-guaranteed-fraction ${INFERENCE_DYNAMIC_BATCHING_BUFFER_GUARANTEED_FRACTION} \
\
--enable-cuda-graph \
${EXTRA_ARGS} \
"
if [[ -v PROMPTS ]]; then
ARGS+=" --prompts ${PROMPTS}"
else
ARGS+=" \
--num-tokens-to-prompt ${NUM_TOKENS_TO_PROMPT} \
--num-tokens-to-generate ${NUM_TOKENS_TO_GENERATE} \
--incoming-requests-duration ${INCOMING_REQUESTS_DURATION} \
--incoming-requests-per-sec ${INCOMING_REQUESTS_PER_SEC} \
"
fi
CMD="python -m examples.inference.gpt.gpt_${ENGINE}_inference ${ARGS}"
if [[ -v NSIGHT_PREFIX ]]; then
CMD="nsys profile -t cuda,nvtx,mpi -s none --wait=primary --show-output=true --force-overwrite=true --export=sqlite -o ${NSIGHT_PREFIX} ${CMD}"
fi
eval ${CMD}
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
import os
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
InferenceWrapperConfig,
)
from pretrain_mamba import model_provider as mamba_model_provider
from pretrain_gpt import model_provider as gpt_model_provider
import torch
import sys
import time
import tqdm
import warnings
from argparse import Namespace
from megatron.core.inference.contexts import StaticInferenceContext
from megatron.core.inference.engines import StaticInferenceEngine
from megatron.core.inference.sampling_params import SamplingParams
from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
GPTInferenceWrapper,
)
from megatron.core.inference.inference_request import InferenceRequest
from megatron.core.inference.text_generation_controllers.text_generation_controller import (
TextGenerationController,
)
from megatron.core.transformer.module import MegatronModule
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
from megatron.training import get_args, get_tokenizer, print_rank_0
from megatron.training.checkpointing import load_checkpoint
from megatron.core import mpu
import json
from megatron.training.initialize import initialize_megatron
from megatron.training import get_model
import asyncio
from typing import AsyncIterator, List
from examples.inference.gpt.utils import add_common_inference_args, build_requests
def add_static_inference_args(parser):
"""Static inference arguments."""
add_common_inference_args(parser)
group = parser.add_argument_group(title='Static inference')
group.add_argument(
"--max-batch-size",
type=int,
default=None,
dest="max_batch_size",
help='Deprecated, use `--inference-max-requests` instead',
)
group.add_argument("--stream", action="store_true", default=False, help="Stream output tokens")
group.add_argument(
"--output-path", type=str, default=None, help="Path to save generations as JSON"
)
return parser
def get_inference_engine(args: Namespace, model: MegatronModule) -> StaticInferenceEngine:
"""Utility to get the relevant backend for running inference
This function will automatically choose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
Args:
args (Namespace): The user arguments parsed from command line
model (MegatronModule): The megatron model .
Returns:
AbstractBackend: The chosen backend
"""
tokenizer = get_tokenizer()
inference_wrapper_config = InferenceWrapperConfig(
hidden_size=args.hidden_size,
inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
fp32_residual_connection=args.fp32_residual_connection,
params_dtype=args.params_dtype,
padded_vocab_size=args.padded_vocab_size,
inference_max_requests=args.inference_max_batch_size,
inference_max_seq_length=args.inference_max_seq_length,
nccl_all_reduce_for_prefill=args.nccl_all_reduce_for_prefill,
)
inference_context = StaticInferenceContext.from_config(inference_wrapper_config)
inference_wrapped_model = GPTInferenceWrapper(
model, inference_wrapper_config, inference_context
)
text_generation_controller = TextGenerationController(
inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
)
return StaticInferenceEngine(text_generation_controller=text_generation_controller)
async def generate(
inference_engine: StaticInferenceEngine, sampling_params: SamplingParams, prompts: List[str]
) -> List[InferenceRequest]:
async def collect_stream(prompt, request_id, stream_generator):
print(f"Request {request_id}: {prompt}", end="", flush=True)
prev_idx = 0
async for output in stream_generator:
print(output.generated_text[prev_idx:], end="", flush=True)
prev_idx = len(output.generated_text)
print()
request_ids: List[str] = [
inference_engine.add_request(prompt=prompt, sampling_params=sampling_params, streaming=True)
for prompt in prompts
]
stream_generators = [
inference_engine.get_stream_generator(request_id) for request_id in request_ids
]
tasks = [
asyncio.create_task(collect_stream(prompt, request_id, stream_generator))
for (prompt, request_id, stream_generator) in zip(prompts, request_ids, stream_generators)
]
await inference_engine.run_engine_async()
await asyncio.gather(*tasks)
results: List[InferenceRequest] = [
inference_engine.scheduler.completed_request_pool[request_id] for request_id in request_ids
]
return results
def main():
"""Main program."""
# Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
# Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
initialize_megatron(
extra_args_provider=add_static_inference_args,
args_defaults={
'no_load_rng': True,
'no_load_optim': True,
'micro_batch_size': 1,
'exit_on_missing_checkpoint': True,
},
)
args = get_args()
if args.max_batch_size is not None:
warnings.warn(
f"`--max-batch-size` has been deprecated in favor of `--inference-max-requests`."
)
args.inference_max_batch_size = max(args.max_batch_size, args.inference_max_batch_size)
# Set up model and load checkpoint
if args.model_provider == "gpt":
model_provider = gpt_model_provider
elif args.model_provider == "mamba":
model_provider = mamba_model_provider
else:
raise ValueError(f"Invalid model provider {args.model_provider}")
model = get_model(model_provider, wrap_with_ddp=False)
load_checkpoint(model, None, None, strict=False)
model = model[0]
inference_engine = get_inference_engine(args, model)
sampling_params = SamplingParams(
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
return_log_probs=args.return_log_probs,
num_tokens_to_generate=args.num_tokens_to_generate,
top_n_logprobs=args.top_n_logprobs,
)
requests = build_requests(args, get_tokenizer())
prompts = [r.prompt_text for r in requests]
if args.enable_cuda_graph:
print(f"Running warmup for CUDA graphs...")
inference_engine.generate(
prompts=["warmup"], sampling_params=SamplingParams(num_tokens_to_generate=10)
)
start_time = time.perf_counter()
if args.stream:
results: List[InferenceRequest] = asyncio.run(
generate(inference_engine, sampling_params, prompts)
)
else:
results: List[InferenceRequest] = inference_engine.generate(
prompts=prompts, sampling_params=sampling_params
)
end_time = time.perf_counter()
latency = end_time - start_time
if torch.distributed.get_rank() == 0:
for idx, result in enumerate(results):
print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
result_dict = {
'id': result.request_id,
'input_prompt': result.prompt,
'generated_text': result.generated_text,
'generated_tokens': result.generated_tokens,
'latency': latency,
}
if sampling_params.top_n_logprobs > 0:
result_dict['generated_top_n_logprobs'] = result.generated_top_n_logprobs
if sampling_params.return_log_probs:
response_logprobs = result.prompt_log_probs + result.generated_log_probs
result_dict["logprobs"] = response_logprobs
# Write results to JSON. Primarily used for functional testing.
if args.output_path:
# Tensors cannot be serialized so we move these to CPU
result_dict['generated_tokens'] = result_dict['generated_tokens'].cpu().numpy().tolist()
results_as_json = json.dumps(result_dict)
with open(args.output_path, 'w') as f:
json.dump(results_as_json, f)
# Print unique prompts + outputs.
if torch.distributed.get_rank() == 0:
print("~~~~ Unique prompts + outputs. ~~~~")
# Map results by their prompt.
from collections import defaultdict
unique_prompt_map = defaultdict(list)
for result_idx, result in enumerate(results):
unique_prompt_map[result.prompt].append(result_idx)
# Print unique prompts + outputs.
for unique_idx, (prompt_text, result_idxs) in enumerate(unique_prompt_map.items()):
result_idx = result_idxs[0]
result = results[result_idx]
generated_text = result.generated_text.replace("\n", "\\n")
print(
f"{unique_idx}/{len(unique_prompt_map)} [{len(result_idxs)}]. {prompt_text} "
f"... {generated_text}"
)
stats = torch.cuda.memory_stats()
print_rank_0(
"static | cg %d | %s | reqs %d [ batch %d ] ... mem %.1f/%.1f ... time %.3f."
% (
args.enable_cuda_graph,
(
f"<user prompts>"
if args.prompts
else "<auto prompts> %s, %d, %.1e, %.1e"
% (
"(%s)" % " ".join(map(str, args.num_tokens_to_prompt)),
args.num_tokens_to_generate,
args.incoming_requests_duration,
args.incoming_requests_per_sec,
)
),
len(requests),
args.inference_max_batch_size,
stats["allocated_bytes.all.peak"] / (1024**3),
stats["reserved_bytes.all.peak"] / (1024**3),
latency,
)
)
torch.distributed.destroy_process_group()
if __name__ == "__main__":
main()
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
import random
import time
import torch
from argparse import ArgumentParser, Namespace
from typing import Any, List
def add_common_inference_args(parser: ArgumentParser) -> ArgumentParser:
"""Common inference arguments."""
group = parser.add_argument_group(title='Common inference')
group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
group.add_argument(
"--return-log-probs",
action='store_true',
default=False,
help='Return the log probabilities of the final output tokens',
)
group.add_argument(
"--prompts",
metavar='N',
type=str,
nargs='+',
help='Input prompts with each prompt within quotes and seperated by space',
)
group.add_argument(
"--num-tokens-to-prompt",
type=int,
nargs="+",
default=[64, 1024],
help='Number of tokens to use for simulated prompts. This should be a '
'space-separated pair of integers, and the generated prompt lengths will '
'be uniformly sampled within this range.',
)
group.add_argument(
"--num-tokens-to-generate",
type=int,
default=30,
help='Number of tokens to generate for each prompt',
)
group.add_argument(
"--top-n-logprobs",
type=int,
default=0,
help='Return the top n logprobs for the generated tokens and their corresponding token as a dictionary',
)
group.add_argument(
"--incoming-requests-per-sec",
type=float,
default=100.0,
help="Simulated number of requests per second.",
)
group.add_argument(
"--incoming-requests-duration",
type=float,
default=10.0,
help="Total amount of time to simulate that requests are "
"arriving. Multiply this value with "
"`--incoming-requests-per-sec` to get the approximate "
"total number of requests.",
)
group.add_argument(
"--model-provider", choices=["mamba", "gpt"], default="gpt", help="Model provider"
)
return parser
def get_curr_time() -> float:
"""Get synchronized time across ranks."""
curr_time = torch.cuda.LongTensor([time.time_ns()])
if torch.distributed.is_initialized():
torch.distributed.broadcast(curr_time, src=0)
return curr_time.item() / 10**9
class Request:
"""Class to hold attributes for a single request.
A request is initialized with its prompt text. As it is added, processed,
and completed through the inference engine, the request is populated with its
start time, end time, and output tokens.
Args:
prompt_text (str): Prompt text.
time_offset (float): Artificial time offset for simulating incoming
requests. This value is later added to the `base_arrival_time` to
simulate the requests arrival time.
tokenizer (Any): Tokenizer for tokenizing the prompt.
"""
def __init__(self, prompt_text: str, time_offset: float, tokenizer: Any):
self.prompt_text = prompt_text
self.prompt_tokens = tokenizer.tokenize(prompt_text)
self.output_text = None
self.output_tokens = []
self.time_offset = time_offset
self.time_arrival = None
self.time_start = None
self.time_end = None
self.state = "not-started"
def __str__(self) -> str:
return "state '%s'; prompt len %d; output len %d; '%s'" % (
self.state,
len(self.prompt_tokens),
len(self.output_tokens),
self.prompt_text,
)
def get_user_requests(args: Namespace, tokenizer: Any) -> List[Request]:
requests = [Request(p, -1.0, tokenizer) for p in args.prompts]
return requests
def get_auto_requests(args: Namespace, tokenizer: Any) -> List[Request]:
"""Get example requests."""
import simpy # Guard against this import in test case
random.seed(args.seed)
# Generate random time offsets.
def arrival(r):
while True:
yield env.timeout(random.expovariate(r))
time_offsets.append(env.now)
time_offsets = []
env = simpy.Environment()
env.process(arrival(args.incoming_requests_per_sec))
env.run(args.incoming_requests_duration)
# Ensure at least a single request.
if len(time_offsets) == 0:
time_offsets = [0.0]
# Initialize requests.
requests = [
Request("hi " * random.randint(*args.num_tokens_to_prompt), t, tokenizer)
for t in time_offsets
]
return requests
def build_requests(args: Namespace, tokenizer: Any) -> List[Request]:
if args.prompts:
return get_user_requests(args, tokenizer)
else:
return get_auto_requests(args, tokenizer)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment