Commit 523ec9cc authored by wangsen's avatar wangsen
Browse files

all

parents
Pipeline #1668 failed with stages
in 0 seconds
#!/bin/bash
# Runs the "340M" parameter model (Bert - Large)
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/bert-vocab.json
DATA_PATH=$4 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
BERT_MODEL_ARGS=(
--num-layers 24
--hidden-size 1024
--num-attention-heads 16
--seq-length 512
--max-position-embeddings 512
)
TRAINING_ARGS=(
--micro-batch-size 4
--global-batch-size 32
--train-iters 1000000
--weight-decay 1e-2
--clip-grad 1.0
--fp16
--lr 0.0001
--lr-decay-iters 990000
--lr-decay-style linear
--min-lr 1.0e-5
--weight-decay 1e-2
--lr-warmup-fraction .01
--clip-grad 1.0
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 16
)
DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_bert.py \
${BERT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
# GPT3 MODEL
## Table of contents
- [1. Training Setup](#1-training-setup)
- [2. Configurations](#2-configurations)
- [3. Training Results](#3-training-results)
## 1. Training setup
<a id="markdown-training-setup" name="training-setup"></a>
To run the model using a docker container run it as follows
```
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
CHECKPOINT_PATH="" #<Specify path>
TENSORBOARD_LOGS_PATH=""#<Specify path>
VOCAB_FILE="" #<Specify path to file>/gpt2-vocab.json
MERGE_FILE="" #<Specify path to file>/gpt2-merges.txt
DATA_PATH="" #<Specify path and file prefix>_text_document
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
bash examples/gpt3/train_gpt3_175b_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $MERGE_FILE $DATA_PATH "
```
NOTE: Depending on the environment you are running it the above command might like slightly different.
## 2. Configurations
<a id="markdown-configurations" name="configurations"></a>
The example in this folder shows you how to run 175B model. There are other configs you could run as well
### 345M
```
--num-layers 12 \
--hidden-size 512 \
--num-attention-heads 8 \
--seq-length 1024 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
### 857M
```
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 2048 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
```
# WARNING: Yaml configs is currently an experimental feature
language_model:
# model architecture
num_layers: 24
hidden_size: 1024
num_attention_heads: 16
num_query_groups: null
ffn_hidden_size: null
kv_channels: null
hidden_dropout: 0.0
attention_dropout: 0.0
fp32_residual_connection: False
apply_residual_connection_post_layernorm: False
layernorm_epsilon: 1.e-5
layernorm_zero_centered_gamma: True
add_bias_linear: False
bias_activation_fusion: False
add_qkv_bias: False
gated_linear_unit: False
activation_func: swiglu
num_moe_experts: null
rotary_interleaved: False
window_size: null
# initialization
init_method: null
init_method_std: 0.02
output_layer_init_method: null
# mixed-precision
apply_query_key_layer_scaling: False
attention_softmax_in_fp32: False
# fusion
bias_swiglu_fusion: True
masked_softmax_fusion: True
persist_layer_norm: False
memory_efficient_layer_norm: False
bias_dropout_fusion: True
apply_rope_fusion: True
# activation recomputation
recompute_granularity: null
recompute_method: null
recompute_num_layers: null
distribute_saved_activations: null
# fp8 related
fp8: null
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1
fp8_amax_compute_algo: "most_recent"
fp8_wgrad: True
# miscellaneous
clone_scatter_output_in_embedding: True
normalization: "LayerNorm" # alt value supported by TE: "RMSNorm"
# MoE related
moe_router_load_balancing_type: "aux_loss"
moe_router_topk: 2
moe_grouped_gemm: False
moe_aux_loss_coeff: 0 # 1e-2 would be a good start value for load balance loss.
moe_z_loss_coeff: null # 1e-3 would be a good start value for z-loss
moe_input_jitter_eps: null
moe_token_dropping: False
model_parallel:
# Model parallelism
tensor_model_parallel_size: 1
context_parallel_size: 1
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
sequence_parallel: True
expert_model_parallel_size: 1
# Initialization
perform_initialization: True
use_cpu_initialization: null
# Training
fp16: False
bf16: True
params_dtype: null # Set from above arguments for core
timers: null
# Optimizations
gradient_accumulation_fusion: True
async_tensor_model_parallel_allreduce: True
tp_comm_overlap: False
# Debug Options
tp_comm_split_ag: True
tp_comm_atomic_ag: True
tp_comm_split_rs: True
tp_comm_atomic_rs: True
tp_comm_bulk_wgrad: True
tp_comm_bulk_dgrad: True
# Parallelism
finalize_model_grads_func: null
# Pipeline Parallel
pipeline_dtype: null
grad_scale_func: null
enable_autocast: False
autocast_dtype: null
variable_seq_lengths: False
num_microbatches_with_partial_activation_checkpoints: null
overlap_p2p_comm: False
batch_p2p_comm: True
batch_p2p_sync: True
use_ring_exchange_p2p: False
deallocate_pipeline_outputs: False
no_sync_func: null
grad_sync_func: null
param_sync_func: null
pipeline_model_parallel_split_rank: null
# CPU Offloading
cpu_offloading: False
cpu_offloading_num_layers: 0
_cpu_offloading_context: null
cpu_offloading_weights: False
cpu_offloading_activations: True
# Timing
barrier_with_L1_time: True
# training:
use_legacy_models: False
spec: null
micro_batch_size: 2
global_batch_size: 128
rampup_batch_size: [32, 32, 65324160]
check_for_nan_in_loss_and_grad: True
num_layers_per_virtual_pipeline_stage: null
encoder_num_layers: null
decoder_num_layers: null
rotary_seq_len_interpolation_factor: null
add_position_embedding: False
make_vocab_size_divisible_by: 128
group_query_attention: False
exit_signal_handler: False
exit_duration_in_mins: null
exit_interval: null
untie_embeddings_and_output_weights: True
position_embedding_type: rope
rotary_percent: 0.5
openai_gelu: False
squared_relu: False
swiglu: True
onnx_safe: null
bert_binary_head: True
max_position_embeddings: 4096
transformer_impl: local
use_flash_attn: False
seed: 1234
data_parallel_random_init: False
# Optimizer
optimizer: adam
lr: 2.5e-4
lr_decay_style: cosine
lr_decay_iters: null
lr_decay_samples: 255126953
lr_warmup_fraction: null
lr_warmup_iters: 0
lr_warmup_samples: 81381
lr_warmup_init: 0.0
min_lr: 2.5e-5
weight_decay: 0.1
start_weight_decay: null
end_weight_decay: null
weight_decay_incr_style: constant
clip_grad: 1.0
adam_beta1: 0.9
adam_beta2: 0.95
adam_eps: 1.e-08
sgd_momentum: 0.9
override_opt_param_scheduler: False
use_checkpoint_opt_param_scheduler: False
# checkpointing arguments
save: null
save_interval: 20000
no_save_optim: null
no_save_rng: null
load: null
no_load_optim: null
no_load_rng: null
finetune: False
use_checkpoint_args: False
exit_on_missing_checkpoint: False
# loss arguments
loss_scale: null
initial_loss_scale: 4294967296
min_loss_scale: 1.0
loss_scale_window: 1000
hysteresis: 2
accumulate_allreduce_grads_in_fp32: False
fp16_lm_cross_entropy: False
# distributed arguments
distributed_backend: nccl
distributed_timeout_minutes: 10
overlap_grad_reduce: False
delay_grad_reduce: True
overlap_param_gather: False
delay_param_gather: False
scatter_gather_tensors_in_pipeline: True
local_rank: null
lazy_mpu_init: null
empty_unused_memory_level: 0
standalone_embedding_stage: False
use_distributed_optimizer: False
nccl_communicator_config_path: null
train_iters: null
eval_iters: 32
eval_interval: 2000
skip_train: False
adlr_autoresume: False
adlr_autoresume_interval: 1000
# garbage collection
manual_gc: False
manual_gc_interval: 0
manual_gc_eval: True
tp_comm_overlap_cfg: null
#data
data_path: null
split: '99,1,0'
train_data_path: null
valid_data_path: null
test_data_path: null
data_cache_path: null
mock_data: False
vocab_size: null
vocab_file: null
merge_file: null
vocab_extra_ids: 0
seq_length: 4096
encoder_seq_length: null
decoder_seq_length: null
retriever_seq_length: 256
sample_rate: 1.0
mask_prob: 0.15
short_seq_prob: 0.1
num_workers: 2
tokenizer_type: GPTSentencePieceTokenizer
tokenizer_model: null
reset_position_ids: False
reset_attention_mask: False
eod_mask_loss: False
train_samples: 268554688
dataloader_type: null
#profile:
profile: False
profile_ranks: [0]
profile_step_end: 12
profile_step_start: 10
#logging:
log_params_norm: True
log_num_zeros_in_grad: True
log_throughput: False
log_progress: False
timing_log_level: 0
timing_log_option: minmax
tensorboard_log_interval: 1
tensorboard_queue_size: 1000
log_timers_to_tensorboard: False
log_batch_size_to_tensorboard: False
log_learning_rate_to_tensorboard: True
log_learning_rate_to_tensorboard: True
log_validation_ppl_to_tensorboard: False
log_memory_to_tensorboard: False
log_world_size_to_tensorboard: False
log_loss_scale_to_tensorboard: True
wandb_project: ''
wandb_exp_name: ''
wandb_save_dir: ''
enable_one_logger: False
one_logger_project: e2e-tracking
one_logger_entity: hwinf_dcm
one_logger_run_name: null
log_interval: 100
tensorboard_dir: null
#!/bin/bash
# Runs the "175B" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH=$5 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 96
--hidden-size 12288
--num-attention-heads 96
--seq-length 2048
--max-position-embeddings 2048
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 1536
--rampup-batch-size 16 16 5859375
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--fp16
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 16
)
DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
#!/bin/bash
# Runs the "175B" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=1 #8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=./tmp #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path>
#VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
#MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH="/root/megatron-llama/dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
TOKENIZER_PATH="/root/megatron-llama/tokenizer.model"
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 12
--hidden-size 512
--num-attention-heads 8
--seq-length 2048
--max-position-embeddings 2048
)
TRAINING_ARGS=(
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 60
--train-iters 50
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--fp16
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 20
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 1
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--position-embedding-type rope
--tokenizer-model $TOKENIZER_PATH
--tokenizer-type GPTSentencePieceTokenizer
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
# Megatron Model Optimization and Deployment
## Installation
We recommend that users follow TensorRT-LLM's official installation guide to build it from source
and proceed with a containerized environment (`docker.io/tensorrt_llm/release:latest`):
```sh
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM
git checkout v0.10.0
make -C docker release_build
```
> **TROUBLE SHOOTING:** rather than copying each folder separately in `docker/Dockerfile.multi`,
> you may need to copy the entire dir as `COPY ./ /src/tensorrt_llm` since a `git submodule` is
> called later which requires `.git` to continue.
Once the container is built, install `nvidia-modelopt` and additional dependencies for sharded checkpoint support:
```sh
pip install "nvidia-modelopt[all]~=0.13.0" --extra-index-url https://pypi.nvidia.com
pip install zarr tensorstore==0.1.45
```
TensorRT-LLM quantization functionalities are currently packaged in `nvidia-modelopt`.
You can find more documentation about `nvidia-modelopt` [here](https://nvidia.github.io/TensorRT-Model-Optimizer/).
## Support Matrix
The following matrix shows the current support for the PTQ + TensorRT-LLM export flow.
| model | fp16 | int8_sq | fp8 | int4_awq |
|-----------------------------|------|---------| ----| -------- |
| nextllm-2b | x | x | x | |
| nemotron3-8b | x | | x | |
| nemotron3-15b | x | | x | |
| llama2-text-7b | x | x | x | TP2 |
| llama2-chat-70b | x | x | x | TP4 |
Our PTQ + TensorRT-LLM flow has native support on MCore `GPTModel` with a mixed layer spec (native ParallelLinear
and Transformer-Engine Norm (`TENorm`). Note that this is not the default mcore gpt spec. You can still load the
following checkpoint formats with some remedy:
| GPTModel | sharded | remedy arguments |
|-----------------------------------|---------|---------------------------------------------|
| megatron.legacy.model | | `--export-legacy-megatron` |
| TE-Fused (default mcore gpt spec) | | `--export-te-mcore-model` |
| TE-Fused (default mcore gpt spec) | x | |
> **TROUBLE SHOOTING:** If you are trying to load an unpacked `.nemo` sharded checkpoint, then typically you will
> need to adding `additional_sharded_prefix="model."` to `modelopt_load_checkpoint()` since NeMo has an additional
> `model.` wrapper on top of the `GPTModel`.
> **NOTE:** flag `--export-legacy-megatron` may not work on all legacy checkpoint versions.
## Examples
> **NOTE:** we only provide a simple text generation script to test the generated TensorRT-LLM engines. For
> a production-level API server or enterprise support, see [NeMo](https://github.com/NVIDIA/NeMo) and TensorRT-LLM's
> backend for [NVIDIA Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server).
### nemotron3-8B FP8 Quantization and TensorRT-LLM Deployment
First download the nemotron checkpoint from https://huggingface.co/nvidia/nemotron-3-8b-base-4k, extract the
sharded checkpoint from the `.nemo` tarbal and fix the tokenizer file name.
> **NOTE:** The following cloning method uses `ssh`, and assume you have registered the `ssh-key` in Hugging Face.
> If you are want to clone with `https`, then `git clone https://huggingface.co/nvidia/nemotron-3-8b-base-4k` with an access token.
```sh
git lfs install
git clone git@hf.co:nvidia/nemotron-3-8b-base-4k
cd nemotron-3-8b-base-4k
tar -xvf Nemotron-3-8B-Base-4k.nemo
mv 586f3f51a9cf43bc9369bd53fa08868c_a934dc7c3e1e46a6838bb63379916563_3feba89c944047c19d5a1d0c07a85c32_mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model tokenizer.model
cd ..
```
Now launch the PTQ + TensorRT-LLM export script,
```sh
bash examples/inference/ptq_trtllm_nemotron3_8b ./nemotron-3-8b-base-4k None
```
By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the
quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can
be restored for further evaluation. TensorRT-LLM checkpoint and engine are exported to `/tmp/trtllm_ckpt` and
built in `/tmp/trtllm_engine` by default.
The script expects `${CHECKPOINT_DIR}` (`./nemotron-3-8b-base-4k`) to have the following structure:
```
├── model_weights
│ ├── common.pt
│ ...
├── model_config.yaml
├── mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model
```
> **NOTE:** The script is using `TP=8`. Change `$TP` in the script if your checkpoint has a different tensor
> model parallelism.
> **KNOWN ISSUES:** The `mt_nlg_plus_multilingual_ja_zh_the_stack_frac_015_256k.model` in the checkpoint is for
> Megatron-LM's `GPTSentencePiece` tokenizer.
> For TensorRT-LLM, we are trying to load this tokenizer as a Hugging Face `T5Tokenizer` by changing
> some special tokens, `encode`, and `batch_decode`. As a result, the tokenizer behavior in TensorRT-LLM engine may
> not match exactly.
### llama2-text-7b INT8 SmoothQuant and TensorRT-LLM Deployment
> **NOTE:** Due to the LICENSE issue, we do not provide a MCore checkpoint to download. Users can follow
> the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
> use `--export-legacy-megatron` flag which will remap the checkpoint to the MCore `GPTModel` spec
> that we support.
```sh
bash examples/inference/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
```
The script expect `${CHECKPOINT_DIR}` to have the following structure:
```
├── hf
│ ├── tokenizer.config
│ ├── tokenizer.model
│ ...
├── iter_0000001
│ ├── mp_rank_00
│ ...
├── latest_checkpointed_iteration.txt
```
In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as
the source of the tokenizer.
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/llama2-text-7b_v0.2.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/hf/tokenizer.model"
# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="2"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
trtllm_options=" \
--tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
--engine-dir /tmp/trtllm_engine \
--tokenizer ${CHECKPOINT_LOAD_DIR}/hf \
--max-input-len 2048 \
--max-output-len 512 \
--max-batch-size 8 "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
--swiglu \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--normalization RMSNorm \
--rotary-percent 1.0 \
--no-position-embedding \
--no-masked-softmax-fusion \
--no-bias-gelu-fusion \
--no-bias-dropout-fusion \
--no-async-tensor-model-parallel-allreduce \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 11008 \
--num-attention-heads 32 \
--seq-length 4096 \
--max-position-embeddings 4096 \
--micro-batch-size 1 \
--make-vocab-size-divisible-by 1 \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--save-interval 1000000 \
--use-dist-ckpt \
--load ${CHECKPOINT_LOAD_DIR}
--fp16"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options}
# This script is using mpi4py which will fork multiple processes.
python examples/inference/trtllm_text_generation.py ${trtllm_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/nemotron3-8b_v0.3.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="gptnext"
CHECKPOINT_LOAD_DIR="${NAME}"
TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/tokenizer.model"
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="1"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
trtllm_options=" \
--tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
--engine-dir /tmp/trtllm_engine \
--tokenizer ${TOKENIZER_MODEL} \
--max-input-len 2048 \
--max-output-len 512 \
--max-batch-size 8 "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--apply-layernorm-1p \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--no-rope-fusion \
--no-position-embedding \
--use-rotary-position-embeddings \
--rotary-percent 0.5 \
--squared-relu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 4096 \
--max-position-embeddings 4096 \
--micro-batch-size 1 \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--save-interval 1000000 \
--load ${CHECKPOINT_LOAD_DIR} \
--fp16 \
--use-dist-ckpt"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options}
# This script is using mpi4py which will fork multiple processes.
python examples/inference/trtllm_text_generation.py ${trtllm_options}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Sample Generate GPT."""
import functools
import os
import sys
from pathlib import Path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
import modelopt.torch.quantization as mtq
import torch
from datasets import load_dataset
from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group
from tqdm import tqdm
# [ModelOpt]: changing the default model provider to the ModelOpt version
from megatron.core import mpu
from megatron.inference.arguments import add_modelopt_args
from megatron.inference.checkpointing import load_modelopt_checkpoint
from megatron.inference.gpt.model_provider import model_provider
from megatron.inference.text_generation import generate_and_post_process
from megatron.training import get_args, get_model, initialize_megatron
from megatron.training.checkpointing import save_checkpoint
from megatron.training.utils import print_rank_0, unwrap_model
QUANT_CFG_CHOICES = {
"int8": mtq.INT8_DEFAULT_CFG,
"int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
"fp8": mtq.FP8_DEFAULT_CFG,
"int4_awq": mtq.INT4_AWQ_CFG,
"w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
"int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
}
def add_trtllm_ckpt_export_args(parser):
"""Add additional arguments for TensorRT-LLM."""
group = parser.add_argument_group(title="trtllm")
group.add_argument(
"--export-dir", type=str, help="The output TensorRT-LLM checkpoint.",
)
group.add_argument(
"--decoder", type=str, choices=["gptnext", 'llama'], help="The decoder type of the model.",
)
group.add_argument(
"--inference-tensor-parallel",
type=int,
help="Tensor parallel for the inference time, can be different from the training config.",
default=1,
)
def add_text_generate_ptq_args(parser):
"""Add additional arguments for ModelOpt text generation PTQ."""
group = parser.add_argument_group(title='ModelOpt text generation ptq')
group.add_argument(
"--calib-dataset",
type=str,
default="cnn_dailymail",
help="Calibration datasets from HuggingFace datasets.",
)
group.add_argument(
"--calib-batch-size", type=int, default=4, help="Batch size to use for ptq calibration."
)
group.add_argument(
"--calib-size", type=int, default=512, help="Samples to use for ptq calibration."
)
parser.add_argument(
"--prompts",
type=str,
default=(
"Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
),
help="Input texts. Please use | to separate different batches.",
)
add_modelopt_args(parser)
add_trtllm_ckpt_export_args(parser)
return parser
def get_calib_dataloader(
data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512
):
if data == "pileval":
dataset = load_dataset(
"json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train"
)
text_column = "text"
elif data == "wikitext":
dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
text_column = "text"
elif data == "cnn_dailymail":
dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
text_column = "article"
calib_size = max(min(len(dataset), calib_size), batch_size)
for i in range(calib_size // batch_size):
batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
for j in range(len(batch)):
batch[j] = batch[j][:max_sequence_length]
yield batch
if __name__ == "__main__":
initialize_megatron(
extra_args_provider=add_text_generate_ptq_args,
args_defaults={
'tokenizer_type': 'GPT2BPETokenizer',
'no_load_rng': True,
'no_load_optim': True,
},
)
args = get_args()
if args.num_layers_per_virtual_pipeline_stage is not None:
print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.")
exit()
print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
args.exit_on_missing_checkpoint = True
# Set up model and load checkpoint
# [ModelOpt]: make sure that output logits are allgathered.
text_generation_model_provider = functools.partial(model_provider, parallel_output=False)
model = get_model(text_generation_model_provider, wrap_with_ddp=False)
if args.load is not None:
load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
print_rank_0("Done loading checkpoint")
# Removing virtual pipeline parallel and other wrapper
assert len(model) == 1, "Above condition should have caught this"
unwrapped_model = unwrap_model(model)
all_prompts = args.prompts.split("|")
def custom_prompt_forward_loop_func(model):
for prompt in tqdm(all_prompts):
if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
(
prompts_plus_generations,
prompts_plus_generations_segments,
logprobs,
_,
) = generate_and_post_process(
model,
prompts=[prompt],
tokens_to_generate=128,
return_output_log_probs=True,
temperature=1.0,
)
print_rank_0(prompts_plus_generations)
else:
generate_and_post_process(model)
def hf_dataset_forword_loop_func(model):
dataloader = get_calib_dataloader(args.calib_dataset, args.calib_batch_size, args.calib_size)
for prompts in tqdm(dataloader, total=args.calib_size//args.calib_batch_size):
if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
(
prompts_plus_generations,
prompts_plus_generations_segments,
logprobs,
_,
) = generate_and_post_process(
model,
prompts=prompts,
tokens_to_generate=0,
return_output_log_probs=True,
temperature=1.0,
)
else:
generate_and_post_process(model)
ptq_forward_loop_func = custom_prompt_forward_loop_func
if args.calib_dataset is not None:
ptq_forward_loop_func = hf_dataset_forword_loop_func
# Setting data parallel and tensor parallel group
set_data_parallel_group(mpu.get_data_parallel_group())
set_tensor_parallel_group(mpu.get_tensor_model_parallel_group())
if args.export_quant_cfg in QUANT_CFG_CHOICES:
mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
if "*output_layer*" not in mtq_config["quant_cfg"]:
mtq_config["quant_cfg"]["*output_layer*"] = {"enable": False}
if "awq" in args.export_quant_cfg:
weight_quantizer = mtq_config["quant_cfg"]["*weight_quantizer"] # type: ignore
if isinstance(weight_quantizer, list):
weight_quantizer = weight_quantizer[0]
weight_quantizer["block_sizes"][-1] = 128
print_rank_0("Quantizing the model...")
mtq.quantize(unwrapped_model[0], mtq_config, ptq_forward_loop_func)
custom_prompt_forward_loop_func(model[0])
if args.save is not None and args.export_quant_cfg in QUANT_CFG_CHOICES:
save_checkpoint(1, unwrapped_model, None, None, 0)
print_rank_0(f"Fake Quantized Model:\n {unwrapped_model[0]}")
if args.export_dir:
assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported."
Path(args.export_dir).mkdir(parents=True, exist_ok=True)
print_rank_0("Exporting TensorRT-LLM checkpoints.")
from modelopt.torch.export import export_tensorrt_llm_checkpoint
# In TRT LLM, squared relu activation does not support bf16. So we use fp16 by default.
export_tensorrt_llm_checkpoint(
unwrapped_model[0],
args.decoder,
torch.bfloat16 if args.bf16 else torch.float16,
export_dir=args.export_dir,
inference_tensor_parallel=args.inference_tensor_parallel,
inference_pipeline_parallel=1,
use_nfs_workspace=True,
)
print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""An example script to run the tensorrt_llm engine."""
import argparse
from pathlib import Path
import numpy as np
import torch
from modelopt.deploy.llm import LLM, build_tensorrt_llm
from transformers import AutoTokenizer, T5Tokenizer
class CustomSentencePieceTokenizer(T5Tokenizer):
"""This is a custom GPTSentencePiece Tokenizer modified from the T5Tokenizer.
Note:
The modification is kept minimal to make `encode` and `batch_decode` working
properly (used in TensorRT-LLM engine). Other functions have not been tested.
"""
def __init__(self, model):
super().__init__(model, extra_ids=0, bos_token="<s>", pad_token="<pad>")
def encode(self, text, add_special_tokens: bool = True, **kwargs):
return torch.Tensor(self.sp_model.encode_as_ids(text))
def batch_encode_plus(
self, batch_text_or_text_pairs, add_special_tokens: bool = True, **kwargs
):
return {'input_ids': self.sp_model.encode_as_ids(batch_text_or_text_pairs)}
def batch_decode(self, sequences, skip_special_tokens: bool = False, **kwargs):
if isinstance(sequences, np.ndarray) or torch.is_tensor(sequences):
sequences = sequences.tolist()
return self.sp_model.decode(sequences)
def decode(self, token_ids, skip_special_tokens: bool = False, **kwargs):
return self.sp_model.decode([token_ids])[0]
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--tokenizer", type=str, default="")
parser.add_argument("--max-input-len", type=int, default=4096)
parser.add_argument("--max-output-len", type=int, default=512)
parser.add_argument("--max-batch-size", type=int, default=8)
parser.add_argument("--tensorrt-llm-checkpoint-dir", type=str, default=None)
parser.add_argument("--engine-dir", type=str, default="/tmp/trtllm_engine")
parser.add_argument(
"--input-texts",
type=str,
default=(
"Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
),
help="Input texts. Please use | to separate different batches.",
)
parser.add_argument("--max-beam-width", type=int, default=1)
parser.add_argument("--profiler-output", type=str, default="")
return parser.parse_args()
def run(args):
tokenizer_path = Path(args.tokenizer)
if tokenizer_path.is_dir():
# For llama models, use local HF tokenizer which is a folder.
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
elif tokenizer_path.is_file():
# For nextllm and nemotron models, use local Megatron GPTSentencePiece tokenizer which is a model file.
tokenizer = CustomSentencePieceTokenizer(args.tokenizer)
else:
raise ValueError(
"arg.tokenizer must be a dir to a hf tokenizer checkpoint for llama or a SentencePiece .model file for gptnext"
)
print(tokenizer, tokenizer.vocab_size)
if not hasattr(args, "profiler_output"):
args.profiler_output = ""
input_texts = args.input_texts.split("|")
assert input_texts, "input_text not specified"
print(input_texts)
if args.tensorrt_llm_checkpoint_dir is not None:
print("Building TensorRT-LLM engines.")
build_tensorrt_llm(
args.tensorrt_llm_checkpoint_dir + "/config.json",
args.engine_dir,
max_input_len=args.max_input_len,
max_batch_size=args.max_batch_size,
max_beam_width=args.max_beam_width,
num_build_workers=1,
)
print(f"TensorRT-LLM engines saved to {args.engine_dir}")
free_memory_before = torch.cuda.mem_get_info()
# This is a ModelOpt wrapper on top of tensorrt_llm.hlapi.llm.LLM
llm_engine = LLM(args.engine_dir, tokenizer)
torch.cuda.cudart().cudaProfilerStart()
# outputs = llm_engine.generate_text(input_texts, args.max_output_len, args.max_beam_width)
outputs = llm_engine.generate(input_texts)
torch.cuda.cudart().cudaProfilerStop()
free_memory_after = torch.cuda.mem_get_info()
print(
f"Used GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB"
)
print(outputs)
if __name__ == "__main__":
args = parse_arguments()
run(args)
#!/bin/bash
# This example will start serving the 345M model.
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--seed 42
#!/bin/bash
# This example will start serving the 345M model that is partitioned 8 way tensor parallel
DISTRIBUTED_ARGS="--nproc_per_node 8 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint (e.g /345m)>
VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
pip install flask-restful
python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 24 \
--hidden-size 1024 \
--load ${CHECKPOINT} \
--num-attention-heads 16 \
--max-position-embeddings 1024 \
--tokenizer-type GPT2BPETokenizer \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--seed 42
#!/bin/bash
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=5
#export HIP_ALLOC_INITIALIZE=0
#export GPU_MAX_HW_QUEUES=20
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=./tmp #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path>
#VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
#MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH="/root/megatron-llama/dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
TOKENIZER_PATH="/root/megatron-llama/tokenizer.model"
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 40
--hidden-size 5120
--num-attention-heads 40
--ffn-hidden-size 13824
--seq-length 4096
--max-position-embeddings 4096
)
TRAINING_ARGS=(
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 60
--train-iters 5
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--use-distributed-optimizer
--use-flash-attn-triton
--recompute-activations
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
--ddp-average-in-collective
--overlap-grad-reduce
--no-gradient-accumulation-fusion
--swiglu
--sequence-parallel
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
)
#--use-flash-attn
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 4
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization RMSNorm
--no-position-embedding
--tokenizer-model $TOKENIZER_PATH
--tokenizer-type Llama2Tokenizer
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
#!/bin/bash
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=5
export HIP_ALLOC_INITIALIZE=0
export GPU_MAX_HW_QUEUES=20
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_HCA=mlx5_1,mlx5_2
export NCCL_NET_GDR_LEVEL=SYS
export NCCL_NET_GDR_READ=0
lrank=$OMPI_COMM_WORLD_LOCAL_RANK
RANK=$OMPI_COMM_WORLD_RANK
WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
export NCCL_IB_TIMEOUT=22
CHECKPOINT_PATH=./tmp #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path>
DATA_PATH="/root/megatron-llama/dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
TOKENIZER_PATH="/root/megatron-llama/tokenizer.model"
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 40
--hidden-size 5120
--num-attention-heads 40
--ffn-hidden-size 13824
--seq-length 4096
--max-position-embeddings 4096
)
TRAINING_ARGS=(
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 60
--train-iters 5
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--use-distributed-optimizer
--use-flash-attn-v2
--recompute-activations
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
--ddp-average-in-collective
--overlap-grad-reduce
--no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
)
#--use-flash-attn
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 4
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization RMSNorm
--no-position-embedding
--tokenizer-model $TOKENIZER_PATH
--tokenizer-type Llama2Tokenizer
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
APP="python3 -u pretrain_gpt.py \
$GPT_MODEL_ARGS[@] \
$TRAINING_ARGS[@] \
$MODEL_PARALLEL_ARGS[@] \
$DATA_ARGS[@] \
$EVAL_AND_LOGGING_ARGS[@]
--rank ${RANK} \
--world_size ${WORLD_SIZE} \
--dist_url tcp://${1}:34566 \
"
case ${lrank} in
[0])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[1])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[2])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[3])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[4])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[5])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[6])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
[7])
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
${APP}
;;
esac
#!/bin/bash
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
#export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=5
#export HIP_ALLOC_INITIALIZE=0
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20
#export NCCL_PROTO=LL
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=./tmp #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path>
#VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
#MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH="/root/megatron-llama/dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
TOKENIZER_PATH="/root/megatron-llama/tokenizer.model"
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 32
--hidden-size 4096
--num-attention-heads 32
--ffn-hidden-size 11008
--seq-length 4096
--max-position-embeddings 4096
)
TRAINING_ARGS=(
--transformer-impl local
--use-legacy-models
--micro-batch-size 1
--global-batch-size 240
--train-iters 5
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--use-distributed-optimizer
--use-flash-attn-triton
--recompute-activations
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
--no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 2
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization RMSNorm
--no-position-embedding
--tokenizer-model $TOKENIZER_PATH
--tokenizer-type Llama2Tokenizer
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
#!/bin/bash
# Runs the "7B" parameter model
export HSA_FORCE_FINE_GRAIN_PCIE=1
#export OMP_NUM_THREADS=1
export NCCL_P2P_LEVEL=5
export NCCL_ALGO=Ring
export NCCL_NCHANNELS_PER_PEER=16
export NCCL_MIN_NCHANNELS=20
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=1 #4
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=./tmp #$1 #<Specify path>
TENSORBOARD_LOGS_PATH=./tmp #$2 #<Specify path>
#VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
#MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH="/root/megatron-llama/dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
TOKENIZER_PATH="/root/megatron-llama/tokenizer.model"
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 16
--hidden-size 1024
--num-attention-heads 16
--ffn-hidden-size 4096
--seq-length 2048
--max-position-embeddings 2048
)
TRAINING_ARGS=(
--transformer-impl local
--sequence-parallel
--micro-batch-size 1
--global-batch-size 10
--train-iters 5
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--bf16
--use-distributed-optimizer
--use-flash-attn-triton
--recompute-activations
--disable-bias-linear
--attention-dropout 0
--hidden-dropout 0
--no-gradient-accumulation-fusion
--swiglu
--lr 3.0e-5
--lr-decay-style cosine
--min-lr 3.0e-6
--lr-warmup-iters 1
)
#--use-legacy-models
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 1
)
DATA_ARGS=(
--data-path $DATA_PATH
--split 949,50,1
--untie-embeddings-and-output-weights
--use-rotary-position-embeddings
--normalization RMSNorm
--no-position-embedding
--tokenizer-model $TOKENIZER_PATH
--tokenizer-type Llama2Tokenizer
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 1
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
#!/bin/bash
# Runs the "175B" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
NUM_NODES=1
NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_LOGS_PATH=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
DATA_PATH=$5 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NUM_NODES
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
GPT_MODEL_ARGS=(
--num-layers 96
--hidden-size 12288
--num-attention-heads 96
--seq-length 2048
--max-position-embeddings 2048
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 1536
--rampup-batch-size 16 16 5859375
--train-iters 500000
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--clip-grad 1.0
--fp16
--lr 6.0e-5
--lr-decay-style cosine
--min-lr 6.0e-6
--lr-warmup-fraction .001
--lr-decay-iters 430000
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 16
)
DATA_ARGS=(
--data-path $DATA_PATH
--vocab-file $VOCAB_FILE
--merge-file $MERGE_FILE
--split 949,50,1
)
EVAL_AND_LOGGING_ARGS=(
--log-interval 100
--save-interval 10000
--eval-interval 1000
--save $CHECKPOINT_PATH
--load $CHECKPOINT_PATH
--eval-iters 10
--tensorboard-dir $TENSORBOARD_LOGS_PATH
)
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${GPT_MODEL_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${DATA_ARGS[@]} \
${EVAL_AND_LOGGING_ARGS[@]}
checkpoints/
data-cache/
tensorboard/
triton-cache/
FROM nvcr.io/nvidia/pytorch:23.12-py3
RUN pip uninstall -y causal-conv1d triton && \
pip install causal-conv1d==1.2.2.post1 sentencepiece==0.1.99 triton==2.1.0 flask-restful
WORKDIR /tmp
RUN git clone https://github.com/state-spaces/mamba.git && \
cd mamba && \
git checkout v2.0.3 && \
python setup.py install && \
cd .. && \
rm -rf mamba
# Mamba-based Language Models
## Introduction
This document is an entrypoint into the code used for
<em>[An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887)</em>.
We are releasing the parameters for some of the models described in that
technical report via
[HuggingFace](https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c).
## Installation
Create and run a Docker container using the [Dockerfile](./Dockerfile).
```
docker build -t your_image_name:your_tag .
docker run --gpus all -it --rm \
-v /path/to/megatron:/workspace/megatron \
-v /path/to/dataset:/workspace/dataset \
-v /path/to/checkpoints:/workspace/checkpoints \
-w /workspace/megatron/examples/mamba \
your_image_name:your_tag
```
## Train
[`train.sh`](./train.sh) is an example pretraining script, showing how to run on
a single node. Select between 800M-scale and 8B-scale models by setting the
`MODEL_SCALE` variable. The 8B-scale hybrid model architecture is the same as
the one described in the technical report.
## Text Generation
Use [`run_text_gen_server_8b.sh`](./run_text_gen_server_8b.sh) to start a text
generation server using an 8B hybrid checkpoint. This is configured to run the
8B hybrid model described in the technical report, with tensor model parallel
set to 1.
The arguments in the script will need to be changed if using a checkpoint with a
different model parallel configuration or other differences, such as model
architecture. For example, to run the 8B pure Mamba-2 model, change
`--hybrid-attention-ratio` and `--hybrid-mlp-ratio` to 0.0, or remove them.
Use [`run_text_gen_server_8b_gpt3.sh`](./run_text_gen_server_8b_gpt3.sh) to start
a text generation server using the 8B reference Transformer checkpoint.
## Checkpoint Formats
For inference, the model must be configured to match the checkpoint file used,
including the hybrid layer configuration and model parallel configuration.
If you need to convert a hybrid checkpoint file to a different tensor parallel
or pipeline parallel size, use
[the hybrid conversion script](../../tools/checkpoint/hybrid_conversion.py).
There is an example run command at the end of that file.
Before running that script, you will need to set `PYTHONPATH` to include the
root directory of your Megatron-LM repository clone.
```
export PYTHONPATH=<path-to-megatron>:PYTHONPATH
```
## Hybrid Options
`--hybrid-attention-ratio ATT` specifies a target ratio of attention layers
to total layers. For example, 4 attention layers out of 48 total layers is
specified by `--hybrid-attention-ratio 0.08`.
`--hybrid-mlp-ratio MLP` specifies a target ratio of MLP layers to total
layers. For example, 24 MLP layers out of 48 total layers is specified by
`--hybrid-mlp-ratio 0.5`.
* (`ATT` + `MLP`) must be less than or equal to 1.0.
* (1.0 - `ATT` - `MLP`) is the hybrid mamba ratio, the ratio of mamba layers to
total layers.
* `ATT` = `MLP` = 0 is a pure Mamba model.
* `ATT` = `MLP` = 0.5 is a transfomer model.
If either `ATT` or `MLP` is greater than 0.0 or if `--hybrid-override-pattern`
is specified, the logfile will include information about the hybrid layer
pattern used. `--hybrid-override-pattern` can be used to specify a different
pattern than the default, algorithmically-generated one.
## Mamba vs Mamba-2
This codebase currently only supports Mamba-2, and not the original version of
Mamba. However, the
[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba)
can be configured to run the original version of Mamba.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment