Commit 4b097dee authored by liangjing's avatar liangjing
Browse files

update to core_v0.9

parent 3aca1415
import argparse
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
# Set up argument parsing
parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.")
parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation")
parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint")
# Parse command-line arguments
args = parser.parse_args()
model_path = args.model_path
prompt = args.prompt
config = AutoConfig.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, config=config)
model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda()
inputs = tokenizer(prompt, return_tensors="pt")
for key in inputs:
inputs[key] = inputs[key].cuda()
# top_k, top_p and do_sample are set for greedy argmax based sampling
outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
\ No newline at end of file
#!/bin/bash
# This example will start serving the Llama3-8B model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--use-checkpoint-args \
--disable-bias-linear \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 500000 \
--use-rotary-position-embeddings \
--swiglu \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 8192 \
--bf16 \
--micro-batch-size 1 \
--seq-length 8192
#!/bin/bash
# This example will start serving the Mistral-7B-v0.3 model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--use-checkpoint-args \
--apply-layernorm-1p \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--ffn-hidden-size 14336 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 4096 \
--bf16 \
--micro-batch-size 1 \
--seq-length 4096 \
--seed 101
# Megatron Model Optimization and Deployment
## Installation
We recommend that users follow TensorRT-LLM's official installation guide to build it from source
and proceed with a containerized environment (`docker.io/tensorrt_llm/release:latest`):
```sh
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM
git checkout v0.10.0
make -C docker release_build
```
> **TROUBLE SHOOTING:** rather than copying each folder separately in `docker/Dockerfile.multi`,
> you may need to copy the entire dir as `COPY ./ /src/tensorrt_llm` since a `git submodule` is
> called later which requires `.git` to continue.
Once the container is built, install `nvidia-modelopt` and additional dependencies for sharded checkpoint support:
```sh
pip install "nvidia-modelopt[all]~=0.13.0" --extra-index-url https://pypi.nvidia.com
pip install zarr tensorstore==0.1.45
```
TensorRT-LLM quantization functionalities are currently packaged in `nvidia-modelopt`.
You can find more documentation about `nvidia-modelopt` [here](https://nvidia.github.io/TensorRT-Model-Optimizer/).
## Support Matrix
The following matrix shows the current support for the PTQ + TensorRT-LLM export flow.
| model | fp16 | int8_sq | fp8 | int4_awq |
|-----------------------------|------|---------| ----| -------- |
| nextllm-2b | x | x | x | |
| nemotron3-8b | x | | x | |
| nemotron3-15b | x | | x | |
| llama2-text-7b | x | x | x | TP2 |
| llama2-chat-70b | x | x | x | TP4 |
Our PTQ + TensorRT-LLM flow has native support on MCore `GPTModel` with a mixed layer spec (native ParallelLinear
and Transformer-Engine Norm (`TENorm`). Note that this is not the default mcore gpt spec. You can still load the
following checkpoint formats with some remedy:
| GPTModel | sharded | remedy arguments |
|-----------------------------------|---------|---------------------------------------------|
| megatron.legacy.model | | `--export-legacy-megatron` |
| TE-Fused (default mcore gpt spec) | | `--export-te-mcore-model` |
| TE-Fused (default mcore gpt spec) | x | |
> **TROUBLE SHOOTING:** If you are trying to load an unpacked `.nemo` sharded checkpoint, then typically you will
> need to adding `additional_sharded_prefix="model."` to `modelopt_load_checkpoint()` since NeMo has an additional
> `model.` wrapper on top of the `GPTModel`.
> **NOTE:** flag `--export-legacy-megatron` may not work on all legacy checkpoint versions.
## Examples
> **NOTE:** we only provide a simple text generation script to test the generated TensorRT-LLM engines. For
> a production-level API server or enterprise support, see [NeMo](https://github.com/NVIDIA/NeMo) and TensorRT-LLM's
> backend for [NVIDIA Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server).
### Minitron-8B FP8 Quantization and TensorRT-LLM Deployment
First download the nemotron checkpoint from https://huggingface.co/nvidia/Minitron-8B-Base, extract the
sharded checkpoint from the `.nemo` tarbal and fix the tokenizer file name.
> **NOTE:** The following cloning method uses `ssh`, and assume you have registered the `ssh-key` in Hugging Face.
> If you are want to clone with `https`, then `git clone https://huggingface.co/nvidia/Minitron-8B-Base` with an access token.
```sh
git lfs install
git clone git@hf.co:nvidia/Minitron-8B-Base
cd Minitron-8B-Base/nemo
tar -xvf minitron-8b-base.nemo
cd ../..
```
Now launch the PTQ + TensorRT-LLM export script,
```sh
bash examples/inference/quantization/ptq_trtllm_minitron_8b ./Minitron-8B-Base None
```
By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the
quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can
be restored for further evaluation or quantization-aware training. TensorRT-LLM checkpoint and engine are exported to `/tmp/trtllm_ckpt` and
built in `/tmp/trtllm_engine` by default.
The script expects `${CHECKPOINT_DIR}` (`./Minitron-8B-Base/nemo`) to have the following structure:
> **NOTE:** The .nemo checkpoint after extraction (including examples below) should all have the following strucure.
```
├── model_weights
│ ├── common.pt
│ ...
├── model_config.yaml
│...
```
> **NOTE:** The script is using `TP=8`. Change `$TP` in the script if your checkpoint has a different tensor
> model parallelism.
Then build TensorRT engine and run text generation example using the newly built TensorRT engine
```sh
export trtllm_options=" \
--checkpoint_dir /tmp/trtllm_ckpt \
--output_dir /tmp/trtllm_engine \
--max_input_len 2048 \
--max_output_len 512 \
--max_batch_size 8 "
trtllm-build ${trtllm_options}
python examples/inference/quantization/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base
```
### mistral-12B FP8 Quantization and TensorRT-LLM Deployment
First download the nemotron checkpoint from https://huggingface.co/nvidia/Mistral-NeMo-12B-Base, extract the
sharded checkpoint from the `.nemo` tarbal.
> **NOTE:** The following cloning method uses `ssh`, and assume you have registered the `ssh-key` in Hugging Face.
> If you are want to clone with `https`, then `git clone https://huggingface.co/nvidia/Mistral-NeMo-12B-Base` with an access token.
```sh
git lfs install
git clone git@hf.co:nvidia/Mistral-NeMo-12B-Base
cd Mistral-NeMo-12B-Base
tar -xvf Mistral-NeMo-12B-Base.nemo
cd ..
```
Then log in to huggingface so that you can access to model
> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to mistralai/Mistral-Nemo-Base-2407 on huggingface
```sh
pip install -U "huggingface_hub[cli]"
huggingface-cli login
```
Now launch the PTQ + TensorRT-LLM checkpoint export script,
```sh
bash examples/inference/quantization/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None
```
Then build TensorRT engine and run text generation example using the newly built TensorRT engine
```sh
export trtllm_options=" \
--checkpoint_dir /tmp/trtllm_ckpt \
--output_dir /tmp/trtllm_engine \
--max_input_len 2048 \
--max_output_len 512 \
--max_batch_size 8 "
trtllm-build ${trtllm_options}
python examples/inference/quantization/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407
```
### llama2-text-7b INT8 SmoothQuant and TensorRT-LLM Deployment
> **NOTE:** Due to the LICENSE issue, we do not provide a MCore checkpoint to download. Users can follow
> the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
> use `--export-legacy-megatron` flag which will remap the checkpoint to the MCore `GPTModel` spec
> that we support.
```sh
bash examples/inference/quantization/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
```
The script expect `${CHECKPOINT_DIR}` to have the following structure:
```
├── hf
│ ├── tokenizer.config
│ ├── tokenizer.model
│ ...
├── iter_0000001
│ ├── mp_rank_00
│ ...
├── latest_checkpointed_iteration.txt
```
In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as
the source of the tokenizer.
### llama3-8b / llama3.1-8b INT8 SmoothQuant and TensorRT-LLM Deployment
> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.17 and trtllm-0.12.
> **NOTE:** There are two ways to acquire the checkpoint. Users can follow
> the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
> use `--export-legacy-megatron` flag which will remap the checkpoint to the MCore `GPTModel` spec
> that we support.
> Or Users can download [nemo model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/llama38bnemo) from NGC and extract the sharded checkpoint from the .nemo tarbal.
If users choose to download the model from NGC, first extract the sharded checkpoint from the .nemo tarbal.
```sh
tar -xvf 8b_pre_trained_bf16.nemo
```
Now launch the PTQ + TensorRT-LLM checkpoint export script for llama-3,
```sh
bash examples/inference/quantization/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None
```
or llama-3.1
```sh
bash examples/inference/quantization/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None
```
Then build TensorRT engine and run text generation example using the newly built TensorRT engine
```sh
export trtllm_options=" \
--checkpoint_dir /tmp/trtllm_ckpt \
--output_dir /tmp/trtllm_engine \
--max_input_len 2048 \
--max_output_len 512 \
--max_batch_size 8 "
trtllm-build ${trtllm_options}
python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B
# For llama-3
python examples/inference/quantization/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
#For llama-3.1
```
\ No newline at end of file
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/llama2-text-7b_v0.2.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/hf/tokenizer.model"
# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="2"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
trtllm_options=" \
--tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
--engine-dir /tmp/trtllm_engine \
--tokenizer ${CHECKPOINT_LOAD_DIR}/hf \
--max-input-len 2048 \
--max-output-len 512 \
--max-batch-size 8 "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
--swiglu \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--normalization RMSNorm \
--rotary-percent 1.0 \
--no-position-embedding \
--no-masked-softmax-fusion \
--no-bias-gelu-fusion \
--no-bias-dropout-fusion \
--no-async-tensor-model-parallel-allreduce \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 11008 \
--num-attention-heads 32 \
--seq-length 4096 \
--max-position-embeddings 4096 \
--micro-batch-size 1 \
--make-vocab-size-divisible-by 1 \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--save-interval 1000000 \
--use-dist-ckpt \
--load ${CHECKPOINT_LOAD_DIR}
--fp16"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
# This script is using mpi4py which will fork multiple processes.
python examples/inference/quantization/trtllm_text_generation.py ${trtllm_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/llama-3_1-8b-nemo_v1.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
export NVTE_FLASH_ATTN=0
export NVTE_FUSED_ATTN=0
export NVTE_UNFUSED_ATTN=1
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="1"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="2"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
--swiglu \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--normalization RMSNorm \
--rotary-percent 1.0 \
--hidden-dropout 0.0 \
--attention-dropout 0.0 \
--no-bias-gelu-fusion \
--no-bias-dropout-fusion \
--no-async-tensor-model-parallel-allreduce \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--seq-length 131072 \
--max-position-embeddings 131072 \
--micro-batch-size 4 \
--make-vocab-size-divisible-by 128 \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model meta-llama/Meta-Llama-3.1-8B \
--save-interval 1000000 \
--use-dist-ckpt \
--load ${CHECKPOINT_LOAD_DIR}
--rotary-base 500000
--fp16"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/llama-3_1-8b-nemo_v1.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="int8_sq"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
export NVTE_FLASH_ATTN=0
export NVTE_FUSED_ATTN=0
export NVTE_UNFUSED_ATTN=1
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="1"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="2"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--disable-bias-linear \
--swiglu \
--no-rope-fusion \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--normalization RMSNorm \
--rotary-percent 1.0 \
--hidden-dropout 0.0 \
--attention-dropout 0.0 \
--no-bias-gelu-fusion \
--no-bias-dropout-fusion \
--no-async-tensor-model-parallel-allreduce \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--seq-length 8192 \
--max-position-embeddings 8192 \
--micro-batch-size 4 \
--make-vocab-size-divisible-by 128 \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model meta-llama/Meta-Llama-3-8B \
--save-interval 1000000 \
--use-dist-ckpt \
--load ${CHECKPOINT_LOAD_DIR}
--rotary-base 500000
--fp16"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/nemotron3-8b_v0.3.0"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
export NVTE_FLASH_ATTN=0
export NVTE_FUSED_ATTN=0
export NVTE_UNFUSED_ATTN=1
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="gptnext"
CHECKPOINT_LOAD_DIR="${NAME}/nemo"
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="1"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--apply-layernorm-1p \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--no-rope-fusion \
--no-position-embedding \
--use-rotary-position-embeddings \
--rotary-percent 0.5 \
--squared-relu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 16384 \
--group-query-attention \
--num-attention-heads 48 \
--kv-channels 128 \
--seq-length 4096 \
--num-query-groups 8 \
--max-position-embeddings 4096 \
--micro-batch-size 4 \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model nvidia/Minitron-8B-Base \
--save-interval 1000000 \
--load ${CHECKPOINT_LOAD_DIR} \
--bf16 \
--use-dist-ckpt"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
#!/bin/bash
set -e
DEFAULT_NAME="/checkpoints/Mistral-NeMo-12B-Base"
NAME="${1:-$DEFAULT_NAME}"
DEFAULT_QUANT_CFG="fp8"
QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
# NOTE: UNFUSED ATTENTION MUST BE USED TO AVOID ADDITIONAL STATE_DICT KEY MISMATCH.
export NVTE_FLASH_ATTN=0
export NVTE_FUSED_ATTN=0
export NVTE_UNFUSED_ATTN=1
# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
TP="8"
INFERENCE_TP=${TP}
DECODER_TYPE="llama"
CHECKPOINT_LOAD_DIR="${NAME}"
if [ "$QUANT_CFG" = "int4_awq" ]; then
INFERENCE_TP="1"
fi
additional_options=" \
--export-quant-cfg ${QUANT_CFG} \
--export-legacy-megatron \
--export-te-mcore-model \
--calib-batch-size 8 \
--decoder ${DECODER_TYPE} \
--export-dir /tmp/trtllm_ckpt \
--inference-tensor-parallel ${INFERENCE_TP} "
# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
export CUDA_DEVICE_MAX_CONNECTIONS=1
options=" \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--use-rotary-position-embeddings \
--rotary-percent 1.0 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size 1 \
--num-layers 40 \
--hidden-size 5120 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--seq-length 8192 \
--kv-channels 128 \
--normalization RMSNorm \
--swiglu \
--num-query-groups 8 \
--group-query-attention \
--position-embedding-type rope \
--max-position-embeddings 8192 \
--micro-batch-size 1 \
--tokenizer-type HuggingFaceTokenizer \
--tiktoken-pattern v2 \
--tokenizer-model mistralai/Mistral-Nemo-Base-2407 \
--save-interval 1000000 \
--load ${CHECKPOINT_LOAD_DIR} \
--fp16 \
--rotary-base 1000000 \
--use-dist-ckpt"
# Precompile CUDA extentions
python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
# Acquire launch configuration where variable launch_config will be set
launch_config="--nproc_per_node=${TP}"
# Launch multi-process with torchrun
torchrun ${launch_config} examples/inference/quantization/text_generation_ptq.py ${options} ${additional_options}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Sample Generate GPT."""
import functools
import os
import sys
from pathlib import Path
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
import modelopt.torch.quantization as mtq
import torch
from datasets import load_dataset
from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group
from tqdm import tqdm
# [ModelOpt]: changing the default model provider to the ModelOpt version
from megatron.core import mpu
from megatron.inference.arguments import add_modelopt_args
from megatron.inference.checkpointing import load_modelopt_checkpoint
from megatron.inference.gpt.model_provider import model_provider
from megatron.inference.text_generation import generate_and_post_process
from megatron.training import get_args, get_model, initialize_megatron
from megatron.training.checkpointing import save_checkpoint
from megatron.training.utils import print_rank_0, unwrap_model
QUANT_CFG_CHOICES = {
"int8": mtq.INT8_DEFAULT_CFG,
"int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
"fp8": mtq.FP8_DEFAULT_CFG,
"int4_awq": mtq.INT4_AWQ_CFG,
"w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
"int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
}
def add_trtllm_ckpt_export_args(parser):
"""Add additional arguments for TensorRT-LLM."""
group = parser.add_argument_group(title="trtllm")
group.add_argument(
"--export-dir", type=str, help="The output TensorRT-LLM checkpoint.",
)
group.add_argument(
"--decoder", type=str, choices=["gptnext", 'llama'], help="The decoder type of the model.",
)
group.add_argument(
"--inference-tensor-parallel",
type=int,
help="Tensor parallel for the inference time, can be different from the training config.",
default=1,
)
def add_text_generate_ptq_args(parser):
"""Add additional arguments for ModelOpt text generation PTQ."""
group = parser.add_argument_group(title='ModelOpt text generation ptq')
group.add_argument(
"--calib-dataset",
type=str,
default="cnn_dailymail",
help="Calibration datasets from HuggingFace datasets.",
)
group.add_argument(
"--calib-batch-size", type=int, default=4, help="Batch size to use for ptq calibration."
)
group.add_argument(
"--calib-size", type=int, default=512, help="Samples to use for ptq calibration."
)
parser.add_argument(
"--prompts",
type=str,
default=(
"Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
),
help="Input texts. Please use | to separate different batches.",
)
add_modelopt_args(parser)
add_trtllm_ckpt_export_args(parser)
return parser
def get_calib_dataloader(
data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512
):
if data == "pileval":
dataset = load_dataset(
"json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train"
)
text_column = "text"
elif data == "wikitext":
dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
text_column = "text"
elif data == "cnn_dailymail":
dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
text_column = "article"
calib_size = max(min(len(dataset), calib_size), batch_size)
for i in range(calib_size // batch_size):
batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
for j in range(len(batch)):
batch[j] = batch[j][:max_sequence_length]
yield batch
if __name__ == "__main__":
initialize_megatron(
extra_args_provider=add_text_generate_ptq_args,
args_defaults={
'tokenizer_type': 'GPT2BPETokenizer',
'no_load_rng': True,
'no_load_optim': True,
},
)
args = get_args()
if args.num_layers_per_virtual_pipeline_stage is not None:
print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.")
exit()
print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
args.exit_on_missing_checkpoint = True
# Set up model and load checkpoint
# [ModelOpt]: make sure that output logits are allgathered.
text_generation_model_provider = functools.partial(model_provider, parallel_output=False)
model = get_model(text_generation_model_provider, wrap_with_ddp=False)
if args.load is not None:
load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
print_rank_0("Done loading checkpoint")
# Removing virtual pipeline parallel and other wrapper
assert len(model) == 1, "Above condition should have caught this"
unwrapped_model = unwrap_model(model)
all_prompts = args.prompts.split("|")
def custom_prompt_forward_loop_func(model):
for prompt in tqdm(all_prompts):
if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
(
prompts_plus_generations,
prompts_plus_generations_segments,
logprobs,
_,
) = generate_and_post_process(
model,
prompts=[prompt],
tokens_to_generate=128,
return_output_log_probs=True,
temperature=1.0,
)
print_rank_0(prompts_plus_generations)
else:
generate_and_post_process(model)
def hf_dataset_forword_loop_func(model):
dataloader = get_calib_dataloader(args.calib_dataset, args.calib_batch_size, args.calib_size)
for prompts in tqdm(dataloader, total=args.calib_size//args.calib_batch_size):
if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
(
prompts_plus_generations,
prompts_plus_generations_segments,
logprobs,
_,
) = generate_and_post_process(
model,
prompts=prompts,
tokens_to_generate=0,
return_output_log_probs=True,
temperature=1.0,
)
else:
generate_and_post_process(model)
ptq_forward_loop_func = custom_prompt_forward_loop_func
if args.calib_dataset is not None:
ptq_forward_loop_func = hf_dataset_forword_loop_func
# Setting data parallel and tensor parallel group
set_data_parallel_group(mpu.get_data_parallel_group())
set_tensor_parallel_group(mpu.get_tensor_model_parallel_group())
if args.export_quant_cfg in QUANT_CFG_CHOICES:
mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
if "*output_layer*" not in mtq_config["quant_cfg"]:
mtq_config["quant_cfg"]["*output_layer*"] = {"enable": False}
if "awq" in args.export_quant_cfg:
weight_quantizer = mtq_config["quant_cfg"]["*weight_quantizer"] # type: ignore
if isinstance(weight_quantizer, list):
weight_quantizer = weight_quantizer[0]
weight_quantizer["block_sizes"][-1] = 128
print_rank_0("Quantizing the model...")
mtq.quantize(unwrapped_model[0], mtq_config, ptq_forward_loop_func)
custom_prompt_forward_loop_func(model[0])
if args.save is not None and args.export_quant_cfg in QUANT_CFG_CHOICES:
save_checkpoint(1, unwrapped_model, None, None, 0)
print_rank_0(f"Fake Quantized Model:\n {unwrapped_model[0]}")
if args.export_dir:
assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported."
Path(args.export_dir).mkdir(parents=True, exist_ok=True)
print_rank_0("Exporting TensorRT-LLM checkpoints.")
from modelopt.torch.export import export_tensorrt_llm_checkpoint
# In TRT LLM, squared relu activation does not support bf16. So we use fp16 by default.
export_tensorrt_llm_checkpoint(
unwrapped_model[0],
args.decoder,
torch.bfloat16 if args.bf16 else torch.float16,
export_dir=args.export_dir,
inference_tensor_parallel=args.inference_tensor_parallel,
inference_pipeline_parallel=1,
use_nfs_workspace=True,
)
print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""An example script to run the tensorrt_llm engine."""
import argparse
from pathlib import Path
import subprocess
from typing import Optional, Union
import numpy as np
import torch
from modelopt.deploy.llm import LLM
from tensorrt_llm.models import PretrainedConfig
from transformers import AutoTokenizer, T5Tokenizer
import tensorrt_llm
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument("--tokenizer", type=str, default="")
parser.add_argument("--engine-dir", type=str, default="/tmp/trtllm_engine")
parser.add_argument(
"--input-texts",
type=str,
default=(
"Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
),
help="Input texts. Please use | to separate different batches.",
)
return parser.parse_args()
def run(args):
try:
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
except Exception as e:
raise Exception(f"Failed to load tokenizer: {e}")
print(tokenizer, tokenizer.vocab_size)
input_texts = args.input_texts.split("|")
assert input_texts, "input_text not specified"
print(input_texts)
free_memory_before = torch.cuda.mem_get_info()
# This is a ModelOpt wrapper on top of tensorrt_llm.hlapi.llm.LLM
llm_engine = LLM(args.engine_dir, tokenizer)
torch.cuda.cudart().cudaProfilerStart()
# outputs = llm_engine.generate_text(input_texts, args.max_output_len, args.max_beam_width)
outputs = llm_engine.generate(input_texts)
torch.cuda.cudart().cudaProfilerStop()
free_memory_after = torch.cuda.mem_get_info()
print(
f"Used GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB"
)
print(outputs)
if __name__ == "__main__":
args = parse_arguments()
run(args)
......@@ -26,9 +26,6 @@ torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--out-seq-length 1024 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--top_p 0.9 \
--seed 42
......@@ -24,9 +24,6 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_s
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--out-seq-length 1024 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--top_p 0.9 \
--seed 42
checkpoints/
data-cache/
tensorboard/
triton-cache/
FROM nvcr.io/nvidia/pytorch:24.01-py3
RUN pip uninstall -y triton && \
pip install triton==2.1.0 sentencepiece==0.1.99 flask-restful
# The causal-conv1d and mamba-ssm packages below are built from scratch here
# (which takes significant time) because there are no wheels available on PyPI
# for these relatively newer versions of the packages that are compatible with
# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we
# are using (in the NGC base container). Generally, if the package is not
# compatible with the PyTorch version, then it will generate a Python import
# error. The package authors tend to only release wheels for new versions of
# these pacakges which are compatible with the versions of regular PyTorch and
# NGC-variant PyTorch that are newer at the time of release. So, to use newer
# versions of these packages with relatively older versions of the NGC PyTorch
# container, we tend to have to build the packages from scratch.
RUN cd /tmp && \
git clone https://github.com/Dao-AILab/causal-conv1d.git && \
cd causal-conv1d && \
git checkout v1.2.2.post1 && \
CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \
cd .. && \
rm -rf causal-conv1d
RUN cd /tmp && \
git clone https://github.com/state-spaces/mamba.git && \
cd mamba && \
git checkout v2.0.3 && \
MAMBA_FORCE_BUILD=TRUE pip install . && \
cd .. && \
rm -rf mamba
# Mamba-based Language Models
## Introduction
This document is an entrypoint into the code used for
<em>[An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887)</em>.
We are releasing the parameters for some of the models described in that
technical report via
[HuggingFace](https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c).
The code in the `main` branch is no longer compatible with the `Mamba2-*`
checkpoints. You can load them using the
[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
## Installation
Create and run a Docker container using the [Dockerfile](./Dockerfile).
```
docker build -t your_image_name:your_tag .
docker run --gpus all -it --rm \
-v /path/to/megatron:/workspace/megatron \
-v /path/to/dataset:/workspace/dataset \
-v /path/to/checkpoints:/workspace/checkpoints \
-w /workspace/megatron/examples/mamba \
your_image_name:your_tag
```
## Train
[`train.sh`](./train.sh) is an example pretraining script, showing how to run on
a single node. Select between 800M-scale and 8B-scale models by setting the
`MODEL_SCALE` variable. The 8B-scale hybrid model architecture is the same as
the one described in the technical report.
## Text Generation
Use [`run_text_gen_server_8b.sh`](./run_text_gen_server_8b.sh) to start a text
generation server using an 8B hybrid checkpoint. This is configured to run the
8B hybrid model described in the technical report, with tensor model parallel
set to 1.
The arguments in the script will need to be changed if using a checkpoint with a
different model parallel configuration or other differences, such as model
architecture. For example, to run the 8B pure Mamba-2 model, change
`--hybrid-attention-ratio` and `--hybrid-mlp-ratio` to 0.0, or remove them.
Use [`run_text_gen_server_8b_gpt3.sh`](./run_text_gen_server_8b_gpt3.sh) to start
a text generation server using the 8B reference Transformer checkpoint.
## Checkpoint Formats
For inference, the model must be configured to match the checkpoint file used,
including the hybrid layer configuration and model parallel configuration.
If you need to convert a hybrid checkpoint file to a different tensor parallel
or pipeline parallel size, use
[the hybrid conversion script](../../tools/checkpoint/hybrid_conversion.py).
There is an example run command at the end of that file.
Before running that script, you will need to set `PYTHONPATH` to include the
root directory of your Megatron-LM repository clone.
```
export PYTHONPATH=<path-to-megatron>:PYTHONPATH
```
## Hybrid Options
`--hybrid-attention-ratio ATT` specifies a target ratio of attention layers
to total layers. For example, 4 attention layers out of 48 total layers is
specified by `--hybrid-attention-ratio 0.08`.
`--hybrid-mlp-ratio MLP` specifies a target ratio of MLP layers to total
layers. For example, 24 MLP layers out of 48 total layers is specified by
`--hybrid-mlp-ratio 0.5`.
* (`ATT` + `MLP`) must be less than or equal to 1.0.
* (1.0 - `ATT` - `MLP`) is the hybrid mamba ratio, the ratio of mamba layers to
total layers.
* `ATT` = `MLP` = 0 is a pure Mamba model.
* `ATT` = `MLP` = 0.5 is a transfomer model.
If either `ATT` or `MLP` is greater than 0.0 or if `--hybrid-override-pattern`
is specified, the logfile will include information about the hybrid layer
pattern used. `--hybrid-override-pattern` can be used to specify a different
pattern than the default, algorithmically-generated one.
## Mamba vs Mamba-2
This codebase currently only supports Mamba-2, and not the original version of
Mamba. However, the
[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba)
can be configured to run the original version of Mamba.
#!/bin/bash
# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
CHECKPOINT_PATH=$1
TOKENIZER_PATH=$2
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_TIMEOUT=19
export NCCL_IB_QPS_PER_CONNECTION=4
export TRITON_CACHE_DIR="./triton-cache/"
export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--untie-embeddings-and-output-weights \
--num-layers 56 \
--hidden-size 4096 \
--load ${CHECKPOINT_PATH} \
--num-attention-heads 32 \
--group-query-attention \
--num-query-groups 8 \
--hybrid-attention-ratio 0.08 \
--hybrid-mlp-ratio 0.5 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--disable-bias-linear \
--normalization RMSNorm \
--seq-length 4096 \
--max-position-embeddings 4096 \
--position-embedding-type none \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--distributed-backend nccl \
--distributed-timeout-minutes 1440 \
--bf16 \
--micro-batch-size 1 \
--use-mcore-models \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--seed 42
#!/bin/bash
# Use: ./run_text_gen_server_8b_gpt3.sh <checkpoint-path> <tokenizer-path>
# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
CHECKPOINT_PATH=$1
TOKENIZER_PATH=$2
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_TIMEOUT=19
export NCCL_IB_QPS_PER_CONNECTION=4
torchrun $DISTRIBUTED_ARGS ../../tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--use-flash-attn \
--apply-layernorm-1p \
--untie-embeddings-and-output-weights \
--num-layers 32 \
--hidden-size 4096 \
--load ${CHECKPOINT_PATH} \
--num-attention-heads 32 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--disable-bias-linear \
--seq-length 4096 \
--max-position-embeddings 4096 \
--position-embedding-type rope \
--rotary-percent 0.5 \
--squared-relu \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--distributed-backend nccl \
--distributed-timeout-minutes 1440 \
--bf16 \
--micro-batch-size 1 \
--use-mcore-models \
--transformer-impl local \
--seed 42
#!/bin/bash
# Use: ./train.sh <data-path> <tokenizer-path>
MODEL_SCALE="800M" # or "8B"
case "${MODEL_SCALE}" in
"800M")
TENSOR_MODEL_PARALLEL_SIZE=1
NUM_LAYERS=48
HIDDEN_SIZE=1024
NUM_ATTENTION_HEADS=16
GLOBAL_BATCH_SIZE=32
;;
"8B")
TENSOR_MODEL_PARALLEL_SIZE=4
NUM_LAYERS=56
HIDDEN_SIZE=4096
NUM_ATTENTION_HEADS=32
GLOBAL_BATCH_SIZE=8
;;
*)
echo "Invalid version specified"
exit 1
;;
esac
DATA_PATH=$1
TOKENIZER_PATH=$2
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_TIMEOUT=19
export NCCL_IB_QPS_PER_CONNECTION=4
CHECKPOINT_DIR="./checkpoints"
DATACACHE_DIR="./data-cache"
TENSORBOARD_DIR="./tensorboard"
mkdir -p ${CHECKPOINT_DIR}
mkdir -p ${DATACACHE_DIR}
mkdir -p ${TENSORBOARD_DIR}
export TRITON_CACHE_DIR="./triton-cache/"
export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
SEQ_LEN=4096
TRAIN_SAMPLES=73242188 # 300B tokens / 4096
LR_WARMUP_SAMPLES=50000
LR_DECAY_SAMPLES=73192188 # TRAIN_SAMPLES - LR_WARMUP_SAMPLES
options=" \
--tensor-model-parallel-size ${TENSOR_MODEL_PARALLEL_SIZE} \
--sequence-parallel \
--pipeline-model-parallel-size 1 \
--use-distributed-optimizer \
--overlap-param-gather \
--overlap-grad-reduce \
--untie-embeddings-and-output-weights \
--init-method-std 0.02 \
--position-embedding-type none \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NUM_ATTENTION_HEADS} \
--group-query-attention \
--num-query-groups 8 \
--hybrid-attention-ratio 0.08 \
--hybrid-mlp-ratio 0.5 \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${SEQ_LEN} \
--train-samples ${TRAIN_SAMPLES} \
--lr-warmup-samples ${LR_WARMUP_SAMPLES} \
--lr-decay-samples ${LR_DECAY_SAMPLES} \
--save ${CHECKPOINT_DIR} \
--load ${CHECKPOINT_DIR} \
--data-path ${DATA_PATH} \
--data-cache-path ${DATACACHE_DIR} \
--split 99,1,0 \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--distributed-backend nccl \
--micro-batch-size 4 \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--lr 2.5e-4 \
--min-lr 2.5e-5 \
--lr-decay-style cosine \
--weight-decay 0.1 \
--clip-grad 1.0 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--disable-bias-linear \
--normalization RMSNorm \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--log-interval 10 \
--save-interval 2000 \
--eval-interval 2000 \
--eval-iters 32 \
--bf16 \
--use-mcore-models \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--no-create-attention-mask-in-dataloader \
--tensorboard-dir ${TENSORBOARD_DIR}"
torchrun --nproc_per_node 8 ../../pretrain_mamba.py ${options}
#!/bin/bash
TENSOR_MODEL_PARALLEL_SIZE=2
VOCAB_FILE=bert-vocab.txt
CHECKPOINT_PATH=checkpoints/bert_345m
WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
--model-type BERT \
--tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--load $CHECKPOINT_PATH
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment