Commit d520d24f authored by silencealiang's avatar silencealiang
Browse files

Merge branch 'main' into 'main'

megatron升级v0.10

See merge request !3
parents 3aca1415 481609bb
Pipeline #2055 failed with stages
in 0 seconds
#!/bin/bash
# This example will start serving the Llama3.1-8B model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--use-checkpoint-args \
--disable-bias-linear \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 500000 \
--use-rope-scaling \
--use-rotary-position-embeddings \
--swiglu \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 131072 \
--bf16 \
--micro-batch-size 1 \
--seq-length 8192
#!/bin/bash
# This example will start serving the Llama3-8B model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--use-checkpoint-args \
--disable-bias-linear \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 500000 \
--use-rotary-position-embeddings \
--swiglu \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 8192 \
--bf16 \
--micro-batch-size 1 \
--seq-length 8192
#!/bin/bash
# This example will start serving the Mistral-7B-v0.3 model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr 0.0.0.0 \
--master_port 6000"
# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
if [ -z "$1" ] || [ -z "$2" ]; then
echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
exit 1
fi
# Assign command-line arguments to variables
CHECKPOINT=$1
TOKENIZER_MODEL=$2
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tokenizer-type HuggingFaceTokenizer \
--tokenizer-model ${TOKENIZER_MODEL} \
--use-checkpoint-args \
--apply-layernorm-1p \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--ffn-hidden-size 14336 \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--load ${CHECKPOINT} \
--num-attention-heads 32 \
--max-position-embeddings 4096 \
--bf16 \
--micro-batch-size 1 \
--seq-length 4096 \
--seed 101
......@@ -26,9 +26,6 @@ torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--out-seq-length 1024 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--top_p 0.9 \
--seed 42
......@@ -24,9 +24,6 @@ python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_s
--fp16 \
--micro-batch-size 1 \
--seq-length 1024 \
--out-seq-length 1024 \
--temperature 1.0 \
--vocab-file $VOCAB_FILE \
--merge-file $MERGE_FILE \
--top_p 0.9 \
--seed 42
import os
import sys
from argparse import Namespace
import torch
import pretrain_t5
from megatron.core.inference.common_inference_params import CommonInferenceParams
from megatron.core.inference.engines.abstract_engine import AbstractEngine
from megatron.core.inference.engines.mcore_engine import MCoreEngine
from megatron.core.inference.inference_request import InferenceRequest
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
InferenceWrapperConfig,
)
from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
T5InferenceWrapper,
)
from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
EncoderDecoderTextGenerationController,
)
from megatron.core.transformer.module import MegatronModule
from pretrain_t5 import model_provider
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
from typing import List
from megatron.core import mpu
from megatron.training import get_args, get_model, get_tokenizer
from megatron.training.checkpointing import load_checkpoint
from megatron.training.initialize import initialize_megatron
def add_text_generate_args(parser):
"""Text generation arguments."""
group = parser.add_argument_group(title='text generation')
group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
group.add_argument(
"--return-log-probs",
action='store_true',
default=False,
help='Return the log probabilities of the final output tokens',
)
group.add_argument(
"--num-tokens-to-generate",
type=int,
default=30,
help='Number of tokens to generate for each prompt',
)
group.add_argument(
"--encoder-prompts",
metavar='N',
type=str,
nargs='+',
help='Encoder input prompts with each prompt within quotes and seperated by space',
)
group.add_argument(
"--max-batch-size", type=int, default=1, help='Max number of prompts to process at once'
)
return parser
def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
"""Utility to get the relevant backend for running inference
This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
Args:
args (Namespace): The user arguments parsed from command line
model (MegatronModule): The megatron model .
Returns:
AbstractBackend: The chosen backend
"""
tokenizer = get_tokenizer()
inference_wrapper_config = InferenceWrapperConfig(
hidden_size=args.hidden_size,
inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
fp32_residual_connection=args.fp32_residual_connection,
params_dtype=args.params_dtype,
padded_vocab_size=args.padded_vocab_size,
)
inference_wrapped_model = T5InferenceWrapper(model, inference_wrapper_config)
text_generation_controller = EncoderDecoderTextGenerationController(
inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
)
return MCoreEngine(
text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
)
def main():
"""Main program."""
# Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
# Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
initialize_megatron(
extra_args_provider=add_text_generate_args,
args_defaults={
'no_load_rng': True,
'no_load_optim': True,
'micro_batch_size': 1,
'exit_on_missing_checkpoint': True,
},
)
# Set up model and load checkpoint
model = get_model(model_provider, wrap_with_ddp=False)
load_checkpoint(model, None, None)
model = model[0]
args = get_args()
inference_engine = get_inference_engine(args, model)
common_inference_params = CommonInferenceParams(
temperature=args.temperature,
top_k=args.top_k,
top_p=args.top_p,
return_log_probs=args.return_log_probs,
num_tokens_to_generate=args.num_tokens_to_generate,
)
tokenizer = get_tokenizer()
decoder_prompts = [""] * len(
args.encoder_prompts
) # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty
args.prompts = decoder_prompts
results: List[InferenceRequest] = inference_engine.generate(
prompts=args.prompts,
add_BOS=True,
encoder_prompts=args.encoder_prompts,
common_inference_params=common_inference_params,
)
if torch.distributed.get_rank() == 0:
for idx, result in enumerate(results):
print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
result = {
'id': result.request_id,
'input_prompt': result.prompt,
'generated_text': result.generated_text,
'generated_tokens': result.generated_tokens,
}
print(result)
if __name__ == "__main__":
main()
checkpoints/
data-cache/
tensorboard/
triton-cache/
FROM nvcr.io/nvidia/pytorch:24.01-py3
RUN pip uninstall -y triton && \
pip install triton==2.1.0 sentencepiece==0.1.99 flask-restful
# The causal-conv1d and mamba-ssm packages below are built from scratch here
# (which takes significant time) because there are no wheels available on PyPI
# for these relatively newer versions of the packages that are compatible with
# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we
# are using (in the NGC base container). Generally, if the package is not
# compatible with the PyTorch version, then it will generate a Python import
# error. The package authors tend to only release wheels for new versions of
# these pacakges which are compatible with the versions of regular PyTorch and
# NGC-variant PyTorch that are newer at the time of release. So, to use newer
# versions of these packages with relatively older versions of the NGC PyTorch
# container, we tend to have to build the packages from scratch.
RUN cd /tmp && \
git clone https://github.com/Dao-AILab/causal-conv1d.git && \
cd causal-conv1d && \
git checkout v1.2.2.post1 && \
CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \
cd .. && \
rm -rf causal-conv1d
RUN cd /tmp && \
git clone https://github.com/state-spaces/mamba.git && \
cd mamba && \
git checkout v2.0.3 && \
MAMBA_FORCE_BUILD=TRUE pip install . && \
cd .. && \
rm -rf mamba
# Mamba-based Language Models
## Introduction
This document is an entrypoint into the code used for
<em>[An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887)</em>.
We are releasing the parameters for some of the models described in that
technical report via
[HuggingFace](https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c).
The code in the `main` branch is no longer compatible with the `Mamba2-*`
checkpoints. You can load them using the
[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
## Installation
Create and run a Docker container using the [Dockerfile](./Dockerfile).
```
docker build -t your_image_name:your_tag .
docker run --gpus all -it --rm \
-v /path/to/megatron:/workspace/megatron \
-v /path/to/dataset:/workspace/dataset \
-v /path/to/checkpoints:/workspace/checkpoints \
-w /workspace/megatron/examples/mamba \
your_image_name:your_tag
```
## Train
[`train.sh`](./train.sh) is an example pretraining script, showing how to run on
a single node. Select between 800M-scale and 8B-scale models by setting the
`MODEL_SCALE` variable. The 8B-scale hybrid model architecture is the same as
the one described in the technical report.
## Text Generation
Use [`run_text_gen_server_8b.sh`](./run_text_gen_server_8b.sh) to start a text
generation server using an 8B hybrid checkpoint. This is configured to run the
8B hybrid model described in the technical report, with tensor model parallel
set to 1.
The arguments in the script will need to be changed if using a checkpoint with a
different model parallel configuration or other differences, such as model
architecture. For example, to run the 8B pure Mamba-2 model, change
`--hybrid-attention-ratio` and `--hybrid-mlp-ratio` to 0.0, or remove them.
Use [`run_text_gen_server_8b_gpt3.sh`](./run_text_gen_server_8b_gpt3.sh) to start
a text generation server using the 8B reference Transformer checkpoint.
## Checkpoint Formats
For inference, the model must be configured to match the checkpoint file used,
including the hybrid layer configuration and model parallel configuration.
If you need to convert a hybrid checkpoint file to a different tensor parallel
or pipeline parallel size, use
[the hybrid conversion script](../../tools/checkpoint/hybrid_conversion.py).
There is an example run command at the end of that file.
Before running that script, you will need to set `PYTHONPATH` to include the
root directory of your Megatron-LM repository clone.
```
export PYTHONPATH=<path-to-megatron>:PYTHONPATH
```
## Hybrid Options
`--hybrid-attention-ratio ATT` specifies a target ratio of attention layers
to total layers. For example, 4 attention layers out of 48 total layers is
specified by `--hybrid-attention-ratio 0.08`.
`--hybrid-mlp-ratio MLP` specifies a target ratio of MLP layers to total
layers. For example, 24 MLP layers out of 48 total layers is specified by
`--hybrid-mlp-ratio 0.5`.
* (`ATT` + `MLP`) must be less than or equal to 1.0.
* (1.0 - `ATT` - `MLP`) is the hybrid mamba ratio, the ratio of mamba layers to
total layers.
* `ATT` = `MLP` = 0 is a pure Mamba model.
* `ATT` = `MLP` = 0.5 is a transfomer model.
If either `ATT` or `MLP` is greater than 0.0 or if `--hybrid-override-pattern`
is specified, the logfile will include information about the hybrid layer
pattern used. `--hybrid-override-pattern` can be used to specify a different
pattern than the default, algorithmically-generated one.
## Mamba vs Mamba-2
This codebase currently only supports Mamba-2, and not the original version of
Mamba. However, the
[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba)
can be configured to run the original version of Mamba.
#!/bin/bash
# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
CHECKPOINT_PATH=$1
TOKENIZER_PATH=$2
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_TIMEOUT=19
export NCCL_IB_QPS_PER_CONNECTION=4
export TRITON_CACHE_DIR="./triton-cache/"
export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--untie-embeddings-and-output-weights \
--num-layers 56 \
--hidden-size 4096 \
--load ${CHECKPOINT_PATH} \
--num-attention-heads 32 \
--group-query-attention \
--num-query-groups 8 \
--hybrid-attention-ratio 0.08 \
--hybrid-mlp-ratio 0.5 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--disable-bias-linear \
--normalization RMSNorm \
--seq-length 4096 \
--max-position-embeddings 4096 \
--position-embedding-type none \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--distributed-backend nccl \
--distributed-timeout-minutes 1440 \
--bf16 \
--micro-batch-size 1 \
--use-mcore-models \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--seed 42
#!/bin/bash
# Use: ./run_text_gen_server_8b_gpt3.sh <checkpoint-path> <tokenizer-path>
# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
CHECKPOINT_PATH=$1
TOKENIZER_PATH=$2
DISTRIBUTED_ARGS="--nproc_per_node 1 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_TIMEOUT=19
export NCCL_IB_QPS_PER_CONNECTION=4
torchrun $DISTRIBUTED_ARGS ../../tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \
--use-flash-attn \
--apply-layernorm-1p \
--untie-embeddings-and-output-weights \
--num-layers 32 \
--hidden-size 4096 \
--load ${CHECKPOINT_PATH} \
--num-attention-heads 32 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--disable-bias-linear \
--seq-length 4096 \
--max-position-embeddings 4096 \
--position-embedding-type rope \
--rotary-percent 0.5 \
--squared-relu \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--distributed-backend nccl \
--distributed-timeout-minutes 1440 \
--bf16 \
--micro-batch-size 1 \
--use-mcore-models \
--transformer-impl local \
--seed 42
#!/bin/bash
# Use: ./train.sh <data-path> <tokenizer-path>
MODEL_SCALE="800M" # or "8B"
case "${MODEL_SCALE}" in
"800M")
TENSOR_MODEL_PARALLEL_SIZE=1
NUM_LAYERS=48
HIDDEN_SIZE=1024
NUM_ATTENTION_HEADS=16
GLOBAL_BATCH_SIZE=32
;;
"8B")
TENSOR_MODEL_PARALLEL_SIZE=4
NUM_LAYERS=56
HIDDEN_SIZE=4096
NUM_ATTENTION_HEADS=32
GLOBAL_BATCH_SIZE=8
;;
*)
echo "Invalid version specified"
exit 1
;;
esac
DATA_PATH=$1
TOKENIZER_PATH=$2
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_IB_TIMEOUT=19
export NCCL_IB_QPS_PER_CONNECTION=4
CHECKPOINT_DIR="./checkpoints"
DATACACHE_DIR="./data-cache"
TENSORBOARD_DIR="./tensorboard"
mkdir -p ${CHECKPOINT_DIR}
mkdir -p ${DATACACHE_DIR}
mkdir -p ${TENSORBOARD_DIR}
export TRITON_CACHE_DIR="./triton-cache/"
export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
SEQ_LEN=4096
TRAIN_SAMPLES=73242188 # 300B tokens / 4096
LR_WARMUP_SAMPLES=50000
LR_DECAY_SAMPLES=73192188 # TRAIN_SAMPLES - LR_WARMUP_SAMPLES
options=" \
--tensor-model-parallel-size ${TENSOR_MODEL_PARALLEL_SIZE} \
--sequence-parallel \
--pipeline-model-parallel-size 1 \
--use-distributed-optimizer \
--overlap-param-gather \
--overlap-grad-reduce \
--untie-embeddings-and-output-weights \
--init-method-std 0.02 \
--position-embedding-type none \
--num-layers ${NUM_LAYERS} \
--hidden-size ${HIDDEN_SIZE} \
--num-attention-heads ${NUM_ATTENTION_HEADS} \
--group-query-attention \
--num-query-groups 8 \
--hybrid-attention-ratio 0.08 \
--hybrid-mlp-ratio 0.5 \
--seq-length ${SEQ_LEN} \
--max-position-embeddings ${SEQ_LEN} \
--train-samples ${TRAIN_SAMPLES} \
--lr-warmup-samples ${LR_WARMUP_SAMPLES} \
--lr-decay-samples ${LR_DECAY_SAMPLES} \
--save ${CHECKPOINT_DIR} \
--load ${CHECKPOINT_DIR} \
--data-path ${DATA_PATH} \
--data-cache-path ${DATACACHE_DIR} \
--split 99,1,0 \
--tokenizer-type GPTSentencePieceTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--distributed-backend nccl \
--micro-batch-size 4 \
--global-batch-size ${GLOBAL_BATCH_SIZE} \
--lr 2.5e-4 \
--min-lr 2.5e-5 \
--lr-decay-style cosine \
--weight-decay 0.1 \
--clip-grad 1.0 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--disable-bias-linear \
--normalization RMSNorm \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--log-interval 10 \
--save-interval 2000 \
--eval-interval 2000 \
--eval-iters 32 \
--bf16 \
--use-mcore-models \
--spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
--no-create-attention-mask-in-dataloader \
--tensorboard-dir ${TENSORBOARD_DIR}"
torchrun --nproc_per_node 8 ../../pretrain_mamba.py ${options}
#!/bin/bash
TENSOR_MODEL_PARALLEL_SIZE=2
VOCAB_FILE=bert-vocab.txt
CHECKPOINT_PATH=checkpoints/bert_345m
WORLD_SIZE=$TENSOR_MODEL_PARALLEL_SIZE python tools/merge_mp_partitions.py \
--model-type BERT \
--tensor-model-parallel-size $TENSOR_MODEL_PARALLEL_SIZE \
--tokenizer-type BertWordPieceLowerCase \
--vocab-file $VOCAB_FILE \
--num-layers 24 \
--hidden-size 1024 \
--num-attention-heads 16 \
--seq-length 512 \
--max-position-embeddings 512 \
--load $CHECKPOINT_PATH
# Mixtral 8x7B Model Inference and Finetuning
## Download Mixtral 8x7B Checkpoints
Download Mixtral 8x7B HF format checkpoint from [HF-hub](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/)
Or you can simply run this following script to download Mixtral 8x7B into a specific folder.
```python
from huggingface_hub import snapshot_download
SAVED_DIR = "" # Specify the saved directory
# Download HF checkpoints
snapshot_download(repo_id="mistralai/Mixtral-8x7B-v0.1", ignore_patterns=["*.pt"], local_dir=SAVED_DIR, local_dir_use_symlinks=False)
```
## Convert Mixtral 8x7B checkpoints from HF to MCore
The HF checkpoints can be converted to Megatron format by using the provided checkpoint converter for HF format.
The target model parallel size(e.g. TP,PP,EP) should be specified.
Currently the converter doesn't support distributed checkpointing yet, so each different parallel config requires a specific checkpoint.
- For training, the recommended model parallel config is TP1EP8PP4
- For inference, the recommended model parallel config is TP1EP1PP2
```
TOKENIZER_MODEL=/workspace/checkpoints/mixtral-hf/tokenizer.model
MEGATRON_PATH="/workspace/megatron-lm"
export PYTHONPATH=$MEGATRON_PATH:$PYTHONPATH
export CUDA_DEVICE_MAX_CONNECTIONS=1
TARGET_TP_SIZE=""
TARGET_EP_SIZE=""
TARGET_PP_SIZE=""
HF_FORMAT_DIR=/workspace/checkpoints/mixtral-hf
MEGATRON_FORMAT_DIR=/workspace/checkpoints/mixtral-mcore-TP${TARGET_TP_SIZE}PP${TARGET_PP_SIZE}EP${TARGET_EP_SIZE}
python tools/checkpoint/convert.py \
--model-type GPT \
--loader loader_mixtral_hf \
--saver mcore \
--target-tensor-parallel-size ${TARGET_TP_SIZE} \
--target-pipeline-parallel-size ${TARGET_PP_SIZE} \
--target-expert-parallel-size ${TARGET_EP_SIZE} \
--load-dir ${HF_FORMAT_DIR} \
--save-dir ${MEGATRON_FORMAT_DIR} \
--tokenizer-model ${TOKENIZER_MODEL}
```
## Text generation with Mixtral 8x7B
Inference with Mixtral 8x7B requires at least 2 GPUS, such that a distributed checkpoint with EP>=2 or PP>=2 converted with above script is needed.
The Megatron-LM have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`, launch it with the following script:
```
#!/bin/bash
# This example will start serving the Mixtral 8x7B model.
DISTRIBUTED_ARGS="--nproc_per_node 2 \
--nnodes 1 \
--node_rank 0 \
--master_addr localhost \
--master_port 6000"
CHECKPOINT=<Path to checkpoint>
TOKENIZER_MODEL=<Path to tokenizer (e.g. /tokenizer.model)>
export CUDA_DEVICE_MAX_CONNECTIONS=1
pip install flask-restful
torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py \
--tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 2 \
--expert-model-parallel-size 1 \
--load ${CHECKPOINT} \
--tokenizer-type Llama2Tokenizer \
--tokenizer-model $TOKENIZER_MODEL \
--use-mcore-models \
--max-position-embeddings 32768 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--normalization RMSNorm \
--disable-bias-linear \
--position-embedding-type rope \
--no-position-embedding \
--swiglu \
--untie-embeddings-and-output-weights \
--group-query-attention \
--num-query-groups 8 \
--bf16 \
--micro-batch-size 1 \
--seq-length 1024 \
--seed 42 \
--num-experts 8 \
--moe-router-topk 2 \
--moe-token-dispatcher-type alltoall \
--moe-grouped-gemm \
--mock-data \
--rotary-base 1000000
```
Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
```
python tools/text_generation_cli.py localhost:5000
```
## Finetuning from pretrained Mixtral 8x7B
To finetuning pretrained Mixtral 8x7B, use the following scripts:
```bash
PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
CHECKPOINT_PATH="" # Speicfy path to checkpoint dir
TOKENIZER_MODEL="" # Specify path to tokenizer.model
DATA_PATH="" # Specify path to data
docker run \
--gpus=all \
--ipc=host \
--workdir /workspace/megatron-lm \
-v /path/to/data:/path/to/data \
-v /path/to/megatron-lm:/workspace/megatron-lm \
$PYTORCH_IMAGE \
bash examples/mixtral/train_mixtral_8x7b_distributed.sh $CHECKPOINT_PATH $TOKENIZER_MODEL $DATA_PATH
```
The above functionality also applys to Mixtral 8x22B actually, you should set the model config (including hidden_size/head_num/num_layers/ffn_hidden_size) properly according to the original [config](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/blob/main/config.json).
## Acknowledgements
Contributors outside NVIDIA for the huggingface converter and example of Mixtral models in Megatron-Core:
- Peng Li <jerry.lp@alibaba-inc.com>
- Jun Huang <huangjun.hj@alibaba-inc.com>
#!/bin/bash
# Runs Mixtral 8x7B model
export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8
# Change for multinode config
MASTER_ADDR=${MASTER_ADDR:-"localhost"}
MASTER_PORT=${MASTER_PORT:-"6000"}
NNODES=${SLURM_NNODES:-"1"}
NODE_RANK=${RANK:-"0"}
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
CHECKPOINT_PATH=$1
TOKENIZER_MODEL=$2
DATA_PATH=$3
DISTRIBUTED_ARGS=(
--nproc_per_node $GPUS_PER_NODE
--nnodes $NNODES
--node_rank $NODE_RANK
--master_addr $MASTER_ADDR
--master_port $MASTER_PORT
)
MODEL_ARGS=(
--use-mcore-models
--disable-bias-linear
--seq-length 4096
--max-position-embeddings 32768
--num-layers 32
--hidden-size 4096
--ffn-hidden-size 14336
--num-attention-heads 32
--init-method-std 0.01
--attention-dropout 0.0
--hidden-dropout 0.0
--normalization RMSNorm
--position-embedding-type rope
--swiglu
--untie-embeddings-and-output-weights
--group-query-attention
--num-query-groups 8
--no-masked-softmax-fusion
--no-position-embedding
--rotary-base 1000000
)
MOE_ARGS=(
--num-experts 8
--moe-router-topk 2
--moe-router-load-balancing-type aux_loss
--moe-aux-loss-coeff 1e-2
--moe-grouped-gemm
--moe-token-dispatcher-type alltoall
--overlap-param-gather
--overlap-grad-reduce
)
DATA_ARGS=(
--tokenizer-type Llama2Tokenizer
--tokenizer-model ${TOKENIZER_MODEL}
--data-path $DATA_PATH
--split 99990,8,2
)
TRAINING_ARGS=(
--micro-batch-size 1
--global-batch-size 256
--lr 1e-4
--train-iters 500000
--lr-decay-iters 320000
--lr-decay-style cosine
--min-lr 1.0e-5
--weight-decay 0.1
--lr-warmup-iters 500
--clip-grad 1.0
--bf16
)
MODEL_PARALLEL_ARGS=(
--tensor-model-parallel-size 1
--pipeline-model-parallel-size 4
--expert-model-parallel-size 8
--use-distributed-optimizer
--sequence-parallel
)
LOGGING_ARGS=(
--log-interval 1 \
--save-interval 10000 \
--eval-interval 1000 \
--eval-iters 10 \
--save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \
--tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
--no-load-optim \
--no-load-rng
)
if [ -n "${WANDB_API_KEY}" ]; then
LOGGING_ARGS+=(
--wandb-project ${WANDB_PROJECT:-"Mixtral"}
--wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
)
fi
torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
${MODEL_ARGS[@]} \
${MOE_ARGS[@]} \
${DATA_ARGS[@]} \
${TRAINING_ARGS[@]} \
${MODEL_PARALLEL_ARGS[@]} \
${LOGGING_ARGS[@]}
FROM nvcr.io/nvidia/pytorch:24.02-py3
RUN apt update && \
apt -y upgrade && \
apt install -y --no-install-recommends \
software-properties-common \
build-essential \
python3-pip \
python3-dev \
bash \
git \
vim \
tmux \
python-is-python3 \
default-jre
RUN pip install --upgrade pip
RUN pip install einops einops-exts sentencepiece braceexpand webdataset packaging
RUN pip install transformers datasets accelerate timm
RUN pip install pytest-cov pytest_mock nltk wrapt
RUN pip install zarr "tensorstore==0.1.45"
RUN pip install black isort click==8.0.2
RUN pip install pycocoevalcap megatron-energon mistral-common tiktoken
RUN pip install git+https://github.com/openai/CLIP.git
# Use --no-deps for the following to avoid outdated and unnecessary dependencies.
RUN pip install open_clip_torch open-flamingo[eval] --no-deps
# Multimodal Example
*NOTE: This example is under active development and is expected change.*
The following walks through all the steps required to pretrain and instruction tune a llava architecture vision-language model (VLM). It is important to precisely follow all steps to obtain the benchmark scores at the end.
This example has been tested on an A100 based DGX cluster. Pretraining and instruction tuning took approximately 1 day and 11 hours respectively on 64 GPUs using four way tensor parallelism (tp=4). Training speed will scale approximately linearly with number of GPUs available.
Multimodal support in megatron is still under active development. This example is not intended to produce state-of-the-art model quality (that would require more data and model refinements), it is merely intended to demonstrate the multimodal functionality in megatron. If you hit any problems, please open a github issue.
## Setup
### Docker container
You can build a docker container using `examples/multimodal/Dockerfile` to run this example.
### Language model
Follow the instructions in [Mistral](../../docs/llama_mistral.md#mistral-7b) to download weights for Mistral-7B-Instruct-v0.3 (Base or Instruct) from HuggingFace and convert to mcore format with tensor parallel size 4.
Please use the tokenizer from HuggingFace.
### Vision model
This example uses the OpenAI CLIP `ViT-L/14@336px` Vision model. To download the weights from OpenAI and convert them to a format that can be loaded in megatron, please run the following:
```
python examples/multimodal/model_converter/clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4 --use-te
```
### Combined model checkpoint
Update the paths to point to the mcore converted CLIP and Mistral models and run the following script to combine the Mistral and CLIP models into a single multimodal checkpoint folder:
```
examples/multimodal/combine_lm_vision_checkpoints.sh /path/to/mistral/model /path/to/clip/model /output/dir
```
## Training
### Pretraining
1. Download the LLavA-Pretrain dataset from Hugging Face and unzip the images folder (NOTE: 79GB of disk space required):
```
git clone https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain
cd LLaVA-Pretrain
unzip images.zip
```
3. Run the following script to convert the data to webdataset format:
```
cd <megatron-lm dir>
python examples/multimodal/convert_llava_pretrain_to_wds.py
```
4. Run the following command to convert to megatron-energon format:
```
cd <LLaVA-Pretrain dir>/wds
energon prepare ./
```
select the following values for the presented options:
```
> Please enter a desired train/val/test split like "0.5, 0.2, 0.3" or "8,1,1": 9,1,0
> Do you want to create a dataset.yaml interactively? [Y/n]: Y
> Please enter a number to choose a class: 10 (VQAWebdataset)
> Do you want to set a simple field_map[Y] (or write your own sample_loader [n])? [Y/n]: Y
> Please enter a webdataset field name for 'image' (<class 'torch.Tensor'>): jpg
> Please enter a webdataset field name for 'context' (<class 'str'>): json[0][value]
> Please enter a webdataset field name for 'answers' (typing.Optional[typing.List[str]], default: None): json[1][value]
> Please enter a webdataset field name for 'answer_weights' (typing.Optional[torch.Tensor], default: None):
```
5. Update `pretrain_dataset.yaml` so that both `path` variables point to `LLaVA-Pretrain/wds`
6. Run the following script to pretrain a llava model for image captioning:
```
cd <megatron-lm dir>
examples/multimodal/pretrain_mistral_clip.sh
```
All being well you should observe training and validation loss curves similar to the following:
<img src="assets/pretrain_curves.png" alt="Pretraining loss curves" width="600"/>
These curves were obtained with global batch size of 256. Changing this value will likely change the curves. For pretraining and instruction tuning llava models we have found that loss curves are an unreliable predictor of downstream task performance. Therefore it is necessary to run test generation and evaluation on a range of metrics to understand model quality. We intend to add training time zero-shot evaluation in a future update.
You can execute the pretraining script multiple times to resume training. On resuming, the latest model, optimizer, and dataloader state are loaded.
### SFT
1. Prepare an instruction tuning dataset such in [megatron-energon format](https://nvidia.github.io/Megatron-Energon/data_prep.html#). NOTE: we do not provide instructions for this.
2. Update `sft_dataset.yaml` so that both `path` variables point to the train and val splits of your instruction tuning dataset.
Run the following script to instruction tune the pre-trained llava model:
```
examples/multimodal/sft_mistral_clip.sh
```
You can execute the SFT script multiple times to resume training. On resuming, the latest model, optimizer, and dataloader state are loaded.
## Evaluation
### Generation
Run the following script:
```
examples/multimodal/text_generation_mistral_clip.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
--model-path /path/to/model.pt --tokenizer-path /path/to/tokenizer/ --gt-path /path/to/groundtruth/file --task generation-task-name
```
where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning` or `MMMU`.
### After pretraining
#### COCO captioning
1. Download the COCO 2014 test image set:
```wget http://images.cocodataset.org/zips/test2014.zip```
2. Download COCO test image annotations:
```https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json```
3. First, run text generation using `--task captioning`.
4. Run the following command:
```
python examples/multimodal/evaluate_coco.py --input-path /output/directory/from/generation --groundtruth-path /path/to/groundtruth/file
```
For the mistral-7b-instruct plus clip llava model you should obtain a COCO CIDer score of approximately 94.
### After SFT
#### MMMU
The official MMMU repository is not pip installable currently so please clone their code in `examples/multimodal` by running `git clone https://github.com/MMMU-Benchmark/MMMU.git`.
The MMMU dataset is loaded from HuggingFace automatically as part of the code.
Run text generation using `--task MMMU`. Then, run the following command:
```
python examples/multimodal/evaluate_mmmu.py --input-path /output/directory/from/generation
```
For the mistral-7b-instruct plus clip instruction tuned llava model you should obtain a MMMU score of approximately 38.
#/bin/bash
MCORE_LM=$1 # <path_to_mcore_lm_model_folder>
MCORE_VISION=$2 # <path_to_mcore_vision_model_folder>
OUTPUT_DIR=$3 # <path_to_output_folder_for_combined_checkpoint>
MODEL_TYPE=$4 # Model type. Default: Mistral CLIP example.
if [[ $MODEL_TYPE == "nvlm" ]]; then
# NVLM TP=8
python examples/multimodal/combine_state_dicts.py \
--input \
${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_04/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_04/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_05/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_05/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_06/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_06/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_07/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_07/model_optim_rng.pt \
--prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
--output \
${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_04/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_05/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_06/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_07/model_optim_rng.pt
else
# Mistral CLIP example TP=4.
python examples/multimodal/combine_state_dicts.py \
--input \
${MCORE_LM}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${MCORE_LM}/iter_0000001/mp_rank_03/model_optim_rng.pt \
${MCORE_VISION}/iter_0000001/mp_rank_03/model_optim_rng.pt \
--prefixes language_model vision_model language_model vision_model language_model vision_model language_model vision_model \
--output \
${OUTPUT_DIR}/iter_0000001/mp_rank_00/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_01/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_02/model_optim_rng.pt \
${OUTPUT_DIR}/iter_0000001/mp_rank_03/model_optim_rng.pt
fi
echo 1 > ${OUTPUT_DIR}/latest_checkpointed_iteration.txt
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import argparse
import os
import sys
import torch
# Add megatron to the path.
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
def combine(input_files, module_prefixes, output_files):
num_inputs_per_output = int(len(input_files) / len(output_files))
for output_idx, output_file in enumerate(output_files):
combined_state_dict = None
lb = output_idx * num_inputs_per_output
ub = (output_idx + 1) * num_inputs_per_output
current_input_files = input_files[lb:ub]
current_module_prefixes = module_prefixes[lb:ub]
for i, (input_file, module_prefix) in enumerate(
zip(current_input_files, current_module_prefixes)
):
# initialize the combined state dict using the first provided input file
current_state_dict = torch.load(input_file)
if i == 0:
combined_state_dict = current_state_dict.copy()
combined_state_dict["model"] = dict()
# copy model state dict and prefix names with the given module keys.
for k, v in current_state_dict["model"].items():
combined_state_dict["model"]["%s.%s" % (module_prefix, k)] = v
output_dir = os.path.dirname(output_file)
if not os.path.exists(output_dir):
os.makedirs(output_dir, exist_ok=True)
torch.save(combined_state_dict, output_file)
print("saved:", output_file)
print("done.")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""
Combine multiple state dicts into a single state dict.
The combined state dict is first initialized by taking a copy of the first provided input state dict.
To avoid conflicts in model parameter names, a prefix must be provided for each input file.
Model parameter names will be renamed from <original name> to <model prefix>.<original name>.
Example usage:
python combine_state_dicts.py --input language_model.pt vision_model.pt --prefixes language_model vision_model --output multimodal.pt
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("--input", nargs="*", required=True, help="paths to input state dict files")
parser.add_argument(
"--prefixes",
nargs="*",
required=True,
help="prefixes to use with each input model's parameters",
)
parser.add_argument(
"--output", nargs="*", required=True, help="path(s) to output state dict file"
)
args = parser.parse_args()
assert len(args.input) > 1, "must provide more than 1 input model to combine"
assert len(args.input) == len(args.prefixes), "each input model must have a corresponding key"
assert (
len(args.input) % len(args.output) == 0
), "each output file must use the same number of input files"
combine(args.input, args.prefixes, args.output)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment