Initial commit

7c19b3a8 · wangsen · 7c19b3a8 · 7c19b3a8 · 7c19b3a8 · 7c19b3a8
Commit 7c19b3a8 authored Sep 26, 2024 by wangsen
20 changed files
--- a/examples/inference/modelopt/ptq_trtllm_llama_7b.sh
+++ b/examples/inference/modelopt/ptq_trtllm_llama_7b.sh
+#!/bin/bash
+set -e
+DEFAULT_NAME="/checkpoints/llama2-text-7b_v0.2.0"
+NAME="${1:-$DEFAULT_NAME}"
+DEFAULT_QUANT_CFG="int8_sq"
+QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
+# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
+TP="8"
+INFERENCE_TP=${TP}
+DECODER_TYPE="llama"
+CHECKPOINT_LOAD_DIR="${NAME}"
+TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/hf/tokenizer.model"
+# LLaMA2 text 7b has ffn_hidden_size 11008. int4_awq requires a block_size of 128 as a result the TP can at most be 2
+if [ "$QUANT_CFG" = "int4_awq" ]; then
+    INFERENCE_TP="2"
+fi
+additional_options=" \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
+    --decoder ${DECODER_TYPE} \
+    --export-dir /tmp/trtllm_ckpt \
+    --inference-tensor-parallel ${INFERENCE_TP} "
+trtllm_options=" \
+    --tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
+    --engine-dir /tmp/trtllm_engine \
+    --tokenizer ${CHECKPOINT_LOAD_DIR}/hf \
+    --max-input-len 2048 \
+    --max-output-len 512 \
+    --max-batch-size 8 "
+# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+options=" \
+    --disable-bias-linear \
+    --swiglu \
+    --no-rope-fusion \
+    --untie-embeddings-and-output-weights \
+    --use-rotary-position-embeddings \
+    --normalization RMSNorm \
+    --rotary-percent 1.0 \
+    --no-position-embedding \
+    --no-masked-softmax-fusion \
+    --no-bias-gelu-fusion \
+    --no-bias-dropout-fusion \
+    --no-async-tensor-model-parallel-allreduce \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --ffn-hidden-size 11008 \
+    --num-attention-heads 32 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 1 \
+    --make-vocab-size-divisible-by 1 \
+    --tokenizer-type Llama2Tokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL} \
+    --save-interval 1000000 \
+    --use-dist-ckpt \
+    --load ${CHECKPOINT_LOAD_DIR}
+    --fp16"
+# Precompile CUDA extentions
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+# Acquire launch configuration where variable launch_config will be set
+launch_config="--nproc_per_node=${TP}"
+# Launch multi-process with torchrun
+torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options}
+# This script is using mpi4py which will fork multiple processes.
+python examples/inference/trtllm_text_generation.py ${trtllm_options}
--- a/examples/inference/modelopt/ptq_trtllm_nemotron3_8b.sh
+++ b/examples/inference/modelopt/ptq_trtllm_nemotron3_8b.sh
+#!/bin/bash
+set -e
+DEFAULT_NAME="/checkpoints/nemotron3-8b_v0.3.0"
+NAME="${1:-$DEFAULT_NAME}"
+DEFAULT_QUANT_CFG="fp8"
+QUANT_CFG="${2:-$DEFAULT_QUANT_CFG}"
+# CHANGE THE FOLLOWING IF YOU MOUNT YOUR DATA AND CHECKPOINTS DIFFERENTLY IN THE CONTAINER.
+TP="8"
+INFERENCE_TP=${TP}
+DECODER_TYPE="gptnext"
+CHECKPOINT_LOAD_DIR="${NAME}"
+TOKENIZER_MODEL="${CHECKPOINT_LOAD_DIR}/tokenizer.model"
+if [ "$QUANT_CFG" = "int4_awq" ]; then
+    INFERENCE_TP="1"
+fi
+additional_options=" \
+    --export-quant-cfg ${QUANT_CFG} \
+    --export-legacy-megatron \
+    --export-te-mcore-model \
+    --calib-batch-size 8 \
+    --decoder ${DECODER_TYPE} \
+    --export-dir /tmp/trtllm_ckpt \
+    --inference-tensor-parallel ${INFERENCE_TP} "
+trtllm_options=" \
+    --tensorrt-llm-checkpoint-dir /tmp/trtllm_ckpt \
+    --engine-dir /tmp/trtllm_engine \
+    --tokenizer ${TOKENIZER_MODEL} \
+    --max-input-len 2048 \
+    --max-output-len 512 \
+    --max-batch-size 8 "
+# DO NOT CHANGE THE SETTING BELOW UNLESS YOU KNOW WHAT YOU ARE DOING!!!
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+options=" \
+    --apply-layernorm-1p \
+    --untie-embeddings-and-output-weights \
+    --disable-bias-linear \
+    --no-rope-fusion \
+    --no-position-embedding \
+    --use-rotary-position-embeddings \
+    --rotary-percent 0.5 \
+    --squared-relu \
+    --attention-dropout 0.0 \
+    --hidden-dropout 0.0 \
+    --tensor-model-parallel-size ${TP} \
+    --pipeline-model-parallel-size 1 \
+    --num-layers 32 \
+    --hidden-size 4096 \
+    --num-attention-heads 32 \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --micro-batch-size 1 \
+    --tokenizer-type GPTSentencePieceTokenizer \
+    --tokenizer-model ${TOKENIZER_MODEL} \
+    --save-interval 1000000 \
+    --load ${CHECKPOINT_LOAD_DIR} \
+    --fp16 \
+    --use-dist-ckpt"
+# Precompile CUDA extentions
+python -c "import modelopt.torch.quantization.extensions as ext; print(ext.cuda_ext); print(ext.cuda_ext_fp8)"
+# Acquire launch configuration where variable launch_config will be set
+launch_config="--nproc_per_node=${TP}"
+# Launch multi-process with torchrun
+torchrun ${launch_config} examples/inference/text_generation_ptq.py ${options} ${additional_options}
+# This script is using mpi4py which will fork multiple processes.
+python examples/inference/trtllm_text_generation.py ${trtllm_options}
--- a/examples/inference/modelopt/text_generation_ptq.py
+++ b/examples/inference/modelopt/text_generation_ptq.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+"""Sample Generate GPT."""
+import functools
+import os
+import sys
+from pathlib import Path
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../")))
+import modelopt.torch.quantization as mtq
+import torch
+from datasets import load_dataset
+from modelopt.torch.utils.distributed import set_data_parallel_group, set_tensor_parallel_group
+from tqdm import tqdm
+# [ModelOpt]: changing the default model provider to the ModelOpt version
+from megatron.core import mpu
+from megatron.inference.arguments import add_modelopt_args
+from megatron.inference.checkpointing import load_modelopt_checkpoint
+from megatron.inference.gpt.model_provider import model_provider
+from megatron.inference.text_generation import generate_and_post_process
+from megatron.training import get_args, get_model, initialize_megatron
+from megatron.training.checkpointing import save_checkpoint
+from megatron.training.utils import print_rank_0, unwrap_model
+QUANT_CFG_CHOICES = {
+    "int8": mtq.INT8_DEFAULT_CFG,
+    "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
+    "fp8": mtq.FP8_DEFAULT_CFG,
+    "int4_awq": mtq.INT4_AWQ_CFG,
+    "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
+    "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
+}
+def add_trtllm_ckpt_export_args(parser):
+    """Add additional arguments for TensorRT-LLM."""
+    group = parser.add_argument_group(title="trtllm")
+    group.add_argument(
+        "--export-dir", type=str, help="The output TensorRT-LLM checkpoint.",
+    )
+    group.add_argument(
+        "--decoder", type=str, choices=["gptnext", 'llama'], help="The decoder type of the model.",
+    )
+    group.add_argument(
+        "--inference-tensor-parallel",
+        type=int,
+        help="Tensor parallel for the inference time, can be different from the training config.",
+        default=1,
+    )
+def add_text_generate_ptq_args(parser):
+    """Add additional arguments for ModelOpt text generation PTQ."""
+    group = parser.add_argument_group(title='ModelOpt text generation ptq')
+    group.add_argument(
+        "--calib-dataset",
+        type=str,
+        default="cnn_dailymail",
+        help="Calibration datasets from HuggingFace datasets.",
+    )
+    group.add_argument(
+        "--calib-batch-size", type=int, default=4, help="Batch size to use for ptq calibration."
+    )
+    group.add_argument(
+        "--calib-size", type=int, default=512, help="Samples to use for ptq calibration."
+    )
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        default=(
+            "Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
+        ),
+        help="Input texts. Please use | to separate different batches.",
+    )
+    add_modelopt_args(parser)
+    add_trtllm_ckpt_export_args(parser)
+    return parser
+def get_calib_dataloader(
+    data="cnn_dailymail", batch_size=4, calib_size=512, max_sequence_length=512
+):
+    if data == "pileval":
+        dataset = load_dataset(
+            "json", data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst", split="train"
+        )
+        text_column = "text"
+    elif data == "wikitext":
+        dataset = load_dataset("wikitext", "wikitext-103-v1", split="train")
+        text_column = "text"
+    elif data == "cnn_dailymail":
+        dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+        text_column = "article"
+    calib_size = max(min(len(dataset), calib_size), batch_size)
+    for i in range(calib_size // batch_size):
+        batch = dataset[i * batch_size : (i + 1) * batch_size][text_column]
+        for j in range(len(batch)):
+            batch[j] = batch[j][:max_sequence_length]
+        yield batch
+if __name__ == "__main__":
+    initialize_megatron(
+        extra_args_provider=add_text_generate_ptq_args,
+        args_defaults={
+            'tokenizer_type': 'GPT2BPETokenizer',
+            'no_load_rng': True,
+            'no_load_optim': True,
+        },
+    )
+    args = get_args()
+    if args.num_layers_per_virtual_pipeline_stage is not None:
+        print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.")
+        exit()
+    print_rank_0("WARNING: Forcing exit_on_missing_checkpoint to True for text generation.")
+    args.exit_on_missing_checkpoint = True
+    # Set up model and load checkpoint
+    # [ModelOpt]: make sure that output logits are allgathered.
+    text_generation_model_provider = functools.partial(model_provider, parallel_output=False)
+    model = get_model(text_generation_model_provider, wrap_with_ddp=False)
+    if args.load is not None:
+        load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
+        print_rank_0("Done loading checkpoint")
+    # Removing virtual pipeline parallel and other wrapper
+    assert len(model) == 1, "Above condition should have caught this"
+    unwrapped_model = unwrap_model(model)
+    all_prompts = args.prompts.split("|")
+    def custom_prompt_forward_loop_func(model):
+        for prompt in tqdm(all_prompts):
+            if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                (
+                    prompts_plus_generations,
+                    prompts_plus_generations_segments,
+                    logprobs,
+                    _,
+                ) = generate_and_post_process(
+                    model,
+                    prompts=[prompt],
+                    tokens_to_generate=128,
+                    return_output_log_probs=True,
+                    temperature=1.0,
+                )
+                print_rank_0(prompts_plus_generations)
+            else:
+                generate_and_post_process(model)
+    def hf_dataset_forword_loop_func(model):
+        dataloader = get_calib_dataloader(args.calib_dataset, args.calib_batch_size, args.calib_size)
+        for prompts in tqdm(dataloader, total=args.calib_size//args.calib_batch_size):
+            if mpu.is_pipeline_first_stage() and mpu.get_tensor_model_parallel_rank() == 0:
+                (
+                    prompts_plus_generations,
+                    prompts_plus_generations_segments,
+                    logprobs,
+                    _,
+                ) = generate_and_post_process(
+                    model,
+                    prompts=prompts,
+                    tokens_to_generate=0,
+                    return_output_log_probs=True,
+                    temperature=1.0,
+                )
+            else:
+                generate_and_post_process(model)
+    ptq_forward_loop_func = custom_prompt_forward_loop_func
+    if args.calib_dataset is not None:
+        ptq_forward_loop_func = hf_dataset_forword_loop_func
+    # Setting data parallel and tensor parallel group
+    set_data_parallel_group(mpu.get_data_parallel_group())
+    set_tensor_parallel_group(mpu.get_tensor_model_parallel_group())
+    if args.export_quant_cfg in QUANT_CFG_CHOICES:
+        mtq_config = QUANT_CFG_CHOICES[args.export_quant_cfg]
+        if "*output_layer*" not in mtq_config["quant_cfg"]:
+            mtq_config["quant_cfg"]["*output_layer*"] = {"enable": False}
+        if "awq" in args.export_quant_cfg:
+            weight_quantizer = mtq_config["quant_cfg"]["*weight_quantizer"]  # type: ignore
+            if isinstance(weight_quantizer, list):
+                weight_quantizer = weight_quantizer[0]
+            weight_quantizer["block_sizes"][-1] = 128
+        print_rank_0("Quantizing the model...")
+        mtq.quantize(unwrapped_model[0], mtq_config, ptq_forward_loop_func)
+    custom_prompt_forward_loop_func(model[0])
+    if args.save is not None and args.export_quant_cfg in QUANT_CFG_CHOICES:
+        save_checkpoint(1, unwrapped_model, None, None, 0)
+    print_rank_0(f"Fake Quantized Model:\n {unwrapped_model[0]}")
+    if args.export_dir:
+        assert args.decoder in ["gptnext", "llama"], f"Decoder type {args.decoder} not supported."
+        Path(args.export_dir).mkdir(parents=True, exist_ok=True)
+        print_rank_0("Exporting TensorRT-LLM checkpoints.")
+        from modelopt.torch.export import export_tensorrt_llm_checkpoint
+        # In TRT LLM, squared relu activation does not support bf16. So we use fp16 by default.
+        export_tensorrt_llm_checkpoint(
+            unwrapped_model[0],
+            args.decoder,
+            torch.bfloat16 if args.bf16 else torch.float16,
+            export_dir=args.export_dir,
+            inference_tensor_parallel=args.inference_tensor_parallel,
+            inference_pipeline_parallel=1,
+            use_nfs_workspace=True,
+        )
+        print_rank_0(f"TensorRT-LLM checkpoints saved to {args.export_dir}")
--- a/examples/inference/modelopt/trtllm_text_generation.py
+++ b/examples/inference/modelopt/trtllm_text_generation.py
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+"""An example script to run the tensorrt_llm engine."""
+import argparse
+from pathlib import Path
+import numpy as np
+import torch
+from modelopt.deploy.llm import LLM, build_tensorrt_llm
+from transformers import AutoTokenizer, T5Tokenizer
+class CustomSentencePieceTokenizer(T5Tokenizer):
+    """This is a custom GPTSentencePiece Tokenizer modified from the T5Tokenizer.
+    Note:
+        The modification is kept minimal to make `encode` and `batch_decode` working
+        properly (used in TensorRT-LLM engine). Other functions have not been tested.
+    """
+    def __init__(self, model):
+        super().__init__(model, extra_ids=0, bos_token="<s>", pad_token="<pad>")
+    def encode(self, text, add_special_tokens: bool = True, **kwargs):
+        return torch.Tensor(self.sp_model.encode_as_ids(text))
+    def batch_encode_plus(
+        self, batch_text_or_text_pairs, add_special_tokens: bool = True, **kwargs
+    ):
+        return {'input_ids': self.sp_model.encode_as_ids(batch_text_or_text_pairs)}
+    def batch_decode(self, sequences, skip_special_tokens: bool = False, **kwargs):
+        if isinstance(sequences, np.ndarray) or torch.is_tensor(sequences):
+            sequences = sequences.tolist()
+        return self.sp_model.decode(sequences)
+    def decode(self, token_ids, skip_special_tokens: bool = False, **kwargs):
+        return self.sp_model.decode([token_ids])[0]
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tokenizer", type=str, default="")
+    parser.add_argument("--max-input-len", type=int, default=4096)
+    parser.add_argument("--max-output-len", type=int, default=512)
+    parser.add_argument("--max-batch-size", type=int, default=8)
+    parser.add_argument("--tensorrt-llm-checkpoint-dir", type=str, default=None)
+    parser.add_argument("--engine-dir", type=str, default="/tmp/trtllm_engine")
+    parser.add_argument(
+        "--input-texts",
+        type=str,
+        default=(
+            "Born in north-east France, Soyer trained as a|Born in California, Soyer trained as a"
+        ),
+        help="Input texts. Please use | to separate different batches.",
+    )
+    parser.add_argument("--max-beam-width", type=int, default=1)
+    parser.add_argument("--profiler-output", type=str, default="")
+    return parser.parse_args()
+def run(args):
+    tokenizer_path = Path(args.tokenizer)
+    if tokenizer_path.is_dir():
+        # For llama models, use local HF tokenizer which is a folder.
+        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer, trust_remote_code=True)
+    elif tokenizer_path.is_file():
+        # For nextllm and nemotron models, use local Megatron GPTSentencePiece tokenizer which is a model file.
+        tokenizer = CustomSentencePieceTokenizer(args.tokenizer)
+    else:
+        raise ValueError(
+            "arg.tokenizer must be a dir to a hf tokenizer checkpoint for llama or a SentencePiece .model file for gptnext"
+        )
+    print(tokenizer, tokenizer.vocab_size)
+    if not hasattr(args, "profiler_output"):
+        args.profiler_output = ""
+    input_texts = args.input_texts.split("|")
+    assert input_texts, "input_text not specified"
+    print(input_texts)
+    if args.tensorrt_llm_checkpoint_dir is not None:
+        print("Building TensorRT-LLM engines.")
+        build_tensorrt_llm(
+            args.tensorrt_llm_checkpoint_dir + "/config.json",
+            args.engine_dir,
+            max_input_len=args.max_input_len,
+            max_batch_size=args.max_batch_size,
+            max_beam_width=args.max_beam_width,
+            num_build_workers=1,
+        )
+        print(f"TensorRT-LLM engines saved to {args.engine_dir}")
+    free_memory_before = torch.cuda.mem_get_info()
+    # This is a ModelOpt wrapper on top of tensorrt_llm.hlapi.llm.LLM
+    llm_engine = LLM(args.engine_dir, tokenizer)
+    torch.cuda.cudart().cudaProfilerStart()
+    # outputs = llm_engine.generate_text(input_texts, args.max_output_len, args.max_beam_width)
+    outputs = llm_engine.generate(input_texts)
+    torch.cuda.cudart().cudaProfilerStop()
+    free_memory_after = torch.cuda.mem_get_info()
+    print(
+        f"Used GPU memory: {(free_memory_before[0] - free_memory_after[0]) / 1024 / 1024 / 1024} GB"
+    )
+    print(outputs)
+if __name__ == "__main__":
+    args = parse_arguments()
+    run(args)
--- a/examples/inference/run_text_generation_server_345M.sh
+++ b/examples/inference/run_text_generation_server_345M.sh
+#!/bin/bash
+# This example will start serving the 345M model.
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+pip install flask-restful
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+#!/bin/bash
+# This example will start serving the 345M model that is partitioned 8 way tensor parallel
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+pip install flask-restful
+python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/examples/llama2/llama2_13b.sh
+++ b/examples/llama2/llama2_13b.sh
+#!/bin/bash
+# Runs the "7B" parameter model
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=5
+#export HIP_ALLOC_INITIALIZE=0
+#export GPU_MAX_HW_QUEUES=20
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=20
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+CHECKPOINT_PATH=./tmp #$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=./tmp  #$2 #<Specify path>
+#VOCAB_FILE=$3 #<Specify path to file>/gpt2-vocab.json
+#MERGE_FILE=$4 #<Specify path to file>/gpt2-merges.txt
+DATA_PATH="/root/megatron-llama/dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
+TOKENIZER_PATH="/root/megatron-llama/tokenizer.model"
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+GPT_MODEL_ARGS=(
+    --num-layers 40 
+    --hidden-size 5120
+    --num-attention-heads 40
+    --ffn-hidden-size 13824
+    --seq-length 4096 
+    --max-position-embeddings 4096
+)
+TRAINING_ARGS=(
+    --transformer-impl local
+    --use-legacy-models 
+    --micro-batch-size 1 
+    --global-batch-size 60
+    --train-iters 5 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    --use-distributed-optimizer 
+    --use-flash-attn-triton
+    --recompute-activations
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    --ddp-average-in-collective
+    --overlap-grad-reduce
+    --no-gradient-accumulation-fusion
+    --swiglu
+    --sequence-parallel
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+)
+#--use-flash-attn
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 2 
+	--pipeline-model-parallel-size 4
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --split 949,50,1
+    --untie-embeddings-and-output-weights
+    --use-rotary-position-embeddings 
+    --normalization RMSNorm 
+    --no-position-embedding 
+    --tokenizer-model $TOKENIZER_PATH 
+    --tokenizer-type Llama2Tokenizer
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${GPT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
--- a/examples/llama2/llama2_70b.sh
+++ b/examples/llama2/llama2_70b.sh
+#!/bin/bash
+# Runs the "7B" parameter model
+export HSA_FORCE_FINE_GRAIN_PCIE=1
+export OMP_NUM_THREADS=1
+export NCCL_P2P_LEVEL=5
+export HIP_ALLOC_INITIALIZE=0
+export GPU_MAX_HW_QUEUES=20
+export NCCL_ALGO=Ring
+export NCCL_NCHANNELS_PER_PEER=16
+export NCCL_MIN_NCHANNELS=20
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_HCA=mlx5_1,mlx5_2
+export NCCL_NET_GDR_LEVEL=SYS
+export NCCL_NET_GDR_READ=0
+lrank=$OMPI_COMM_WORLD_LOCAL_RANK
+RANK=$OMPI_COMM_WORLD_RANK
+WORLD_SIZE=$OMPI_COMM_WORLD_SIZE
+export NCCL_IB_TIMEOUT=22
+CHECKPOINT_PATH=./tmp #$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=./tmp  #$2 #<Specify path>
+DATA_PATH="/root/megatron-llama/dataset/my-llama_text_document" #<Specify path and file prefix>_text_document
+TOKENIZER_PATH="/root/megatron-llama/tokenizer.model"
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+GPT_MODEL_ARGS=(
+    --num-layers 40 
+    --hidden-size 5120
+    --num-attention-heads 40
+    --ffn-hidden-size 13824
+    --seq-length 4096 
+    --max-position-embeddings 4096
+)
+TRAINING_ARGS=(
+    --transformer-impl local
+    --use-legacy-models 
+    --micro-batch-size 1 
+    --global-batch-size 60 
+    --train-iters 5 
+    --weight-decay 0.1 
+    --adam-beta1 0.9 
+    --adam-beta2 0.95 
+    --init-method-std 0.006 
+    --clip-grad 1.0 
+    --bf16
+    --use-distributed-optimizer 
+    --use-flash-attn-v2
+    --recompute-activations
+    --disable-bias-linear
+    --attention-dropout 0
+    --hidden-dropout 0
+    --ddp-average-in-collective
+    --overlap-grad-reduce
+    --no-gradient-accumulation-fusion
+    --swiglu
+    --lr 3.0e-5 
+    --lr-decay-style cosine 
+    --min-lr 3.0e-6
+    --lr-warmup-iters 1
+)
+#--use-flash-attn
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 2 
+	--pipeline-model-parallel-size 4 
+)
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --split 949,50,1
+    --untie-embeddings-and-output-weights
+    --use-rotary-position-embeddings 
+    --normalization RMSNorm 
+    --no-position-embedding 
+    --tokenizer-model $TOKENIZER_PATH 
+    --tokenizer-type Llama2Tokenizer
+)
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+APP="python3  -u pretrain_gpt.py \
+     $GPT_MODEL_ARGS[@] \
+     $TRAINING_ARGS[@] \
+     $MODEL_PARALLEL_ARGS[@] \
+     $DATA_ARGS[@] \
+     $EVAL_AND_LOGGING_ARGS[@]
+     --rank ${RANK} \
+     --world_size ${WORLD_SIZE} \
+     --dist_url tcp://${1}:34566 \
+    "
+case ${lrank} in
+[0])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[1])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[2])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[3])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[4])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[5])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[6])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+[7])
+  export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+  ${APP}
+  ;;
+esac
--- a/examples/llama2/llama2_7b.sh
+++ b/examples/llama2/llama2_7b.sh
--- a/examples/llama2/test.sh
+++ b/examples/llama2/test.sh
--- a/examples/llama2/train_gpt3_175b_distributed.sh
+++ b/examples/llama2/train_gpt3_175b_distributed.sh
--- a/examples/mamba/.gitignore
+++ b/examples/mamba/.gitignore
+checkpoints/
+data-cache/
+tensorboard/
+triton-cache/
--- a/examples/mamba/Dockerfile
+++ b/examples/mamba/Dockerfile
+FROM nvcr.io/nvidia/pytorch:23.12-py3
+RUN pip uninstall -y causal-conv1d triton && \
+    pip install causal-conv1d==1.2.2.post1 sentencepiece==0.1.99 triton==2.1.0 flask-restful
+WORKDIR /tmp
+RUN git clone https://github.com/state-spaces/mamba.git && \
+    cd mamba && \
+    git checkout v2.0.3 && \
+    python setup.py install && \
+    cd .. && \
+    rm -rf mamba
--- a/examples/mamba/README.md
+++ b/examples/mamba/README.md
--- a/examples/mamba/run_text_gen_server_8b.sh
+++ b/examples/mamba/run_text_gen_server_8b.sh
--- a/examples/mamba/run_text_gen_server_8b_gpt3.sh
+++ b/examples/mamba/run_text_gen_server_8b_gpt3.sh
--- a/examples/mamba/train.sh
+++ b/examples/mamba/train.sh
--- a/examples/multimodal/Dockerfile
+++ b/examples/multimodal/Dockerfile
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+RUN apt update && \
+    apt -y upgrade && \
+    apt install -y --no-install-recommends \
+        software-properties-common \
+        build-essential \
+        python3-pip \
+        python3-dev \
+        bash \
+        git \
+        vim \
+        python-is-python3 \
+        default-jre
+RUN pip install --upgrade pip
+RUN pip install einops einops-exts sentencepiece braceexpand webdataset
+RUN pip install transformers datasets
+RUN pip install pytest-cov pytest_mock nltk wrapt
+RUN pip install zarr "tensorstore==0.1.45"
+RUN pip install git+https://github.com/fanshiqing/grouped_gemm@main
+RUN pip install black==19.10b0 isort click==8.0.2
+RUN pip install pycocoevalcap megatron-energon
+RUN pip install git+https://github.com/openai/CLIP.git
+# Use --no-deps for the following to avoid outdated and unnecessary dependencies.
+RUN pip install mmf --no-deps
+RUN pip install open-flamingo[eval] --no-deps
--- a/examples/multimodal/README.md
+++ b/examples/multimodal/README.md
--- a/examples/multimodal/clip_converter.py
+++ b/examples/multimodal/clip_converter.py