Merge branch 'main' into 'main'

megatron升级v0.10 See merge request !3

Merge branch 'main' into 'main'
megatron升级v0.10 See merge request !3
d520d24f · silencealiang · 3aca1415 · 481609bb · d520d24f · d520d24f
Commit d520d24f authored Dec 09, 2024 by silencealiang
20 changed files
--- a/examples/sc21/CONFIG.sh
+++ b/examples/sc21/CONFIG.sh
--- a/examples/sc21/README.md
+++ b/examples/sc21/README.md
--- a/examples/sc21/SBATCH.sh
+++ b/examples/sc21/SBATCH.sh
--- a/examples/sc21/SRUN.sh
+++ b/examples/sc21/SRUN.sh
--- a/examples/sc21/run_figure_11.sh
+++ b/examples/sc21/run_figure_11.sh
--- a/examples/sc21/run_figure_12.sh
+++ b/examples/sc21/run_figure_12.sh
--- a/examples/sc21/run_figure_13.sh
+++ b/examples/sc21/run_figure_13.sh
--- a/examples/sc21/run_figure_14.sh
+++ b/examples/sc21/run_figure_14.sh
--- a/examples/sc21/run_figure_15.sh
+++ b/examples/sc21/run_figure_15.sh
--- a/examples/sc21/run_figure_16.sh
+++ b/examples/sc21/run_figure_16.sh
--- a/examples/sc21/run_figure_17.sh
+++ b/examples/sc21/run_figure_17.sh
--- a/examples/sc21/run_figure_18.sh
+++ b/examples/sc21/run_figure_18.sh
--- a/examples/sc21/run_table_1.sh
+++ b/examples/sc21/run_table_1.sh
--- a/examples/bert/README.md
+++ b/examples/bert/README.md
+# BERT MODEL
+
+## Table of contents
+- [1. Training Setup](#1-training-setup)
+- [2. Configurations](#2-configurations)
+
+## 1. Training setup
+<a id="markdown-training-setup" name="training-setup"></a>
+
+To run the model using a docker container run it as follows
+```
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.01-py3
+CHECKPOINT_PATH="" #<Specify path>
+TENSORBOARD_LOGS_PATH=""#<Specify path>
+VOCAB_FILE="" #<Specify path to file>//bert-vocab.txt
+DATA_PATH="" #<Specify path and file prefix>_text_document
+
+docker run \
+  --gpus=all \
+  --ipc=host \
+  --workdir /workspace/megatron-lm \
+  -v /path/to/data:/path/to/data \
+  -v /path/to/megatron-lm:/workspace/megatron-lm \
+  megatron-lm nvcr.io/nvidia/pytorch:24.01-py3 \
+  bash examples/bert/train_bert_340m_distributed.sh $CHECKPOINT_PATH $TENSORBOARD_LOGS_PATH $VOCAB_FILE $DATA_PATH "
+
+```
+NOTE: Depending on the environment you are running it the above command might like slightly different.
+
+
+## 2. Configurations
+<a id="markdown-configurations" name="configurations"></a>
+The example in this folder shows you how to run 340m large model. There are other configs you could run as well
+
+### 4B
+```
+       --num-layers 48 \
+       --hidden-size 2560 \
+       --num-attention-heads 32 \
+       --tensor-model-parallel-size 1 \
+       --pipeline-model-parallel-size 1 \
+
+```
+
+### 20B
+```
+       --num-layers 48 \
+       --hidden-size 6144 \
+       --num-attention-heads 96 \
+       --tensor-model-parallel-size 4 \
+       --pipeline-model-parallel-size 4 \
+
+```
\ No newline at end of file
--- a/examples/bert/train_bert_340m_distributed.sh
+++ b/examples/bert/train_bert_340m_distributed.sh
+#!/bin/bash
+
+# Runs the "340M" parameter model (Bert - Large)
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=localhost
+MASTER_PORT=6000
+NUM_NODES=1
+NODE_RANK=0
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+CHECKPOINT_PATH=$1 #<Specify path>
+TENSORBOARD_LOGS_PATH=$2 #<Specify path>
+VOCAB_FILE=$3 #<Specify path to file>/bert-vocab.json
+DATA_PATH=$4 #<Specify path and file prefix>_text_document
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE 
+    --nnodes $NUM_NODES 
+    --master_addr $MASTER_ADDR 
+    --master_port $MASTER_PORT
+)
+
+BERT_MODEL_ARGS=(
+    --num-layers 24 
+    --hidden-size 1024 
+    --num-attention-heads 16 
+    --seq-length 512 
+    --max-position-embeddings 512 
+    --attention-backend auto # Can use (flash/fused/unfused/local)
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 4 
+    --global-batch-size 32 
+    --train-iters 1000000 
+    --weight-decay 1e-2 
+    --clip-grad 1.0 
+    --fp16
+    --lr 0.0001
+    --lr-decay-iters 990000 
+    --lr-decay-style linear 
+    --min-lr 1.0e-5 
+    --weight-decay 1e-2 
+    --lr-warmup-fraction .01 
+    --clip-grad 1.0 
+)
+
+MODEL_PARALLEL_ARGS=(
+	--tensor-model-parallel-size 8 
+	--pipeline-model-parallel-size 16 
+)
+
+DATA_ARGS=(
+    --data-path $DATA_PATH 
+    --vocab-file $VOCAB_FILE 
+    --split 949,50,1
+)
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 100
+    --save-interval 10000 
+    --eval-interval 1000 
+    --save $CHECKPOINT_PATH 
+    --load $CHECKPOINT_PATH 
+    --eval-iters 10
+    --tensorboard-dir $TENSORBOARD_LOGS_PATH 
+)
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_bert.py \
+    ${BERT_MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
+    
\ No newline at end of file
--- a/examples/evaluate_retriever_nq.sh
+++ b/examples/evaluate_retriever_nq.sh
-#!/bin/bash
-
-# Evaluate natural question test data given Wikipedia embeddings and pretrained
-# ICT model or a finetuned model for Natural Question task
-
-# Datasets can be downloaded from the following link:
-# https://github.com/facebookresearch/DPR/blob/master/data/download_data.py
-
-EVIDENCE_DATA_DIR=<Specify path of Wikipedia dataset>
-EMBEDDING_PATH=<Specify path of the embeddings>
-CHECKPOINT_PATH=<Specify path of pretrained ICT model or finetuned model>
-
-QA_FILE=<Path of the natural question dev or test dataset>
-
-python tasks/main.py \
-    --task RETRIEVER-EVAL \
-    --tokenizer-type BertWordPieceLowerCase \
-    --num-layers 12 \
-    --hidden-size 768 \
-    --num-attention-heads 12 \
-    --tensor-model-parallel-size 1 \
-    --micro-batch-size 128 \
-    --seq-length 512 \
-    --max-position-embeddings 512 \
-    --load ${CHECKPOINT_PATH} \
-    --evidence-data-path ${EVIDENCE_DATA_DIR} \
-    --embedding-path ${EMBEDDING_PATH} \
-    --retriever-seq-length 256 \
-    --vocab-file  bert-vocab.txt\
-    --qa-data-test ${QA_FILE} \
-    --faiss-use-gpu \
-    --retriever-report-topk-accuracies 1 5 20 100 \
-    --fp16 \
-    --indexer-log-interval 1000 \
-    --indexer-batch-size 128
-
-
--- a/examples/evaluate_zeroshot_gpt.sh
+++ b/examples/evaluate_zeroshot_gpt.sh
-#!/bin/bash
-
-WORLD_SIZE=8
-
-DISTRIBUTED_ARGS="--nproc_per_node $WORLD_SIZE \
-                  --nnodes 1 \
-                  --node_rank 0 \
-                  --master_addr localhost \
-                  --master_port 6000"
-
-TASK="LAMBADA"
-
-VALID_DATA=<lambada path>
-VOCAB_FILE=gpt2-vocab.json
-MERGE_FILE=gpt2-merges.txt
-CHECKPOINT=checkpoints/gpt2_345m
-
-
-python -m torch.distributed.launch $DISTRIBUTED_ARGS ./tasks/main.py \
-               --task $TASK \
-               --valid-data $VALID_DATA \
-               --tokenizer-type GPT2BPETokenizer \
-               --strict-lambada \
-               --vocab-file $VOCAB_FILE \
-               --merge-file $MERGE_FILE \
-               --load $CHECKPOINT \
-               --tensor-model-parallel-size 1 \
-               --num-layers 24 \
-               --hidden-size 1024 \
-               --num-attention-heads 16 \
-               --batch-size 8 \
-               --seq-length 1024 \
-               --max-position-embeddings 1024 \
-               --log-interval 10 \
-               --fp16 \
-               --no-load-optim \
-               --no-load-rng
--- a/examples/export/README.md
+++ b/examples/export/README.md
+# Megatron Core Export
+
+This module is used to export megatron core models to different inference frameworks. 
+Currently we support TRTLLM export . In the future we will be adding support for VLLM etc. 
+
+## PTQ AND EXPORT
+Follow the instructions in [ptq_and_trtllm_export](./ptq_and_trtllm_export) to do post training quantization, followed by an export to TRTLLM format. 
+
+# TRTLLM EXPORT
+Follow the instructions in [trtllm_export](./trtllm_export/) to do export to TRTLLM checkpoint format alone.
\ No newline at end of file
--- a/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py
+++ b/examples/export/knowledge_distillation/pretrain_gpt_modelopt.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+"""Pretrain GPT."""
+import os
+import sys
+from functools import partial
+
+# This file isn't located in project root, but to import, it should pretend to be.
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
+
+from megatron.core import mpu
+from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
+from megatron.core.datasets.gpt_dataset import GPTDataset, GPTDatasetConfig, MockGPTDataset
+from megatron.core.datasets.utils import get_blend_from_list
+from megatron.core.enums import ModelType
+from megatron.core.models.gpt import GPTModel
+from megatron.core.utils import StragglerDetector
+from megatron.inference.arguments import add_modelopt_args
+from megatron.inference.gpt import loss_func, model_provider
+from megatron.training import get_args, get_timers, get_tokenizer, pretrain
+from megatron.training.utils import (
+    get_batch_on_this_cp_rank,
+    get_batch_on_this_tp_rank,
+    print_rank_0,
+)
+
+stimer = StragglerDetector()
+
+
+def get_batch(data_iterator):
+    """Generate a batch."""
+
+    # TODO: this is pretty hacky, find a better way
+    if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
+        return None, None, None, None, None
+
+    # get batches based on the TP rank you are on
+    batch = get_batch_on_this_tp_rank(data_iterator)
+
+    # slice batch along sequence dimension for context parallelism
+    batch = get_batch_on_this_cp_rank(batch)
+
+    return batch.values()
+
+
+def forward_step(data_iterator, model: GPTModel):
+    """Forward training step.
+
+    Args:
+        data_iterator : Input data iterator
+        model (GPTModel): The GPT Model
+    """
+    timers = get_timers()
+
+    # Get the batch.
+    timers('batch-generator', log_level=2).start()
+    global stimer
+    with stimer(bdata=True):
+        tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data_iterator)
+    timers('batch-generator').stop()
+
+    with stimer:
+        output_tensor = model(tokens, position_ids, attention_mask, labels=labels)
+
+    # [ModelOpt]: model is needed to access ModelOpt distillation losses
+    return output_tensor, partial(loss_func, loss_mask, model)
+
+
+def is_dataset_built_on_rank():
+    return (
+        mpu.is_pipeline_first_stage() or mpu.is_pipeline_last_stage()
+    ) and mpu.get_tensor_model_parallel_rank() == 0
+
+
+def core_gpt_dataset_config_from_args(args):
+    tokenizer = get_tokenizer()
+
+    return GPTDatasetConfig(
+        random_seed=args.seed,
+        sequence_length=args.seq_length,
+        blend=get_blend_from_list(args.data_path),
+        blend_per_split=[
+            get_blend_from_list(args.train_data_path),
+            get_blend_from_list(args.valid_data_path),
+            get_blend_from_list(args.test_data_path),
+        ],
+        split=args.split,
+        num_dataset_builder_threads=args.num_dataset_builder_threads,
+        path_to_cache=args.data_cache_path,
+        mmap_bin_files=args.mmap_bin_files,
+        tokenizer=tokenizer,
+        reset_position_ids=args.reset_position_ids,
+        reset_attention_mask=args.reset_attention_mask,
+        eod_mask_loss=args.eod_mask_loss,
+        create_attention_mask=args.create_attention_mask_in_dataloader,
+    )
+
+
+def train_valid_test_datasets_provider(train_val_test_num_samples):
+    """Build the train test and validation datasets.
+
+    Args:
+        train_val_test_num_samples : A list containing the number of samples in train test and validation.
+    """
+    args = get_args()
+
+    config = core_gpt_dataset_config_from_args(args)
+
+    if args.mock_data:
+        dataset_type = MockGPTDataset
+    else:
+        dataset_type = GPTDataset
+
+    print_rank_0("> building train, validation, and test datasets for GPT ...")
+
+    train_ds, valid_ds, test_ds = BlendedMegatronDatasetBuilder(
+        dataset_type, train_val_test_num_samples, is_dataset_built_on_rank, config
+    ).build()
+
+    print_rank_0("> finished creating GPT datasets ...")
+
+    return train_ds, valid_ds, test_ds
+
+
+if __name__ == "__main__":
+    # Temporary for transition to core datasets
+    train_valid_test_datasets_provider.is_distributed = True
+
+    pretrain(
+        train_valid_test_datasets_provider,
+        model_provider,
+        ModelType.encoder_or_decoder,
+        forward_step,
+        args_defaults={"tokenizer_type": "GPT2BPETokenizer"},
+        extra_args_provider=add_modelopt_args,
+    )
--- a/examples/export/ptq_and_trtllm_export/README.md
+++ b/examples/export/ptq_and_trtllm_export/README.md
+# Megatron Model Optimization and Deployment
+
+## Installation
+We recommend that users follow TensorRT-LLM's official installation guide to build it from source
+and proceed with a containerized environment (`docker.io/tensorrt_llm/release:latest`):
+
+```sh
+git clone https://github.com/NVIDIA/TensorRT-LLM.git
+cd TensorRT-LLM
+git checkout v0.10.0
+make -C docker release_build
+```
+
+> **TROUBLE SHOOTING:** rather than copying each folder separately in `docker/Dockerfile.multi`,
+> you may need to copy the entire dir as `COPY ./ /src/tensorrt_llm` since a `git submodule` is
+> called later which requires `.git` to continue.
+
+Once the container is built, install `nvidia-modelopt` and additional dependencies for sharded checkpoint support:
+```sh
+pip install "nvidia-modelopt[all]~=0.13.0" --extra-index-url https://pypi.nvidia.com
+pip install zarr tensorstore==0.1.45
+```
+TensorRT-LLM quantization functionalities are currently packaged in `nvidia-modelopt`.
+You can find more documentation about `nvidia-modelopt` [here](https://nvidia.github.io/TensorRT-Model-Optimizer/).
+
+## Support Matrix
+
+The following matrix shows the current support for the PTQ + TensorRT-LLM export flow.
+
+| model                       | fp16 | int8_sq | fp8 | int4_awq |
+|-----------------------------|------|---------| ----| -------- |
+| nextllm-2b                  | x    | x       |   x |          |
+| nemotron3-8b                | x    |         |   x |          |
+| nemotron3-15b               | x    |         |   x |          |
+| llama2-text-7b              | x    | x       |   x |      TP2 |
+| llama2-chat-70b             | x    | x       |   x |      TP4 |
+
+Our PTQ + TensorRT-LLM flow has native support on MCore `GPTModel` with a mixed layer spec (native ParallelLinear
+and Transformer-Engine Norm (`TENorm`). Note that this is not the default mcore gpt spec. You can still load the
+following checkpoint formats with some remedy:
+
+| GPTModel                          | sharded |                        remedy arguments     |
+|-----------------------------------|---------|---------------------------------------------|
+| megatron.legacy.model             |         | `--export-legacy-megatron` |
+| TE-Fused (default mcore gpt spec) |         | `--export-te-mcore-model`       |
+| TE-Fused (default mcore gpt spec) |       x |                                             |
+
+> **TROUBLE SHOOTING:** If you are trying to load an unpacked `.nemo` sharded checkpoint, then typically you will
+> need to adding `additional_sharded_prefix="model."` to `modelopt_load_checkpoint()` since NeMo has an additional
+> `model.` wrapper on top of the `GPTModel`.
+
+> **NOTE:** flag `--export-legacy-megatron` may not work on all legacy checkpoint versions.
+
+## Examples
+
+> **NOTE:** we only provide a simple text generation script to test the generated TensorRT-LLM engines. For
+> a production-level API server or enterprise support, see [NeMo](https://github.com/NVIDIA/NeMo) and TensorRT-LLM's
+> backend for [NVIDIA Triton Inference Server](https://developer.nvidia.com/nvidia-triton-inference-server).
+
+### Minitron-8B FP8 Quantization and TensorRT-LLM Deployment
+First download the nemotron checkpoint from https://huggingface.co/nvidia/Minitron-8B-Base, extract the
+sharded checkpoint from the `.nemo` tarbal and fix the tokenizer file name.
+
+> **NOTE:** The following cloning method uses `ssh`, and assume you have registered the `ssh-key` in Hugging Face.
+> If you are want to clone with `https`, then `git clone https://huggingface.co/nvidia/Minitron-8B-Base` with an access token.
+
+```sh
+git lfs install
+git clone git@hf.co:nvidia/Minitron-8B-Base
+cd Minitron-8B-Base/nemo
+tar -xvf minitron-8b-base.nemo
+cd ../..
+```
+
+Now launch the PTQ + TensorRT-LLM export script,
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_minitron_8b ./Minitron-8B-Base None
+```
+By default, `cnn_dailymail` is used for calibration. The `GPTModel` will have quantizers for simulating the
+quantization effect. The checkpoint will be saved optionally (with quantizers as additional states) and can
+be restored for further evaluation or quantization-aware training. TensorRT-LLM checkpoint and engine are exported to `/tmp/trtllm_ckpt` and
+built in `/tmp/trtllm_engine` by default.
+
+The script expects `${CHECKPOINT_DIR}` (`./Minitron-8B-Base/nemo`) to have the following structure:
+
+> **NOTE:** The .nemo checkpoint after extraction (including examples below) should all have the following strucure.
+
+```
+├── model_weights
+│   ├── common.pt
+│   ...
+│
+├── model_config.yaml
+│...
+```
+
+> **NOTE:** The script is using `TP=8`. Change `$TP` in the script if your checkpoint has a different tensor
+> model parallelism.
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer nvidia/Minitron-8B-Base
+```
+
+### mistral-12B FP8 Quantization and TensorRT-LLM Deployment
+First download the nemotron checkpoint from https://huggingface.co/nvidia/Mistral-NeMo-12B-Base, extract the
+sharded checkpoint from the `.nemo` tarbal.
+
+> **NOTE:** The following cloning method uses `ssh`, and assume you have registered the `ssh-key` in Hugging Face.
+> If you are want to clone with `https`, then `git clone https://huggingface.co/nvidia/Mistral-NeMo-12B-Base` with an access token.
+
+```sh
+git lfs install
+git clone git@hf.co:nvidia/Mistral-NeMo-12B-Base
+cd Mistral-NeMo-12B-Base
+tar -xvf Mistral-NeMo-12B-Base.nemo
+cd ..
+```
+
+Then log in to huggingface so that you can access to model
+
+> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to mistralai/Mistral-Nemo-Base-2407 on huggingface
+
+```sh
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login
+```
+
+Now launch the PTQ + TensorRT-LLM checkpoint export script,
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mistral_12b.sh ./Mistral-NeMo-12B-Base None
+```
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mistral-Nemo-Base-2407
+```
+
+
+### llama2-text-7b INT8 SmoothQuant and TensorRT-LLM Deployment
+> **NOTE:** Due to the LICENSE issue, we do not provide a MCore checkpoint to download. Users can follow
+> the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
+> use `--export-legacy-megatron` flag which will remap the checkpoint to the MCore `GPTModel` spec
+> that we support.
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama_7b.sh ${CHECKPOINT_DIR}
+```
+
+The script expect `${CHECKPOINT_DIR}` to have the following structure:
+```
+├── hf
+│   ├── tokenizer.config
+│   ├── tokenizer.model
+│   ...
+│
+├── iter_0000001
+│   ├── mp_rank_00
+│   ...
+│
+├── latest_checkpointed_iteration.txt
+```
+In short, other than the converted llama megatron checkpoint, also put the Hugging Face checkpoint inside as
+the source of the tokenizer.
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Llama-2-7b
+```
+
+### llama3-8b / llama3.1-8b INT8 SmoothQuant and TensorRT-LLM Deployment
+> **NOTE:** For llama3.1, the missing rope_scaling parameter will be fixed in modelopt-0.19 and trtllm-0.13.
+
+> **NOTE:** There are two ways to acquire the checkpoint. Users can follow
+> the instruction in `docs/llama2.md` to convert the checkpoint to megatron legacy `GPTModel` format and
+> use `--export-legacy-megatron` flag which will remap the checkpoint to the MCore `GPTModel` spec
+> that we support.
+> Or Users can download [nemo model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/llama38bnemo) from NGC and extract the sharded checkpoint from the .nemo tarbal.
+
+If users choose to download the model from NGC, first extract the sharded checkpoint from the .nemo tarbal.
+
+```sh
+tar -xvf 8b_pre_trained_bf16.nemo
+```
+
+> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to meta-llama/Llama-3.1-8B or meta-llama/Llama-3-8B on huggingface
+
+```sh
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login
+```
+
+Now launch the PTQ + TensorRT-LLM checkpoint export script for llama-3,
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_8b.sh ./llama-3-8b-nemo_v1.0 None
+```
+
+or llama-3.1
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_llama3_1_8b.sh ./llama-3_1-8b-nemo_v1.0 None
+```
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3-8B
+# For llama-3
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer meta-llama/Meta-Llama-3.1-8B
+#For llama-3.1
+```
+
+
+### Mixtral-8x7B FP8 Quantization and TensorRT-LLM Deployment
+First download the nemotron checkpoint from https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/mixtral-8x7b-v01, extract the
+sharded checkpoint from the `.nemo` tarbal.
+
+```sh
+ngc registry model download-version "nvidia/nemo/mixtral-8x7b-v01:1.0"
+cd mixtral-8x7b-v01_v1.0
+tar -xvf mixtral.nemo
+cd ..
+```
+
+Then log in to huggingface so that you can access to model
+
+> **NOTE:** You need a token generated from huggingface.co/settings/tokens and access to mistralai/Mixtral-8x7B-v0.1 on huggingface
+
+```sh
+pip install -U "huggingface_hub[cli]"
+huggingface-cli login
+```
+
+Now launch the PTQ + TensorRT-LLM checkpoint export script,
+
+```sh
+bash examples/export/ptq_and_trtllm_export/ptq_trtllm_mixtral_8x7b.sh ./mixtral-8x7b-v01_v1.0/
+```
+
+Then build TensorRT engine and run text generation example using the newly built TensorRT engine
+
+```sh
+export trtllm_options=" \
+    --checkpoint_dir /tmp/trtllm_ckpt \
+    --output_dir /tmp/trtllm_engine \
+    --max_input_len 2048 \
+    --max_seq_len 512 \
+    --max_batch_size 8 "
+
+trtllm-build ${trtllm_options}
+
+python examples/export/ptq_and_trtllm_export/trtllm_text_generation.py --tokenizer mistralai/Mixtral-8x7B-v0.1
+```
\ No newline at end of file