“13.0”

1106877d · jerrrrry · 1106877d · 1106877d · 1106877d · 1106877d
Commit 1106877d authored Sep 23, 2025 by jerrrrry
20 changed files
--- a/Megatron-LM/examples/inference/llama_mistral/huggingface_reference.py
+++ b/Megatron-LM/examples/inference/llama_mistral/huggingface_reference.py
+import argparse
+from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
+
+# Set up argument parsing
+parser = argparse.ArgumentParser(description="Script for text generation with a specific model and prompt.")
+parser.add_argument('--prompt', type=str, required=True, help="Prompt text to use for text generation")
+parser.add_argument('--model-path', type=str, required=True, help="Path to the Huggingface model checkpoint")
+
+# Parse command-line arguments
+args = parser.parse_args()
+
+model_path = args.model_path
+prompt = args.prompt
+
+config = AutoConfig.from_pretrained(model_path)
+tokenizer = AutoTokenizer.from_pretrained(model_path, config=config)
+model = AutoModelForCausalLM.from_pretrained(model_path, config=config).cuda()
+
+inputs = tokenizer(prompt, return_tensors="pt")
+for key in inputs:
+    inputs[key] = inputs[key].cuda()
+# top_k, top_p and do_sample are set for greedy argmax based sampling
+
+outputs = model.generate(**inputs, max_length=100, do_sample=False, top_p=0, top_k=0, temperature=1.0)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
\ No newline at end of file
--- a/Megatron-LM/examples/inference/llama_mistral/run_static_inference_llama4_scout.sh
+++ b/Megatron-LM/examples/inference/llama_mistral/run_static_inference_llama4_scout.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Fill in checkpoint path to Llama 4 Scout to run
+CHECKPOINT=<Path to Scout checkpoint>
+PROMPTS="What is the capital of France?"
+TOKENS_TO_GENERATE=4
+MAX_BATCH_SIZE=2
+
+MODEL_ARGS=" \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --disable-bias-linear \
+    --untie-embeddings-and-output-weights \
+    --position-embedding-type rope \
+    --no-rope-fusion \
+    --normalization RMSNorm \
+    --swiglu \
+    --num-layers 48 \
+    --hidden-size 5120 \
+    --ffn-hidden-size 16384 \
+    --num-attention-heads 40 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --qk-layernorm \
+    --num-experts 16 \
+    --moe-ffn-hidden-size 8192 \
+    --moe-router-score-function sigmoid \
+    --moe-router-topk 1 \
+    --moe-router-topk-scaling-factor 1.0 \
+    --moe-shared-expert-intermediate-size 8192 \
+    --moe-aux-loss-coeff 1e-3 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-token-drop-policy probs \
+    --moe-router-load-balancing-type seq_aux_loss \
+    --seq-length 4096 \
+    --max-position-embeddings 4096 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 128 \
+    --use-mcore-models \
+    --rotary-interleaved \
+    --rotary-percent 1.0 \
+    --rotary-base 500000 \
+    --rope-scaling-factor 8.0 \
+    --use-rope-scaling \
+    --no-bias-swiglu-fusion \
+    --qk-l2-norm \
+    --moe-apply-probs-on-input \
+    --moe-router-dtype fp64 \
+"
+
+torchrun $DISTRIBUTED_ARGS -m examples.inference.gpt.gpt_static_inference   \
+      --load ${CHECKPOINT} \
+      --tokenizer-model unsloth/Llama-4-Scout-17B-16E-Instruct \
+      --dist-ckpt-strictness log_unexpected \
+      --tensor-model-parallel-size 8 \
+      --prompts ${PROMPTS} \
+      --num-tokens-to-generate ${TOKENS_TO_GENERATE}  \
+      --max-batch-size ${MAX_BATCH_SIZE} \
+      ${MODEL_ARGS}
--- a/Megatron-LM/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
+++ b/Megatron-LM/examples/inference/llama_mistral/run_text_generation_llama3.1.sh
+#!/bin/bash
+# This example will start serving the Llama3.1-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rope-scaling \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 131072  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
--- a/Megatron-LM/examples/inference/llama_mistral/run_text_generation_llama3.sh
+++ b/Megatron-LM/examples/inference/llama_mistral/run_text_generation_llama3.sh
+#!/bin/bash
+# This example will start serving the Llama3-8B model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NVTE_APPLY_QK_LAYER_SCALING=0
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+      --use-checkpoint-args \
+      --disable-bias-linear \
+      --tokenizer-type HuggingFaceTokenizer \
+      --tokenizer-model ${TOKENIZER_MODEL} \
+      --transformer-impl transformer_engine \
+      --normalization RMSNorm \
+      --group-query-attention \
+      --num-query-groups 8 \
+      --no-masked-softmax-fusion \
+      --attention-softmax-in-fp32 \
+      --attention-dropout 0.0 \
+      --hidden-dropout 0.0 \
+      --untie-embeddings-and-output-weights \
+      --position-embedding-type rope \
+      --rotary-percent 1.0 \
+      --rotary-base 500000 \
+      --use-rotary-position-embeddings \
+      --swiglu \
+      --tensor-model-parallel-size 1  \
+      --pipeline-model-parallel-size 1  \
+      --num-layers 32  \
+      --hidden-size 4096  \
+      --ffn-hidden-size 14336 \
+      --load ${CHECKPOINT}  \
+      --num-attention-heads 32  \
+      --max-position-embeddings 8192  \
+      --bf16  \
+      --micro-batch-size 1  \
+      --seq-length 8192
--- a/Megatron-LM/examples/inference/llama_mistral/run_text_generation_mistral.sh
+++ b/Megatron-LM/examples/inference/llama_mistral/run_text_generation_mistral.sh
+#!/bin/bash
+# This example will start serving the Mistral-7B-v0.3 model
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr 0.0.0.0 \
+                  --master_port 6000"
+
+# Ensure CHECKPOINT and TOKENIZER_MODEL are provided
+if [ -z "$1" ] || [ -z "$2" ]; then
+  echo "Error: You must provide CHECKPOINT and TOKENIZER_MODEL as command-line arguments."
+  echo "Usage: $0 /path/to/checkpoint /path/to/tokenizer_model"
+  exit 1
+fi
+
+# Assign command-line arguments to variables
+CHECKPOINT=$1
+TOKENIZER_MODEL=$2
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tokenizer-type HuggingFaceTokenizer \
+       --tokenizer-model ${TOKENIZER_MODEL} \
+       --use-checkpoint-args \
+       --apply-layernorm-1p \
+       --transformer-impl transformer_engine \
+       --normalization RMSNorm \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --no-masked-softmax-fusion \
+       --use-flash-attn \
+       --untie-embeddings-and-output-weights \
+       --disable-bias-linear \
+       --position-embedding-type rope \
+       --rotary-percent 1.0 \
+       --rotary-base 1000000 \
+       --swiglu \
+       --ffn-hidden-size 14336 \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 32  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 32  \
+       --max-position-embeddings 4096  \
+       --bf16  \
+       --micro-batch-size 1  \
+       --seq-length 4096  \
+       --seed 101
--- a/Megatron-LM/examples/inference/run_text_generation_server_345M.sh
+++ b/Megatron-LM/examples/inference/run_text_generation_server_345M.sh
+#!/bin/bash
+# This example will start serving the 345M model.
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/Megatron-LM/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+++ b/Megatron-LM/examples/inference/run_text_generation_server_345M_8_tensor_parallel.sh
+#!/bin/bash
+# This example will start serving the 345M model that is partitioned 8 way tensor parallel
+DISTRIBUTED_ARGS="--nproc_per_node 8 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint (e.g /345m)>
+VOCAB_FILE=<Path to vocab.json (e.g. /gpt2-vocab.json)>
+MERGE_FILE=<Path to merges.txt (e.g. /gpt2-merges.txt)>
+
+pip install flask-restful
+
+python -m torch.distributed.launch $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 8  \
+       --pipeline-model-parallel-size 1  \
+       --num-layers 24  \
+       --hidden-size 1024  \
+       --load ${CHECKPOINT}  \
+       --num-attention-heads 16  \
+       --max-position-embeddings 1024  \
+       --tokenizer-type GPT2BPETokenizer  \
+       --fp16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --vocab-file $VOCAB_FILE  \
+       --merge-file $MERGE_FILE  \
+       --seed 42
--- a/Megatron-LM/examples/inference/t5/simple_t5_batch_inference.py
+++ b/Megatron-LM/examples/inference/t5/simple_t5_batch_inference.py
+import os
+import sys
+from argparse import Namespace
+
+import torch
+
+import pretrain_t5
+from megatron.core.inference.sampling_params import SamplingParams
+from megatron.core.inference.engines import AbstractEngine, StaticInferenceEngine
+from megatron.core.inference.inference_request import InferenceRequest
+from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
+    InferenceWrapperConfig,
+)
+from megatron.core.inference.model_inference_wrappers.t5.t5_inference_wrapper import (
+    T5InferenceWrapper,
+)
+from megatron.core.inference.text_generation_controllers.encoder_decoder_text_generation_controller import (
+    EncoderDecoderTextGenerationController,
+)
+from megatron.core.transformer.module import MegatronModule
+from pretrain_t5 import model_provider
+
+sys.path.append(
+    os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
+)
+
+from typing import List
+
+from megatron.core import mpu
+from megatron.training import get_args, get_model, get_tokenizer
+from megatron.training.checkpointing import load_checkpoint
+from megatron.training.initialize import initialize_megatron
+
+
+def add_text_generate_args(parser):
+    """Text generation arguments."""
+    group = parser.add_argument_group(title='text generation')
+
+    group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
+    group.add_argument("--top_k", type=int, default=1, help='Top k sampling.')
+    group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
+    group.add_argument(
+        "--return-log-probs",
+        action='store_true',
+        default=False,
+        help='Return the log probabilities of the final output tokens',
+    )
+    group.add_argument(
+        "--num-tokens-to-generate",
+        type=int,
+        default=30,
+        help='Number of tokens to generate for each prompt',
+    )
+    group.add_argument(
+        "--encoder-prompts",
+        metavar='N',
+        type=str,
+        nargs='+',
+        help='Encoder input prompts with each prompt within quotes and seperated by space',
+    )
+    group.add_argument(
+        "--max-batch-size", type=int, default=1, help='Max number of prompts to process at once'
+    )
+    return parser
+
+
+def get_inference_engine(args: Namespace, model: MegatronModule) -> AbstractEngine:
+    """Utility to get the relevant backend for running inference
+
+    This function will automatically chose the TRTLLMBackend when possible, and if not revert to Mcore backend if the user does not specify any backends. TRT LLM Backend is not implmented yet.
+
+    Args:
+        args (Namespace): The user arguments parsed from command line
+        model (MegatronModule): The megatron model .
+
+    Returns:
+        AbstractBackend: The chosen backend
+    """
+    tokenizer = get_tokenizer()
+
+    inference_wrapper_config = InferenceWrapperConfig(
+        hidden_size=args.hidden_size,
+        inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
+        fp32_residual_connection=args.fp32_residual_connection,
+        params_dtype=args.params_dtype,
+        padded_vocab_size=args.padded_vocab_size,
+    )
+
+    inference_wrapped_model = T5InferenceWrapper(model, inference_wrapper_config)
+    text_generation_controller = EncoderDecoderTextGenerationController(
+        inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
+    )
+    return StaticInferenceEngine(
+        text_generation_controller=text_generation_controller, max_batch_size=args.max_batch_size
+    )
+
+
+def main():
+    """Main program."""
+
+    # Note: The default args passed here can be overwritten by using appropriate params (check arguments.py file)
+    # Micro batch size is not needed to be set by user. (It is calculated based on inference-batch-times-seqlen-threshold argument)
+    initialize_megatron(
+        extra_args_provider=add_text_generate_args,
+        args_defaults={
+            'no_load_rng': True,
+            'no_load_optim': True,
+            'micro_batch_size': 1,
+            'exit_on_missing_checkpoint': True,
+        },
+    )
+
+    # Set up model and load checkpoint
+    model = get_model(model_provider, wrap_with_ddp=False)
+    load_checkpoint(model, None, None)
+    model = model[0]
+
+    args = get_args()
+
+    inference_engine = get_inference_engine(args, model)
+
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_k=args.top_k,
+        top_p=args.top_p,
+        return_log_probs=args.return_log_probs,
+        num_tokens_to_generate=args.num_tokens_to_generate,
+    )
+
+    tokenizer = get_tokenizer()
+    decoder_prompts = [""] * len(
+        args.encoder_prompts
+    )  # for T5, the prompt is provided as encoder input, hence decoder_prompts is empty
+    args.prompts = decoder_prompts
+
+    results: List[InferenceRequest] = inference_engine.generate(
+        prompts=args.prompts,
+        add_BOS=True,
+        encoder_prompts=args.encoder_prompts,
+        sampling_params=sampling_params,
+    )
+
+    if torch.distributed.get_rank() == 0:
+        for idx, result in enumerate(results):
+            print(f' \n------------- RESULT FOR PROMPT {idx} --------------- ')
+            result = {
+                'id': result.request_id,
+                'input_prompt': result.prompt,
+                'generated_text': result.generated_text,
+                'generated_tokens': result.generated_tokens,
+            }
+            print(result)
+
+
+if __name__ == "__main__":
+    main()
--- a/Megatron-LM/examples/llama/README.md
+++ b/Megatron-LM/examples/llama/README.md
+# Llama Models
+
+## Table of contents
+- [1. Overview](#1-overview)
+- [2. Prerequisites](#2-prerequisites)
+- [3. Training Setup](#3-training-setup)
+- [4. Configuration](#4-configuration)
+- [5. Test Datasets](#5-test-datasets)
+- [6. FP8 Debugging](#6-fp8-debugging)
+
+## 1. Overview
+<a id="overview" name="overview"></a>
+
+Train Llama models using FP8 precision with Megatron-Core.
+
+## 2. Prerequisites
+<a id="prerequisites" name="prerequisites"></a>
+
+```bash
+# Clone repository
+export HOST_MEGATRON_LM_DIR="/path/to/your/host/megatron-lm"
+git clone https://github.com/NVIDIA/Megatron-LM.git "$HOST_MEGATRON_LM_DIR"
+cd "$HOST_MEGATRON_LM_DIR"
+git checkout "core_r0.12.0"
+
+# Set paths
+export HOST_CHECKPOINT_PATH="./checkpoints/llama3_8b_fp8"
+export HOST_TENSORBOARD_LOGS_PATH="./tensorboard_logs/llama3_8b_fp8"
+
+# Optional: For real data
+# export HOST_TOKENIZER_MODEL_PATH="/path/to/host/tokenizer.model"
+# export HOST_DATA_PREFIX="/path/to/host/mydata_prefix"
+```
+
+## 3. Training Setup
+<a id="training-setup" name="training-setup"></a>
+
+### Using Mock Data
+```bash
+PYTORCH_IMAGE="nvcr.io/nvidia/pytorch:25.03-py3"
+
+docker run --rm --gpus all --ipc=host --ulimit memlock=-1 \
+  -v "${HOST_MEGATRON_LM_DIR}:/workspace/megatron-lm" \
+  -v "${HOST_CHECKPOINT_PATH}:/workspace/checkpoints" \
+  -v "${HOST_TENSORBOARD_LOGS_PATH}:/workspace/tensorboard_logs" \
+  --workdir /workspace/megatron-lm \
+  $PYTORCH_IMAGE \
+  bash examples/llama/train_llama3_8b_h100_fp8.sh \
+    /workspace/checkpoints \
+    /workspace/tensorboard_logs \
+  2>&1 | tee "${HOST_TENSORBOARD_LOGS_PATH}/training_mock_$(date +'%y-%m-%d_%H-%M-%S').log"
+```
+
+### Using Custom Data and Tokenizer
+```bash
+PYTORCH_IMAGE="nvcr.io/nvidia/pytorch:25.03-py3"
+
+docker run --rm --gpus all --ipc=host --ulimit memlock=-1 \
+  -v "${HOST_MEGATRON_LM_DIR}:/workspace/megatron-lm" \
+  -v "${HOST_CHECKPOINT_PATH}:/workspace/checkpoints" \
+  -v "${HOST_TENSORBOARD_LOGS_PATH}:/workspace/tensorboard_logs" \
+  -v "${HOST_TOKENIZER_MODEL_PATH}:/workspace/tokenizer_model" \
+  -v "$(dirname "${HOST_DATA_PREFIX}"):/workspace/data_dir" \
+  --workdir /workspace/megatron-lm \
+  $PYTORCH_IMAGE \
+  bash examples/llama/train_llama3_8b_h100_fp8.sh \
+    /workspace/checkpoints \
+    /workspace/tensorboard_logs \
+    /workspace/tokenizer_model \
+    "/workspace/data_dir/$(basename "${HOST_DATA_PREFIX}")" \
+  2>&1 | tee "${HOST_TENSORBOARD_LOGS_PATH}/training_custom_$(date +'%y-%m-%d_%H-%M-%S').log"
+```
+
+## 4. Configuration
+<a id="configuration" name="configuration"></a>
+
+Default parallelism strategy:
+- Tensor Parallel: 1
+- Pipeline Parallel: 1
+- Context Parallel: 2
+
+Llama-3-8B architecture:
+- 32 layers
+- Hidden size: 4096
+- FFN hidden size: 14336
+- Attention heads: 32
+- Query groups: 8
+- Sequence length: 8192
+- RMSNorm normalization with SwiGLU and RoPE
+
+Key training parameters:
+- Micro-batch size: 1
+- Global batch size: 128
+- Learning rate: 1.5e-4
+- Min learning rate: 1.0e-5
+- Weight decay: 0.1
+- FP8 format: hybrid
+
+You can modify these parameters directly in the `train_llama3_8b_h100_fp8.sh` script.
+
+This configuration follows those defined in NeMo Framework's performance scripts, which can be found at [https://github.com/NVIDIA/NeMo/tree/main/scripts/performance](https://github.com/NVIDIA/NeMo/tree/main/scripts/performance). 
+
+### FP8 Performance
+
+| Model | #-GPUs | GBS | MBS | Seq Length | TP | PP | CP | VP | EP | GA | Tokens/sec/GPU | TFLOP/sec/GPU |
+|-------|--------|-----|-----|------------|----|----|----|----|----|----|----------------|---------------|
+| LLAMA3-8B | 8 | 128 | 1 | 8192 | 1 | 1 | 2 | 1 | 1 | 32 | 13812 | 800 |
+| LLAMA3-70B | 64 | 128 | 1 | 8192 | 4 | 8 | 1 | 5 | 1 | 64 | 1621 | 780 |
+| LLAMA3-405B | 1024 | 512 | 1 | 8192 | 8 | 8 | 2 | 8 | 1 | 64 | 315 | 834 |
+
+Legend:
+- GBS: Global Batch Size
+- MBS: Micro Batch Size
+- TP: Tensor Parallel size
+- PP: Pipeline Parallel size
+- CP: Context Parallel size
+- VP: Virtual Pipeline stages
+- EP: Expert Parallel size
+- GA: Gradient Accumulation steps
+
+As NeMo uses Megatron-Core, for the latest performance benchmarks, please refer to the official [NeMo documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance_summary.html).
+
+## 5. Test Datasets
+<a id="test-datasets" name="test-datasets"></a>
+
+Recommended datasets:
+1. **WikiText-103**: https://huggingface.co/datasets/Salesforce/wikitext
+
+Preprocess datasets:
+```bash
+python "${HOST_MEGATRON_LM_DIR}/tools/preprocess_data.py" \
+       --input your_dataset.json \
+       --output-prefix test_dataset \
+       --tokenizer-type HuggingFaceTokenizer \
+       --tokenizer-model /path/to/tokenizer.model \
+       --append-eod
+```
+
+## 6. FP8 Training Considerations
+<a id="fp8-training-considerations" name="fp8-training-considerations"></a>
+
+- **Hardware**: Requires NVIDIA Hopper, Ada, or Blackwell GPUs for FP8 support
+   
+- **Troubleshooting**: If you encounter NaN values or instability with FP8 training, please refer to [Transformer Engine](https://github.com/NVIDIA/TransformerEngine).
--- a/Megatron-LM/examples/llama/train_llama3_8b_h100_fp8.sh
+++ b/Megatron-LM/examples/llama/train_llama3_8b_h100_fp8.sh
+#!/bin/bash
+
+# Environment variables for performance tuning
+export CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS:-1}
+#export LOG_LEVEL=${LOG_LEVEL:-INFO}
+#export NCCL_IB_TIMEOUT=${NCCL_IB_TIMEOUT:-19}
+#export NVTE_FWD_LAYERNORM_SM_MARGIN=${NVTE_FWD_LAYERNORM_SM_MARGIN:-16}
+#export NVTE_BWD_LAYERNORM_SM_MARGIN=${NVTE_BWD_LAYERNORM_SM_MARGIN:-16}
+#export NCCL_P2P_NET_CHUNKSIZE=${NCCL_P2P_NET_CHUNKSIZE:-2097152}
+#export NCCL_AVOID_RECORD_STREAMS=${NCCL_AVOID_RECORD_STREAMS:-1}
+
+CHECKPOINT_PATH=${1:-"checkpoints/llama3_8b_fp8"}
+TENSORBOARD_LOGS_PATH=${2:-"tensorboard_logs/llama3_8b_fp8"}
+TOKENIZER_ARG=${3:-"MOCK"} # Path to tokenizer model, or "MOCK"
+DATA_ARG=${4:-"MOCK"}     # Data prefix, or "MOCK"
+
+# Create directories if they don't exist
+mkdir -p "$(dirname "$CHECKPOINT_PATH")"
+mkdir -p "$(dirname "$TENSORBOARD_LOGS_PATH")"
+
+# Distributed training setup
+GPUS_PER_NODE=8
+NUM_NODES=1
+MASTER_ADDR=${MASTER_ADDR:-localhost}
+MASTER_PORT=${MASTER_PORT:-6000}
+NODE_RANK=${NODE_RANK:-0}
+WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
+
+# Path to the pretrain_gpt.py script, assuming this script is run from the root of the Megatron-LM repository
+PRETRAIN_SCRIPT_PATH="pretrain_gpt.py"
+
+# Fixed model and training parameters
+TP_SIZE=1     
+CP_SIZE=1     
+PP_SIZE=1     
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=128
+NUM_LAYERS=32  
+DTYPE="fp8"
+SEQ_LENGTH=8192
+MAX_POSITION_EMBEDDINGS=8192
+
+# Data cache path (useful for both mock and real data)
+DATA_CACHE_PATH="${PWD}/benchmark_cache_llama3_8b_fp8"
+mkdir -p "$DATA_CACHE_PATH"
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE
+    --nnodes $NUM_NODES
+    --node_rank $NODE_RANK
+    --master_addr $MASTER_ADDR
+    --master_port $MASTER_PORT
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --num-layers $NUM_LAYERS
+    --hidden-size 4096
+    --ffn-hidden-size 14336
+    --num-attention-heads 32
+    --group-query-attention
+    --num-query-groups 8
+    --kv-channels 128
+    --seq-length $SEQ_LENGTH
+    --max-position-embeddings $MAX_POSITION_EMBEDDINGS
+    --position-embedding-type rope
+    --rotary-base 1000000 
+    --rotary-percent 1.0
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --swiglu
+    --init-method-std 0.0134
+    --attention-backend fused
+    --apply-layernorm-1p 
+    --untie-embeddings-and-output-weights
+    --disable-bias-linear 
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size $MICRO_BATCH_SIZE
+    --global-batch-size $GLOBAL_BATCH_SIZE
+    --train-samples 1953125000
+    --lr-decay-samples 1949218748
+    --lr-warmup-samples 3906252
+    --lr 0.00015
+    --min-lr 0.00001
+    --decoupled-lr 5.0e-4      # Specific to decoupled AdamW, ensure optimizer is compatible
+    --decoupled-min-lr 4.5e-5  # Specific to decoupled AdamW
+    --lr-decay-style cosine
+    --clip-grad 1.0
+    --weight-decay 0.1
+    --adam-beta1 0.9
+    --adam-beta2 0.95
+    --bf16
+    --grad-reduce-in-bf16
+    --cross-entropy-loss-fusion
+    --calculate-per-token-loss 
+    --manual-gc 
+    --empty-unused-memory-level 1 
+    --exit-duration-in-mins 235 
+)
+
+# Conditional arguments based on DTYPE (FP8)
+DTYPE_ARGS=()
+if [[ "$DTYPE" == "fp8" ]]; then
+    DTYPE_ARGS+=(
+        "--fp8-format hybrid"
+        "--fp8-amax-history-len 1024"
+        "--fp8-amax-compute-algo max"
+        "--fp8-param-gather"
+    )
+fi
+
+# Model parallelism arguments
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size $TP_SIZE
+    --context-parallel-size $CP_SIZE
+    # --pipeline-model-parallel-size $PP_SIZE # Not explicitly set in llama script options, assume 1 if not multi-node PP
+    --sequence-parallel  # Always enable sequence parallelism with TP_SIZE=2
+)
+
+# Distributed Data Parallel (DDP) arguments
+# From original script's ddp_args
+DDP_ARGS=(
+    --use-distributed-optimizer
+    --overlap-grad-reduce
+    --overlap-param-gather
+)
+TRAINING_ARGS+=("${DDP_ARGS[@]}")
+
+
+# Data arguments (conditional for mock vs real data)
+DATA_ARGS_LIST=()
+if [[ "$TOKENIZER_ARG" == "MOCK" ]] || [[ "$DATA_ARG" == "MOCK" ]] || [[ -z "$TOKENIZER_ARG" ]]; then
+    DATA_ARGS_LIST+=(
+        "--mock-data"
+        "--tokenizer-type NullTokenizer"
+        "--vocab-size 128256" 
+        "--data-cache-path ${DATA_CACHE_PATH}"
+        "--tiktoken-pattern v2" 
+        "--split '99,1,0'"
+        "--no-create-attention-mask-in-dataloader"
+        "--no-mmap-bin-files"
+        "--num-workers 1"
+    )
+else
+    # Settings for real data
+    DATA_ARGS_LIST+=(
+        "--data-path $DATA_ARG"
+        "--tokenizer-type HuggingFaceTokenizer" 
+        "--tokenizer-model $TOKENIZER_ARG"
+        "--data-cache-path ${DATA_CACHE_PATH}"
+        "--split '99,1,0'"
+        "--no-create-attention-mask-in-dataloader"
+        "--no-mmap-bin-files"
+        "--num-workers 1"
+        # Note: --vocab-size might be inferred by HuggingFaceTokenizer or might need to be explicit.
+        "--vocab-size 128256"
+    )
+fi
+
+EVAL_AND_LOGGING_ARGS=(
+    --log-interval 1
+    --eval-iters 32
+    --eval-interval 100
+    --save-interval 1000
+    --log-throughput
+    --profile
+    --profile-step-start 4
+    --profile-step-end 6
+    --ckpt-format torch_dist 
+    --distributed-timeout-minutes 60
+    --save "$CHECKPOINT_PATH"
+    --load "$CHECKPOINT_PATH" 
+    --tensorboard-dir "$TENSORBOARD_LOGS_PATH"
+)
+
+# Ensure pretrain_gpt.py is found
+if [ ! -f "$PRETRAIN_SCRIPT_PATH" ]; then
+    echo "Error: pretrain_gpt.py not found at $PRETRAIN_SCRIPT_PATH"
+    echo "Please ensure you are running this script from the root of the Megatron-LM repository, and pretrain_gpt.py is present."
+    exit 1
+fi
+
+# Run the training command
+torchrun ${DISTRIBUTED_ARGS[@]} \
+    "$PRETRAIN_SCRIPT_PATH" \
+    ${MODEL_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${DTYPE_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${DATA_ARGS_LIST[@]} \
+    ${EVAL_AND_LOGGING_ARGS[@]}
+
+set +x
\ No newline at end of file
--- a/Megatron-LM/examples/mamba/.gitignore
+++ b/Megatron-LM/examples/mamba/.gitignore
+checkpoints/
+data-cache/
+tensorboard/
+triton-cache/
--- a/Megatron-LM/examples/mamba/Dockerfile
+++ b/Megatron-LM/examples/mamba/Dockerfile
+FROM nvcr.io/nvidia/pytorch:24.01-py3
+
+RUN pip uninstall -y triton && \
+    pip install triton==2.1.0 sentencepiece==0.1.99 flask-restful
+
+# The causal-conv1d and mamba-ssm packages below are built from scratch here
+# (which takes significant time) because there are no wheels available on PyPI
+# for these relatively newer versions of the packages that are compatible with
+# the older NGC-variant PyTorch version (e.g. version 2.2.0.dev231106) that we
+# are using (in the NGC base container). Generally, if the package is not
+# compatible with the PyTorch version, then it will generate a Python import
+# error. The package authors tend to only release wheels for new versions of
+# these pacakges which are compatible with the versions of regular PyTorch and
+# NGC-variant PyTorch that are newer at the time of release. So, to use newer
+# versions of these packages with relatively older versions of the NGC PyTorch
+# container, we tend to have to build the packages from scratch.
+
+RUN cd /tmp && \
+    git clone https://github.com/Dao-AILab/causal-conv1d.git && \
+    cd causal-conv1d && \
+    git checkout v1.2.2.post1 && \
+    CAUSAL_CONV1D_FORCE_BUILD=TRUE pip install . && \
+    cd .. && \
+    rm -rf causal-conv1d
+
+RUN cd /tmp && \
+    git clone https://github.com/state-spaces/mamba.git && \
+    cd mamba && \
+    git checkout v2.0.3 && \
+    MAMBA_FORCE_BUILD=TRUE pip install . && \
+    cd .. && \
+    rm -rf mamba
--- a/Megatron-LM/examples/mamba/README.md
+++ b/Megatron-LM/examples/mamba/README.md
+# Mamba-based Language Models
+
+## Introduction
+
+This document is an entrypoint into the code used for
+<em>[An Empirical Study of Mamba-based Language Models](https://arxiv.org/abs/2406.07887)</em>.
+
+We are releasing the parameters for some of the models described in that
+technical report via
+[HuggingFace](https://huggingface.co/collections/nvidia/ssms-666a362c5c3bb7e4a6bcfb9c).
+The code in the `main` branch is no longer compatible with the `Mamba2-*`
+checkpoints. You can load them using the
+[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba).
+
+## Installation
+
+Create and run a Docker container using the [Dockerfile](./Dockerfile).
+
+```
+docker build -t your_image_name:your_tag .
+docker run --gpus all -it --rm \
+  -v /path/to/megatron:/workspace/megatron \
+  -v /path/to/dataset:/workspace/dataset \
+  -v /path/to/checkpoints:/workspace/checkpoints \
+  -w /workspace/megatron/examples/mamba \
+  your_image_name:your_tag
+```
+
+## Train
+
+[`train.sh`](./train.sh) is an example pretraining script, showing how to run on
+a single node. Select between 800M-scale and 8B-scale models by setting the
+`MODEL_SCALE` variable. The 8B-scale hybrid model architecture is the same as
+the one described in the technical report.
+
+## Text Generation
+
+Use [`run_text_gen_server_8b.sh`](./run_text_gen_server_8b.sh) to start a text
+generation server using an 8B hybrid checkpoint. This is configured to run the
+8B hybrid model described in the technical report, with tensor model parallel
+set to 1.
+
+The arguments in the script will need to be changed if using a checkpoint with a
+different model parallel configuration or other differences, such as model
+architecture. For example, to run the 8B pure Mamba-2 model, change
+`--hybrid-attention-ratio` and `--hybrid-mlp-ratio` to 0.0, or remove them.
+
+Use [`run_text_gen_server_8b_gpt3.sh`](./run_text_gen_server_8b_gpt3.sh) to start
+a text generation server using the 8B reference Transformer checkpoint.
+
+## Checkpoint Formats
+
+For inference, the model must be configured to match the checkpoint file used,
+including the hybrid layer configuration and model parallel configuration.
+
+If you need to convert a hybrid checkpoint file to a different tensor parallel
+or pipeline parallel size, use
+[the hybrid conversion script](../../tools/checkpoint/hybrid_conversion.py).
+There is an example run command at the end of that file.
+
+Before running that script, you will need to set `PYTHONPATH` to include the
+root directory of your Megatron-LM repository clone.
+
+```
+export PYTHONPATH=<path-to-megatron>:PYTHONPATH
+```
+
+## Hybrid Options
+
+`--hybrid-attention-ratio ATT` specifies a target ratio of attention layers
+to total layers. For example, 4 attention layers out of 48 total layers is
+specified by `--hybrid-attention-ratio 0.08`.
+
+`--hybrid-mlp-ratio MLP` specifies a target ratio of MLP layers to total
+layers. For example, 24 MLP layers out of 48 total layers is specified by
+`--hybrid-mlp-ratio 0.5`.
+
+* (`ATT` + `MLP`) must be less than or equal to 1.0.
+* (1.0 - `ATT` - `MLP`) is the hybrid mamba ratio, the ratio of mamba layers to
+total layers.
+* `ATT` = `MLP` = 0 is a pure Mamba model.
+* `ATT` = `MLP` = 0.5 is a transfomer model.
+
+If either `ATT` or `MLP` is greater than 0.0 or if `--hybrid-override-pattern`
+is specified, the logfile will include information about the hybrid layer
+pattern used. `--hybrid-override-pattern` can be used to specify a different
+pattern than the default, algorithmically-generated one.
+
+## Mamba vs Mamba-2
+
+This codebase currently only supports Mamba-2, and not the original version of
+Mamba. However, the
+[fixed snapshot of the code used for the technical report](https://github.com/NVIDIA/Megatron-LM/tree/ssm/examples/mamba)
+can be configured to run the original version of Mamba.
--- a/Megatron-LM/examples/mamba/run_text_gen_server_8b.sh
+++ b/Megatron-LM/examples/mamba/run_text_gen_server_8b.sh
+#!/bin/bash
+
+# Use: ./run_text_gen_server_8b.sh <checkpoint-path> <tokenizer-path>
+# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
+
+CHECKPOINT_PATH=$1
+TOKENIZER_PATH=$2
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+export TRITON_CACHE_DIR="./triton-cache/"
+export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
+
+torchrun $DISTRIBUTED_ARGS ../../tools/run_mamba_text_generation_server.py \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --untie-embeddings-and-output-weights \
+       --num-layers 56  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT_PATH}  \
+       --num-attention-heads 32  \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --hybrid-attention-ratio 0.08 \
+       --hybrid-mlp-ratio 0.5 \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --normalization RMSNorm \
+       --seq-length 4096  \
+       --max-position-embeddings 4096  \
+       --position-embedding-type none \
+       --tokenizer-type GPTSentencePieceTokenizer  \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --distributed-timeout-minutes 1440 \
+       --bf16  \
+       --micro-batch-size 1  \
+       --use-mcore-models \
+       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+       --seed 42
--- a/Megatron-LM/examples/mamba/run_text_gen_server_8b_gpt3.sh
+++ b/Megatron-LM/examples/mamba/run_text_gen_server_8b_gpt3.sh
+#!/bin/bash
+
+# Use: ./run_text_gen_server_8b_gpt3.sh <checkpoint-path> <tokenizer-path>
+# To launch the client: python ../../tools/text_generation_cli.py <URL-provided-by-server>
+
+CHECKPOINT_PATH=$1
+TOKENIZER_PATH=$2
+
+DISTRIBUTED_ARGS="--nproc_per_node 1 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+torchrun $DISTRIBUTED_ARGS ../../tools/run_text_generation_server.py \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 1  \
+       --use-flash-attn \
+       --apply-layernorm-1p \
+       --untie-embeddings-and-output-weights \
+       --num-layers 32  \
+       --hidden-size 4096  \
+       --load ${CHECKPOINT_PATH}  \
+       --num-attention-heads 32  \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --seq-length 4096  \
+       --max-position-embeddings 4096  \
+       --position-embedding-type rope \
+       --rotary-percent 0.5 \
+       --squared-relu \
+       --tokenizer-type GPTSentencePieceTokenizer  \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --distributed-timeout-minutes 1440 \
+       --bf16  \
+       --micro-batch-size 1  \
+       --use-mcore-models \
+       --transformer-impl local \
+       --seed 42
--- a/Megatron-LM/examples/mamba/train.sh
+++ b/Megatron-LM/examples/mamba/train.sh
+#!/bin/bash
+
+# Use: ./train.sh <data-path> <tokenizer-path>
+
+MODEL_SCALE="800M" # or "8B"
+
+case "${MODEL_SCALE}" in
+    "800M")
+        TENSOR_MODEL_PARALLEL_SIZE=1
+        NUM_LAYERS=48
+        HIDDEN_SIZE=1024
+        NUM_ATTENTION_HEADS=16
+        GLOBAL_BATCH_SIZE=32
+        ;;
+    "8B")
+        TENSOR_MODEL_PARALLEL_SIZE=4
+        NUM_LAYERS=56
+        HIDDEN_SIZE=4096
+        NUM_ATTENTION_HEADS=32
+        GLOBAL_BATCH_SIZE=8
+        ;;
+    *)
+        echo "Invalid version specified"
+        exit 1
+        ;;
+esac
+
+DATA_PATH=$1
+TOKENIZER_PATH=$2
+
+export NCCL_IB_SL=1
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_IB_TIMEOUT=19
+export NCCL_IB_QPS_PER_CONNECTION=4
+
+CHECKPOINT_DIR="./checkpoints"
+DATACACHE_DIR="./data-cache"
+TENSORBOARD_DIR="./tensorboard"
+
+mkdir -p ${CHECKPOINT_DIR}
+mkdir -p ${DATACACHE_DIR}
+mkdir -p ${TENSORBOARD_DIR}
+
+export TRITON_CACHE_DIR="./triton-cache/"
+export TRITON_CACHE_MANAGER="megatron.core.ssm.triton_cache_manager:ParallelFileCacheManager"
+
+SEQ_LEN=4096
+TRAIN_SAMPLES=73242188  # 300B tokens / 4096
+LR_WARMUP_SAMPLES=50000
+LR_DECAY_SAMPLES=73192188 # TRAIN_SAMPLES - LR_WARMUP_SAMPLES
+
+options=" \
+       --tensor-model-parallel-size ${TENSOR_MODEL_PARALLEL_SIZE} \
+       --sequence-parallel \
+       --pipeline-model-parallel-size 1 \
+       --use-distributed-optimizer \
+       --overlap-param-gather \
+       --overlap-grad-reduce \
+       --untie-embeddings-and-output-weights \
+       --init-method-std 0.02 \
+       --position-embedding-type none \
+       --num-layers ${NUM_LAYERS} \
+       --hidden-size ${HIDDEN_SIZE} \
+       --num-attention-heads ${NUM_ATTENTION_HEADS} \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --hybrid-attention-ratio 0.08 \
+       --hybrid-mlp-ratio 0.5 \
+       --seq-length ${SEQ_LEN} \
+       --max-position-embeddings ${SEQ_LEN} \
+       --train-samples ${TRAIN_SAMPLES} \
+       --lr-warmup-samples ${LR_WARMUP_SAMPLES} \
+       --lr-decay-samples ${LR_DECAY_SAMPLES} \
+       --save ${CHECKPOINT_DIR} \
+       --load ${CHECKPOINT_DIR} \
+       --data-path ${DATA_PATH} \
+       --data-cache-path ${DATACACHE_DIR} \
+       --split 99,1,0 \
+       --tokenizer-type GPTSentencePieceTokenizer \
+       --tokenizer-model ${TOKENIZER_PATH} \
+       --distributed-backend nccl \
+       --micro-batch-size 4 \
+       --global-batch-size ${GLOBAL_BATCH_SIZE} \
+       --lr 2.5e-4 \
+       --min-lr 2.5e-5 \
+       --lr-decay-style cosine \
+       --weight-decay 0.1 \
+       --clip-grad 1.0 \
+       --attention-dropout 0.0 \
+       --hidden-dropout 0.0 \
+       --disable-bias-linear \
+       --normalization RMSNorm \
+       --adam-beta1 0.9 \
+       --adam-beta2 0.95 \
+       --log-interval 10 \
+       --save-interval 2000 \
+       --eval-interval 2000 \
+       --eval-iters 32 \
+       --bf16 \
+       --use-mcore-models \
+       --spec megatron.core.models.mamba.mamba_layer_specs mamba_stack_spec \
+       --no-create-attention-mask-in-dataloader \
+       --tensorboard-dir ${TENSORBOARD_DIR}"
+
+torchrun --nproc_per_node 8 ../../pretrain_mamba.py ${options}
--- a/Megatron-LM/examples/mixtral/README.md
+++ b/Megatron-LM/examples/mixtral/README.md
+# Mixtral 8x7B Model Inference and Finetuning
+
+## Download Mixtral 8x7B Checkpoints
+Download Mixtral 8x7B HF format checkpoint from [HF-hub](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1/)
+
+Or you can simply run this following script to download Mixtral 8x7B into a specific folder.
+```python
+from huggingface_hub import snapshot_download
+SAVED_DIR = "" # Specify the saved directory
+# Download HF checkpoints
+snapshot_download(repo_id="mistralai/Mixtral-8x7B-v0.1", ignore_patterns=["*.pt"], local_dir=SAVED_DIR, local_dir_use_symlinks=False)
+```
+
+## Convert Mixtral 8x7B checkpoints from HF to MCore
+The HF checkpoints can be converted to Megatron format by using the provided checkpoint converter for HF format.
+The target model parallel size(e.g. TP,PP,EP) should be specified.
+
+Currently the converter doesn't support distributed checkpointing yet, so each different parallel config requires a specific checkpoint.
+- For training, the recommended model parallel config is TP1EP8PP4
+- For inference, the recommended model parallel config is TP1EP1PP2
+
+```
+TOKENIZER_MODEL=/workspace/checkpoints/mixtral-hf/tokenizer.model
+MEGATRON_PATH="/workspace/megatron-lm"
+export PYTHONPATH=$MEGATRON_PATH:$PYTHONPATH
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+TARGET_TP_SIZE=""
+TARGET_EP_SIZE=""
+TARGET_PP_SIZE=""
+
+HF_FORMAT_DIR=/workspace/checkpoints/mixtral-hf
+MEGATRON_FORMAT_DIR=/workspace/checkpoints/mixtral-mcore-TP${TARGET_TP_SIZE}PP${TARGET_PP_SIZE}EP${TARGET_EP_SIZE}
+
+python tools/checkpoint/convert.py \
+--model-type GPT \
+--loader loader_mixtral_hf \
+--saver mcore \
+--target-tensor-parallel-size ${TARGET_TP_SIZE} \
+--target-pipeline-parallel-size ${TARGET_PP_SIZE} \
+--target-expert-parallel-size ${TARGET_EP_SIZE} \
+--load-dir ${HF_FORMAT_DIR} \
+--save-dir ${MEGATRON_FORMAT_DIR} \
+--tokenizer-model ${TOKENIZER_MODEL}
+```
+
+## Text generation with Mixtral 8x7B
+Inference with Mixtral 8x7B requires at least 2 GPUS, such that a distributed checkpoint with EP>=2 or PP>=2 converted with above script is needed.
+
+The Megatron-LM have included a simple REST server to use for text generation in `tools/run_text_generation_server.py`, launch it with the following script:
+```
+#!/bin/bash
+# This example will start serving the Mixtral 8x7B model.
+DISTRIBUTED_ARGS="--nproc_per_node 2 \
+                  --nnodes 1 \
+                  --node_rank 0 \
+                  --master_addr localhost \
+                  --master_port 6000"
+
+CHECKPOINT=<Path to checkpoint>
+TOKENIZER_MODEL=<Path to tokenizer (e.g. /tokenizer.model)>
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+pip install flask-restful
+
+torchrun $DISTRIBUTED_ARGS tools/run_text_generation_server.py   \
+       --tensor-model-parallel-size 1  \
+       --pipeline-model-parallel-size 2  \
+       --expert-model-parallel-size 1 \
+       --load ${CHECKPOINT}  \
+       --tokenizer-type Llama2Tokenizer \
+       --tokenizer-model $TOKENIZER_MODEL \
+       --use-mcore-models \
+       --max-position-embeddings 32768 \
+       --num-layers 32 \
+       --hidden-size 4096 \
+       --ffn-hidden-size 14336 \
+       --num-attention-heads 32 \
+       --normalization RMSNorm \
+       --disable-bias-linear \
+       --position-embedding-type rope \
+       --no-position-embedding \
+       --swiglu \
+       --untie-embeddings-and-output-weights \
+       --group-query-attention \
+       --num-query-groups 8 \
+       --bf16  \
+       --micro-batch-size 1  \
+       --seq-length 1024  \
+       --seed 42 \
+       --num-experts 8 \
+       --moe-router-topk 2 \
+       --moe-token-dispatcher-type alltoall \
+       --moe-grouped-gemm \
+       --mock-data \
+       --rotary-base 1000000
+```
+
+Once the server is running you can use `tools/text_generation_cli.py` to query it, it takes one argument which is the host the server is running on.
+
+```
+python tools/text_generation_cli.py localhost:5000
+```
+
+
+## Finetuning from pretrained Mixtral 8x7B
+To finetuning pretrained Mixtral 8x7B, use the following scripts:
+
+
+```bash
+PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:24.04-py3
+CHECKPOINT_PATH="" # Speicfy path to checkpoint dir
+TOKENIZER_MODEL="" # Specify path to tokenizer.model
+DATA_PATH="" # Specify path to data
+
+docker run \
+    --gpus=all \
+    --ipc=host \
+    --workdir /workspace/megatron-lm \
+    -v /path/to/data:/path/to/data \
+    -v /path/to/megatron-lm:/workspace/megatron-lm \
+    $PYTORCH_IMAGE \
+    bash examples/mixtral/train_mixtral_8x7b_distributed.sh $CHECKPOINT_PATH $TOKENIZER_MODEL $DATA_PATH
+```
+
+The above functionality also applys to Mixtral 8x22B actually, you should set the model config (including hidden_size/head_num/num_layers/ffn_hidden_size) properly according to the original [config](https://huggingface.co/mistralai/Mixtral-8x22B-v0.1/blob/main/config.json).
+
+## Acknowledgements
+Contributors outside NVIDIA for the huggingface converter and example of Mixtral models in Megatron-Core:
+- Peng Li <jerry.lp@alibaba-inc.com>
+- Jun Huang <huangjun.hj@alibaba-inc.com>
--- a/Megatron-LM/examples/mixtral/train_mixtral_8x7b_distributed.sh
+++ b/Megatron-LM/examples/mixtral/train_mixtral_8x7b_distributed.sh
+#!/bin/bash
+
+# Runs Mixtral 8x7B model
+
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+
+GPUS_PER_NODE=8
+# Change for multinode config
+MASTER_ADDR=${MASTER_ADDR:-"localhost"}
+MASTER_PORT=${MASTER_PORT:-"6000"}
+NNODES=${SLURM_NNODES:-"1"}
+NODE_RANK=${RANK:-"0"}
+WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
+
+CHECKPOINT_PATH=$1
+TOKENIZER_MODEL=$2
+DATA_PATH=$3
+
+DISTRIBUTED_ARGS=(
+    --nproc_per_node $GPUS_PER_NODE
+    --nnodes $NNODES
+    --node_rank $NODE_RANK
+    --master_addr $MASTER_ADDR
+    --master_port $MASTER_PORT
+)
+
+MODEL_ARGS=(
+    --use-mcore-models
+    --disable-bias-linear
+    --seq-length 4096
+    --max-position-embeddings 32768
+    --num-layers 32
+    --hidden-size 4096
+    --ffn-hidden-size 14336
+    --num-attention-heads 32
+    --init-method-std 0.01
+    --attention-dropout 0.0
+    --hidden-dropout 0.0
+    --normalization RMSNorm
+    --position-embedding-type rope
+    --swiglu
+    --untie-embeddings-and-output-weights
+    --group-query-attention
+    --num-query-groups 8
+    --no-masked-softmax-fusion
+    --no-position-embedding
+    --rotary-base 1000000
+)
+
+MOE_ARGS=(
+    --num-experts 8
+    --moe-router-topk 2
+    --moe-router-load-balancing-type aux_loss
+    --moe-aux-loss-coeff 1e-2
+    --moe-grouped-gemm
+    --moe-token-dispatcher-type alltoall
+    --overlap-param-gather
+    --overlap-grad-reduce
+)
+
+DATA_ARGS=(
+    --tokenizer-type Llama2Tokenizer
+    --tokenizer-model ${TOKENIZER_MODEL}
+    --data-path $DATA_PATH
+    --split 99990,8,2
+)
+
+TRAINING_ARGS=(
+    --micro-batch-size 1
+    --global-batch-size 256
+    --lr 1e-4
+    --train-iters 500000
+    --lr-decay-iters 320000
+    --lr-decay-style cosine
+    --min-lr 1.0e-5
+    --weight-decay 0.1
+    --lr-warmup-iters 500
+    --clip-grad 1.0
+    --bf16
+)
+
+MODEL_PARALLEL_ARGS=(
+    --tensor-model-parallel-size 1
+    --pipeline-model-parallel-size 4
+    --expert-model-parallel-size 8
+    --use-distributed-optimizer
+    --sequence-parallel
+)
+
+LOGGING_ARGS=(
+    --log-interval 1 \
+    --save-interval 10000 \
+    --eval-interval 1000 \
+    --eval-iters 10 \
+    --save $CHECKPOINT_PATH \
+    --load $CHECKPOINT_PATH \
+    --tensorboard-dir "${CHECKPOINT_PATH}/tensorboard" \
+    --no-load-optim \
+    --no-load-rng
+)
+
+if [ -n "${WANDB_API_KEY}" ]; then
+    LOGGING_ARGS+=(
+        --wandb-project ${WANDB_PROJECT:-"Mixtral"}
+        --wandb-exp-name ${WANDB_NAME:-"Mixtral_8x7B"}
+    )
+fi
+
+
+torchrun ${DISTRIBUTED_ARGS[@]} pretrain_gpt.py \
+    ${MODEL_ARGS[@]} \
+    ${MOE_ARGS[@]} \
+    ${DATA_ARGS[@]} \
+    ${TRAINING_ARGS[@]} \
+    ${MODEL_PARALLEL_ARGS[@]} \
+    ${LOGGING_ARGS[@]}
--- a/Megatron-LM/examples/multimodal/Dockerfile
+++ b/Megatron-LM/examples/multimodal/Dockerfile
+FROM nvcr.io/nvidia/pytorch:24.02-py3
+
+RUN apt update && \
+    apt -y upgrade && \
+    apt install -y --no-install-recommends \
+        software-properties-common \
+        build-essential \
+        python3-pip \
+        python3-dev \
+        bash \
+        git \
+        vim \
+        tmux \
+        python-is-python3 \
+        default-jre
+
+RUN pip install --upgrade pip
+RUN pip install einops einops-exts sentencepiece braceexpand webdataset packaging
+RUN pip install transformers datasets accelerate timm
+RUN pip install pytest-cov pytest_mock nltk wrapt
+RUN pip install zarr "tensorstore==0.1.45"
+RUN pip install black isort click==8.0.2
+RUN pip install pycocoevalcap megatron-energon mistral-common tiktoken
+RUN pip install git+https://github.com/openai/CLIP.git
+# Use --no-deps for the following to avoid outdated and unnecessary dependencies.
+RUN pip install open_clip_torch open-flamingo[eval] --no-deps
--- a/Megatron-LM/examples/multimodal/README.md
+++ b/Megatron-LM/examples/multimodal/README.md
+# Multimodal Example
+
+*NOTE: This example is under active development and is expected change.*
+
+The following walks through all the steps required to pretrain and instruction tune a llava architecture vision-language model (VLM). It is important to precisely follow all steps to obtain the benchmark scores at the end.
+
+This example has been tested on an A100 based DGX cluster. Pretraining and instruction tuning took approximately 1 day and 11 hours respectively on 64 GPUs using four way tensor parallelism (tp=4). Training speed will scale approximately linearly with number of GPUs available.
+
+Multimodal support in megatron is still under active development. This example is not intended to produce state-of-the-art model quality (that would require more data and model refinements), it is merely intended to demonstrate the multimodal functionality in megatron. If you hit any problems, please open a github issue.
+
+## Setup
+
+### Docker container
+
+You can build a docker container using `examples/multimodal/Dockerfile` to run this example.
+
+### Language model
+
+Follow the instructions in [Mistral](../../docs/llama_mistral.md#mistral-7b) to download weights for Mistral-7B-Instruct-v0.3 from HuggingFace and convert to mcore format with tensor parallel size 4.
+Please use the tokenizer from HuggingFace.
+
+### Vision model
+
+This example uses the OpenAI CLIP `ViT-L/14@336px` Vision model. To download the weights from OpenAI and convert them to a format that can be loaded in megatron, please run the following:
+
+```
+python examples/multimodal/model_converter/clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4 --use-te
+```
+
+### Combined model checkpoint
+
+Update the paths to point to the mcore converted CLIP and Mistral models and run the following script to combine the Mistral and CLIP models into a single multimodal checkpoint folder:
+
+```
+examples/multimodal/combine_lm_vision_checkpoints.sh /path/to/mistral/model /path/to/clip/model /output/dir
+```
+
+## Training
+
+### Pretraining
+
+1. Download the LLavA-Pretrain dataset from Hugging Face and unzip the images folder (NOTE: 79GB of disk space required):
+
+    ```
+    git clone https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain
+    cd LLaVA-Pretrain
+    unzip images.zip
+    ```
+
+3. Run the following script to convert the data to webdataset format:
+
+    ```
+    cd <megatron-lm dir>
+    python examples/multimodal/convert_llava_pretrain_to_wds.py
+    ```
+
+4. Run the following command to convert to megatron-energon format:
+
+    ```
+    cd <LLaVA-Pretrain dir>/wds
+    energon prepare ./
+    ```
+
+    select the following values for the presented options:
+
+    ```
+    > Please enter a desired train/val/test split like "0.5, 0.2, 0.3" or "8,1,1": 9,1,0
+    > Do you want to create a dataset.yaml interactively? [Y/n]: Y
+    > Please enter a number to choose a class: 10 (VQAWebdataset)
+    > Do you want to set a simple field_map[Y] (or write your own sample_loader [n])? [Y/n]: Y
+    > Please enter a webdataset field name for 'image' (<class 'torch.Tensor'>): jpg
+    > Please enter a webdataset field name for 'context' (<class 'str'>): json[0][value]
+    > Please enter a webdataset field name for 'answers' (typing.Optional[typing.List[str]], default: None): json[1][value]
+    > Please enter a webdataset field name for 'answer_weights' (typing.Optional[torch.Tensor], default: None):
+    ```
+
+5. Update `pretrain_dataset.yaml` so that both `path` variables point to `LLaVA-Pretrain/wds`
+
+6. Run the following script to pretrain a llava model for image captioning:
+
+    ```
+    cd <megatron-lm dir>
+    examples/multimodal/pretrain_mistral_clip.sh
+    ```
+
+All being well you should observe training and validation loss curves similar to the following:
+
+<img src="assets/pretrain_curves.png" alt="Pretraining loss curves" width="600"/>
+
+These curves were obtained with global batch size of 256. Changing this value will likely change the curves. For pretraining and instruction tuning llava models we have found that loss curves are an unreliable predictor of downstream task performance. Therefore it is necessary to run test generation and evaluation on a range of metrics to understand model quality. We intend to add training time zero-shot evaluation in a future update.
+
+You can execute the pretraining script multiple times to resume training. On resuming, the latest model, optimizer, and dataloader state are loaded.
+
+### SFT
+
+1. Prepare an instruction tuning dataset such in [megatron-energon format](https://nvidia.github.io/Megatron-Energon/data_prep.html#). NOTE: we do not provide instructions for this.
+
+2. Update `sft_dataset.yaml` so that both `path` variables point to the train and val splits of your instruction tuning dataset.
+
+Run the following script to instruction tune the pre-trained llava model:
+
+    ```
+    examples/multimodal/sft_mistral_clip.sh
+    ```
+
+You can execute the SFT script multiple times to resume training. On resuming, the latest model, optimizer, and dataloader state are loaded.
+
+## Evaluation
+
+### Generation
+
+Run the following script:
+
+```
+examples/multimodal/text_generation_mistral_clip.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
+    --model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name
+```
+
+where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning` or `MMMU`.
+
+### After pretraining
+
+#### COCO captioning
+
+1. Download the COCO 2014 test image set:
+
+    ```wget http://images.cocodataset.org/zips/test2014.zip```
+
+2. Download COCO test image annotations:
+
+    ```https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json```
+
+3. First, run text generation using `--task captioning`.
+
+4. Run the following command:
+
+    ```
+    python examples/multimodal/evaluate_coco.py --input-path /output/directory/from/generation --groundtruth-path /path/to/groundtruth/file
+    ```
+
+For the mistral-7b-instruct plus clip llava model you should obtain a COCO CIDer score of approximately 94.
+
+### After SFT
+
+#### MMMU
+
+The official MMMU repository is not pip installable currently so please clone their code in `examples/multimodal` by running `git clone https://github.com/MMMU-Benchmark/MMMU.git`.
+
+The MMMU dataset is loaded from HuggingFace automatically as part of the code.
+
+Run text generation using `--task MMMU`. Then, run the following command:
+
+```
+python examples/multimodal/evaluate_mmmu.py --input-path /output/directory/from/generation
+```
+
+For the mistral-7b-instruct plus clip instruction tuned llava model you should obtain a MMMU score of approximately 38.