Commit 1106877d authored by jerrrrry's avatar jerrrrry
Browse files

“13.0”

parents
Pipeline #2934 failed with stages
in 0 seconds
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Generate text using a vision language model."""
import json
import logging
import os
import sys
from functools import partial
from typing import List, Dict
# Add megatron to the path.
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
import torch
import yaml
from config import EvaluationConfig
from evaluation.evaluation_datasets import get_evaluation_dataset
from model import model_provider
from multimodal_args import add_multimodal_extra_args
from megatron.core import parallel_state
from megatron.core.enums import ModelType
from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
from megatron.inference.text_generation.api import generate_and_post_process
from megatron.inference.text_generation.forward_step import ForwardStep
from megatron.core.inference.contexts import StaticInferenceContext
from megatron.core.inference.sampling_params import SamplingParams
from megatron.core.inference.engines import StaticInferenceEngine
from megatron.core.inference.inference_request import InferenceRequest, VLMInferenceRequest
from megatron.core.inference.text_generation_controllers.vlm_text_generation_controller import (
VLMTextGenerationController,
)
from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
InferenceWrapperConfig,
)
from megatron.core.inference.model_inference_wrappers.multimodal.vlm_inference_wrapper import (
VLMInferenceWrapper,
)
from megatron.training import get_args, get_model, get_tokenizer, print_rank_0, is_last_rank
from megatron.training.checkpointing import load_checkpoint
from megatron.training.initialize import initialize_megatron
def is_first_rank():
"""First tensor and pipeline parallel rank."""
return (
parallel_state.is_pipeline_first_stage(ignore_virtual=True)
and parallel_state.get_tensor_model_parallel_rank() == 0
)
def add_text_generation_args(parser):
"""Text generation arguments."""
group = parser.add_argument_group(title='Vision language model text generation arguments')
group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
group.add_argument(
"--out-seq-length", type=int, default=128, help='Length of the output generated text.'
)
group.add_argument("--output-path", type=str, help='Output file path')
group.add_argument('--input-image-path', type=str, help="Input image directory")
group.add_argument(
'--num-partitions', type=int, default=0, help="Number of partitions for inputs."
)
group.add_argument('--partition-id', type=int, default=0, help="Partition index")
group.add_argument("--gt-path", type=str, help="Optional ground truth file")
group.add_argument(
"--task",
type=str,
choices=[
"captioning",
"TextVQA",
"VQAv2",
"ChartQA",
"MMMU",
"OCRBench",
"OCRBench_v2",
"MathVista",
"AI2D",
"InfoVQA",
"SPDocVQA",
"RD_TableBench",
"VideoMME",
"PerceptionTest",
"MotionBench",
"PhysGameBench",
"MVBench",
"inference",
],
help="Generation task to run",
)
group.add_argument(
"--num-samples-per-partition", type=int, default=0, help="Number of samples per partition"
)
group.add_argument("--config-path", type=str, help="Evaluation config file to use.")
# Add common multimodal arguments needed for e.g. building the model.
parser = add_multimodal_extra_args(parser)
return parser
def get_evaluation_dataloader(
task,
input_image_path,
gt_path,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
num_samples_per_partition,
num_partitions,
partition_id,
num_frames,
num_workers,
vision_model_type,
split="validation"
):
"""Build evaluation dataset."""
dataset = get_evaluation_dataset(
task,
input_image_path,
gt_path,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
num_samples_per_partition,
num_partitions,
partition_id,
num_frames,
vision_model_type,
split=split
)
dp_rank = parallel_state.get_data_parallel_rank()
dp_world_size = parallel_state.get_data_parallel_world_size()
sampler = torch.utils.data.DistributedSampler(
dataset, shuffle=False, num_replicas=dp_world_size, rank=dp_rank
)
# TODO: Batched inference is not supported yet.
dataloader = torch.utils.data.DataLoader(
dataset, batch_size=None, num_workers=num_workers, sampler=sampler, pin_memory=True
)
return dataloader
def generate_samples(model, config: EvaluationConfig, print_output):
"""Text generation using a trained vision language model."""
args = get_args()
dataloader = get_evaluation_dataloader(
config.task,
config.input_image_path,
config.gt_path,
args.img_h,
args.img_w,
args.use_tiling,
args.max_num_tiles,
args.use_thumbnail,
config.num_samples_per_partition,
config.num_partitions,
config.partition_id,
args.num_frames,
args.num_workers,
args.vision_model_type,
config.split
)
num_img_embeddings_per_tile = get_num_image_embeddings(
args.img_h,
args.img_w,
args.patch_dim,
args.vision_model_type,
args.disable_vision_class_token,
1,
args.pixel_shuffle,
args.use_tile_tags,
args.max_num_tiles,
args.tokenizer_prompt_format,
)
if args.use_mcore_inference:
inference_wrapper_config = InferenceWrapperConfig(
hidden_size=args.hidden_size,
inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
fp32_residual_connection=args.fp32_residual_connection,
params_dtype=args.params_dtype,
padded_vocab_size=args.padded_vocab_size,
)
inference_wrapped_model = VLMInferenceWrapper(model, inference_wrapper_config)
tokenizer = get_tokenizer()
controller = VLMTextGenerationController(
inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
)
inference_engine = StaticInferenceEngine(
controller, max_batch_size=1, random_seed=args.seed
)
sampling_params = SamplingParams(
temperature=config.temperature,
top_k=config.top_k,
top_p=config.top_p,
num_tokens_to_generate=config.out_seq_length,
)
for idx, (imgs, num_tiles, sample_id, question, answers, metadata) in enumerate(dataloader):
imgs = imgs.to("cuda")
num_tiles = num_tiles.to("cuda")
conv = get_conversation(config.task, question, metadata)
if not args.use_mcore_inference:
forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles, args.decoder_seq_length)
inference_context = StaticInferenceContext(max_batch_size=1, max_sequence_length=args.inference_max_seq_length)
if is_first_rank():
if args.use_mcore_inference:
inference_request = VLMInferenceRequest(
request_id=inference_engine.get_new_request_id(),
prompt=conv,
prompt_tokens=controller.tokenize_prompt(conv),
sampling_params=sampling_params,
num_img_embeddings_per_tile=num_img_embeddings_per_tile,
imgs=imgs,
num_tiles=num_tiles,
decoder_seq_length=args.decoder_seq_length,
)
results: List[InferenceRequest] = inference_engine.generate(
inference_requests=[inference_request]
)
resp_sentences = [
tokenizer.detokenize(result.prompt_tokens) + result.generated_text
for result in results
]
else:
resp_sentences, _, _, _ = generate_and_post_process(
model, inference_context,
forward_step=forward_step,
prompts=[conv],
tokens_to_generate=config.out_seq_length,
top_k_sampling=config.top_k,
top_p_sampling=config.top_p,
add_BOS=False,
temperature=config.temperature,
random_seed=args.seed,
detokenize_segments=False,
data_parallel=True,
)
for generation in resp_sentences:
if isinstance(sample_id, torch.Tensor):
sample_id = sample_id.item()
output = {"sample_id": sample_id}
output_name = ""
if config.task == "captioning":
output_name = "caption"
elif config.task in (
"TextVQA",
"VQAv2",
"ChartQA",
"OCRBench",
"MathVista",
"AI2D",
"RealworldQA",
"MotionBench",
"PhysGameBench",
"MVBench",
"InfoVQA",
"SPDocVQA",
"inference",
):
output_name = "answer"
elif config.task in ("MMMU"):
output_name = "text"
elif config.task == "VideoMME":
output_name = "response"
output = question
elif config.task in ["OCRBench_v2", "RD_TableBench"]:
output_name = "predict"
else:
raise NotImplementedError("no output name defined for", config.task)
prompt, generated = get_prompt_and_generated(
generation, args.tokenizer_prompt_format
)
if config.task == "VideoMME":
output["questions"][0][output_name] = generated
else:
output["prompt"] = prompt
output[output_name] = generated
if config.task in ["captioning", "RD_TableBench"]:
output["ground_truth"] = answers
elif config.task in (
"TextVQA",
"VQAv2",
"ChartQA",
"OCRBench",
"OCRBench_v2",
"MathVista",
"AI2D",
"PerceptionTest",
"RealworldQA",
"MotionBench",
"PhysGameBench",
"MVBench",
"InfoVQA",
"SPDocVQA",
"inference",
):
if isinstance(answers, str):
answers = [answers]
output["gt_answer"] = answers
if len(metadata) > 0:
output.update(metadata)
elif config.task == "MMMU":
output["prediction"] = generated
output.update(metadata)
elif config.task == "VideoMME":
pass
else:
raise NotImplementedError("no output processing defined for", config.task)
if print_output:
print(output)
yield output
idx += 1
else:
if args.use_mcore_inference:
inference_request = VLMInferenceRequest(
request_id=inference_engine.get_new_request_id(),
prompt=conv,
prompt_tokens=controller.tokenize_prompt(conv),
sampling_params=sampling_params,
num_img_embeddings_per_tile=num_img_embeddings_per_tile,
imgs=imgs,
num_tiles=num_tiles,
decoder_seq_length=args.decoder_seq_length,
)
inference_engine.generate(
inference_requests=[inference_request]
)
else:
generate_and_post_process(
model, inference_context, forward_step=forward_step, detokenize_segments=False, data_parallel=True
)
idx += 1
def get_evaluation_configs(config_path=None) -> Dict[str, EvaluationConfig]:
"""Get evaluation config(s) from a config file or command-line arguments.
Args:
config_path: Optional path to config file. If not provided, will check args.config_path
or fall back to command-line arguments.
Returns:
Dict[str, EvaluationConfig]: dict of configs.
"""
args = get_args()
configs = {}
# Use provided config_path or fall back to args.config_path
config_file = config_path or args.config_path
# We check if we're trying to run a single config evals by checking for the task and output_path
# args.
if hasattr(args, "task") and args.task and hasattr(args, "output_path") and args.output_path:
# Single config from args
config = EvaluationConfig(
task=args.task,
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
out_seq_length=args.out_seq_length,
output_path=args.output_path,
input_image_path=args.input_image_path,
gt_path=args.gt_path,
num_partitions=args.num_partitions,
partition_id=args.partition_id,
num_samples_per_partition=args.num_samples_per_partition,
)
if not config.output_path:
default_output_dir = args.output_path if args.output_path else "generated"
os.makedirs(default_output_dir, exist_ok=True)
config.output_path = os.path.join(default_output_dir, args.language_model_type)
return {args.task: config}
elif config_file:
with open(config_file, "r") as f:
config_data = yaml.safe_load(f)
if 'datasets' not in config_data:
print("Error: 'datasets' key not found in config file for batch mode.")
sys.exit(1)
config_dict = config_data['datasets']
for key, value in config_dict.items():
config = EvaluationConfig(**value)
config.dataset = key
if not config.output_path:
# Use args.output_path if available, otherwise use "generated"
default_output_dir = getattr(args, 'output_path', None) or "generated"
os.makedirs(default_output_dir, exist_ok=True)
config.output_path = os.path.join(default_output_dir, f"{args.language_model_type}")
configs[key] = config
return configs
else:
raise ValueError("No config file provided and no task specified.")
def get_output_path(config, dp_rank):
"""Generation output path."""
ckpt_step = None
try:
args = get_args()
ckpt_step = args.ckpt_step
except Exception as e:
print(f"Failed getting args: {type(e).__name__} - {e}")
if ckpt_step is not None:
return f"{config.output_path}-{config.task}-dprank={dp_rank}-partition={config.partition_id}-step={args.ckpt_step}.jsonl"
else:
return f"{config.output_path}-{config.task}-dprank={dp_rank}-partition={config.partition_id}.jsonl"
def generate_and_write_samples(model, config, print_output=True):
"""Generate text and write to an output file."""
dp_rank = parallel_state.get_data_parallel_rank()
if is_first_rank():
output_path = get_output_path(config, dp_rank)
output_file = open(output_path, "w")
print(f"output path: {output_file.name}")
with torch.no_grad():
for output in generate_samples(model, config, print_output):
if is_first_rank():
output_file.write(json.dumps(output) + "\n")
output_file.flush()
if is_first_rank():
output_file.close()
class VLMForwardStep(ForwardStep):
"""Inference forward step for a multimodal model."""
def __init__(
self,
num_img_embeddings_per_tile,
images,
num_tiles,
decoder_seq_length,
model,
inference_context,
):
"""Create multimodal forward step."""
total_num_tiles = torch.sum(num_tiles).item()
num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles
super().__init__(model, inference_context)
self._images = images
self._num_tiles = num_tiles
self._num_img_embeddings = num_img_embeddings
self.decoder_seq_length = decoder_seq_length
self._recv_only_vision_embeds = False
pp_rank = parallel_state.get_pipeline_model_parallel_rank()
# Checks if the previous stage only has a vision encoder, and that the current stage has part of the LM decoder.
# In this case, the current stage should only receive vision embeddings.
if pp_rank > 0:
self._recv_only_vision_embeds = parallel_state.is_inside_encoder(pp_rank - 1) and (not parallel_state.is_inside_decoder(pp_rank - 1)) and parallel_state.is_inside_decoder()
# Checks if the current stage only has a vision encoder
self._encoder_only = parallel_state.is_inside_encoder() and not parallel_state.is_inside_decoder()
def _forward(self, tokens, position_ids, attention_mask):
return self.model(
self._images,
tokens,
position_ids,
attention_mask=None,
inference_context=self.inference_context,
num_image_tiles=self._num_tiles,
runtime_gather_output=True,
)
def __call__(self, tokens, position_ids, attention_mask):
num_image_tokens = (tokens == self.model.module.image_token_index).sum().item()
num_tokens = tokens.size(1)
recv_buffer_seq_length = None
if num_image_tokens > 0:
# When there are image tokens and this stage only receives vision embeddings, adjust the recv buffer seq length to match the image embeddings sequence length.
# If there are image tokens and this stage receives full embeddings, make sure we compensate for expansion of image tokens.
# Note that this will set a recv_buffer_seq_length for the encoder stage, this length is irrelevant since that recv buffer is never allocated.
if self._recv_only_vision_embeds:
recv_buffer_seq_length = self._num_img_embeddings
else:
recv_buffer_seq_length = min(self._num_img_embeddings + num_tokens - num_image_tokens, self.decoder_seq_length)
elif self._recv_only_vision_embeds:
# If this stage only receives vision embeddings and there are no image tokens we won't run the encoder and therefore shouldn't try to recv.
recv_buffer_seq_length = 0
# If the pipeline stage only has a vision encoder, then it only needs to run when there are image tokens
if not (self._encoder_only and num_image_tokens == 0):
output = super().__call__(tokens, position_ids, attention_mask, recv_buffer_seq_length=recv_buffer_seq_length)
else:
output = None
if isinstance(output, tuple):
logits, _ = output
else:
logits = output
# On the first inference iteration, we compute image tokens.
# On every PP stage(although inference params should only matter for decoder),
# update the sequence length offset by the number of image tokens.
if num_tokens > 1 and num_image_tokens > 0:
if "image_tokens_count" not in self.inference_context.key_value_memory_dict:
self.inference_context.key_value_memory_dict["image_tokens_count"] = self._num_img_embeddings
if self._num_img_embeddings + num_tokens - num_image_tokens > self.decoder_seq_length:
self.inference_context.sequence_len_offset += self.decoder_seq_length - num_tokens
else:
self.inference_context.sequence_len_offset += (
self.inference_context.key_value_memory_dict["image_tokens_count"] - num_image_tokens
)
return logits
def get_conversation(task, question, metadata=None):
"""Get a conversation for a given task and evaluation question."""
conversation = []
# In all cases, the tokenizer adds possible header tokens for the assistant.
if task == "captioning":
conversation = [
{"role": "system", "content": "Answer the questions."},
{
"role": "user",
"content": f"{IMAGE_TOKEN}\nGive a brief description of this image in one sentence.",
},
]
elif task in ("TextVQA", "InfoVQA", "SPDocVQA"):
conversation = [
{"role": "system", "content": "Follow the user's instruction and answer questions."},
{
"role": "user",
"content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word, phrase, or number.",
},
]
elif task == "VQAv2":
conversation = [
{"role": "system", "content": "Follow the user's instruction and answer questions."},
{
"role": "user",
"content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.",
},
]
elif task == "ChartQA":
conversation = [
{"role": "system", "content": "Follow the user's instruction and answer questions."},
{
"role": "user",
"content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.",
},
]
elif task == "MMMU":
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
elif task == "VideoMME":
q = (
"Select the best answer to the following multiple-choice "
"question based on the video. Respond with only the letter "
"(A, B, C, or D) of the correct option.\n"
)
q += question["questions"][0]["question"] + "\n"
q += question["questions"][0]["choices"][0] + "\n"
q += question["questions"][0]["choices"][1] + "\n"
q += question["questions"][0]["choices"][2] + "\n"
q += question["questions"][0]["choices"][3] + "\n"
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{q}"},
]
elif task in ("OCRBench", "OCRBench_v2", "RD_TableBench"):
conversation = [
{"role": "system", "content": "Follow the user's instruction and answer questions."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
elif task == "MathVista":
conversation = [
{"role": "system", "content": "You are math expert. Use your math knowledge to calculate the answer."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
elif task == "RealworldQA":
conversation = [
{"role": "system", "content": "Follow the user's instruction and answer questions."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
elif task == "AI2D":
conversation = [
{"role": "system", "content": "Follow the user's instruction and answer questions."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
elif task == "MotionBench":
extra_instruction = "Respond with only the letter choice (A, B, C, or D) of the correct option.\n"
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}\n{extra_instruction}"},
]
elif task == "PhysGameBench":
extra_instruction = "Respond with only the letter choice (A, B, C, or D) of the correct option.\n"
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}\n{extra_instruction}"},
]
elif task == "MVBench":
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase."},
]
elif task in ["PerceptionTest"]:
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
elif task == "inference":
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"{question}"},
]
else:
raise NotImplementedError(f"No prompting support for task {task}")
return conversation
def get_prompt_and_generated(prompt_and_generation, prompt_format):
"""Strip prompt and other unnecessary text from generation."""
if prompt_format in ("llama3", "llama3p1"):
splitted = prompt_and_generation.split("<|start_header_id|>assistant<|end_header_id|>\n\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|eot_id|>")[0]
elif prompt_format == "mistral":
splitted = prompt_and_generation.split("[/INST]")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("</s>")[0]
elif prompt_format == "chatml":
splitted = prompt_and_generation.split("<|im_start|> assistant\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|im_end|>")[0]
elif prompt_format in ("nvlm-yi-34b", "qwen2p0", "qwen2p5"):
splitted = prompt_and_generation.split("<|im_start|>assistant\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|im_end|>")[0]
elif prompt_format in ("nemotron5"):
splitted = prompt_and_generation.split("<SPECIAL_14>assistant\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<SPECIAL_15>")[0]
elif prompt_format in ("nemotron5-aligned"):
splitted = prompt_and_generation.split("Assistant\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("[PREFIX]")[0]
generated = generated.split("\\n")[0]
else:
raise ValueError(f"Prompt format {prompt_format} is not supported.")
# Remove possible garbage.
generated = generated.strip()
return prompt, generated
def run_eval(config, iteration=None):
# Run evaluation.
print(f"====== {config.task} {config.dataset} at iteration={iteration} scores ======")
if config.task == "TextVQA":
from evaluation.evaluate_textvqa import textvqa_eval
avg_acc = textvqa_eval(config.output_path)
score = {"TextVQA accuracy": avg_acc}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} at iteration={iteration} TextVQA accuracy: {score}\n")
elif config.task == "OCRBench":
from evaluation.evaluate_ocrbench import ocrbench_eval
log, avg_acc = ocrbench_eval(config.output_path)
score = {"OCRBench accuracy": avg_acc}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} at iteration={iteration} OCRBench accuracy: {score}\n")
f.write(f"{log}\n")
elif config.task == "MathVista":
from evaluation.evaluate_mathvista import mathvista_eval
avg_acc = mathvista_eval(config.output_path)
score = {"MathVista accuracy": avg_acc}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} at iteration={iteration} MathVista accuracy: {score}\n")
elif config.task == "ChartQA":
from evaluation.evaluate_chartqa import chartqa_eval
avg_acc = chartqa_eval(config.output_path)
score = {"ChartQA accuracy": avg_acc}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} at iteration={iteration} ChartQA accuracy: {score}\n")
elif config.task == "SPDocVQA":
from evaluation.evaluate_spdocvqa import spdocvqa_eval
avg_acc = spdocvqa_eval(config.output_path)
score = {"SPDocVQA accuracy": avg_acc}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} at iteration={iteration} SPDocVQA accuracy: {score}\n")
elif config.task == "RealworldQA":
from evaluation.evaluate_realworldqa import realworldqa_eval
avg_acc = realworldqa_eval(config.output_path)
score = {"RealworldQA accuracy": avg_acc}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} at iteration={iteration} RealworldQA accuracy: {score}\n")
elif config.task == "AI2D":
from evaluation.evaluate_ai2d import ai2d_eval
avg_acc = ai2d_eval(config.output_path)
score = {f"AI2D {config.dataset} accuracy": avg_acc}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} at iteration={iteration} AI2D accuracy: {score}\n")
elif config.task == "MMMU":
from evaluation.evaluate_mmmu import convert_to_mmmu_format
from examples.multimodal.evaluation.mmmu_utils import mmmu_main_eval
result_file = convert_to_mmmu_format(config.output_path)
result = json.load(open(result_file))
mmmu_results = mmmu_main_eval(result, {"answer_dict": config.gt_path})
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.split} at iteration={iteration} :\n")
for cat, cat_val in mmmu_results.items():
if 'Overall' in cat:
cat = cat.replace("Overall-", "")
print(f'{cat}: {cat_val["acc"] * 100:.2f}')
f.write(f'{cat}: {cat_val["acc"] * 100:.2f}\n')
score = {"MMMU val accuracy": mmmu_results['Overall']['acc']}
elif config.task == 'captioning':
from evaluation.evaluate_coco import coco_captioning_eval
cider_score = coco_captioning_eval(config.output_path, config.gt_path)
score = {f"{config.task} {config.dataset} CIDEr": cider_score}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} CIDEr scores at iteration={iteration}: {cider_score}\n")
elif config.task == 'MotionBench':
from evaluation.evaluate_video_motionbench import motionbench_eval
avg_acc = motionbench_eval(config.output_path)
score = {f"MotionBench accuracy": avg_acc}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} scores at iteration={iteration}: {score}\n")
elif config.task == 'PhysGameBench':
from evaluation.evaluate_video_phys_game_bench import phys_game_bench_eval
avg_acc_dict = phys_game_bench_eval(config.output_path)
score = {f"PhysGame Total accuracy": avg_acc_dict['Physgame-Total-Acc']}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} scores at iteration={iteration}: {avg_acc_dict}\n")
elif config.task == "MVBench":
from evaluation.evaluate_video_mvbench import mvbench_eval
avg_acc_dict = mvbench_eval(config.output_path)
score = {f"MVBench accuracy": avg_acc_dict['total-acc']}
with open(config.output_path + "-scores.txt", "a") as f:
f.write(f"{config.task} {config.dataset} scores at iteration={iteration}: {avg_acc_dict}\n")
elif config.task == "inference":
score = {"Inference accuracy:": None}
pass
else:
raise NotImplementedError(f"Evaluation of {config.task} not implemented yet")
print(score)
return score
def run_evaluation_loop(model, configs, output_dir_override=None, iteration=None, print_output=True):
"""
Common evaluation loop used by both online evaluation during training and standalone evaluation.
Args:
model: The model to evaluate
configs: Dict[str, EvaluationConfig] - dictionary of evaluation configs
output_dir_override: Optional directory to override the output path in configs
iteration: Optional iteration number for logging
print_output: Whether to print generation output
Returns:
Dict[str, float]: Dictionary of evaluation scores
"""
args = get_args()
scores = {}
for key, config in configs.items():
# Handle output path override for online evaluation
if output_dir_override:
config.output_path = os.path.join(output_dir_override, args.language_model_type)
# Generate samples and write to file
generate_and_write_samples(model, config, print_output=print_output)
# Synchronize before evaluation
torch.distributed.barrier()
# Run evaluation on the last rank
if is_last_rank():
task_scores = run_eval(config, iteration=iteration)
scores.update(task_scores)
# Synchronize after evaluation
torch.distributed.barrier()
return scores
def eval_tasks():
"""Vision language model text generation for single or batch tasks."""
initialize_megatron(extra_args_provider=add_text_generation_args)
args = get_args()
def wrapped_model_provider(pre_process, post_process, add_encoder=True, add_decoder=True):
return model_provider(pre_process, post_process, add_encoder=add_encoder, add_decoder=add_decoder,
parallel_output=False)
# Set up model and load checkpoint.
model = get_model(wrapped_model_provider, model_type=ModelType.encoder_and_decoder, wrap_with_ddp=False)
if args.load is not None:
_ = load_checkpoint(model, None, None)
model = model[0]
model.eval()
configs = get_evaluation_configs()
# Use the common evaluation loop
run_evaluation_loop(model, configs, iteration=args.ckpt_step)
if __name__ == "__main__":
eval_tasks()
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 1.
path: <path_to_sft_dataset_in_energon_format>
subflavors:
augmentation: false
val:
datasets:
- weight: 1.
path: <path_to_sft_dataset_in_energon_format>
subflavors:
augmentation: false
#!/bin/bash
# Run SFT on a pretrained multimodal model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-sft"
# Check that the user has set an output path for model checkpoints.
if [[ -z $WORKSPACE ]]; then
echo "Please set WORKSPACE for storing your model checkpoints."
exit 1
fi
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
if [[ -z $LOAD_NAME ]]; then
echo "Please set LOAD_NAME for input model name."
exit 1
fi
if [[ -z $LOAD_ITER ]]; then
echo "Please set LOAD_ITER for pre-trained input model iteration."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
DEBUG=0
if [[ $DEBUG -eq 1 ]]; then
BZ=8
NW=1
HD=0.0
LI=1
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
else
BZ=128
NW=2
HD=0.1
LI=10
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
fi
OPTIONS=" \
--apply-layernorm-1p \
--attention-softmax-in-fp32 \
--use-checkpoint-args \
--use-distributed-optimizer \
--transformer-impl transformer_engine \
--use-te \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--num-workers ${NW} \
--exit-duration-in-mins 230 \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout ${HD} \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 576 \
--decoder-seq-length 2048 \
--max-position-embeddings 4096 \
--ffn-hidden-size 14336 \
--train-iters 20000 \
--micro-batch-size 1 \
--global-batch-size ${BZ} \
--lr-decay-iters 20000 \
--lr-warmup-fraction .01 \
--lr 1e-6 \
--min-lr 1e-7 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 500 \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
--save-interval 500 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--split 100,0,0 \
--clip-grad 0.5 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--log-params-norm \
--log-num-zeros-in-grad \
--eod-mask-loss \
--freeze-ViT \
--patch-dim 14 \
--img-h 336 \
--img-w 336 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type=mistral_7b \
--disable-vision-class-token \
${EXTRA_ARGS} \
--distributed-timeout-minutes 60 \
--ckpt-format torch
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
NUM_FRAMES=1
while [[ $# -gt 0 ]]; do
case $1 in
-i|--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
--num-frames)
NUM_FRAMES="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
-t|--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--apply-layernorm-1p \
--attention-softmax-in-fp32 \
--use-flash-attn \
--transformer-impl transformer_engine \
--use-te \
--use-checkpoint-args \
--normalization RMSNorm \
--language-model-type mistral_7b \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--group-query-attention \
--num-query-groups 8 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--max-position-embeddings 4096 \
--no-masked-softmax-fusion \
--load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \
--bf16 \
--micro-batch-size 1 \
--seq-length 2048 \
--out-seq-length 12 \
--temperature 1.0 \
--img-h 336 \
--img-w 336 \
--patch-dim 14 \
--seed 153 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--input-image-path ${INPUT_IMAGE_PATH} \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--task ${TASK} \
--disable-vision-class-token \
--num-frames ${NUM_FRAMES} \
--ckpt-format torch
done
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Pretrain or SFT multimodal."""
import math
import os
import sys
from functools import partial
import torch
import yaml
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
from dataloader_provider import train_valid_test_dataloaders_provider, is_first_or_last_stage
from model import model_provider
from multimodal_args import add_multimodal_extra_args
from megatron.core import mpu, tensor_parallel
from megatron.core.enums import ModelType
from megatron.core.models.multimodal import context_parallel
from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, LLaVAModel
from megatron.core.packed_seq_params import PackedSeqParams
from megatron.core.parallel_state import (
get_tensor_model_parallel_rank,
get_pipeline_model_parallel_world_size,
is_pipeline_last_stage,
)
from megatron.training import get_args, get_timers, get_tokenizer, pretrain
from megatron.training.utils import is_last_rank, get_batch_on_this_cp_rank
def get_batch(data_iterator, image_token_index, img_seq_len):
"""Generate a batch
Note: attn_mask_type in layer_specs.py sets the attention mask. Attention mask is None here.
"""
imgs = None
tokens = None
labels = None
loss_mask = None
attention_mask = None
position_ids = None
num_tiles = None
packed_seq_params = None
args = get_args()
# Dataloader doesn't run on the middle stages in a pipeline parallel model.
pp_size = get_pipeline_model_parallel_world_size()
if not is_first_or_last_stage(pp_size, args.encoder_pipeline_model_parallel_size):
# Note these are all set to None above.
return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles, packed_seq_params
# Broadcast data.
torch.cuda.nvtx.range_push("get_data")
if data_iterator is not None and get_tensor_model_parallel_rank() == 0:
data = next(data_iterator)
else:
data = None
data_text = tensor_parallel.broadcast_data(["tokens"], data, torch.int64)["tokens"]
labels = tensor_parallel.broadcast_data(["labels"], data, torch.int64)["labels"]
imgs = tensor_parallel.broadcast_data(["imgs"], data, torch.float32)["imgs"]
num_tiles = tensor_parallel.broadcast_data(["num_tiles"], data, torch.int32)["num_tiles"]
cu_lengths = tensor_parallel.broadcast_data(["cu_lengths"], data, torch.int32)["cu_lengths"]
max_lengths = tensor_parallel.broadcast_data(["max_lengths"], data, torch.int32)["max_lengths"]
# No image input (text-only sample) if the dataloader returned a size 1 image.
if imgs.shape == torch.Size([1, 1]):
# FSDP can hang with text-only samples. A workaround is to run a valid dummy image through the vision
# model and then add image embeddings with a zero multiplier.
if args.use_torch_fsdp2:
imgs = torch.zeros((1, 3, args.img_h, args.img_w), dtype=torch.float32, device=data_text.device)
num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
else:
# Similar workaround is not needed without FSDP and we can use an empty image.
# FIXME: text-only data can cause still cause a hang in the special case where
# the vision model is own its own pipeline rank and --freeze-ViT is enabled.
imgs = torch.tensor([], dtype=torch.float32, device=data_text.device)
num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
# Last pipeline parallel stage doesn't need images.
if pp_size > 1 and is_pipeline_last_stage():
imgs = None
# If cu_lengths and max_lengths are non-dummy, construct PackedSeqParams. Otherwise, leave it at None.
if cu_lengths.shape != torch.Size([1, 1]):
assert (
cu_lengths.shape[0] == max_lengths.shape[0] == 1
), "micro-batch-size must be 1 for packing"
cu_lengths = cu_lengths[0]
max_lengths = max_lengths[0]
packed_seq_params = PackedSeqParams(
qkv_format="thd",
cu_seqlens_q=cu_lengths,
cu_seqlens_kv=cu_lengths,
max_seqlen_q=max_lengths,
max_seqlen_kv=max_lengths,
)
torch.cuda.nvtx.range_pop()
tokens_ = data_text.long()
torch.cuda.nvtx.range_push("index tokens")
tokenizer = get_tokenizer()
text_length = tokens_.shape[1]
tokens = tokens_[:, :text_length].contiguous()
labels = labels[:, 1 : text_length + 1].contiguous()
assert tokens.shape == labels.shape, f"tokens: {tokens.shape} != labels: {labels.shape}"
torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_push("get_ltor_masks_and_position_ids")
loss_mask, position_ids = get_ltor_masks_and_position_ids(tokens, labels, tokenizer.pad)
torch.cuda.nvtx.range_pop()
# If context parallel is enabled, must shard inputs to CP ranks.
if args.context_parallel_size > 1 or args.sequence_parallel:
assert tokens.shape[0], "micro-batch-size > 1 not supported yet with CP"
num_image_tokens = torch.sum(tokens == image_token_index).item()
num_image_embeddings = img_seq_len * imgs.shape[0] - num_image_tokens
seq_len = text_length + num_image_embeddings
# CP expects sequence length is divisible by CP size so apply padding.
mp_padding_needed = context_parallel.get_padding(
seq_len, args.context_parallel_size,
args.tensor_model_parallel_size, args.sequence_parallel,
)
tokens, position_ids, labels, loss_mask = [torch.nn.functional.pad(item, (0, mp_padding_needed)) for item in (tokens, position_ids, labels, loss_mask)]
# Get PackedSeqParams that indicate the amount of padding for TransformerEngine.
packed_seq_params = context_parallel.get_packed_seq_params(tokens, num_image_embeddings, mp_padding_needed, args.context_parallel_size, True)
return (
tokens,
labels,
loss_mask,
attention_mask,
position_ids,
imgs,
num_tiles,
packed_seq_params,
)
def get_ltor_masks_and_position_ids(input_ids, target, pad_token):
"""Build masks and position id for left to right model."""
seq_length = input_ids.shape[1]
# Position ids.
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
# Loss mask.
loss_mask = torch.ones(target.size(), dtype=torch.float, device=input_ids.device)
loss_mask[target == pad_token] = 0.0 # mask paddings
loss_mask[target == IGNORE_INDEX] = 0.0 # mask prompts
return loss_mask, position_ids
def get_mask_start_and_end_idx(arr):
"""
Returns a list of tuples holding the start and end index in arr of the non-zeros contiguuous
sub arrays.
For instance, if arr = [0, 1, 0, 0, 1, 1]
get_mask_start_and_end_idx(arr) = [(1, 1), (4, 5)]
such that arr[1:1+1] = [1] and arr[4:5+1] = [1, 1]
"""
mask = (arr != 0)
mask_int = mask.int()
diff = mask_int[1:] - mask_int[:-1]
start_indices = (diff == 1).nonzero(as_tuple=False).flatten() + 1
end_indices = (diff == -1).nonzero(as_tuple=False).flatten()
if len(mask)==0: return []
if mask[0]:
start_indices = torch.cat((torch.tensor([0], device=arr.device), start_indices))
if mask[-1]:
end_indices = torch.cat((end_indices, torch.tensor([len(arr) - 1], device=arr.device)))
sequences = list(zip(start_indices.tolist(), end_indices.tolist()))
return sequences
def scaled_loss_func(loss_mask, output_tensor):
"""
Scaled loss function
Scale the loss for each conversation turn using the formula:
1 / sum_j[ sqrt(length(loss_turn_j)) ] * sum_i[ sum(loss_turn_i) / sqrt(length(loss_turn_i)) ]
Where we use the loss mask to infer the start / end of the conversation turns.
"""
args = get_args()
losses = output_tensor.float()
loss_list = []
num_valid_labels_list = []
for idx in range(losses.shape[0]):
loss_this_sample = losses[idx]
turn_start_end_list = get_mask_start_and_end_idx(loss_mask[idx])
for turn_start, turn_end in turn_start_end_list:
# compute loss for each turn
loss_this_turn = loss_this_sample[turn_start:turn_end+1].sum()
assert (1 - loss_mask)[idx][turn_start:turn_end+1].sum() < 1.0
num_valid_labels_this_turn = turn_end - turn_start + 1
loss_this_turn = loss_this_turn / num_valid_labels_this_turn
loss_list.append(loss_this_turn)
# append num of valid labels for each turn
num_valid_labels_list.append(num_valid_labels_this_turn)
base_num = sum([math.sqrt(each) for each in num_valid_labels_list])
for idx in range(len(loss_list)):
# normalize loss for each turn
loss_list[idx] = loss_list[idx] * math.sqrt(num_valid_labels_list[idx]) / base_num
# Some ranks may not get loss tokens due to Context Parallel Sharding
if len(loss_list) > 0:
total_loss = torch.stack(loss_list).sum()
total_tokens = torch.ones_like(total_loss)
elif len(loss_list) == 0 and args.context_parallel_size > 1:
total_tokens = loss_mask.sum()
total_loss = torch.sum(losses.view(-1) * loss_mask)
else:
raise RuntimeError("loss_list for loss scaling per conversation unexpectedly got empty list")
num_tokens = total_tokens.clone().detach().to(torch.int)
reporting_loss = torch.cat([total_loss.clone().detach().view(1), num_tokens.view(1)])
return (total_loss, num_tokens, {'lm loss': reporting_loss})
def loss_func(loss_mask, output_tensor):
args = get_args()
losses = output_tensor.view(-1).float()
loss_mask = loss_mask.contiguous().view(-1).float()
loss = torch.sum(losses * loss_mask)
num_tokens = loss_mask.sum().clone().detach().to(torch.int)
reporting_loss = torch.cat([loss.clone().detach().view(1), num_tokens.view(1)])
return (loss, num_tokens, {'lm loss': reporting_loss})
def forward_step(data_iterator, model: LLaVAModel):
"""Forward training step.
Args:
data_iterator (torch.utils.data.dataloader): Input data iterator
model: Multimodal model
Returns:
output_tensor (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
loss_func (callable): Loss function with a loss mask specified.
"""
timers = get_timers()
# Get the batch.
timers('batch-generator', log_level=2).start()
(
tokens,
labels,
loss_mask,
attention_mask,
position_ids,
images,
num_image_tiles,
packed_seq_params,
) = get_batch(data_iterator, model.module.module.image_token_index, model.module.module.img_seq_len)
timers('batch-generator').stop()
output_tensor, loss_mask = model(
images,
tokens,
position_ids,
attention_mask,
labels,
loss_mask,
num_image_tiles=num_image_tiles,
packed_seq_params=packed_seq_params,
)
args = get_args()
if args.use_loss_scaling:
loss_function = partial(scaled_loss_func, loss_mask)
else:
loss_function = partial(loss_func, loss_mask)
return output_tensor, loss_function
def llava_embedding_ranks(pp_ranks):
"""LLava's embedding ranks consist of the decoder's first and last ranks (ie, the ViT has no embeddings).
Args:
pp_ranks: A list of global ranks that constitute a pipeline group.
"""
args = get_args()
# encoder size is also the index to the first rank of the decoder.
epp = args.encoder_pipeline_model_parallel_size
last_rank = pp_ranks[-1]
if len(pp_ranks) == 1 or pp_ranks[epp] == last_rank:
return [last_rank]
else:
return [pp_ranks[epp], last_rank]
def llava_position_embedding_ranks(pp_ranks):
"""LLava's embedding ranks consist of the singular rank of the model or the decoder's first rank.
Args:
pp_ranks: A list of global ranks that constitute a pipeline group.
"""
args = get_args()
# encoder size is also the index to the first rank of the decoder.
epp = args.encoder_pipeline_model_parallel_size
last_rank = pp_ranks[-1]
if len(pp_ranks) == 1:
return [last_rank]
else:
return [pp_ranks[epp]]
def run_online_eval(model):
"""Run an evaluation benchmark during training."""
args = get_args()
# Online evaluation config is not defined. Do nothing.
if not args.online_evaluation_config:
return []
from config import EvaluationConfig
# Import the common evaluation functions
from run_text_generation import get_evaluation_configs, run_evaluation_loop
# Use the common config loading function
configs = get_evaluation_configs(config_path=args.online_evaluation_config)
# The inference code assumes the first rank is the leader.
# Tensorboard writer is on the last rank.
# We must write to a storage space that all ranks see.
output_dir = os.path.join(args.save, "online_eval")
os.makedirs(output_dir, exist_ok=True)
# Use the common evaluation loop
scores = run_evaluation_loop(model[0].module, configs, output_dir_override=output_dir, print_output=False)
return [scores]
def write_eval_to_tensorboard(data, iteration, writer, walltime=None):
"""Write evaluation data to Tensorboard."""
if not writer:
return
for item in data:
for k, v in item.items():
writer.add_scalar(k, v, iteration, walltime=walltime)
def write_online_eval_to_tensorboard(data, iteration, writer, walltime=None):
"""Write online evaluation data to Tensorboard."""
import shutil
args = get_args()
# Define source and destination directories
source_dir = os.path.join(args.save, "online_eval")
destination_dir = os.path.join(args.save, f"online_eval_{iteration}")
if os.path.exists(source_dir):
print("Moving online eval data from", source_dir, "to", destination_dir)
# Move the directory (back up the generation)
shutil.move(source_dir, destination_dir)
write_eval_to_tensorboard(data, iteration, writer, walltime)
if __name__ == "__main__":
train_valid_test_dataloaders_provider.is_distributed = True
pretrain(
train_valid_test_dataloaders_provider,
model_provider,
ModelType.encoder_and_decoder,
forward_step,
args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
extra_args_provider=add_multimodal_extra_args,
process_non_loss_data_func=write_online_eval_to_tensorboard,
get_embedding_ranks=llava_embedding_ranks,
get_position_embedding_ranks=llava_position_embedding_ranks,
non_loss_data_func=run_online_eval,
)
# NVIDIA TensorRT Model Optimizer (ModelOpt) Integration
ModelOpt (`nvidia-modelopt`) provides end-to-end model optimization for NVIDIA hardware including
quantization, sparsity, knowledge distillation, pruning, neural architecture search.
You can find more info abour ModelOpt at our Github repository https://github.com/NVIDIA/TensorRT-Model-Optimizer.
We support Megatron Core `GPTModel` and `MambaModel` as well as task-specific optimization
such as speculative decoding. Users can choose to start from Megatron-LM or NeMo framework.
The optimized model can be deploied with NVIDIA TensorRT-LLM, vLLM, or SGLang.
## Table of Contents
[[_TOC_]]
## Getting Started with Post-Training Quantization (
> **IMPORTANT :** Example scripts require basic access (general available) to
> NVIDIA GPU Cloud (NGC). If you have yet to register and acquire a `NGC_CLI_API_KEY`,
> please first register at https://ngc.nvidia.com/signin.
Login to nvcr.io docker registry (using `NGC_CLI_API_KEY`) and start an interactive
section **at the root of the megatron-lm repo!** Export your `NGC_CLI_API_KEY` in the environment.
```sh
docker login nvcr.io
docker run --gpus all --init -it --rm -v $PWD:/workspace/megatron-lm \
nvcr.io/nvidia/pytorch:24.10-py3 bash
cd /workspace/megatron-lm/examples/post_training/modelopt
export NGC_CLI_API_KEY=
```
Now let's start a simple FP8 quantization task. You must provide `HF_TOKEN` which grants you
access to `meta-llama/Llama-3.2-1B-Instruct`.
```sh
export HF_TOKEN=
bash convert.sh meta-llama/Llama-3.2-1B-Instruct
MLM_MODEL_CKPT=/tmp/megatron_workspace/meta-llama/Llama-3.2-1B-Instruct_mlm bash quantize.sh meta-llama/Llama-3.2-1B-Instruct fp8
```
The model card name (see the support list in `conf/`) is expected as an input to all the sample scripts.
Other arguments are specified as varibles (e.g. `TP=8`) where you can either set before `bash` or export
to the current bash environment upfront.
The script will perform per-tensor FP8 faked-quantization and generate some tokens as an indication thatthe quantized model still behaves correctly. The end results are stored in `/tmp/megatron_workspace/meta-llama/Llama-3.2-1B-Instruct_quant`. This is a Megatron Mcore distributed checkpoint (with additional states), which can be loaded for quantization-aware training (QAT) or exported for deployment.
## Export for TensorRT-LLM, vLLM, SGLang Deployment
For supported Hugging Face models, TensorRT Model Optimizer can export the quantized model to
a HF-like checkpoint with real-quantied weights.
```sh
MLM_MODEL_CKPT=/tmp/megatron_workspace/meta-llama/Llama-3.2-1B-Instruct_quant bash export.sh meta-llama/Llama-3.2-1B-Instruct
```
> **NOTE:** The HF-like export only supports pipeline parallelism (`PP`). Other parallelism must be
> set to 1. The exported checkpoint is sharded with safetensors. Although it is HF-like, this format
> currently cannot be loaded by `from_pretrained()`.
The exported checkpoint is stored in `/tmp/megatron_workspace/meta-llama/Llama-3.1-8B-Instruct_export` which can be provided as an input to most of the `LLM` APIs. For examples,
```
vllm serve /tmp/megatron_workspace/meta-llama/Llama-3.1-8B-Instruct_export --quantization modelopt
```
> **TROUBLESHOOTING:** You need a device with `sm>=89` (Ada Lovelace or Hopper) for FP8 compute.
## Advanced Usage
TBD
MLM_MODEL_CFG=$1
# Bash coloring
RED='\033[0;31m'
YELLOW='\033[0;33m'
GREEN='\033[0;32m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
WHITE='\033[0;37m'
# Predefined logging
MLM_ERROR="${RED}ERROR: ${WHITE}"
MLM_WARNING="${YELLOW}WARNING:${WHITE}"
if [ -z ${SANDBOX_ENV_SETUP} ]; then
printf "${MLM_WARNING} ${PURPLE}SANDBOX_ENV_SETUP${WHITE} is not set!\n"
else
source ${SANDBOX_ENV_SETUP}
fi
if [ -z ${SCRIPT_DIR} ]; then
printf "${MLM_ERROR} Variable ${PURPLE}SCRIPT_DIR${WHITE} must be set!\n"
exit 1
fi
if [ -z ${MLM_MODEL_CFG} ]; then
printf "${MLM_ERROR} Variable ${PURPLE}MLM_MODEL_CFG${WHITE} must be set!\n"
exit 1
fi
if [ -z ${MLM_ENV_SETUP} ]; then
printf "${MLM_WARNING} Variable ${PURPLE}MLM_ENV_SETUP${WHITE} not set! (only needed when launching with slurm)\n"
else
source ${MLM_ENV_SETUP}
fi
if [ -z ${MLM_EXTRA_ARGS} ]; then
printf "${MLM_WARNING} Use ${PURPLE}MLM_EXTRA_ARGS${WHITE} to provide additional arguments!\n"
fi
if [ -z ${MLM_WORK_DIR} ]; then
export MLM_WORK_DIR=/tmp/megatron_workspace
printf "${MLM_WARNING} Variable ${PURPLE}MLM_WORK_DIR${WHITE} is set (default: ${MLM_WORK_DIR})!\n"
fi
if [ -z ${TP} ]; then
TP=1
printf "${MLM_WARNING} Variable ${PURPLE}TP${WHITE} not set! (default: ${TP})\n"
fi
if [ -z ${EP} ]; then
EP=1
printf "${MLM_WARNING} Variable ${PURPLE}EP${WHITE} not set! (default: ${EP})\n"
fi
if [ -z ${PP} ]; then
PP=1
printf "${MLM_WARNING} Variable ${PURPLE}PP${WHITE} not set! (default: ${PP})\n"
fi
if [ -z ${DP} ]; then
DP=1
printf "${MLM_WARNING} Variable ${PURPLE}DP${WHITE} not set! (default: ${DP})\n"
fi
if [ -z ${LAUNCH_SCRIPT} ]; then
LAUNCH_SCRIPT="torchrun --nproc_per_node=$((TP * EP * PP * DP))"
fi
# Install TensorRT Model Optimizer if haven't.
if [ -z ${MLM_SKIP_INSTALL} ]; then
pip install -r ${SCRIPT_DIR}/requirements.txt
fi
export TOKENIZERS_PARALLELISM=False
export OMP_NUM_THREADS=1
export NCCL_IB_SL=1
export NCCL_IB_TIMEOUT=22
export CUDA_DEVICE_MAX_CONNECTIONS=1
# TE specific warning
printf "${MLM_WARNING} If you see core_attention _extra_state missing error, use --export-force-local-attention\n"
# Base model specific arguments
if [ -z ${SANDBOX_ROOT} ]; then
source "${SCRIPT_DIR}/conf/${MLM_MODEL_CFG}.sh"
else
source "${SANDBOX_ROOT}/conf/model/${MLM_MODEL_CFG}.sh"
fi
#!/bin/bash
TOKENIZER_MODEL="deepseek-ai/DeepSeek-R1"
MODEL_ARGS=" \
--save-interval 100000 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--rotary-percent 1.0 \
--no-rope-fusion \
--no-position-embedding \
--normalization RMSNorm \
--swiglu \
--num-layers 61 \
--hidden-size 7168 \
--ffn-hidden-size 18432 \
--num-attention-heads 128 \
--kv-channels 128 \
--multi-latent-attention \
--kv-lora-rank 512 \
--q-lora-rank 1536 \
--qk-head-dim 128 \
--qk-layernorm \
--qk-pos-emb-head-dim 64 \
--num-experts 256 \
--moe-layer-freq [0]*3+[1]*58 \
--moe-ffn-hidden-size 2048 \
--moe-router-score-function sigmoid \
--moe-router-bias-update-rate 0.001 \
--moe-router-enable-expert-bias \
--moe-router-topk 8 \
--moe-router-pre-softmax \
--moe-router-topk-scaling-factor 2.5 \
--moe-shared-expert-overlap \
--moe-shared-expert-intermediate-size 2048 \
--moe-aux-loss-coeff 1e-2 \
--moe-router-load-balancing-type seq_aux_loss \
--moe-token-dispatcher-type alltoall \
--moe-token-drop-policy probs \
--seq-length 4096 \
--max-position-embeddings 163840 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 40 \
--use-mcore-models \
--rotary-base 10000 \
--rotary-percent 1.0 \
--rotary-scaling-factor 40 \
--mscale 1.0 \
--mscale-all-dim 1.0 \
--recompute-activations \
--moe-layer-recompute \
"
#!/bin/bash
TOKENIZER_MODEL="deepseek-ai/DeepSeek-V2-Lite"
MODEL_ARGS=" \
--save-interval 100000 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--no-rope-fusion \
--normalization RMSNorm \
--norm-epsilon 1e-6 \
--swiglu \
--num-layers 27 \
--hidden-size 2048 \
--ffn-hidden-size 10944 \
--num-attention-heads 16 \
--kv-channels 16 \
--multi-latent-attention \
--kv-lora-rank 512 \
--v-head-dim 128 \
--qk-head-dim 128 \
--qk-layernorm \
--qk-pos-emb-head-dim 64 \
--num-experts 64 \
--moe-layer-freq ([0]+[1]*26) \
--moe-ffn-hidden-size 1408 \
--moe-grouped-gemm \
--moe-router-score-function softmax \
--moe-router-topk 6 \
--moe-router-topk-scaling-factor 1.0 \
--moe-router-pre-softmax \
--moe-shared-expert-intermediate-size 2816 \
--moe-aux-loss-coeff 1e-3 \
--moe-token-dispatcher-type alltoall \
--moe-token-drop-policy probs \
--moe-router-load-balancing-type seq_aux_loss \
--seq-length 1024 \
--max-position-embeddings 1024 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 3200 \
--attention-softmax-in-fp32 \
--use-mcore-models \
--rotary-percent 1.0 \
--rotary-base 10000 \
--rotary-scaling-factor 40 \
--mscale 0.707 \
--mscale-all-dim 0.707 \
--sequence-parallel \
"
#!/bin/bash
if [ -z ${HF_MODEL_CKPT} ]; then
HF_MODEL_CKPT=meta-llama/Llama-3.1-8B-Instruct
TOKENIZER_MODEL=nvidia/Llama-3.1-70B-Instruct-FP8
else
TOKENIZER_MODEL=${HF_MODEL_CKPT}
fi
MODEL_ARGS=" \
--save-interval 100000 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--rotary-percent 1.0 \
--no-rope-fusion \
--no-position-embedding \
--normalization RMSNorm \
--swiglu \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--group-query-attention \
--num-query-groups 8 \
--seq-length 4096 \
--max-position-embeddings 8192 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 1 \
--use-mcore-models \
--rotary-base 500000 \
--use-rope-scaling \
"
#!/bin/bash
if [ -z ${HF_MODEL_CKPT} ]; then
HF_MODEL_CKPT=meta-llama/Llama-3.2-1B-Instruct
TOKENIZER_MODEL=nvidia/Llama-3.1-70B-Instruct-FP8
else
TOKENIZER_MODEL=${HF_MODEL_CKPT}
fi
MODEL_ARGS=" \
--save-interval 100000 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--use-rotary-position-embeddings \
--no-rope-fusion \
--no-position-embedding \
--normalization RMSNorm \
--swiglu \
--num-layers 16 \
--hidden-size 2048 \
--ffn-hidden-size 8192 \
--num-attention-heads 32 \
--group-query-attention \
--num-query-groups 8 \
--seq-length 4096 \
--max-position-embeddings 8192 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 1 \
--use-mcore-models \
--rotary-percent 1.0 \
--rotary-base 500000 \
--use-rope-scaling \
--export-force-local-attention \
"
#!/bin/bash
if [ -z ${HF_MODEL_CKPT} ]; then
HF_MODEL_CKPT=meta-llama/Llama-4-Maverick-17B-128E-Instruct
TOKENIZER_MODEL=meta-llama/Llama-4-Maverick-17B-128E-Instruct
else
TOKENIZER_MODEL=${HF_MODEL_CKPT}
fi
MODEL_ARGS=" \
--recompute-activations \
--save-interval 100000 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--no-rope-fusion \
--normalization RMSNorm \
--swiglu \
--num-layers 48 \
--hidden-size 5120 \
--ffn-hidden-size 16384 \
--num-attention-heads 40 \
--group-query-attention \
--num-query-groups 8 \
--num-experts 128 \
--moe-layer-freq ([0,1]*24) \
--moe-layer-recompute \
--moe-ffn-hidden-size 8192 \
--moe-router-score-function sigmoid \
--moe-router-topk 1 \
--moe-router-topk-scaling-factor 1.0 \
--moe-router-dtype fp32 \
--moe-shared-expert-intermediate-size 8192 \
--moe-aux-loss-coeff 1e-3 \
--moe-token-dispatcher-type alltoall \
--moe-token-drop-policy probs \
--moe-router-load-balancing-type seq_aux_loss \
--seq-length 2048 \
--max-position-embeddings 2048 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 1 \
--use-mcore-models \
--rotary-percent 1.0 \
--rope-scaling-factor 8.0 \
--rotary-base 500000 \
--rotary-interleaved \
--no-rope-freq 4 \
--export-moe-apply-probs-on-input \
"
#!/bin/bash
if [ -z ${HF_MODEL_CKPT} ]; then
HF_MODEL_CKPT=meta-llama/Llama-4-Scout-17B-16E-Instruct
TOKENIZER_MODEL=meta-llama/Llama-4-Scout-17B-16E-Instruct
else
TOKENIZER_MODEL=${HF_MODEL_CKPT}
fi
MODEL_ARGS=" \
--save-interval 100000 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--no-rope-fusion \
--normalization RMSNorm \
--swiglu \
--num-layers 48 \
--hidden-size 5120 \
--ffn-hidden-size 16384 \
--num-attention-heads 40 \
--group-query-attention \
--num-query-groups 8 \
--qk-layernorm \
--num-experts 16 \
--moe-ffn-hidden-size 8192 \
--moe-router-score-function sigmoid \
--moe-router-topk 1 \
--moe-router-topk-scaling-factor 1.0 \
--moe-router-dtype fp32 \
--moe-shared-expert-intermediate-size 8192 \
--moe-aux-loss-coeff 1e-3 \
--moe-token-dispatcher-type alltoall \
--moe-token-drop-policy probs \
--moe-router-load-balancing-type seq_aux_loss \
--seq-length 4096 \
--max-position-embeddings 4096 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 128 \
--use-mcore-models \
--rotary-interleaved \
--rotary-percent 1.0 \
--rotary-base 500000 \
--rope-scaling-factor 8.0 \
--use-rope-scaling \
--sequence-parallel \
--no-bias-swiglu-fusion \
--export-qk-l2-norm \
--export-moe-apply-probs-on-input \
"
#!/bin/bash
if [ -z ${HF_MODEL_CKPT} ]; then
HF_MODEL_CKPT=nvidia/Nemotron-H-4B-Instruct
TOKENIZER_MODEL=nvidia/Nemotron-H-4B-Instruct
else
TOKENIZER_MODEL=${HF_MODEL_CKPT}
fi
MODEL_ARGS=" \
--save-interval 100000 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--rotary-percent 0.5 \
--no-rope-fusion \
--no-position-embedding \
--normalization RMSNorm \
--squared-relu \
--num-layers 52 \
--hidden-size 3072 \
--ffn-hidden-size 12288 \
--kv-channels 128 \
--num-attention-heads 32 \
--group-query-attention \
--num-query-groups 8 \
--hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \
--mamba-head-dim 64 \
--mamba-num-heads 112 \
--mamba-num-groups 8 \
--mamba-state-dim 128 \
--seq-length 4096 \
--max-position-embeddings 8192 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 1 \
--use-mcore-models \
--rotary-base 10000 \
--export-model-type MambaModel \
"
#!/bin/bash
if [ -z ${HF_MODEL_CKPT} ]; then
HF_MODEL_CKPT=nvidia/Nemotron-H-8B-Base-8K
TOKENIZER_MODEL=nvidia/Nemotron-H-8B-Base-8K
else
TOKENIZER_MODEL=${HF_MODEL_CKPT}
fi
MODEL_ARGS=" \
--save-interval 100000 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--no-rope-fusion \
--no-position-embedding \
--normalization RMSNorm \
--squared-relu \
--num-layers 52 \
--hidden-size 4096 \
--ffn-hidden-size 21504 \
--num-attention-heads 32 \
--group-query-attention \
--num-query-groups 8 \
--hybrid-override-pattern M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M*-M-M-M-M-M- \
--is-hybrid-model \
--mamba-head-dim 64 \
--mamba-num-heads 128 \
--mamba-num-groups 8 \
--mamba-state-dim 128 \
--seq-length 4096 \
--max-position-embeddings 8192 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 1 \
--use-mcore-models \
--rotary-percent 0.5 \
--rotary-base 500000 \
--export-model-type MambaModel \
"
# --rotary-base 10000 \
#!/bin/bash
if [ -z ${HF_MODEL_CKPT} ]; then
HF_MODEL_CKPT=nvidia/Nemotron-Mini-4B-Instruct
TOKENIZER_MODEL=nvidia/Nemotron-Mini-4B-Instruct
else
TOKENIZER_MODEL=${HF_MODEL_CKPT}
fi
MODEL_ARGS=" \
--save-interval 100000 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--untie-embeddings-and-output-weights \
--use-rotary-position-embeddings \
--rotary-percent 0.5 \
--no-rope-fusion \
--no-position-embedding \
--normalization LayerNorm \
--apply-layernorm-1p \
--squared-relu \
--num-layers 32 \
--hidden-size 3072 \
--ffn-hidden-size 9216 \
--num-attention-heads 24 \
--group-query-attention \
--num-query-groups 8 \
--seq-length 4096 \
--max-position-embeddings 4096 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 1 \
--use-mcore-models \
--rotary-base 10000 \
"
#!/bin/bash
if [ -z ${HF_MODEL_CKPT} ]; then
HF_MODEL_CKPT=Qwen/Qwen3-235B-A22B
TOKENIZER_MODEL=Qwen/Qwen3-235B-A22B
else
TOKENIZER_MODEL=${HF_MODEL_CKPT}
fi
MODEL_ARGS=" \
--save-interval 100000 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--no-rope-fusion \
--normalization RMSNorm \
--swiglu \
--num-layers 94 \
--hidden-size 4096 \
--ffn-hidden-size 12288 \
--num-attention-heads 64 \
--group-query-attention \
--num-query-groups 4 \
--kv-channels 128 \
--qk-layernorm \
--num-experts 128 \
--moe-ffn-hidden-size 1536 \
--moe-router-topk 8 \
--moe-router-dtype fp32 \
--moe-aux-loss-coeff 1e-3 \
--moe-token-dispatcher-type alltoall \
--moe-router-load-balancing-type aux_loss \
--moe-layer-recompute \
--seq-length 4096 \
--max-position-embeddings 40960 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 1187 \
--use-mcore-models \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--no-bias-swiglu-fusion \
--sequence-parallel \
"
#!/bin/bash
if [ -z ${HF_MODEL_CKPT} ]; then
HF_MODEL_CKPT=Qwen/Qwen3-30B-A3B
TOKENIZER_MODEL=Qwen/Qwen3-30B-A3B
else
TOKENIZER_MODEL=${HF_MODEL_CKPT}
fi
MODEL_ARGS=" \
--save-interval 100000 \
--micro-batch-size 1 \
--bf16 \
--no-masked-softmax-fusion \
--disable-bias-linear \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--no-rope-fusion \
--normalization RMSNorm \
--swiglu \
--num-layers 48 \
--hidden-size 2048 \
--ffn-hidden-size 6144 \
--num-attention-heads 32 \
--group-query-attention \
--num-query-groups 4 \
--kv-channels 128 \
--qk-layernorm \
--num-experts 128 \
--moe-ffn-hidden-size 768 \
--moe-router-topk 8 \
--moe-router-dtype fp32 \
--moe-aux-loss-coeff 1e-3 \
--moe-token-dispatcher-type alltoall \
--moe-router-load-balancing-type aux_loss \
--moe-layer-recompute \
--seq-length 4096 \
--max-position-embeddings 40960 \
--tokenizer-type HuggingFaceTokenizer \
--make-vocab-size-divisible-by 1187 \
--use-mcore-models \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--no-bias-swiglu-fusion \
--sequence-parallel \
"
#!/bin/bash
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
# Common arguments and base model specific arguments
source "${SCRIPT_DIR}/conf/arguments.sh"
# Default arguments of this script
MLM_DEFAULT_ARGS="--finetune --auto-detect-ckpt-format --export-te-mcore-model --use-cpu-initialization"
if [ -z ${HF_TOKEN} ]; then
printf "${MLM_WARNING} Variable ${PURPLE}HF_TOKEN${WHITE} is not set! HF snapshot download may fail!\n"
fi
if [ -z ${MLM_MODEL_SAVE} ]; then
MLM_MODEL_SAVE=${MLM_WORK_DIR}/${MLM_MODEL_CFG}_mlm
printf "${MLM_WARNING} Variable ${PURPLE}MLM_MODEL_SAVE${WHITE} is not set (default: ${MLM_MODEL_SAVE})!\n"
fi
if [ -z ${MLM_MODEL_CKPT} ]; then
if [ -z ${HF_MODEL_CKPT} ]; then
HF_MODEL_CKPT=${1}
fi
${LAUNCH_SCRIPT} ${SCRIPT_DIR}/convert_model.py \
${MODEL_ARGS} \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--tokenizer-model ${TOKENIZER_MODEL} \
--pretrained-model-path ${HF_MODEL_CKPT} \
--save ${MLM_MODEL_SAVE} \
${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS}
else
${LAUNCH_SCRIPT} ${SCRIPT_DIR}/convert_model.py \
${MODEL_ARGS} \
--tensor-model-parallel-size ${TP} \
--pipeline-model-parallel-size ${PP} \
--tokenizer-model ${TOKENIZER_MODEL} \
--load ${MLM_MODEL_CKPT} \
--save ${MLM_MODEL_SAVE} \
${MLM_DEFAULT_ARGS} ${MLM_EXTRA_ARGS}
fi
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
"""Convert a GPTModel."""
import functools
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")))
import modelopt.torch.speculative as mtsp
import torch
from modelopt.torch.export import import_mcore_gpt_from_hf
from megatron.core import mpu
from megatron.core.enums import ModelType
from megatron.post_training.arguments import add_modelopt_args
from megatron.post_training.checkpointing import load_modelopt_checkpoint
from megatron.post_training.model_provider import model_provider
from megatron.training import get_args # , get_model
from megatron.training.checkpointing import save_checkpoint
from megatron.training.initialize import initialize_megatron
from megatron.training.utils import print_rank_0, unwrap_model
ALGO_TO_CONFIG = {
"eagle1": mtsp.config.EAGLE1_DEFAULT_CFG,
"eagle3": mtsp.config.EAGLE3_DEFAULT_CFG,
"eagle-mtp": mtsp.config.EAGLE_MTP_DEFAULT_CFG,
}
def add_convert_args(parser):
"""Add additional arguments for ModelOpt checkpoint convertion."""
group = parser.add_argument_group(title='ModelOpt MCore checkpoint convertion')
group.add_argument(
"--pretrained-model-path", type=str, default=None, help="HuggingFace pretrained model"
)
group.add_argument(
"--extra-model-path", type=str, default=None, help="Extra module weights to load"
)
add_modelopt_args(parser)
return parser
def get_model(model_provider_func, model_type=ModelType.encoder_or_decoder, wrap_with_ddp=True):
"""Build the model."""
args = get_args()
args.model_type = model_type
pre_process = mpu.is_pipeline_first_stage()
post_process = mpu.is_pipeline_last_stage()
model = model_provider_func(pre_process=pre_process, post_process=post_process)
model.model_type = model_type
return [model]
def check_arguments():
"""Checking user arguments."""
args = get_args()
if args.num_layers_per_virtual_pipeline_stage is not None:
print_rank_0("Interleaved pipeline schedule is not yet supported for text generation.")
exit()
if hasattr(args, 'moe_grouped_gemm') and args.moe_grouped_gemm == True:
print_rank_0("WARNING: Forcing moe_grouped_gemm to False for PTQ and export.")
args.moe_grouped_gemm = False
if __name__ == "__main__":
initialize_megatron(
extra_args_provider=add_convert_args,
args_defaults={
'tokenizer_type': 'HuggingFaceTokenizer',
'no_load_rng': True,
'no_load_optim': True,
},
)
check_arguments()
args = get_args()
model = get_model(functools.partial(model_provider, parallel_output=True), wrap_with_ddp=False)
unwrapped_model = unwrap_model(model)[0]
if args.pretrained_model_path is not None:
unwrapped_model = unwrap_model(model)[0]
workspace_dir = os.environ.get("MLM_WORK_DIR", "/tmp")
import_mcore_gpt_from_hf(unwrapped_model, args.pretrained_model_path, workspace_dir)
elif args.load is not None:
_ = load_modelopt_checkpoint(model)
if args.export_num_eagle_layers > 0:
mtsp_config = ALGO_TO_CONFIG[args.export_eagle_algorithm]
mtsp_config["config"]["draft_vocab_size"] = args.export_draft_vocab_size
unwrapped_model = mtsp.convert(unwrapped_model, mtsp_config)
if args.extra_model_path is not None:
eagle_module = getattr(unwrapped_model, "eagle_module", None)
if eagle_module is not None:
mcore_eagle_state_dict = torch.load(args.extra_model_path)
eagle_module.load_state_dict(mcore_eagle_state_dict, strict=False)
if args.export_num_medusa_heads > 0:
config = {"medusa_num_heads": args.export_num_medusa_heads, "medusa_num_layers": 1}
unwrapped_model = mtsp.convert(unwrapped_model, [("medusa", config)])
if args.export_num_mtp > 0:
config = {
"mtp_num_module": args.export_num_mtp,
"mtp_num_layers": 1,
"mtp_freeze_list": args.export_freeze_mtp,
"use_last_layernorm": False,
}
unwrapped_model = mtsp.convert(unwrapped_model, [("mtp", config)])
print_rank_0(f"Converted Model:\n {model}")
torch.distributed.barrier()
save_checkpoint(1, model, None, None, 0)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment