Commit d520d24f authored by silencealiang's avatar silencealiang
Browse files

Merge branch 'main' into 'main'

megatron升级v0.10

See merge request !3
parents 3aca1415 481609bb
Pipeline #2055 failed with stages
in 0 seconds
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import argparse
import os
import sys
# Add megatron and the multimodal example to the path.
sys.path.append(
os.path.abspath(
os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir)
)
)
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
import torch
from transformers import AutoModel
from examples.multimodal.model import model_provider
from examples.multimodal.multimodal_args import add_multimodal_extra_args
from megatron.training import get_model
from megatron.training.checkpointing import load_checkpoint
from megatron.training.initialize import initialize_megatron
def run_mcore_vision(model_path):
"""Run mcore vision model."""
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
# Megatron has some mandatory flags.
sys.argv = [
"ignore_me.py",
"--micro-batch-size=1",
"--num-layers=2",
"--vision-model-type=internvit",
"--language-model-type=mistral_7b",
"--tokenizer-prompt-format=mistral",
"--tokenizer-type=MultimodalTokenizer",
"--tokenizer-model=mistralai/Mistral-7B-Instruct-v0.3",
"--vocab-size=1024",
"--hidden-size=64",
"--num-attention-heads=8",
"--seq-length=1024",
"--decoder-seq-length=2048",
"--max-position-embeddings=2048",
"--bf16",
"--img-h=448",
"--img-w=448",
"--patch-dim=14",
"--tensor-model-parallel-size=8",
"--use-te",
f"--pretrained-checkpoint={model_path}",
]
initialize_megatron(extra_args_provider=add_multimodal_extra_args)
def wrapped_model_provider(pre_process, post_process):
return model_provider(pre_process, post_process, parallel_output=False)
# Set up model and load checkpoint.
model = get_model(wrapped_model_provider, wrap_with_ddp=False)
vision_model = model[0].module.vision_model
load_checkpoint([vision_model], None, None)
vision_model.eval()
images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
output = vision_model(images)
return output
def run_hf_vision(model_name):
"""Run HF vision model."""
model = (
AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True)
.cuda()
.eval()
)
images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
outputs = model(images, return_dict=True)
return outputs
def main(mcore_model, hf_model):
"""Compare vision model outputs between mcore and HF given the same fixed input."""
mcore = run_mcore_vision(mcore_model)
if torch.distributed.get_rank() == 0:
hf = run_hf_vision(hf_model)
hf = hf["last_hidden_state"]
# Compare logits. Due to different attention implementations and other details,
# there will be numerical differences.
diff = (mcore - hf).abs()
mean_diff = diff.mean().item()
max_diff = diff.max().item()
print(f"mean diff {mean_diff}, max diff {max_diff}")
assert mean_diff < 0.1, "mean output difference is greater than expected"
assert max_diff < 50, "max output difference is greater than expected"
print("lgtm")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Check mcore vision model output vs. HF numerically.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--mcore-model", type=str, required=True, help="directory for mcore model weights"
)
parser.add_argument("--hf-model", type=str, required=True, help="Model name in HF")
args = parser.parse_args()
main(args.mcore_model, args.hf_model)
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
def add_multimodal_extra_args(parser):
"""Extra arguments."""
group = parser.add_argument_group(title='multimodal arguments')
group.add_argument('--dataset-config', type=str, default=None)
group.add_argument("--prompt-path", type=str, default=None)
group.add_argument('--freeze-LM', action='store_true', default=False)
group.add_argument('--freeze-ViT', action='store_true', default=False)
group.add_argument('--language-model-type', type=str, required=True)
group.add_argument('--vision-model-type', type=str, default="clip")
group.add_argument("--disable-vision-class-token", action="store_true", default=False)
group.add_argument(
"--allow-missing-vision-projection-checkpoint", action="store_true", default=False
)
group.add_argument("--use-te", action="store_true", default=False)
group.add_argument(
"--dataloader-save", type=str, default=None, help="Energon dataloader state save path"
)
group.add_argument(
"--use-tiling", action="store_true", default=False, help="Use input image tiling"
)
group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
group.add_argument(
"--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile"
)
group.add_argument(
"--dataloader-seq-length",
type=int,
help="Make dataloader to produce sequences of specific length.",
)
group.add_argument(
"--num-frames",
type=int,
default=1,
help="Number of frames to regularly sample from the video as input to the model.",
)
group.add_argument(
"--online-evaluation-config", type=str, help="Config file for online evaluation."
)
group.add_argument(
"--special-tokens",
nargs="*",
default=[IMAGE_TOKEN],
help="Special tokens used in the multimodal model",
)
group.add_argument(
"--tokenizer-prompt-format",
type=str,
choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0"],
required=True,
help="Prompt format to use with the tokenizer.",
)
group.add_argument("--pixel-shuffle", action="store_true", default=False)
group.add_argument(
"--image-tag-type",
type=str,
choices=["nvlm", "internvl", ""],
default="", # Default: Image tag not used.
help="Surround image tokens with tags.",
)
group.add_argument("--use-tile-tags", action="store_true", default=False, help="Use tile tags")
group.add_argument(
"--packing-buffer-size",
type=int,
default=None, # Packing is disabled by default.
help="Enable sample packing by setting the buffer size to > 0",
)
group.add_argument(
"--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
)
return parser
NVLM
====
Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
*NOTE: VLMs in Megatron are under active development and are expected to change.*
# Setup
## Docker image
Please use `examples/multimodal/Dockerfile`.
## Dataset preparation
Please refer to Tables 4 and 6 in the [NVLM paper](https://arxiv.org/pdf/2409.11402) for full list of pretrain and SFT datasets.
Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
## Model conversion
### Vision model
NVLM 1.0 models use [OpenGVLab/InternViT-6B-448px-V1-5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V1-5) from HuggingFace.
Please download it and run the following command to convert it to Megatron format.
```
python examples/multimodal/model_converter/internvit_converter.py --output-dir <some output dir> --use-te --tensor-parallel-size 8
```
### 34B Language model
NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B) from HuggingFace.
Please download it and run the following command to convert it to Megatron format.
```
python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
--load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model directory> \
--saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1
```
### 72B Language model
NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) from HuggingFace.
Please download it and run the following command to convert it to Megatron format.
```
python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
--load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model directory> \
--saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf
```
### Combined checkpoint
Combine the vision model checkpoint from [InternVit](#internvit) with the [34B](#34b-language-model) or [72B](#72b-language-model) language model by running:
```
examples/multimodal/combine_lm_vision_checkpoints.sh <language model directory> <vision model directory> <output directory> nvlm
```
# Training
## 34B
1. Pretraining: please run `examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh`. Please use the InternViT + 34B [combined checkpoint](#combined-checkpoint) and tokenizer from HuggingFace.
2. SFT: please run `examples/multimodal/nvlm/sft_34b_internvit.sh` using the checkpoint from 1.
## 72B
1. Pretraining: please run `examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh`. Please use the InternViT + 72B [combined checkpoint](#combined-checkpoint) and tokenizer from HuggingFace.
2. Convert the pretraining checkpoint from 1. to have pipeline parallel size = 4 for SFT. Please run
```
examples/multimodal/nvlm/pp_checkpoint_converter.py --input <pretrained checkpoint directory> \
--input-pipeline-parallel 1 --output <some output dir> --output-pipeline-parallel 4 \
--tensor-parallel 8
```
3. SFT: please run `examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh` using the checkpoint from 2.
4. To convert the checkpoint with pipeline parallel size = 4 back to 1 for evaluation, please run
```
examples/multimodal/nvlm/pp_checkpoint_converter.py --input <sft checkpoint directory> \
--input-pipeline-parallel 4 --output <some output dir> --output-pipeline-parallel 1 \
--tensor-parallel 8
```
# Evaluation
Run the text generation script.
- 34B
```
examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
--model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name --use-tiling
```
- 72B
```
examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
--model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name --use-tiling
```
where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning`, `MMMU` or `TextVQA`.
Then, run one of the evaluation scripts from `examples/multimodal`. For example
```
python examples/multimodal/evaluate_mmmu.py --input-path /output/directory/from/generation
```
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
""""
NOTE: NVLM uses InternViT with tensor parallel (TP) size = 8.
Since InternViT has 25 attention heads and Megatron currently requires the number of attention heads
to be divisible by the TP size, we add 7 dummy zero attention heads to have 32 attention heads.
This workaround requires some changes to how we compute RMSNorm, Attention etc.
Additionally, InternViT introduces some unique features like Layer Scaling.
Those code changes are gathered here.
"""
from functools import partial
from typing import Dict
import torch
from megatron.core.dist_checkpointing.mapping import ShardedStateDict
from megatron.core.extensions.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TERowParallelLinear,
)
from megatron.core.parallel_state import (
get_tensor_model_parallel_group,
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
)
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.spec_utils import ModuleSpec, build_module
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
class InternViTRMSNorm(MegatronModule):
def __init__(
self,
config,
hidden_size: int,
eps: float = 1e-6,
sequence_parallel: bool = False,
compute_var: bool = False,
):
"""Custom RMSNorm for InternViT.
Args:
config (TransformerConfig): Config.
hidden_size (int): Input hidden size.
eps (float): epsilon to use for the norm, default to 1e-6
sequence_parallel (bool): Set to true if sequence parallelism is being used,
this marks the weights as needing to be allreduced.
compute_var (bool): Indicator to compute statistic manually.
"""
super().__init__(config=config)
self.config = config
self.eps = eps
self.weight = torch.nn.Parameter(torch.ones(hidden_size))
self._compute_var = compute_var
assert not sequence_parallel, "Sequence parallelism is not supported with InternViT."
setattr(self.weight, 'sequence_parallel', sequence_parallel)
def _norm(self, x, var):
if var is None:
var = x.pow(2).mean(-1, keepdim=True)
return x * torch.rsqrt(var + self.eps)
def forward(self, x):
"""Run RMSNorm with an option to compute custom statistic."""
var = None
if self._compute_var:
unpadded_hidden_size = self.config.hidden_size # 3200
max_dim = x.shape[-1] # 128
x = x.reshape(x.size(0), x.size(1), -1)
var = self._gather_var(x.float().pow(2), max_dim) / unpadded_hidden_size
output = self._norm(x.float(), var).type_as(x)
output = output * self.weight
if self._compute_var:
output = output.reshape(output.size(0), output.size(1), -1, max_dim)
return output
def _gather_var(self, input_, max_dim, valid_ranks=6):
"""Compute statistic across the non-dummy heads."""
world_size = get_tensor_model_parallel_world_size()
assert world_size == 8, "tested only with TP=8"
# Size and dimension.
last_dim = input_.dim() - 1
rank = get_tensor_model_parallel_rank()
if rank < valid_ranks: # Ranks 0-5 have 24 non-dummy attention heads.
var = input_.sum(-1, keepdim=True)
elif rank == valid_ranks: # Rank 6 has 1 non-dummy attention head.
var = input_[..., :max_dim].sum(-1, keepdim=True)
else:
var = input_.sum(-1, keepdim=True) * 0.0 # Zero-out the dummy heads.
tensor_list = [torch.empty_like(var) for _ in range(world_size)]
tensor_list[rank] = var
torch.distributed.all_gather(tensor_list, var, group=get_tensor_model_parallel_group())
output = torch.cat(tensor_list, dim=last_dim).contiguous()
return output.sum(-1, keepdim=True)
def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata={}):
# in InternVitSelfAttention the q_layernorm and k_layernorm weights
# are tensor-parallel so must be converted to sharded tensors
if 'q_layernorm' in prefix or 'k_layernorm' in prefix:
state_dict = self.state_dict(prefix='', keep_vars=True)
return make_sharded_tensors_for_checkpoint(
state_dict, prefix, {'weight': 0}, sharded_offsets
)
else:
return super().sharded_state_dict(prefix, sharded_offsets, metadata)
def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
# Dense MLP w/ or w/o TE modules.
return ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
),
)
# Handle InternViT's layer scaling.
def _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training):
x, bias = x_with_bias # unpack
residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
if bias is not None:
x = x + bias
out = torch.nn.functional.dropout(x, p=prob, training=training)
out = residual + out * ls
return out
else:
out = torch.nn.functional.dropout(x, p=prob, training=training)
out = residual + out * ls
return out
def bias_dropout_add_unfused_internvit(ls, training):
"""Bias-dropout-add as in Megatron but with added LayerScaling handling."""
def _bias_dropout_add(x_with_bias, residual, prob):
return _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training)
return _bias_dropout_add
def get_bias_dropout_add_internvit(ls, training, fused):
"""Bias-dropout-add as in Megatron but with added LayerScaling handling."""
assert not fused, "Fused bias-dropout-add not implemented for InternViT."
return bias_dropout_add_unfused_internvit(ls, training)
# Add InternViT specialties to our default TransformerLayer.
class InternViTTransformerLayer(TransformerLayer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.ls1 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
self.ls2 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
self.self_attn_bda = partial(self.self_attn_bda, self.ls1)
self.mlp_bda = partial(self.mlp_bda, self.ls2)
# Override a few things that are special in InternViT and not supported by the SelfAttention class.
class InternViTSelfAttention(SelfAttention):
def __init__(
self, config: TransformerConfig, submodules: SelfAttentionSubmodules, *args, **kwargs
):
super().__init__(config=config, submodules=submodules, *args, **kwargs)
# Need to override linear_qkv, q_layernorm and k_layernorm.
qkv_bias = False
self.linear_qkv = build_module(
submodules.linear_qkv,
self.config.hidden_size,
self.query_projection_size + 2 * self.kv_projection_size,
config=self.config,
init_method=self.config.init_method,
gather_output=False,
bias=qkv_bias,
skip_bias_add=False,
is_expert=False,
tp_comm_buffer_name='qkv',
)
qk_layernorm_hidden_size = (
self.hidden_size_per_attention_head * self.num_attention_heads_per_partition
) # 512 for internvit
self.q_layernorm = build_module(
submodules.q_layernorm,
hidden_size=qk_layernorm_hidden_size,
config=self.config,
eps=self.config.layernorm_epsilon,
compute_var=True,
)
self.k_layernorm = build_module(
submodules.k_layernorm,
hidden_size=qk_layernorm_hidden_size,
config=self.config,
eps=self.config.layernorm_epsilon,
compute_var=True,
)
class InternViTTEDotProductAttention(TEDotProductAttention):
"""Adjusted Attention for InternViT"""
def forward(self, *args, **kwargs):
"""Regular TEDotProductAttention + zero-out dummy attention heads."""
out = super().forward(*args, **kwargs)
# This makes sure the dummy attention heads are zeroed out.
mask = torch.ones_like(out, dtype=out.dtype, device=out.device)
rank = get_tensor_model_parallel_rank()
max_dim = out.shape[-1] # 128
valid_ranks = 6
if rank == valid_ranks:
mask[..., max_dim:] *= 0.0
elif rank > valid_ranks:
mask *= 0.0
out *= mask
return out
def get_internvit_layer_spec(use_te) -> ModuleSpec:
mlp = get_mlp_module_spec(use_te) # no norm
return ModuleSpec(
module=InternViTTransformerLayer,
submodules=TransformerLayerSubmodules(
input_layernorm=InternViTRMSNorm,
self_attention=ModuleSpec(
module=InternViTSelfAttention,
params={"attn_mask_type": AttnMaskType.no_mask},
submodules=SelfAttentionSubmodules(
linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear,
core_attention=TEDotProductAttention if use_te else DotProductAttention,
linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
q_layernorm=InternViTRMSNorm,
k_layernorm=InternViTRMSNorm,
),
),
self_attn_bda=get_bias_dropout_add_internvit,
pre_mlp_layernorm=InternViTRMSNorm,
mlp=mlp,
mlp_bda=get_bias_dropout_add_internvit,
),
)
{
"COMMENT": "Mixture of our own custom prompts and some prompts from https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
"Captioning": {
"raw": [
"Can you briefly explain what you see in the image?",
"Describe what's happening in this image in one short sentence.",
"Write a short caption that accurately represents the content of this image.",
"Please generate a descriptive caption for the image provided.",
"How would you summarize the scene depicted in the picture in short?",
"Describe the image briefly.",
"Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
"Create a concise caption that accurately describes the main elements in the image provided.",
"Write a brief, yet comprehensive, description of the image.",
"Describe the image in a clear and concise manner.",
"For the given image, provide a one-sentence summary that captures the most important details.",
"Generate a short caption for the picture.",
"Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
"Provide a concise and informative caption for the image, focusing on the primary subjects.",
"Write a clear description of the image, make sure the key features are well covered.",
"Offer a succinct explanation of the picture presented."
]
},
"CaptioningPretraining": {
"raw": [
"Give a brief description of image.",
"Give a brief description of the image.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely.",
"Generate a clear and concise summary of the photo."
]
},
"CaptioningSFT": {
"raw": [
"Give a brief description of the image.",
"Give a short and clear explanation of the subsequent image.",
"Present a compact description of the photo's key features.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Render a clear and concise summary of the photo.",
"Share a concise interpretation of the image provided.",
"Summarize the visual content of the image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely."
]
},
"VQAPretraining": {
"raw": [
"Question: {} Short answer:",
"Question: {} Answer:"
]
},
"VQASFT": {
"raw": [
"{}",
"{}\nAnswer the question using a single word or phrase."
],
"docvqa": [
"{}",
"{}\nAnswer this question using the text in the image directly."
]
},
"DocPretraining": {
"raw": [
"Retrieve the text from the given pdf image.",
"Extract the text from the provided document.",
"Transcribe the text displayed in the image."
],
"ocr_multi": [
"Apply grounded Optical Character Recognition (OCR) to the provided image.",
"Extract all texts and their bounding boxes from the given image using grounded OCR.",
"Extract and transcribe all visible text from the provided image, ensuring accurate spatial recognition.",
"Conduct a detailed optical character recognition analysis on this image, maintaining the text's original layout and positioning.",
"Execute a thorough text recognition procedure on this visual input, ensuring that the spatial arrangement of the text is accurately represented.",
"Perform an in-depth OCR scan of the image, capturing both the content and contextual positioning of all textual information.",
"OCR with grounding:"
],
"md": [
"Extract the text from the given image and format it in Markdown.",
"Convert the text from the provided image into Markdown format.",
"Transform the text from the given image into Markdown syntax.",
"Extract and convert the text from the image to Markdown.",
"Retrieve the text from the image and present it in Markdown format."
],
"grounded_ocr": [
"{}. Text:",
"Recognize the text in this region: {}.",
"Identify the text in this area: {}.",
"Detect the text within this section: {}."
],
"referring_grounding": [
"Region of \"{}\" is:",
"Locate the text \"{}\" in the image.",
"Identify the text \"{}\" in the image and provide the coordinates."
]
},
"CaptioningDetailed": {
"raw": [
"Create a comprehensive paragraph that captures the essence of the image while weaving a cohesive narrative around its elements.",
"Compose a paragraph that thoroughly describes the image's content, providing context and connections between different aspects of the scene.",
"Provide a detailed, paragraph-length description of the image that paints a vivid picture and tells a coherent story.",
"Write a rich and engaging paragraph that delves into the image's components, describing not only what is seen but also how the elements relate to one another.",
"Give a well-rounded, paragraph-length explanation of the image, describing the scene and its components while forming a complete and engaging narrative.",
"Produce a paragraph that not only describes the individual elements in the image but also weaves them together to form a cohesive, connected account.",
"Construct a paragraph that captures the image's details and context, offering a more in-depth and engaging story than a simple caption.",
"Compose a descriptive paragraph that brings the image to life through detailed storytelling, connecting the various visual elements into a unified narrative.",
"Create a paragraph that provides an extensive and interconnected description of the image, ensuring that the narrative is both detailed and cohesive.",
"Write a compelling and detailed paragraph that delves into the image's components, linking them together to create a unified and engaging story."
]
},
"OCR": {
"raw": [
"Can you read the text from image and output here?",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"
],
"markdown": [
"Can you extract all visible text from the provided image?",
"Converting the text embedded in this image into a readable markdown document.",
"Can you read the text in the document as markdown?",
"Transcribe the document as markdown.",
"Extract and document the text from the provided image."
],
"table_markdown": [
"Can you extract all visible text from the provided table?",
"Can you read the text in the provided table as markdown?",
"Transcribe the table as markdown.",
"Extract and document the text from the provided table image."
],
"plain": [
"Transcribe the document as plain text.",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"
],
"bbox_plain": [
"Transcribe the document as plain text along with bounding boxes.",
"Extract and document the text from the provided image along with bounding boxes.",
"Converting the text embedded in this image into a readable documen along with bounding boxes.",
"Can you extract all visible text with bounding boxes from the image here?"
]
},
"VQA": {
"raw": [
"Given the image, answer the following question with few words.",
"Answer the following question: ",
"What is the answer to this question?",
"Write the answer: ",
"Please answer this question: "
]
},
"Embedded": {
"raw": [
"Given the image, answer the following question with few words.",
"Answer the following question: ",
"What is the answer to this question?",
"Write the answer: ",
"Please answer this question: "
]
}
}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import argparse
import os
import sys
import torch
# Add megatron to the path.
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir))
)
def split(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_layers_per_pp_rank):
"""Split pipeline parallel size = 1 checkpoint to pipeline parallel size N."""
for tp in range(num_tp):
path = os.path.join(input_dir, f"mp_rank_0{tp}", "model_optim_rng.pt")
sd = torch.load(path)
if num_layers_per_pp_rank is None:
num_layers = sd["args"].num_layers
assert num_layers % output_pp == 0, "specify --num-layers-per-pp-rank for an uneven split"
num_layers_per_pp_rank = [num_layers // output_pp] * output_pp
layer_lb = 0
for pp in range(output_pp):
assert num_layers_per_pp_rank[pp] > 0, "each pp rank must have at least 1 layer"
layer_ub = layer_lb + num_layers_per_pp_rank[pp]
new_sd = sd.copy()
new_sd["model"] = dict()
for k, v in sd["model"].items():
# First pp rank has vision model.
if pp == 0 and ("vision_model" in k or "vision_projection" in k):
new_sd["model"][k] = v
continue
# Only the first pp rank has the word embeddings.
if "language_model.embedding.word_embeddings" in k and pp == 0:
new_sd["model"][k] = v
# Only the last pp rank has the output layer.
if "language_model.output_layer" in k and pp == output_pp - 1:
new_sd["model"][k] = v
# Only the last pp rank has final layer norm.
if "language_model.decoder.final_layernorm" in k and pp == output_pp - 1:
new_sd["model"][k] = v
if "language_model.decoder.layers" in k:
layer_num = int(k.split(".")[3])
if layer_lb <= layer_num and layer_num < layer_ub:
# On all pp ranks, megatron starts layer nums from 0!
new_layer_num = int(layer_num - layer_lb)
k_splitted = k.split(".")
k_splitted[3] = str(new_layer_num)
new_k = ".".join(k_splitted)
new_sd["model"][new_k] = v
output_dir = os.path.join(base_output_dir, f"iter_0000001/mp_rank_0{tp}_00{pp}")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "model_optim_rng.pt")
torch.save(new_sd, output_path)
print(f"processed tp rank: {tp}/{num_tp - 1} and pp rank: {pp}/{output_pp - 1}")
layer_lb = layer_ub
# This is needed for megatron checkpoint loading.
with open(os.path.join(base_output_dir, "latest_checkpointed_iteration.txt"), "w") as f:
f.write("1")
def combine(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_layers_per_pp_rank):
"""Combine pipeline parallel size = N checkpoint to pipeline parallel size 1."""
for tp in range(num_tp):
new_sd = None
layer_num_offset = 0
max_layer_num = 0
for pp in range(input_pp):
path = os.path.join(input_dir, f"mp_rank_0{tp}_00{pp}", "model_optim_rng.pt")
sd = torch.load(path)
if pp == 0:
new_sd = sd.copy()
new_sd["model"] = dict()
new_sd["args"].pipeline_model_parallel_size = 1
assert new_sd is not None
for k, v in sd["model"].items():
# First pp rank has vision model.
if pp == 0 and ("vision_model" in k or "vision_projection" in k):
new_sd["model"][k] = v
continue
# Only the first pp rank has the word embeddings.
if "language_model.embedding.word_embeddings" in k and pp == 0:
new_sd["model"][k] = v
# Only the last pp rank has the output layer.
if "language_model.output_layer" in k and pp == input_pp - 1:
new_sd["model"][k] = v
# Only the last pp rank has final layer norm.
if "language_model.decoder.final_layernorm" in k and pp == input_pp - 1:
new_sd["model"][k] = v
if "language_model.decoder.layers" in k:
layer_num = int(k.split(".")[3])
# On all pp ranks, megatron starts layer nums from 0!
new_layer_num = layer_num_offset + layer_num
if new_layer_num > max_layer_num:
max_layer_num = new_layer_num
k_splitted = k.split(".")
k_splitted[3] = str(new_layer_num)
new_k = ".".join(k_splitted)
new_sd["model"][new_k] = v
print(f"processed tp rank: {tp}/{num_tp - 1} and pp rank: {pp}/{input_pp - 1}")
layer_num_offset = max_layer_num + 1
output_dir = os.path.join(base_output_dir, f"iter_0000001/mp_rank_0{tp}")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "model_optim_rng.pt")
torch.save(new_sd, output_path)
# This is needed for megatron checkpoint loading.
with open(os.path.join(base_output_dir, "latest_checkpointed_iteration.txt"), "w") as f:
f.write("1")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Change pipeline parallelism for a model",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--input", type=str, required=True, help="Input model directory"
)
parser.add_argument(
"--input-pipeline-parallel", type=int, required=True, help="Input model pipeline parallelism"
)
parser.add_argument(
"--output", type=str, required=True, help="Output model directory"
)
parser.add_argument(
"--output-pipeline-parallel", type=int, required=True, help="Output model pipeline parallelism"
)
parser.add_argument(
"--tensor-parallel", type=int, required=True, help="Model tensor parallel size",
)
parser.add_argument(
"--num-layers-per-pp-rank", type=int, default=None, nargs="*", help="Specify this for uneven pipeline parallel split",
)
args = parser.parse_args()
f = None
if args.input_pipeline_parallel == 1 and args.output_pipeline_parallel > 1:
f = split
elif args.input_pipeline_parallel > 1 and args.output_pipeline_parallel == 1:
f = combine
else:
raise NotImplementedError("Only pipeline parallel 1 to N and N to 1 are supported")
f(args.input, args.output, args.input_pipeline_parallel, args.output_pipeline_parallel, args.tensor_parallel, args.num_layers_per_pp_rank)
print("done.")
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 0.579 # Datasets are weighted according to their size. Weights sum up to 1.
path: <path to laion dataset>
subflavors:
augmentation: False
- weight: 0.02
path: <path to coco>
subflavors:
augmentation: False
- weight: 0.01
path: <path to vqav2 dataset>
subflavors:
augmentation: False
# Please refer to Table 4 in https://arxiv.org/pdf/2409.11402 for full list of pretrain datasets.
# Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
val:
datasets:
- weight: 1.
path: <path to validation dataset>
subflavors:
augmentation: False
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export TOKENIZERS_PARALLELISM="false"
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-qwen20-72b-internvit-${DATETIME}"
else
MODEL_NAME="mcore-qwen20-72b-internvit"
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
CHECKPOINT_DIR="${WORKSPACE}/combined-qwen2.0-72b-instruct-internvit-6b-448px-1.5-tp8-te"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
AD=0.0
HD=0.0
LI=1
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
else
MBZ=1
BZ=2048
NW=8
AD=0.1
HD=0.1
LI=5
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
fi
SEQ_LEN=256 # Image embeddings sequence length.
DECODER_SEQ_LEN=512 # Language model sequence length.
MAX_POS_EMBED=512
OPTIONS=" \
--use-checkpoint-args \
--exit-duration-in-mins 230 \
--disable-bias-linear \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<path to tokenizer model> \
--tokenizer-prompt-format qwen2p0 \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 80 \
--hidden-size 8192 \
--ffn-hidden-size 29568 \
--add-qkv-bias \
--num-attention-heads 64 \
--use-distributed-optimizer \
--use-te \
--num-workers ${NW} \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings 32768 \
--train-samples 122880000 \
--lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--lr 1e-4 \
--min-lr 2.5e-5 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 500 \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--save-interval 5000 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--split 100,0,0 \
--clip-grad 10.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--bf16 \
--eod-mask-loss \
--freeze-ViT \
--freeze-LM \
--patch-dim 14 \
--img-h 448 \
--img-w 448 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type qwen2.0_72B \
${EXTRA_ARGS} \
--allow-missing-vision-projection-checkpoint \
--vision-model-type internvit \
--disable-vision-class-token \
--log-params-norm \
--log-num-zeros-in-grad \
--ckpt-format torch \
--pixel-shuffle \
--image-tag-type nvlm
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export TOKENIZERS_PARALLELISM="false"
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-nous-yi34b-internvit-mlp-${DATETIME}"
else
MODEL_NAME="mcore-nous-yi34b-internvit-mlp"
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
LOAD_NAME="combined-yi-34b-internvit-tp8-mcore"
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
LI=1
AD=0.0
HD=0.0
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
else
MBZ=1
BZ=2048
NW=8
LI=5
AD=0.1
HD=0.1
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
fi
SEQ_LEN=256 # Image embeddings sequence length.
DECODER_SEQ_LEN=512 # Language model sequence length.
MAX_POS_EMBED=512
OPTIONS=" \
--swiglu \
--use-distributed-optimizer \
--num-workers ${NW} \
--num-layers 60 \
--hidden-size 7168 \
--normalization RMSNorm \
--num-attention-heads 56 \
--exit-duration-in-mins 230 \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 20480 \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<path to tokenizer> \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 5000000 \
--disable-bias-linear \
--tensor-model-parallel-size 8 \
--language-model-type yi-34b \
--vision-model-type internvit \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--train-samples 122880000 \
--lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \
--lr 1e-4 \
--min-lr 2.5e-5 \
--lr-decay-style cosine \
--clip-grad 10.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--eod-mask-loss \
--bf16 \
--tensorboard-dir=${TENSORBOARD_DIR} \
--freeze-LM \
--freeze-ViT \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--data-path ${DATA_TRAIN} \
--dataloader-type external \
--split 100,0,0 \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--log-interval ${LI} \
--save-interval 2000 \
--eval-interval 500 \
--eval-iters 10 \
--log-params-norm \
--log-num-zeros-in-grad \
${EXTRA_ARGS} \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--allow-missing-vision-projection-checkpoint \
--disable-vision-class-token \
--use-te \
--use-checkpoint-args \
--ckpt-format torch \
--pixel-shuffle \
--image-tag-type nvlm
"
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
export TOKENIZERS_PARALLELISM="false"
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
USE_TILING=0
USE_PIXEL_SHUFFLE_ONLY=0
while [[ $# -gt 0 ]]; do
case $1 in
--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
--use-tiling)
USE_TILING=1
shift
shift
;;
--use-pixel-shuffle-only)
USE_PIXEL_SHUFFLE_ONLY=1
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
SEQ_LEN=1024 # Image embeddings sequence length.
DECODER_SEQ_LEN=8192 # Language model sequence length.
MAX_POS_EMBED=8192
# Additional arguments.
EXTRA_ARGS=""
if [[ $USE_TILING -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 6 --use-thumbnail --use-tile-tags"
SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
fi
if [[ $USE_PIXEL_SHUFFLE_ONLY -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle"
SEQ_LEN=256
fi
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \
--no-masked-softmax-fusion \
--swiglu \
--num-layers 80 \
--hidden-size 8192 \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--num-attention-heads 64 \
--exit-on-missing-checkpoint \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 29568 \
--load ${MODEL_PATH} \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model <tokenizer model path> \
--tokenizer-prompt-format qwen2p0 \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--disable-bias-linear \
--add-qkv-bias \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--language-model-type qwen2.0_72B \
--vision-model-type internvit \
--micro-batch-size 1 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--bf16 \
--freeze-LM \
--freeze-ViT \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--use-te \
--transformer-impl transformer_engine \
--use-checkpoint-args \
--out-seq-length 16 \
--temperature 1.0 \
--patch-dim 14 \
--seed 1234 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--disable-vision-class-token \
--input-image-path ${INPUT_IMAGE_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
${EXTRA_ARGS} \
--task ${TASK} \
--image-tag-type nvlm \
--ckpt-format torch
done
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
USE_TILING=0
USE_PIXEL_SHUFFLE_ONLY=0
while [[ $# -gt 0 ]]; do
case $1 in
--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
--use-tiling)
USE_TILING=1
shift
shift
;;
--use-pixel-shuffle-only)
USE_PIXEL_SHUFFLE_ONLY=1
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
SEQ_LEN=1024 # Image embeddings sequence length.
DECODER_SEQ_LEN=8192 # Language model sequence length.
MAX_POS_EMBED=8192
# Additional arguments.
EXTRA_ARGS=""
if [[ $USE_TILING -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 6 --use-thumbnail --use-tile-tags"
SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
fi
if [[ $USE_PIXEL_SHUFFLE_ONLY -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle"
SEQ_LEN=256
fi
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \
--no-masked-softmax-fusion \
--swiglu \
--num-layers 60 \
--hidden-size 7168 \
--normalization RMSNorm \
--num-attention-heads 56 \
--exit-on-missing-checkpoint \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 20480 \
--load ${MODEL_PATH} \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model <tokenizer model path> \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 5000000 \
--disable-bias-linear \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--language-model-type yi-34b \
--vision-model-type internvit \
--micro-batch-size 1 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--bf16 \
--freeze-LM \
--freeze-ViT \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--use-te \
--transformer-impl transformer_engine \
--use-checkpoint-args \
--out-seq-length 16 \
--temperature 1.0 \
--patch-dim 14 \
--seed 1234 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--disable-vision-class-token \
--input-image-path ${INPUT_IMAGE_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
${EXTRA_ARGS} \
--task ${TASK} \
--image-tag-type nlvm \
--ckpt-format torch
done
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_ALGO=^NVLS
export TOKENIZERS_PARALLELISM="false"
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-nous-yi34b-internvit-mlp-sft-${DATETIME}"
else
MODEL_NAME="mcore-nous-yi34b-internvit-mlp-sft"
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
LOAD_NAME="mcore-nous-yi34b-internvit-mlp" # From pretraining
CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
LI=1
AD=0.0
HD=0.0
ALLOW_NONDETERMINISTIC=1
# Can run out of GPU memory in interactive memory without this.
# This is just for interactive testing purposes. Do not use for proper training.
EXTRA_ARGS=" --freeze-LM"
else
MBZ=1
BZ=128
NW=2
LI=5
AD=0.0
HD=0.0
ALLOW_NONDETERMINISTIC=1
EXTRA_ARGS=""
fi
SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
DECODER_SEQ_LEN=3200 # Language model sequence length.
MAX_POS_EMBED=3200
OPTIONS=" \
--swiglu \
--use-distributed-optimizer \
--num-workers ${NW} \
--num-layers 60 \
--hidden-size 7168 \
--normalization RMSNorm \
--num-attention-heads 56 \
--exit-duration-in-mins 230 \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 20480 \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<tokenizer path> \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 5000000 \
--disable-bias-linear \
--tensor-model-parallel-size 8 \
--language-model-type yi-34b \
--vision-model-type internvit \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--train-samples 30000000 \
--lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \
--lr 2e-6 \
--min-lr 2.5e-7 \
--lr-decay-style cosine \
--split 100,0,0 \
--clip-grad 10 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--eod-mask-loss \
--bf16 \
--tensorboard-dir=${TENSORBOARD_DIR} \
--freeze-ViT \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--data-path ${DATA_TRAIN} \
--dataloader-type external \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--log-interval ${LI} \
--load ${FINETUNE_DIR} \
--save ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--save-interval 5000 \
--eval-interval 500 \
--eval-iters 10 \
--log-params-norm \
--log-num-zeros-in-grad \
${EXTRA_ARGS} \
--disable-vision-class-token \
--use-te \
--ckpt-format torch \
--pixel-shuffle \
--use-tiling \
--max-num-tiles 6 \
--use-thumbnail \
--use-tile-tags \
--image-tag-type nvlm
"
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 0.01 # # Datasets are weighted according to their size. Weights sum up to 1.
path: <path to coco>
subflavors:
augmentation: False
- weight: 0.02
path: <path to clevr-math dataset>
subflavors:
augmentation: False
# Please refer to Table 6 in https://arxiv.org/pdf/2409.11402 for full list of SFT datasets.
# Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
val:
datasets:
- weight: 1.
path: <path to validation dataset>
subflavors:
augmentation: False
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_ALGO=^NVLS
export TOKENIZERS_PARALLELISM="false"
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-qwen20-72b-internvit-sft-${DATETIME}"
else
MODEL_NAME="mcore-qwen20-72b-internvit-sft"
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR="${OUTPUT}/checkpoints"
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
# From pretraining. The pretraining checkpoint must be manually split to 4 pipeline parallel stages.
# Please refer to README.md and run examples/multimodal/nvlm/pp_checkpoint_converter.py.
LOAD_NAME="mcore-qwen20-72b-internvit-pp4"
CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
AD=0.0
HD=0.0
LI=1
# This is just for interactive testing purposes. Do not use for proper training.
EXTRA_ARGS="--freeze-LM"
ALLOW_NONDETERMINISTIC=1
else
MBZ=1
BZ=256
NW=8
AD=0.0
HD=0.0
LI=5
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
fi
SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
DECODER_SEQ_LEN=3200 # Language model sequence length.
MAX_POS_EMBED=8192
OPTIONS=" \
--use-checkpoint-args \
--exit-duration-in-mins 230 \
--disable-bias-linear \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<tokenizer model path> \
--tokenizer-prompt-format qwen2p0 \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 4 \
--num-layers 80 \
--hidden-size 8192 \
--ffn-hidden-size 29568 \
--add-qkv-bias \
--num-attention-heads 64 \
--use-distributed-optimizer \
--use-te \
--num-workers ${NW} \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings 32768 \
--train-samples 122880000 \
--lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--lr 2e-6 \
--min-lr 2.5e-7 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 500 \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--save-interval 10000 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--split 100,0,0 \
--clip-grad 10.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--bf16 \
--eod-mask-loss \
--freeze-ViT \
--patch-dim 14 \
--img-h 448 \
--img-w 448 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type qwen2.0_72B \
${EXTRA_ARGS} \
--vision-model-type internvit \
--disable-vision-class-token \
--log-params-norm \
--log-num-zeros-in-grad \
--ckpt-format torch \
--pixel-shuffle \
--use-tiling \
--max-num-tiles 6 \
--use-thumbnail \
--use-tile-tags \
--image-tag-type nvlm
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 1.
path: <path_to_pretraining_dataset_in_energon_format>
subflavors:
augmentation: false
val:
datasets:
- weight: 1.
path: <path_to_pretraining_dataset_in_energon_format>
subflavors:
augmentation: false
#!/bin/bash
# Pretrain a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-pretraining"
# Check that the user has set an output path for model checkpoints.
if [[ -z $WORKSPACE ]]; then
echo "Please set WORKSPACE for storing your model checkpoints."
exit 1
fi
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
if [[ -z $LOAD_NAME ]]; then
echo "Please set LOAD_NAME for input model name."
exit 1
fi
if [[ -z $TOKENIZER_MODEL ]]; then
echo "Please set TOKENIZER_MODEL for tokenizer model name."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
DEBUG=0
if [[ $DEBUG -eq 1 ]]; then
BZ=32
NW=2
HD=0.0
LI=1
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
else
BZ=256
NW=2
HD=0.1
LI=10
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
fi
OPTIONS=" \
--apply-layernorm-1p \
--attention-softmax-in-fp32 \
--use-checkpoint-args \
--use-distributed-optimizer \
--transformer-impl transformer_engine \
--use-te \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--num-workers ${NW} \
--exit-duration-in-mins 230 \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout ${HD} \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 576 \
--decoder-seq-length 1024 \
--max-position-embeddings 4096 \
--ffn-hidden-size 14336 \
--train-iters 20000 \
--micro-batch-size 1 \
--global-batch-size ${BZ} \
--lr-decay-iters 20000 \
--lr-warmup-fraction .01 \
--lr 0.00015 \
--min-lr 1.0e-5 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 1000 \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
--tokenizer-prompt-format mistral \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
--save-interval 1000 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--split 100,0,0 \
--clip-grad 1.0 \
--weight-decay 1e-2 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--log-params-norm \
--log-num-zeros-in-grad \
--bf16 \
--eod-mask-loss \
--freeze-LM \
--freeze-ViT \
--patch-dim 14 \
--img-h 336 \
--img-w 336 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type=mistral_7b \
--disable-vision-class-token \
${EXTRA_ARGS} \
--distributed-timeout-minutes 60 \
--allow-missing-vision-projection-checkpoint \
--ckpt-format torch
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
\ No newline at end of file
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Generate text using a vision language model."""
import json
import logging
import os
import sys
from functools import partial
# Add megatron to the path.
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
import torch
import yaml
from config import EvaluationConfig
from evaluation_datasets import get_evaluation_dataset
from model import model_provider
from multimodal_args import add_multimodal_extra_args
from megatron.core import parallel_state
from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
from megatron.inference.text_generation.api import generate_and_post_process
from megatron.inference.text_generation.forward_step import ForwardStep
from megatron.inference.text_generation.communication import broadcast_int_list
from megatron.training import get_args, get_model, get_tokenizer, print_rank_0
from megatron.training.checkpointing import load_checkpoint
from megatron.training.initialize import initialize_megatron
def add_text_generation_args(parser):
"""Text generation arguments."""
group = parser.add_argument_group(title='Vision language model text generation arguments')
group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
group.add_argument(
"--out-seq-length", type=int, default=1024, help='Length of the output generated text.'
)
group.add_argument("--output-path", type=str, help='Output file path')
group.add_argument('--input-image-path', type=str, help="Input image directory")
group.add_argument(
'--num-partitions', type=int, default=0, help="Number of partitions for inputs."
)
group.add_argument('--partition-id', type=int, default=0, help="Partition index")
group.add_argument("--gt-path", type=str, help="Optional ground truth file")
group.add_argument(
"--task",
type=str,
choices=[
"captioning",
"TextVQA",
"VQAv2",
"ChartQA",
"MMMU",
"VideoMME",
"OCRBench",
"MathVista",
"AI2D",
],
help="Generation task to run",
)
group.add_argument(
"--num-samples-per-partition", type=int, default=0, help="Number of samples per partition"
)
group.add_argument("--config-path", type=str, help="Evaluation config file to use.")
# Add common multimodal arguments needed for e.g. building the model.
parser = add_multimodal_extra_args(parser)
return parser
def get_evaluation_dataloader(
task,
input_image_path,
gt_path,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
num_samples_per_partition,
num_partitions,
partition_id,
num_frames,
num_workers,
vision_model_type,
):
"""Build evaluation dataset."""
dataset = get_evaluation_dataset(
task,
input_image_path,
gt_path,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
num_samples_per_partition,
num_partitions,
partition_id,
num_frames,
vision_model_type,
)
dp_rank = parallel_state.get_data_parallel_rank()
dp_world_size = parallel_state.get_data_parallel_world_size()
sampler = torch.utils.data.DistributedSampler(
dataset, shuffle=False, num_replicas=dp_world_size, rank=dp_rank
)
# TODO: Batched inference is not supported yet.
dataloader = torch.utils.data.DataLoader(
dataset, batch_size=None, num_workers=num_workers, sampler=sampler, pin_memory=True
)
return dataloader
def generate_samples(model, config: EvaluationConfig, print_output):
"""Text generation using a trained vision language model."""
args = get_args()
dataloader = get_evaluation_dataloader(
config.task,
config.input_image_path,
config.gt_path,
args.img_h,
args.img_w,
args.use_tiling,
args.max_num_tiles,
args.use_thumbnail,
config.num_samples_per_partition,
config.num_partitions,
config.partition_id,
args.num_frames,
args.num_workers,
args.vision_model_type,
)
num_img_embeddings_per_tile = get_num_image_embeddings(
args.img_h,
args.img_w,
args.patch_dim,
args.vision_model_type,
args.disable_vision_class_token,
1,
args.pixel_shuffle,
args.use_tile_tags,
)
for idx, (imgs, num_tiles, sample_id, question, answers, metadata) in enumerate(dataloader):
imgs = imgs.to("cuda")
num_tiles = num_tiles.to("cuda")
conv = get_conversation(config.task, question)
forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles, args.decoder_seq_length)
if is_first_rank():
resp_sentences, _, _, _ = generate_and_post_process(
model,
forward_step=forward_step,
prompts=[conv],
tokens_to_generate=config.out_seq_length,
top_k_sampling=config.top_k,
top_p_sampling=config.top_p,
add_BOS=False,
temperature=config.temperature,
random_seed=args.seed,
detokenize_segments=False,
data_parallel=True,
)
for generation in resp_sentences:
if isinstance(sample_id, torch.Tensor):
sample_id = sample_id.item()
output = {"sample_id": sample_id}
output_name = ""
if config.task == "captioning":
output_name = "caption"
elif config.task in (
"TextVQA",
"VQAv2",
"ChartQA",
"OCRBench",
"MathVista",
"AI2D",
):
output_name = "answer"
elif config.task in ("MMMU"):
output_name = "text"
elif config.task == "VideoMME":
output_name = "response"
output = question
else:
raise NotImplementedError("no output name defined for", config.task)
prompt, generated = get_prompt_and_generated(
generation, args.tokenizer_prompt_format
)
if config.task == "VideoMME":
output["questions"][0][output_name] = generated
else:
output[output_name] = generated
output["prompt"] = prompt
if config.task == "captioning":
output["ground_truth"] = answers
elif config.task in (
"TextVQA",
"VQAv2",
"ChartQA",
"OCRBench",
"MathVista",
"AI2D",
):
if isinstance(answers, str):
answers = [answers]
output["gt_answer"] = answers
if len(metadata) > 0:
output.update(metadata)
elif config.task == "MMMU":
output["prediction"] = generated
output.update(metadata)
else:
raise NotImplementedError("no output processing defined for", config.task)
if print_output:
print(output)
yield output
idx += 1
else:
generate_and_post_process(
model, forward_step=forward_step, detokenize_segments=False, data_parallel=True
)
idx += 1
def get_evaluation_config():
"""Get evaluation config from a config file or command-line arguments."""
args = get_args()
if args.config_path:
with open(args.config_path, "r") as f:
config_dict = yaml.safe_load(f)
config = EvaluationConfig(**config_dict)
else:
config = EvaluationConfig(
task=args.task,
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
out_seq_length=args.out_seq_length,
output_path=args.output_path,
input_image_path=args.input_image_path,
gt_path=args.gt_path,
num_partitions=args.num_partitions,
partition_id=args.partition_id,
num_samples_per_partition=args.num_samples_per_partition,
)
# Default output path if not defined...
if not config.output_path:
os.makedirs("generated", exist_ok=True)
config.output_path = "generated/" + args.language_model_type
return config
def is_first_rank():
"""First tensor and pipeline parallel rank."""
return (
parallel_state.is_pipeline_first_stage(ignore_virtual=True)
and parallel_state.get_tensor_model_parallel_rank() == 0
)
def get_output_path(config, dp_rank):
"""Generation output path."""
return (
f"{config.output_path}-{config.task}-dprank={dp_rank}-partition={config.partition_id}.jsonl"
)
def generate_and_write_samples(model, config, print_output=True):
"""Generate text and write to an output file."""
dp_rank = parallel_state.get_data_parallel_rank()
if is_first_rank():
output_path = get_output_path(config, dp_rank)
output_file = open(output_path, "w")
print(f"output path: {output_file.name}")
with torch.no_grad():
for output in generate_samples(model, config, print_output):
if is_first_rank():
output_file.write(json.dumps(output) + "\n")
output_file.flush()
if is_first_rank():
output_file.close()
class VLMForwardStep(ForwardStep):
"""Inference forward step for a multimodal model."""
def __init__(
self,
num_img_embeddings_per_tile,
images,
num_tiles,
decoder_seq_length,
model,
max_batch_size,
max_sequence_length,
):
"""Create multimodal forward step."""
total_num_tiles = torch.sum(num_tiles).item()
num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles
super().__init__(model, max_batch_size, max_sequence_length + num_img_embeddings)
self._images = images
self._num_tiles = num_tiles
self._num_img_embeddings = num_img_embeddings
self.decoder_seq_length = decoder_seq_length
self._recv_only_vision_embeds = False
pp_rank = parallel_state.get_pipeline_model_parallel_rank()
# Checks if the previous stage only has a vision encoder, and that the current stage has part of the LM decoder.
# In this case, the current stage should only receive vision embeddings.
if pp_rank > 0:
self._recv_only_vision_embeds = parallel_state.is_inside_encoder(pp_rank - 1) and (not parallel_state.is_inside_decoder(pp_rank - 1)) and parallel_state.is_inside_decoder()
# Checks if the current stage only has a vision encoder
self._encoder_only = parallel_state.is_inside_encoder() and not parallel_state.is_inside_decoder()
def _forward(self, tokens, position_ids, attention_mask):
return self.model(
self._images,
tokens,
position_ids,
attention_mask=None,
inference_params=self.inference_params,
num_image_tiles=self._num_tiles,
runtime_gather_output=True,
)
def __call__(self, tokens, position_ids, attention_mask):
num_image_tokens = (tokens == self.model.image_token_index).sum().item()
num_tokens = tokens.size(1)
recv_buffer_seq_length = None
if num_image_tokens > 0:
# When there are image tokens and this stage only receives vision embeddings, adjust the recv buffer seq length to match the image embeddings sequence length.
# If there are image tokens and this stage receives full embeddings, make sure we compensate for expansion of image tokens.
# Note that this will set a recv_buffer_seq_length for the encoder stage, this length is irrelevant since that recv buffer is never allocated.
if self._recv_only_vision_embeds:
recv_buffer_seq_length = self._num_img_embeddings
else:
recv_buffer_seq_length = min(self._num_img_embeddings + num_tokens - num_image_tokens, self.decoder_seq_length)
elif self._recv_only_vision_embeds:
# If this stage only receives vision embeddings and there are no image tokens we won't run the encoder and therefore shouldn't try to recv.
recv_buffer_seq_length = 0
# If the pipeline stage only has a vision encoder, then it only needs to run when there are image tokens
if not (self._encoder_only and num_image_tokens == 0):
output = super().__call__(tokens, position_ids, attention_mask, recv_buffer_seq_length=recv_buffer_seq_length)
else:
output = None
if isinstance(output, tuple):
logits, _ = output
else:
logits = output
# On the first inference iteration, we compute image tokens.
# On every PP stage(although inference params should only matter for decoder),
# update the sequence length offset by the number of image tokens.
if num_tokens > 1 and num_image_tokens > 0:
if "image_tokens_count" not in self.inference_params.key_value_memory_dict:
self.inference_params.key_value_memory_dict["image_tokens_count"] = self._num_img_embeddings
if self._num_img_embeddings + num_tokens - num_image_tokens > self.decoder_seq_length:
self.inference_params.sequence_len_offset += self.decoder_seq_length - num_tokens
else:
self.inference_params.sequence_len_offset += (
self.inference_params.key_value_memory_dict["image_tokens_count"] - num_image_tokens
)
return logits
def get_conversation(task, question):
"""Get a conversation for a given task and evaluation question."""
conversation = []
# In all cases, the tokenizer adds possible header tokens for the assistant.
if task == "captioning":
conversation = [
{"role": "system", "content": "Answer the questions."},
{
"role": "user",
"content": "<image>\nProvide a one-sentence caption for provided image.",
},
]
elif task in ("TextVQA", "VQAv2", "ChartQA"):
conversation = [
{"role": "system", "content": "Answer the questions."},
{
"role": "user",
"content": f"<image>\n{question}\nAnswer the question using a single word or phrase.",
},
]
elif task in ("OCRBench", "MathVista", "AI2D"):
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"<image>\n{question}"},
]
elif task == "MMMU":
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": question},
]
elif task == "VideoMME":
q = (
"Select the best answer to the following multiple-choice "
"question based on the video. Respond with only the letter "
"(A, B, C, or D) of the correct option.\n"
)
q += question["questions"][0]["question"] + "\n"
q += question["questions"][0]["choices"][0] + "\n"
q += question["questions"][0]["choices"][1] + "\n"
q += question["questions"][0]["choices"][2] + "\n"
q += question["questions"][0]["choices"][3] + "\n"
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"<image>\n{question}"},
]
return conversation
def get_prompt_and_generated(prompt_and_generation, prompt_format):
"""Strip prompt and other unnecessary text from generation."""
if prompt_format == "llama3":
splitted = prompt_and_generation.split("<|start_header_id|>assistant<|end_header_id|>\n\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|eot_id|>")[0]
elif prompt_format == "mistral":
splitted = prompt_and_generation.split("[/INST]")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("</s>")[0]
elif prompt_format == "chatml":
splitted = prompt_and_generation.split("<|im_start|> assistant\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|im_end|>")[0]
elif prompt_format in ("nvlm-yi-34b", "qwen2p0"):
splitted = prompt_and_generation.split("<|im_start|>assistant\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|im_end|>")[0]
# Remove possible garbage.
generated = generated.strip()
generated = generated.split("\n\n")[0]
generated = generated.split("\n")[0]
return prompt, generated
def main():
"""Vision language model text generation."""
initialize_megatron(extra_args_provider=add_text_generation_args)
if torch.distributed.get_rank() == 0:
logging.getLogger(__name__).warning(
"Models using pipeline parallelism are not supported yet."
)
args = get_args()
def wrapped_model_provider(pre_process, post_process):
return model_provider(pre_process, post_process, parallel_output=False)
# Set up model and load checkpoint.
model = get_model(wrapped_model_provider, wrap_with_ddp=False)
if args.load is not None:
_ = load_checkpoint(model, None, None)
model = model[0]
model.eval()
config = get_evaluation_config()
generate_and_write_samples(model, config)
if __name__ == "__main__":
main()
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 1.
path: <path_to_sft_dataset_in_energon_format>
subflavors:
augmentation: false
val:
datasets:
- weight: 1.
path: <path_to_sft_dataset_in_energon_format>
subflavors:
augmentation: false
#!/bin/bash
# Run SFT on a pretrained multimodal model
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-sft"
# Check that the user has set an output path for model checkpoints.
if [[ -z $WORKSPACE ]]; then
echo "Please set WORKSPACE for storing your model checkpoints."
exit 1
fi
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
if [[ -z $LOAD_NAME ]]; then
echo "Please set LOAD_NAME for input model name."
exit 1
fi
if [[ -z $LOAD_ITER ]]; then
echo "Please set LOAD_ITER for pre-trained input model iteration."
exit 1
fi
if [[ -z $TOKENIZER_MODEL ]]; then
echo "Please set TOKENIZER_MODEL for tokenizer model name."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
DEBUG=0
if [[ $DEBUG -eq 1 ]]; then
BZ=8
NW=1
HD=0.0
LI=1
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
else
BZ=128
NW=2
HD=0.1
LI=10
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
fi
OPTIONS=" \
--apply-layernorm-1p \
--attention-softmax-in-fp32 \
--use-checkpoint-args \
--use-distributed-optimizer \
--transformer-impl transformer_engine \
--use-te \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--num-workers ${NW} \
--exit-duration-in-mins 230 \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout ${HD} \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 576 \
--decoder-seq-length 2048 \
--max-position-embeddings 4096 \
--ffn-hidden-size 14336 \
--train-iters 20000 \
--micro-batch-size 1 \
--global-batch-size ${BZ} \
--lr-decay-iters 20000 \
--lr-warmup-fraction .01 \
--lr 1e-6 \
--min-lr 1e-7 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 500 \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/${TOKENIZER_MODEL} \
--tokenizer-prompt-format mistral \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
--save-interval 500 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--split 100,0,0 \
--clip-grad 0.5 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--log-params-norm \
--log-num-zeros-in-grad \
--eod-mask-loss \
--freeze-ViT \
--patch-dim 14 \
--img-h 336 \
--img-w 336 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type=mistral_7b \
--disable-vision-class-token \
${EXTRA_ARGS} \
--distributed-timeout-minutes 60 \
--ckpt-format torch
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
GROUNDTRUTH_PATH="placeholder"
NUM_FRAMES=1
while [[ $# -gt 0 ]]; do
case $1 in
--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
--num-frames)
NUM_FRAMES="$2"
shift
shift
;;
-g|--groundtruth-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
-t|--tokenizer-path)
TOKENIZER_PATH="$2"
shift
shift
;;
--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--apply-layernorm-1p \
--attention-softmax-in-fp32 \
--use-flash-attn \
--transformer-impl transformer_engine \
--use-te \
--use-checkpoint-args \
--normalization RMSNorm \
--language-model-type mistral_7b \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--group-query-attention \
--num-query-groups 8 \
--num-layers 32 \
--hidden-size 4096 \
--ffn-hidden-size 14336 \
--num-attention-heads 32 \
--max-position-embeddings 4096 \
--no-masked-softmax-fusion \
--load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${TOKENIZER_PATH} \
--tokenizer-prompt-format mistral \
--bf16 \
--micro-batch-size 1 \
--seq-length 2048 \
--out-seq-length 12 \
--temperature 1.0 \
--img-h 336 \
--img-w 336 \
--patch-dim 14 \
--seed 153 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--input-image-path ${INPUT_IMAGE_PATH} \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--task ${TASK} \
--disable-vision-class-token \
--num-frames ${NUM_FRAMES} \
--ckpt-format torch
done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment