Commit 1106877d authored by jerrrrry's avatar jerrrrry
Browse files

“13.0”

parents
Pipeline #2934 failed with stages
in 0 seconds
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import argparse
import os
import sys
# Add megatron and the multimodal example to the path.
sys.path.append(
os.path.abspath(
os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir)
)
)
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
import torch
from transformers import AutoModel
from examples.multimodal.model import model_provider
from examples.multimodal.multimodal_args import add_multimodal_extra_args
from megatron.training import get_model
from megatron.training.checkpointing import load_checkpoint
from megatron.training.initialize import initialize_megatron
def run_mcore_vision(model_path):
"""Run mcore vision model."""
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
# Megatron has some mandatory flags.
sys.argv = [
"ignore_me.py",
"--micro-batch-size=1",
"--num-layers=2",
"--vision-model-type=internvit",
"--language-model-type=mistral_7b",
"--tokenizer-prompt-format=mistral",
"--tokenizer-type=MultimodalTokenizer",
"--tokenizer-model=mistralai/Mistral-7B-Instruct-v0.3",
"--vocab-size=1024",
"--hidden-size=64",
"--num-attention-heads=8",
"--seq-length=1024",
"--decoder-seq-length=2048",
"--max-position-embeddings=2048",
"--bf16",
"--img-h=448",
"--img-w=448",
"--patch-dim=14",
"--tensor-model-parallel-size=8",
"--use-te",
f"--pretrained-checkpoint={model_path}",
]
initialize_megatron(extra_args_provider=add_multimodal_extra_args)
def wrapped_model_provider(pre_process, post_process):
return model_provider(pre_process, post_process, parallel_output=False)
# Set up model and load checkpoint.
model = get_model(wrapped_model_provider, wrap_with_ddp=False)
vision_model = model[0].module.vision_model
load_checkpoint([vision_model], None, None)
vision_model.eval()
images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
output = vision_model(images)
return output
def run_hf_vision(model_name):
"""Run HF vision model."""
model = (
AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True)
.cuda()
.eval()
)
images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
outputs = model(images, return_dict=True)
return outputs
def main(mcore_model, hf_model):
"""Compare vision model outputs between mcore and HF given the same fixed input."""
mcore = run_mcore_vision(mcore_model)
if torch.distributed.get_rank() == 0:
hf = run_hf_vision(hf_model)
hf = hf["last_hidden_state"]
# Compare logits. Due to different attention implementations and other details,
# there will be numerical differences.
diff = (mcore - hf).abs()
mean_diff = diff.mean().item()
max_diff = diff.max().item()
print(f"mean diff {mean_diff}, max diff {max_diff}")
assert mean_diff < 0.1, "mean output difference is greater than expected"
assert max_diff < 50, "max output difference is greater than expected"
print("lgtm")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Check mcore vision model output vs. HF numerically.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--mcore-model", type=str, required=True, help="directory for mcore model weights"
)
parser.add_argument("--hf-model", type=str, required=True, help="Model name in HF")
args = parser.parse_args()
main(args.mcore_model, args.hf_model)
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
def add_multimodal_extra_args(parser):
"""Extra arguments."""
group = parser.add_argument_group(title='multimodal arguments')
group.add_argument('--dataset-config', type=str, default=None)
group.add_argument("--prompt-path", type=str, default=None)
group.add_argument('--freeze-LM', action='store_true', default=False)
group.add_argument('--freeze-ViT', action='store_true', default=False)
group.add_argument('--language-model-type', type=str, required=True)
group.add_argument('--vision-model-type', type=str, default="clip")
group.add_argument("--disable-vision-class-token", action="store_true", default=False)
group.add_argument(
"--allow-missing-vision-projection-checkpoint", action="store_true", default=False
)
group.add_argument("--use-te", action="store_true", default=False)
group.add_argument(
"--dataloader-save", type=str, default=None, help="Energon dataloader state save path"
)
group.add_argument(
"--use-tiling", action="store_true", default=False, help="Use input image tiling"
)
group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
group.add_argument(
"--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile"
)
group.add_argument(
"--dataloader-seq-length",
type=int,
help="Make dataloader to produce sequences of specific length.",
)
group.add_argument(
"--num-frames",
type=int,
default=1,
help="Number of frames to regularly sample from the video as input to the model.",
)
group.add_argument(
"--online-evaluation-config", type=str, help="Config file for online evaluation."
)
group.add_argument(
"--special-tokens",
nargs="*",
default=[IMAGE_TOKEN],
help="Special tokens used in the multimodal model",
)
group.add_argument(
"--tokenizer-prompt-format",
type=str,
choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5", "llama3p1", "nemotron5",
"nemotron5-aligned"],
required=True,
help="Prompt format to use with the tokenizer.",
)
group.add_argument("--pixel-shuffle", action="store_true", default=False)
group.add_argument(
"--image-tag-type",
type=str,
choices=["nvlm", "internvl", ""],
default="", # Default: Image tag not used.
help="Surround image tokens with tags.",
)
group.add_argument("--use-tile-tags", action="store_true", default=False, help="Use tile tags")
group.add_argument(
"--packing-buffer-size",
type=int,
default=None, # Packing is disabled by default.
help="Enable sample packing by setting the buffer size to > 0",
)
group.add_argument(
"--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
)
group.add_argument(
"--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
)
group.add_argument(
"--use-loss-scaling", action="store_true", default=False, help="Scale loss based on conversation turn length (in tokens)."
)
group.add_argument(
"--force-system-message", action="store_true", default=False, help="Force a specific system message"
)
group.add_argument("--eos-id", type=int, help="termination id for MultiModal Tokenizer")
group.add_argument(
"--use-area-weighted-aspect-ratio", action="store_true", default=False,
help=(
"When --use-tiling is True, find the aspect ratio to use based on the original ",
"image aspect ratio and the area covered by the tiles.")
)
group.add_argument("--use-mcore-inference", action="store_true", default=False, help="Use the MCore inference API")
return parser
NVLM
====
Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
*NOTE: VLMs in Megatron are under active development and are expected to change.*
# Checkpoints
NVLM 1.0 model weights are publicly available in HuggingFace and Megatron format.
- NVLM-1.0-D 72B [HuggingFace version](https://huggingface.co/nvidia/NVLM-D-72B)
- NVLM-1.0-D 72B [Megatron-Core version](https://huggingface.co/nvidia/NVLM-D-72B-mcore)
# Setup
## Docker image
Please use `examples/multimodal/Dockerfile`.
## Dataset preparation
Please refer to Tables 4 and 6 in the [NVLM paper](https://arxiv.org/pdf/2409.11402) for full list of pretrain and SFT datasets.
Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
## Model conversion
### Vision model
NVLM 1.0 models use [OpenGVLab/InternViT-6B-448px-V1-5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V1-5) from HuggingFace.
Please download it and run the following command to convert it to Megatron format.
```
python examples/multimodal/model_converter/internvit_converter.py --output-dir <some output dir> --use-te --tensor-parallel-size 8
```
### 34B Language model
NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B) from HuggingFace.
Please download it and run the following command to convert it to Megatron format.
```
python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
--load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model name/directory> \
--saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1
```
### 72B Language model
NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) from HuggingFace.
Please download it and run the following command to convert it to Megatron format.
```
python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
--load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model name/directory> \
--saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf
```
### Combined checkpoint
Combine the vision model checkpoint from [InternVit](#internvit) with the [34B](#34b-language-model) or [72B](#72b-language-model) language model by running:
```
examples/multimodal/combine_lm_vision_checkpoints.sh <language model directory> <vision model directory> <output directory> nvlm
```
# Training
## 34B
1. Pretraining: please run `examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh`. Please use the InternViT + 34B [combined checkpoint](#combined-checkpoint) and tokenizer from HuggingFace.
2. SFT: please run `examples/multimodal/nvlm/sft_34b_internvit.sh` using the checkpoint from 1.
## 72B
1. Pretraining: please run `examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh`. Please use the InternViT + 72B [combined checkpoint](#combined-checkpoint) and tokenizer from HuggingFace.
2. Convert the pretraining checkpoint from 1. to have pipeline parallel size = 4 for SFT. Please run
```
examples/multimodal/nvlm/pp_checkpoint_converter.py --input <pretrained checkpoint directory> \
--input-pipeline-parallel 1 --output <some output dir> --output-pipeline-parallel 4 \
--tensor-parallel 8
```
3. SFT: please run `examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh` using the checkpoint from 2.
4. To convert the checkpoint with pipeline parallel size = 4 back to 1 for evaluation, please run
```
examples/multimodal/nvlm/pp_checkpoint_converter.py --input <sft checkpoint directory> \
--input-pipeline-parallel 4 --output <some output dir> --output-pipeline-parallel 1 \
--tensor-parallel 8
```
# Evaluation
Run the text generation script.
- 34B
```
examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
--model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name --use-tiling
```
- 72B
```
examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
--model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name --use-tiling
```
where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning`, `MMMU` or `TextVQA`.
Then, run one of the evaluation scripts from `examples/multimodal`. For example
```
python examples/multimodal/evaluate_mmmu.py --input-path /output/directory/from/generation
```
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
""""
NOTE: NVLM uses InternViT with tensor parallel (TP) size = 8.
Since InternViT has 25 attention heads and Megatron currently requires the number of attention heads
to be divisible by the TP size, we add 7 dummy zero attention heads to have 32 attention heads.
This workaround requires some changes to how we compute RMSNorm, Attention etc.
Additionally, InternViT introduces some unique features like Layer Scaling.
Those code changes are gathered here.
"""
from functools import partial
import torch
from megatron.core.utils import divide
from megatron.core.extensions.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TERowParallelLinear,
)
from megatron.core.parallel_state import (
get_tensor_model_parallel_group,
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
)
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.spec_utils import ModuleSpec, build_module
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
from examples.multimodal.layer_scaling import LayerScalingTransformerLayer, get_bias_dropout_add_layer_scaling
try:
import apex
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.transformer.torch_norm import WrappedTorchNorm
HAVE_APEX = True
LNImpl = FusedLayerNorm
except ImportError:
import warnings
from megatron.core.transformer.torch_norm import WrappedTorchNorm
warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
LNImpl = WrappedTorchNorm
class InternViTRMSNorm(MegatronModule):
def __init__(
self,
config,
hidden_size: int,
eps: float = 1e-6,
sequence_parallel: bool = False,
compute_var: bool = False,
):
"""Custom RMSNorm for InternViT.
Args:
config (TransformerConfig): Config.
hidden_size (int): Input hidden size.
eps (float): epsilon to use for the norm, default to 1e-6
sequence_parallel (bool): Set to true if sequence parallelism is being used,
this marks the weights as needing to be allreduced.
compute_var (bool): Indicator to compute statistic manually.
"""
super().__init__(config=config)
self.config = config
self.eps = eps
self.weight = torch.nn.Parameter(torch.ones(hidden_size))
self._compute_var = compute_var
assert not sequence_parallel, "Sequence parallelism is not supported with InternViT."
setattr(self.weight, 'sequence_parallel', sequence_parallel)
def _norm(self, x, var):
if var is None:
var = x.pow(2).mean(-1, keepdim=True)
return x * torch.rsqrt(var + self.eps)
def forward(self, x):
"""Run RMSNorm with an option to compute custom statistic."""
var = None
if self._compute_var:
unpadded_hidden_size = self.config.hidden_size # 3200
max_dim = x.shape[-1] # 128
x = x.reshape(x.size(0), x.size(1), -1)
var = self._gather_var(x.float().pow(2), max_dim) / unpadded_hidden_size
output = self._norm(x.float(), var).type_as(x)
output = output * self.weight
if self._compute_var:
output = output.reshape(output.size(0), output.size(1), -1, max_dim)
return output
def _gather_var(self, input_, max_dim):
"""Compute statistic across the non-dummy heads."""
world_size = get_tensor_model_parallel_world_size()
# Size and dimension.
last_dim = input_.dim() - 1
rank = get_tensor_model_parallel_rank()
num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
valid_ranks = 24 // num_attention_heads_per_partition
residual_heads = 25 % num_attention_heads_per_partition
if residual_heads == 0:
residual_heads = num_attention_heads_per_partition
max_dim = max_dim * residual_heads
if rank < valid_ranks: # Ranks without any dummy attention heads.
var = input_.sum(-1, keepdim=True)
elif rank == valid_ranks: # The only rank which may contain 'residual_heads' dummy attention heads.
var = input_[..., :max_dim].sum(-1, keepdim=True)
else:
var = input_.sum(-1, keepdim=True) * 0.0 # All heads in these ranks are dummy heads: Zero-out.
tensor_list = [torch.empty_like(var) for _ in range(world_size)]
tensor_list[rank] = var
torch.distributed.all_gather(tensor_list, var, group=get_tensor_model_parallel_group())
output = torch.cat(tensor_list, dim=last_dim).contiguous()
return output.sum(-1, keepdim=True)
def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata={}):
# in InternVitSelfAttention the q_layernorm and k_layernorm weights
# are tensor-parallel so must be converted to sharded tensors
if 'q_layernorm' in prefix or 'k_layernorm' in prefix:
state_dict = self.state_dict(prefix='', keep_vars=True)
return make_sharded_tensors_for_checkpoint(
state_dict, prefix, {'weight': 0}, sharded_offsets
)
else:
return super().sharded_state_dict(prefix, sharded_offsets, metadata)
def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
# Dense MLP w/ or w/o TE modules.
return ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
),
)
# Override a few things that are special in InternViT and not supported by the SelfAttention class.
class InternViTSelfAttention(SelfAttention):
def __init__(
self, config: TransformerConfig, submodules: SelfAttentionSubmodules, *args, **kwargs
):
super().__init__(config=config, submodules=submodules, *args, **kwargs)
# Need to override linear_qkv, q_layernorm and k_layernorm.
qkv_bias = False
self.linear_qkv = build_module(
submodules.linear_qkv,
self.config.hidden_size,
self.query_projection_size + 2 * self.kv_projection_size,
config=self.config,
init_method=self.config.init_method,
gather_output=False,
bias=qkv_bias,
skip_bias_add=False,
is_expert=False,
tp_comm_buffer_name='qkv',
)
qk_layernorm_hidden_size = (
self.hidden_size_per_attention_head * self.num_attention_heads_per_partition
) # 512 for internvit
self.q_layernorm = build_module(
submodules.q_layernorm,
hidden_size=qk_layernorm_hidden_size,
config=self.config,
eps=self.config.layernorm_epsilon,
compute_var=True,
)
self.k_layernorm = build_module(
submodules.k_layernorm,
hidden_size=qk_layernorm_hidden_size,
config=self.config,
eps=self.config.layernorm_epsilon,
compute_var=True,
)
class InternViTTEDotProductAttention(TEDotProductAttention):
"""Adjusted Attention for InternViT"""
def forward(self, *args, **kwargs):
"""Regular TEDotProductAttention + zero-out dummy attention heads."""
out = super().forward(*args, **kwargs)
# This makes sure the dummy attention heads are zeroed out.
mask = torch.ones_like(out, dtype=out.dtype, device=out.device)
rank = get_tensor_model_parallel_rank()
max_dim = out.shape[-1] # 128
valid_ranks = 6
if rank == valid_ranks:
mask[..., max_dim:] *= 0.0
elif rank > valid_ranks:
mask *= 0.0
out *= mask
return out
def get_internvit_layer_spec(use_te) -> ModuleSpec:
mlp = get_mlp_module_spec(use_te) # no norm
return ModuleSpec(
module=LayerScalingTransformerLayer,
submodules=TransformerLayerSubmodules(
input_layernorm=InternViTRMSNorm,
self_attention=ModuleSpec(
module=InternViTSelfAttention,
params={"attn_mask_type": AttnMaskType.no_mask},
submodules=SelfAttentionSubmodules(
linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear,
core_attention=TEDotProductAttention if use_te else DotProductAttention,
linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
q_layernorm=InternViTRMSNorm,
k_layernorm=InternViTRMSNorm,
),
),
self_attn_bda=get_bias_dropout_add_layer_scaling,
pre_mlp_layernorm=InternViTRMSNorm,
mlp=mlp,
mlp_bda=get_bias_dropout_add_layer_scaling,
),
)
def get_internvit300M_layer_spec(use_te) -> ModuleSpec:
mlp = get_mlp_module_spec(use_te) # no norm
return ModuleSpec(
module=LayerScalingTransformerLayer,
submodules=TransformerLayerSubmodules(
input_layernorm=LNImpl,
self_attention=ModuleSpec(
module=SelfAttention,
params={"attn_mask_type": AttnMaskType.no_mask},
submodules=SelfAttentionSubmodules(
linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear,
core_attention=TEDotProductAttention if use_te else DotProductAttention,
linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
q_layernorm=None,
k_layernorm=None,
),
),
self_attn_bda=get_bias_dropout_add_layer_scaling,
pre_mlp_layernorm=LNImpl,
mlp=mlp,
mlp_bda=get_bias_dropout_add_layer_scaling,
),
)
{
"COMMENT": "Mixture of our own custom prompts and some prompts from https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
"Captioning": {
"raw": [
"Can you briefly explain what you see in the image?",
"Describe what's happening in this image in one short sentence.",
"Write a short caption that accurately represents the content of this image.",
"Please generate a descriptive caption for the image provided.",
"How would you summarize the scene depicted in the picture in short?",
"Describe the image briefly.",
"Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
"Create a concise caption that accurately describes the main elements in the image provided.",
"Write a brief, yet comprehensive, description of the image.",
"Describe the image in a clear and concise manner.",
"For the given image, provide a one-sentence summary that captures the most important details.",
"Generate a short caption for the picture.",
"Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
"Provide a concise and informative caption for the image, focusing on the primary subjects.",
"Write a clear description of the image, make sure the key features are well covered.",
"Offer a succinct explanation of the picture presented."
]
},
"CaptioningPretraining": {
"raw": [
"Give a brief description of image.",
"Give a brief description of the image.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely.",
"Generate a clear and concise summary of the photo."
]
},
"CaptioningSFT": {
"raw": [
"Give a brief description of the image.",
"Give a short and clear explanation of the subsequent image.",
"Present a compact description of the photo's key features.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Render a clear and concise summary of the photo.",
"Share a concise interpretation of the image provided.",
"Summarize the visual content of the image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely."
]
},
"VQAPretraining": {
"raw": [
"Question: {} Short answer:",
"Question: {} Answer:"
]
},
"VQASFT": {
"raw": [
"{}",
"{}\nAnswer the question using a single word or phrase."
],
"docvqa": [
"{}",
"{}\nAnswer this question using the text in the image directly."
]
},
"DocPretraining": {
"raw": [
"Retrieve the text from the given pdf image.",
"Extract the text from the provided document.",
"Transcribe the text displayed in the image."
],
"ocr_multi": [
"Apply grounded Optical Character Recognition (OCR) to the provided image.",
"Extract all texts and their bounding boxes from the given image using grounded OCR.",
"Extract and transcribe all visible text from the provided image, ensuring accurate spatial recognition.",
"Conduct a detailed optical character recognition analysis on this image, maintaining the text's original layout and positioning.",
"Execute a thorough text recognition procedure on this visual input, ensuring that the spatial arrangement of the text is accurately represented.",
"Perform an in-depth OCR scan of the image, capturing both the content and contextual positioning of all textual information.",
"OCR with grounding:"
],
"md": [
"Extract the text from the given image and format it in Markdown.",
"Convert the text from the provided image into Markdown format.",
"Transform the text from the given image into Markdown syntax.",
"Extract and convert the text from the image to Markdown.",
"Retrieve the text from the image and present it in Markdown format."
],
"grounded_ocr": [
"{}. Text:",
"Recognize the text in this region: {}.",
"Identify the text in this area: {}.",
"Detect the text within this section: {}."
],
"referring_grounding": [
"Region of \"{}\" is:",
"Locate the text \"{}\" in the image.",
"Identify the text \"{}\" in the image and provide the coordinates."
]
},
"CaptioningDetailed": {
"raw": [
"Create a comprehensive paragraph that captures the essence of the image while weaving a cohesive narrative around its elements.",
"Compose a paragraph that thoroughly describes the image's content, providing context and connections between different aspects of the scene.",
"Provide a detailed, paragraph-length description of the image that paints a vivid picture and tells a coherent story.",
"Write a rich and engaging paragraph that delves into the image's components, describing not only what is seen but also how the elements relate to one another.",
"Give a well-rounded, paragraph-length explanation of the image, describing the scene and its components while forming a complete and engaging narrative.",
"Produce a paragraph that not only describes the individual elements in the image but also weaves them together to form a cohesive, connected account.",
"Construct a paragraph that captures the image's details and context, offering a more in-depth and engaging story than a simple caption.",
"Compose a descriptive paragraph that brings the image to life through detailed storytelling, connecting the various visual elements into a unified narrative.",
"Create a paragraph that provides an extensive and interconnected description of the image, ensuring that the narrative is both detailed and cohesive.",
"Write a compelling and detailed paragraph that delves into the image's components, linking them together to create a unified and engaging story."
]
},
"OCR": {
"raw": [
"Can you read the text from image and output here?",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"
],
"markdown": [
"Can you extract all visible text from the provided image?",
"Converting the text embedded in this image into a readable markdown document.",
"Can you read the text in the document as markdown?",
"Transcribe the document as markdown.",
"Extract and document the text from the provided image."
],
"table_markdown": [
"Can you extract all visible text from the provided table?",
"Can you read the text in the provided table as markdown?",
"Transcribe the table as markdown.",
"Extract and document the text from the provided table image."
],
"plain": [
"Transcribe the document as plain text.",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"
],
"bbox_plain": [
"Transcribe the document as plain text along with bounding boxes.",
"Extract and document the text from the provided image along with bounding boxes.",
"Converting the text embedded in this image into a readable documen along with bounding boxes.",
"Can you extract all visible text with bounding boxes from the image here?"
]
},
"VQA": {
"raw": [
"Given the image, answer the following question with few words.",
"Answer the following question: ",
"What is the answer to this question?",
"Write the answer: ",
"Please answer this question: "
]
},
"Embedded": {
"raw": [
"Given the image, answer the following question with few words.",
"Answer the following question: ",
"What is the answer to this question?",
"Write the answer: ",
"Please answer this question: "
]
}
}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import argparse
import os
import sys
import torch
# Add megatron to the path.
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir))
)
def split(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_layers_per_pp_rank):
"""Split pipeline parallel size = 1 checkpoint to pipeline parallel size N."""
iter = args.iteration if args.iteration else 1
for tp in range(num_tp):
path = os.path.join(input_dir, f"mp_rank_0{tp}", "model_optim_rng.pt")
sd = torch.load(path)
if num_layers_per_pp_rank is None:
num_layers = sd["args"].num_layers
assert num_layers % output_pp == 0, "specify --num-layers-per-pp-rank for an uneven split"
num_layers_per_pp_rank = [num_layers // output_pp] * output_pp
layer_lb = 0
for pp in range(output_pp):
assert num_layers_per_pp_rank[pp] > 0, "each pp rank must have at least 1 layer"
layer_ub = layer_lb + num_layers_per_pp_rank[pp]
new_sd = sd.copy()
new_sd["model"] = dict()
for k, v in sd["model"].items():
# First pp rank has vision model.
if pp == 0 and ("vision_model" in k or "vision_projection" in k):
new_sd["model"][k] = v
continue
# Only the first pp rank has the word embeddings.
if "language_model.embedding.word_embeddings" in k and pp == 0:
new_sd["model"][k] = v
# Only the last pp rank has the output layer.
if "language_model.output_layer" in k and pp == output_pp - 1:
new_sd["model"][k] = v
# Only the last pp rank has final layer norm.
if pp == output_pp - 1 and (
"language_model.decoder.final_norm" in k # Mamba model
or "language_model.decoder.final_layernorm" in k # GPT model
):
new_sd["model"][k] = v
if "language_model.decoder.layers" in k:
layer_num = int(k.split(".")[3])
if layer_lb <= layer_num and layer_num < layer_ub:
# On all pp ranks, megatron starts layer nums from 0!
new_layer_num = int(layer_num - layer_lb)
k_splitted = k.split(".")
k_splitted[3] = str(new_layer_num)
new_k = ".".join(k_splitted)
new_sd["model"][new_k] = v
output_dir = os.path.join(base_output_dir, f"iter_{iter:0>7}/mp_rank_0{tp}_00{pp}")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "model_optim_rng.pt")
torch.save(new_sd, output_path)
print(f"processed tp rank: {tp}/{num_tp - 1} and pp rank: {pp}/{output_pp - 1}")
layer_lb = layer_ub
# This is needed for megatron checkpoint loading.
with open(os.path.join(base_output_dir, "latest_checkpointed_iteration.txt"), "w") as f:
f.write(f"{iter}")
def combine(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_layers_per_pp_rank):
"""Combine pipeline parallel size = N checkpoint to pipeline parallel size 1."""
iter = args.iteration if args.iteration else 1
for tp in range(num_tp):
new_sd = None
layer_num_offset = 0
max_layer_num = 0
for pp in range(input_pp):
path = os.path.join(input_dir, f"mp_rank_0{tp}_00{pp}", "model_optim_rng.pt")
sd = torch.load(path)
if pp == 0:
new_sd = sd.copy()
new_sd["model"] = dict()
new_sd["args"].pipeline_model_parallel_size = 1
assert new_sd is not None
for k, v in sd["model"].items():
# First pp rank has vision model.
if pp == 0 and ("vision_model" in k or "vision_projection" in k):
new_sd["model"][k] = v
continue
# Only the first pp rank has the word embeddings.
if "language_model.embedding.word_embeddings" in k and pp == 0:
new_sd["model"][k] = v
# Only the last pp rank has the output layer.
if "language_model.output_layer" in k and pp == input_pp - 1:
new_sd["model"][k] = v
# Only the last pp rank has final layer norm.
if pp == output_pp - 1 and (
"language_model.decoder.final_norm" in k # Mamba model
or "language_model.decoder.final_layernorm" in k # GPT model
):
new_sd["model"][k] = v
if "language_model.decoder.layers" in k:
layer_num = int(k.split(".")[3])
# On all pp ranks, megatron starts layer nums from 0!
new_layer_num = layer_num_offset + layer_num
if new_layer_num > max_layer_num:
max_layer_num = new_layer_num
k_splitted = k.split(".")
k_splitted[3] = str(new_layer_num)
new_k = ".".join(k_splitted)
new_sd["model"][new_k] = v
print(f"processed tp rank: {tp}/{num_tp - 1} and pp rank: {pp}/{input_pp - 1}")
layer_num_offset = max_layer_num + 1
output_dir = os.path.join(base_output_dir, f"iter_{iter:0>7}/mp_rank_0{tp}")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "model_optim_rng.pt")
torch.save(new_sd, output_path)
# This is needed for megatron checkpoint loading.
with open(os.path.join(base_output_dir, "latest_checkpointed_iteration.txt"), "w") as f:
f.write(f"{iter}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Change pipeline parallelism for a model",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--input", type=str, required=True, help="Input model directory"
)
parser.add_argument(
"--input-pipeline-parallel", type=int, required=True, help="Input model pipeline parallelism"
)
parser.add_argument(
"--output", type=str, required=True, help="Output model directory"
)
parser.add_argument(
"--output-pipeline-parallel", type=int, required=True, help="Output model pipeline parallelism"
)
parser.add_argument(
"--tensor-parallel", type=int, required=True, help="Model tensor parallel size",
)
parser.add_argument(
"--num-layers-per-pp-rank", type=int, default=None, nargs="*", help="Specify this for uneven pipeline parallel split",
)
parser.add_argument(
"--iteration", type=int, default=None, help="Specify checkpoint iteration",
)
args = parser.parse_args()
f = None
if args.input_pipeline_parallel == 1 and args.output_pipeline_parallel > 1:
f = split
elif args.input_pipeline_parallel > 1 and args.output_pipeline_parallel == 1:
f = combine
else:
raise NotImplementedError("Only pipeline parallel 1 to N and N to 1 are supported")
f(args.input, args.output, args.input_pipeline_parallel, args.output_pipeline_parallel, args.tensor_parallel, args.num_layers_per_pp_rank)
print("done.")
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 0.579 # Datasets are weighted according to their size. Weights sum up to 1.
path: <path to laion dataset>
subflavors:
augmentation: False
- weight: 0.02
path: <path to coco>
subflavors:
augmentation: False
- weight: 0.01
path: <path to vqav2 dataset>
subflavors:
augmentation: False
# Please refer to Table 4 in https://arxiv.org/pdf/2409.11402 for full list of pretrain datasets.
# Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
val:
datasets:
- weight: 1.
path: <path to validation dataset>
subflavors:
augmentation: False
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export TOKENIZERS_PARALLELISM="false"
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-qwen20-72b-internvit-${DATETIME}"
else
MODEL_NAME="mcore-qwen20-72b-internvit"
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
CHECKPOINT_DIR="${WORKSPACE}/combined-qwen2.0-72b-instruct-internvit-6b-448px-1.5-tp8-te"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
AD=0.0
HD=0.0
LI=1
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
else
MBZ=1
BZ=2048
NW=8
AD=0.1
HD=0.1
LI=5
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
fi
SEQ_LEN=256 # Image embeddings sequence length.
DECODER_SEQ_LEN=512 # Language model sequence length.
MAX_POS_EMBED=512
OPTIONS=" \
--use-checkpoint-args \
--exit-duration-in-mins 230 \
--disable-bias-linear \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 80 \
--hidden-size 8192 \
--ffn-hidden-size 29568 \
--add-qkv-bias \
--num-attention-heads 64 \
--use-distributed-optimizer \
--use-te \
--num-workers ${NW} \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings 32768 \
--train-samples 122880000 \
--lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--lr 1e-4 \
--min-lr 2.5e-5 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 500 \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--save-interval 5000 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--split 100,0,0 \
--clip-grad 10.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--bf16 \
--eod-mask-loss \
--freeze-ViT \
--freeze-LM \
--patch-dim 14 \
--img-h 448 \
--img-w 448 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type qwen2.0_72B \
${EXTRA_ARGS} \
--allow-missing-vision-projection-checkpoint \
--vision-model-type internvit \
--disable-vision-class-token \
--log-params-norm \
--log-num-zeros-in-grad \
--ckpt-format torch \
--pixel-shuffle \
--image-tag-type nvlm
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export TOKENIZERS_PARALLELISM="false"
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-nous-yi34b-internvit-mlp-${DATETIME}"
else
MODEL_NAME="mcore-nous-yi34b-internvit-mlp"
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
LOAD_NAME="combined-yi-34b-internvit-tp8-mcore"
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
LI=1
AD=0.0
HD=0.0
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
else
MBZ=1
BZ=2048
NW=8
LI=5
AD=0.1
HD=0.1
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
fi
SEQ_LEN=256 # Image embeddings sequence length.
DECODER_SEQ_LEN=512 # Language model sequence length.
MAX_POS_EMBED=512
OPTIONS=" \
--swiglu \
--use-distributed-optimizer \
--num-workers ${NW} \
--num-layers 60 \
--hidden-size 7168 \
--normalization RMSNorm \
--num-attention-heads 56 \
--exit-duration-in-mins 230 \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 20480 \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 5000000 \
--disable-bias-linear \
--tensor-model-parallel-size 8 \
--language-model-type yi-34b \
--vision-model-type internvit \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--train-samples 122880000 \
--lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \
--lr 1e-4 \
--min-lr 2.5e-5 \
--lr-decay-style cosine \
--clip-grad 10.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--untie-embeddings-and-output-weights \
--eod-mask-loss \
--bf16 \
--tensorboard-dir=${TENSORBOARD_DIR} \
--freeze-LM \
--freeze-ViT \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--data-path ${DATA_TRAIN} \
--dataloader-type external \
--split 100,0,0 \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--log-interval ${LI} \
--save-interval 2000 \
--eval-interval 500 \
--eval-iters 10 \
--log-params-norm \
--log-num-zeros-in-grad \
${EXTRA_ARGS} \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--allow-missing-vision-projection-checkpoint \
--disable-vision-class-token \
--use-te \
--use-checkpoint-args \
--ckpt-format torch \
--pixel-shuffle \
--image-tag-type nvlm
"
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
export TOKENIZERS_PARALLELISM="false"
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
USE_TILING=0
USE_PIXEL_SHUFFLE_ONLY=0
while [[ $# -gt 0 ]]; do
case $1 in
--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
--use-tiling)
USE_TILING=1
shift
shift
;;
--use-pixel-shuffle-only)
USE_PIXEL_SHUFFLE_ONLY=1
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
SEQ_LEN=1024 # Image embeddings sequence length.
DECODER_SEQ_LEN=8192 # Language model sequence length.
MAX_POS_EMBED=8192
# Additional arguments.
EXTRA_ARGS=""
if [[ $USE_TILING -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 6 --use-thumbnail --use-tile-tags"
SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
fi
if [[ $USE_PIXEL_SHUFFLE_ONLY -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle"
SEQ_LEN=256
fi
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \
--no-masked-softmax-fusion \
--swiglu \
--num-layers 80 \
--hidden-size 8192 \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--num-attention-heads 64 \
--exit-on-missing-checkpoint \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 29568 \
--load ${MODEL_PATH} \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--disable-bias-linear \
--add-qkv-bias \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--language-model-type qwen2.0_72B \
--vision-model-type internvit \
--micro-batch-size 1 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--bf16 \
--freeze-LM \
--freeze-ViT \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--use-te \
--transformer-impl transformer_engine \
--use-checkpoint-args \
--out-seq-length 16 \
--temperature 1.0 \
--patch-dim 14 \
--seed 1234 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--disable-vision-class-token \
--input-image-path ${INPUT_IMAGE_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
${EXTRA_ARGS} \
--task ${TASK} \
--image-tag-type nvlm \
--ckpt-format torch
done
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
export TOKENIZERS_PARALLELISM="false"
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
while [[ $# -gt 0 ]]; do
case $1 in
--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
--input-metadata-path)
INPUT_METADATA_PATH="$2"
shift
shift
;;
--num-frames)
NUM_FRAMES="$2"
shift
shift
;;
-g|--groundtruth-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
SEQ_LEN=256
DECODER_SEQ_LEN=16384
EXTRA_ARGS=" --pixel-shuffle"
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \
--transformer-impl transformer_engine \
--use-te \
--use-checkpoint-args \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--language-model-type=qwen2.5_7B \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--group-query-attention \
--num-query-groups 4 \
--num-layers 28 \
--hidden-size 3584 \
--ffn-hidden-size 18944 \
--add-qkv-bias \
--num-attention-heads 28 \
--max-position-embeddings 32768 \
--no-masked-softmax-fusion \
--load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2.5-7B-Instruct \
--tokenizer-prompt-format qwen2p5 \
--bf16 \
--micro-batch-size 1 \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--out-seq-length 128 \
--temperature 1.0 \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--seed 153 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--input-image-path ${INPUT_IMAGE_PATH} \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--task ${TASK} \
${EXTRA_ARGS} \
--special-tokens "<image>" "<img>" "</img>" \
--vision-model-type internvit \
--num-frames ${NUM_FRAMES} \
--ckpt-format torch
done
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
export TOKENIZERS_PARALLELISM="false"
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
while [[ $# -gt 0 ]]; do
case $1 in
-i|--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
-t|--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
SEQ_LEN=256
DECODER_SEQ_LEN=8192
EXTRA_ARGS=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \
--transformer-impl transformer_engine \
--use-te \
--use-checkpoint-args \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--language-model-type=qwen2.5_7B \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--group-query-attention \
--num-query-groups 4 \
--num-layers 28 \
--hidden-size 3584 \
--ffn-hidden-size 18944 \
--add-qkv-bias \
--num-attention-heads 28 \
--max-position-embeddings 32768 \
--no-masked-softmax-fusion \
--load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2.5-7B-Instruct \
--tokenizer-prompt-format qwen2p5 \
--bf16 \
--micro-batch-size 1 \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--out-seq-length 128 \
--temperature 1.0 \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--seed 153 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--input-image-path ${INPUT_IMAGE_PATH} \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--task ${TASK} \
${EXTRA_ARGS} \
--special-tokens "<image>" "<img>" "</img>" \
--vision-model-type siglip \
--ckpt-format torch
done
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
USE_TILING=0
USE_PIXEL_SHUFFLE_ONLY=0
while [[ $# -gt 0 ]]; do
case $1 in
--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
--use-tiling)
USE_TILING=1
shift
shift
;;
--use-pixel-shuffle-only)
USE_PIXEL_SHUFFLE_ONLY=1
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
SEQ_LEN=1024 # Image embeddings sequence length.
DECODER_SEQ_LEN=8192 # Language model sequence length.
MAX_POS_EMBED=8192
# Additional arguments.
EXTRA_ARGS=""
if [[ $USE_TILING -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 6 --use-thumbnail --use-tile-tags"
SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
fi
if [[ $USE_PIXEL_SHUFFLE_ONLY -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle"
SEQ_LEN=256
fi
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \
--no-masked-softmax-fusion \
--swiglu \
--num-layers 60 \
--hidden-size 7168 \
--normalization RMSNorm \
--num-attention-heads 56 \
--exit-on-missing-checkpoint \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 20480 \
--load ${MODEL_PATH} \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 5000000 \
--disable-bias-linear \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--language-model-type yi-34b \
--vision-model-type internvit \
--micro-batch-size 1 \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--bf16 \
--freeze-LM \
--freeze-ViT \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--use-te \
--transformer-impl transformer_engine \
--use-checkpoint-args \
--out-seq-length 16 \
--temperature 1.0 \
--patch-dim 14 \
--seed 1234 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--disable-vision-class-token \
--input-image-path ${INPUT_IMAGE_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
${EXTRA_ARGS} \
--task ${TASK} \
--image-tag-type nvlm \
--ckpt-format torch
done
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_ALGO=^NVLS
export TOKENIZERS_PARALLELISM="false"
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-nous-yi34b-internvit-mlp-sft-${DATETIME}"
else
MODEL_NAME="mcore-nous-yi34b-internvit-mlp-sft"
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
LOAD_NAME="mcore-nous-yi34b-internvit-mlp" # From pretraining
CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
LI=1
AD=0.0
HD=0.0
ALLOW_NONDETERMINISTIC=1
# Can run out of GPU memory in interactive memory without this.
# This is just for interactive testing purposes. Do not use for proper training.
EXTRA_ARGS=" --freeze-LM"
else
MBZ=1
BZ=128
NW=2
LI=5
AD=0.0
HD=0.0
ALLOW_NONDETERMINISTIC=1
EXTRA_ARGS=""
fi
SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
DECODER_SEQ_LEN=3200 # Language model sequence length.
MAX_POS_EMBED=3200
OPTIONS=" \
--swiglu \
--use-distributed-optimizer \
--num-workers ${NW} \
--num-layers 60 \
--hidden-size 7168 \
--normalization RMSNorm \
--num-attention-heads 56 \
--exit-duration-in-mins 230 \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 20480 \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 5000000 \
--disable-bias-linear \
--tensor-model-parallel-size 8 \
--language-model-type yi-34b \
--vision-model-type internvit \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--train-samples 30000000 \
--lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \
--lr 2e-6 \
--min-lr 2.5e-7 \
--lr-decay-style cosine \
--split 100,0,0 \
--clip-grad 10 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--untie-embeddings-and-output-weights \
--eod-mask-loss \
--bf16 \
--tensorboard-dir=${TENSORBOARD_DIR} \
--freeze-ViT \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--data-path ${DATA_TRAIN} \
--dataloader-type external \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--log-interval ${LI} \
--load ${FINETUNE_DIR} \
--save ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--save-interval 5000 \
--eval-interval 500 \
--eval-iters 10 \
--log-params-norm \
--log-num-zeros-in-grad \
${EXTRA_ARGS} \
--disable-vision-class-token \
--use-te \
--ckpt-format torch \
--pixel-shuffle \
--use-tiling \
--max-num-tiles 6 \
--use-thumbnail \
--use-tile-tags \
--image-tag-type nvlm
"
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 0.01 # # Datasets are weighted according to their size. Weights sum up to 1.
path: <path to coco>
subflavors:
augmentation: False
- weight: 0.02
path: <path to clevr-math dataset>
subflavors:
augmentation: False
# Please refer to Table 6 in https://arxiv.org/pdf/2409.11402 for full list of SFT datasets.
# Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
val:
datasets:
- weight: 1.
path: <path to validation dataset>
subflavors:
augmentation: False
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_ALGO=^NVLS
export TOKENIZERS_PARALLELISM="false"
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-qwen20-72b-internvit-sft-${DATETIME}"
else
MODEL_NAME="mcore-qwen20-72b-internvit-sft"
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR="${OUTPUT}/checkpoints"
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
# From pretraining. The pretraining checkpoint must be manually split to 4 pipeline parallel stages.
# Please refer to README.md and run examples/multimodal/nvlm/pp_checkpoint_converter.py.
LOAD_NAME="mcore-qwen20-72b-internvit-pp4"
CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
AD=0.0
HD=0.0
LI=1
# This is just for interactive testing purposes. Do not use for proper training.
EXTRA_ARGS="--freeze-LM"
ALLOW_NONDETERMINISTIC=1
else
MBZ=1
BZ=256
NW=8
AD=0.0
HD=0.0
LI=5
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
fi
SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
DECODER_SEQ_LEN=3200 # Language model sequence length.
MAX_POS_EMBED=8192
OPTIONS=" \
--use-checkpoint-args \
--exit-duration-in-mins 230 \
--disable-bias-linear \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 4 \
--num-layers 80 \
--hidden-size 8192 \
--ffn-hidden-size 29568 \
--add-qkv-bias \
--num-attention-heads 64 \
--use-distributed-optimizer \
--use-te \
--num-workers ${NW} \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings 32768 \
--train-samples 122880000 \
--lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--lr 2e-6 \
--min-lr 2.5e-7 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 500 \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--save-interval 10000 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--split 100,0,0 \
--clip-grad 10.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--bf16 \
--eod-mask-loss \
--freeze-ViT \
--patch-dim 14 \
--img-h 448 \
--img-w 448 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type qwen2.0_72B \
${EXTRA_ARGS} \
--vision-model-type internvit \
--disable-vision-class-token \
--log-params-norm \
--log-num-zeros-in-grad \
--ckpt-format torch \
--pixel-shuffle \
--use-tiling \
--max-num-tiles 6 \
--use-thumbnail \
--use-tile-tags \
--image-tag-type nvlm
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_ALGO=^NVLS
export TOKENIZERS_PARALLELISM=false
USER=$SLURM_JOB_USER
# Auto-detect batch or interactive mode.
which srun
BATCH=$((1-$?))
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="qwen2.5-7B-internvit-video-sft-nvlm-${DATETIME}"
else
MODEL_NAME="qwen2.5-7B-internvitp-video-sft-nvlm"
DEBUG=0
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR="${OUTPUT}/checkpoints"
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
# From pretraining. The pretraining checkpoint should have tensor parallel size to 4.
LOAD_NAME="mcore-qwen2p5-7b-internvit-tp4"
CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
AD=0.0
HD=0.0
LI=1
# This is just for interactive testing purposes. Do not use for proper training.
EXTRA_ARGS="--freeze-LM"
ALLOW_NONDETERMINISTIC=1
else
MBZ=1
BZ=256
NW=8
AD=0.0
HD=0.0
LI=5
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
fi
USE_TILING=1
SEQ_LEN=1024
DECODER_SEQ_LEN=16384
MAX_POS_EMBED=32768
TRAIN_SAMPLES=6602173
WARMUP_SAMPLES=198065
if [[ $BATCH -eq 0 ]]; then
# Runs out of GPU memory in interactive memory without this.
EXTRA_ARGS+="--freeze-LM"
fi
if [[ $USE_TILING -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
SEQ_LEN=256
fi
OPTIONS=" \
--swiglu \
--use-distributed-optimizer \
--num-workers ${NW} \
--num-layers 28 \
--hidden-size 3584 \
--norm-epsilon 1e-06 \
--normalization RMSNorm \
--num-attention-heads 28 \
--exit-duration-in-mins 110 \
--group-query-attention \
--num-query-groups 4 \
--ffn-hidden-size 18944 \
--add-qkv-bias \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--dataloader-seq-length ${DECODER_SEQ_LEN} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2.5-7B-Instruct \
--tokenizer-prompt-format qwen2p5 \
--pixel-shuffle \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--disable-bias-linear \
--pipeline-model-parallel-size 1 \
--tensor-model-parallel-size 4 \
--language-model-type qwen2.5_7B \
--vision-model-type internvit \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--lr 2e-6 \
--min-lr 2.5e-7 \
--train-samples ${TRAIN_SAMPLES} \
--lr-warmup-samples ${WARMUP_SAMPLES} \
--lr-decay-style cosine \
--clip-grad 10 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--eod-mask-loss \
--bf16 \
--tensorboard-dir ${TENSORBOARD_DIR} \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--data-path ${DATA_TRAIN} \
--dataloader-type external \
--split 100,0,0 \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--log-interval ${LI} \
--save-interval 500 \
--eval-interval 500 \
--eval-iters 10 \
--log-params-norm \
--log-num-zeros-in-grad \
${EXTRA_ARGS} \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--distributed-timeout-minutes 60 \
--allow-missing-vision-projection-checkpoint \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--disable-vision-class-token \
--use-te \
--ckpt-format torch \
--num-frames 32 \
--use-checkpoint-args \
--image-tag-type internvl \
--recompute-granularity full \
--recompute-method block \
--recompute-num-layers 28 \
--recompute-vision \
"
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 1.
path: <path_to_pretraining_dataset_in_energon_format>
subflavors:
augmentation: false
val:
datasets:
- weight: 1.
path: <path_to_pretraining_dataset_in_energon_format>
subflavors:
augmentation: false
#!/bin/bash
# Pretrain a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-pretraining"
# Check that the user has set an output path for model checkpoints.
if [[ -z $WORKSPACE ]]; then
echo "Please set WORKSPACE for storing your model checkpoints."
exit 1
fi
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
if [[ -z $LOAD_NAME ]]; then
echo "Please set LOAD_NAME for input model name."
exit 1
fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
DEBUG=0
if [[ $DEBUG -eq 1 ]]; then
BZ=32
NW=2
HD=0.0
LI=1
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
else
BZ=256
NW=2
HD=0.1
LI=10
EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1
fi
OPTIONS=" \
--apply-layernorm-1p \
--attention-softmax-in-fp32 \
--use-checkpoint-args \
--use-distributed-optimizer \
--transformer-impl transformer_engine \
--use-te \
--normalization RMSNorm \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--num-workers ${NW} \
--exit-duration-in-mins 230 \
--use-flash-attn \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout ${HD} \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--num-layers 32 \
--hidden-size 4096 \
--num-attention-heads 32 \
--seq-length 576 \
--decoder-seq-length 1024 \
--max-position-embeddings 4096 \
--ffn-hidden-size 14336 \
--train-iters 20000 \
--micro-batch-size 1 \
--global-batch-size ${BZ} \
--lr-decay-iters 20000 \
--lr-warmup-fraction .01 \
--lr 0.00015 \
--min-lr 1.0e-5 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 1000 \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
--save-interval 1000 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--split 100,0,0 \
--clip-grad 1.0 \
--weight-decay 1e-2 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--log-params-norm \
--log-num-zeros-in-grad \
--bf16 \
--eod-mask-loss \
--freeze-LM \
--freeze-ViT \
--patch-dim 14 \
--img-h 336 \
--img-w 336 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type=mistral_7b \
--disable-vision-class-token \
${EXTRA_ARGS} \
--distributed-timeout-minutes 60 \
--allow-missing-vision-projection-checkpoint \
--ckpt-format torch
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
\ No newline at end of file
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
from functools import partial
import torch
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
from examples.multimodal.layer_scaling import LayerScalingTransformerLayer, get_bias_dropout_add_layer_scaling
try:
from megatron.core.extensions.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TELayerNormColumnParallelLinear,
TENorm,
TERowParallelLinear,
)
HAVE_TE = True
except ImportError:
HAVE_TE = False
try:
import apex
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
HAVE_APEX = True
LNImpl = FusedLayerNorm
except ImportError:
import warnings
from megatron.core.transformer.torch_norm import WrappedTorchNorm
warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
LNImpl = WrappedTorchNorm
def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
# Dense MLP w/ or w/o TE modules.
return ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
),
)
def get_norm_mlp_module_spec_te() -> ModuleSpec:
return ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
),
)
def get_radio_g_layer_spec(normalization) -> ModuleSpec:
attn_mask_type = AttnMaskType.no_mask
if normalization == "LayerNorm":
norm = LNImpl
elif normalization == "RMSNorm":
if HAVE_TE:
norm = TENorm
else:
assert is_torch_min_version("2.4.0"), "Torch version >= 2.4.0 is required for RMSNorm"
if HAVE_APEX:
warnings.warn(f'Apex does not support RMSNorm. Falling back to Torch Norm')
norm = WrappedTorchNorm
else:
raise RuntimeError("unknown normalization", normalization)
mlp = get_mlp_module_spec(use_te=False) # doesn't include norm.
return ModuleSpec(
module=LayerScalingTransformerLayer,
submodules=TransformerLayerSubmodules(
input_layernorm=norm,
self_attention=ModuleSpec(
module=SelfAttention,
params={"attn_mask_type": attn_mask_type},
submodules=SelfAttentionSubmodules(
linear_qkv=ColumnParallelLinear,
core_attention=DotProductAttention,
linear_proj=RowParallelLinear,
q_layernorm=IdentityOp,
k_layernorm=IdentityOp,
),
),
self_attn_bda=get_bias_dropout_add_layer_scaling,
pre_mlp_layernorm=norm,
mlp=mlp,
mlp_bda=get_bias_dropout_add_layer_scaling,
),
)
def get_radio_g_layer_spec_te() -> ModuleSpec:
attn_mask_type = AttnMaskType.no_mask
mlp = get_norm_mlp_module_spec_te()
return ModuleSpec(
module=LayerScalingTransformerLayer,
submodules=TransformerLayerSubmodules(
self_attention=ModuleSpec(
module=SelfAttention,
params={"attn_mask_type": attn_mask_type},
submodules=SelfAttentionSubmodules(
linear_qkv=TELayerNormColumnParallelLinear,
core_attention=TEDotProductAttention,
linear_proj=TERowParallelLinear,
q_layernorm=IdentityOp,
k_layernorm=IdentityOp,
),
),
self_attn_bda=get_bias_dropout_add_layer_scaling,
pre_mlp_layernorm=IdentityOp,
mlp=mlp,
mlp_bda=get_bias_dropout_add_layer_scaling,
),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment