Commit 688448db authored by silencealiang's avatar silencealiang
Browse files

更新代码

parent a02a5490
Pipeline #2503 passed with stage
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import argparse
import os
import torch
def convert(output_path, tensor_parallel_size, use_te, version):
device = "cuda"
model = torch.hub.load('NVlabs/RADIO', 'radio_model', version=version, progress=True)
state_dict = model.state_dict()
new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
# Indices from mapping pytorch multihead attention to megatron.
kv_channels = 80
hidden_dim = 1280
num_heads = 16
indices = []
for i in range(num_heads):
lb = i * kv_channels
ub = (i + 1) * kv_channels
indices.append(torch.arange(lb, ub, dtype=torch.int))
indices.append(torch.arange(hidden_dim + lb, hidden_dim + ub, dtype=torch.int))
indices.append(torch.arange(2 * hidden_dim + lb, 2 * hidden_dim + ub, dtype=torch.int))
indices = torch.cat(indices)
for name, tensor in state_dict.items():
# Map parameter names to ones used in megatron.
new_name = ""
new_tensor = tensor
if new_tensor.dtype == torch.float16:
new_tensor = new_tensor.to(torch.float32)
# This is used for chunking some tensors to target tensor parallel size.
chunk_dim = None
if "summary_idxs" in name:
continue
elif "patch_generator" in name:
if "embedder" in name:
new_name = "embedder.weight"
chunk_dim = 0
elif "cls_token" in name:
new_name = "class_token"
elif "pos_embed" in name:
new_name = "position_embeddings"
elif "input_conditioner" in name:
continue
elif "blocks" in name:
layer_idx = name.split(".")[2]
base = f"decoder.layers.{layer_idx}"
if "attn.qkv.weight" in name:
new_name = f"{base}.self_attention.linear_qkv.weight"
new_tensor = new_tensor[indices]
chunk_dim = 0
elif "attn.qkv.bias" in name:
new_name = f"{base}.self_attention.linear_qkv.bias"
new_tensor = new_tensor[indices]
chunk_dim = 0
elif "attn.proj.weight" in name:
new_name = f"{base}.self_attention.linear_proj.weight"
chunk_dim = 1
elif "attn.proj.bias" in name:
new_name = f"{base}.self_attention.linear_proj.bias"
elif "norm1.weight" in name:
new_name = f"{base}.input_layernorm.weight"
if use_te:
new_name = f"{base}.self_attention.linear_qkv.layer_norm_weight"
elif "norm1.bias" in name:
new_name = f"{base}.input_layernorm.bias"
if use_te:
new_name = f"{base}.self_attention.linear_qkv.layer_norm_bias"
elif "mlp.fc1.weight" in name:
new_name = f"{base}.mlp.linear_fc1.weight"
chunk_dim = 0
elif "mlp.fc1.bias" in name:
new_name = f"{base}.mlp.linear_fc1.bias"
chunk_dim = 0
elif "mlp.fc2.weight" in name:
new_name = f"{base}.mlp.linear_fc2.weight"
chunk_dim = 1
elif "mlp.fc2.bias" in name:
new_name = f"{base}.mlp.linear_fc2.bias"
elif "norm2.weight" in name:
new_name = f"{base}.pre_mlp_layernorm.weight"
if use_te:
new_name = f"{base}.mlp.linear_fc1.layer_norm_weight"
elif "norm2.bias" in name:
new_name = f"{base}.pre_mlp_layernorm.bias"
if use_te:
new_name = f"{base}.mlp.linear_fc1.layer_norm_bias"
assert new_name != "", f"unexpected layer name {name}"
if chunk_dim is None:
new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
else:
new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
for i in range(tensor_parallel_size):
# chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
# TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
is_extra_state_layer = any([l in new_name for l in extra_state_layers])
if use_te and is_extra_state_layer:
layer = new_name.split(".")[-2]
if layer in extra_state_layers:
extra_state_name = (
new_name[: new_name.rfind(".") + 1] + "_extra_state"
) # Replace the weight name.
new_state_dicts[i]["model"][extra_state_name] = None
for i in range(tensor_parallel_size):
output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
os.makedirs(output_dir_tp)
output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
torch.save(new_state_dicts[i], output_path_tp)
with open(os.path.join(output_path, "latest_checkpointed_iteration.txt"), "w") as f:
f.write("1")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""
Convert RADIO weights to megatron format.
Example usage:
python radio_converter.py --output /some/output/folder --tensor-parallel-size 4
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--output", type=str, required=True, help="output directory for megatron state dict file(s)"
)
parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
)
parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
parser.add_argument("--version", type=str, default="radio_v2.5-h", help="Version of radio to load for conversion")
args = parser.parse_args()
convert(args.output, args.tensor_parallel_size, args.use_te, args.version)
print("done.")
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
def add_multimodal_extra_args(parser): def add_multimodal_extra_args(parser):
"""Extra arguments.""" """Extra arguments."""
group = parser.add_argument_group(title='multimodal arguments') group = parser.add_argument_group(title='multimodal arguments')
group.add_argument('--dataset-config', type=str, default=None) group.add_argument('--dataset-config', type=str, default=None)
group.add_argument("--prompt-path", type=str, default=None) group.add_argument("--prompt-path", type=str, default=None)
group.add_argument('--freeze-LM', action='store_true', default=False) group.add_argument('--freeze-LM', action='store_true', default=False)
group.add_argument('--freeze-ViT', action='store_true', default=False) group.add_argument('--freeze-ViT', action='store_true', default=False)
group.add_argument('--language-model-type', type=str, required=True) group.add_argument('--language-model-type', type=str, required=True)
group.add_argument('--vision-model-type', type=str, default="clip") group.add_argument('--language-huggingface-model-name-or-path', type=str)
group.add_argument("--disable-vision-class-token", action="store_true", default=False) group.add_argument('--vision-model-type', type=str, default="clip")
group.add_argument( group.add_argument('--vision-huggingface-model-name-or-path', type=str)
"--allow-missing-vision-projection-checkpoint", action="store_true", default=False group.add_argument("--disable-vision-class-token", action="store_true", default=False)
) group.add_argument(
group.add_argument("--use-te", action="store_true", default=False) "--allow-missing-vision-projection-checkpoint", action="store_true", default=False
group.add_argument( )
"--dataloader-save", type=str, default=None, help="Energon dataloader state save path" group.add_argument("--use-te", action="store_true", default=False)
) group.add_argument(
group.add_argument( "--dataloader-save", type=str, default=None, help="Energon dataloader state save path"
"--use-tiling", action="store_true", default=False, help="Use input image tiling" )
) group.add_argument(
group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles") "--use-tiling", action="store_true", default=False, help="Use input image tiling"
group.add_argument( )
"--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile" group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
) group.add_argument(
group.add_argument( "--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile"
"--dataloader-seq-length", )
type=int, group.add_argument(
help="Make dataloader to produce sequences of specific length.", "--dataloader-seq-length",
) type=int,
group.add_argument( help="Make dataloader to produce sequences of specific length.",
"--num-frames", )
type=int, group.add_argument(
default=1, "--num-frames",
help="Number of frames to regularly sample from the video as input to the model.", type=int,
) default=1,
group.add_argument( help="Number of frames to regularly sample from the video as input to the model.",
"--online-evaluation-config", type=str, help="Config file for online evaluation." )
) group.add_argument(
group.add_argument( "--online-evaluation-config", type=str, help="Config file for online evaluation."
"--special-tokens", )
nargs="*", group.add_argument(
default=[IMAGE_TOKEN], "--special-tokens",
help="Special tokens used in the multimodal model", nargs="*",
) default=[IMAGE_TOKEN],
group.add_argument( help="Special tokens used in the multimodal model",
"--tokenizer-prompt-format", )
type=str, group.add_argument(
choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"], "--tokenizer-prompt-format",
required=True, type=str,
help="Prompt format to use with the tokenizer.", choices=["mistral", "llama3", "llama3p1", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"],
) required=True,
group.add_argument("--pixel-shuffle", action="store_true", default=False) help="Prompt format to use with the tokenizer.",
group.add_argument( )
"--image-tag-type", group.add_argument("--pixel-shuffle", action="store_true", default=False)
type=str, group.add_argument(
choices=["nvlm", "internvl", ""], "--image-tag-type",
default="", # Default: Image tag not used. type=str,
help="Surround image tokens with tags.", choices=["nvlm", "internvl", ""],
) default="", # Default: Image tag not used.
group.add_argument("--use-tile-tags", action="store_true", default=False, help="Use tile tags") help="Surround image tokens with tags.",
group.add_argument( )
"--packing-buffer-size", group.add_argument("--use-tile-tags", action="store_true", default=False, help="Use tile tags")
type=int, group.add_argument(
default=None, # Packing is disabled by default. "--packing-buffer-size",
help="Enable sample packing by setting the buffer size to > 0", type=int,
) default=None, # Packing is disabled by default.
group.add_argument( help="Enable sample packing by setting the buffer size to > 0",
"--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing." )
) group.add_argument(
group.add_argument( "--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
"--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model" )
) group.add_argument(
"--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
)
return parser group.add_argument(
"--use-loss-scaling", action="store_true", default=False, help="Scale loss based on conversation turn length (in tokens)."
)
group.add_argument(
"--use-area-weighted-aspect-ratio", action="store_true", default=False,
help=(
"When --use-tiling is True, find the aspect ratio to use based on the original ",
"image aspect ratio and the area covered by the tiles.")
)
return parser
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""" """"
NOTE: NVLM uses InternViT with tensor parallel (TP) size = 8. NOTE: NVLM uses InternViT with tensor parallel (TP) size = 8.
Since InternViT has 25 attention heads and Megatron currently requires the number of attention heads Since InternViT has 25 attention heads and Megatron currently requires the number of attention heads
to be divisible by the TP size, we add 7 dummy zero attention heads to have 32 attention heads. to be divisible by the TP size, we add 7 dummy zero attention heads to have 32 attention heads.
This workaround requires some changes to how we compute RMSNorm, Attention etc. This workaround requires some changes to how we compute RMSNorm, Attention etc.
Additionally, InternViT introduces some unique features like Layer Scaling. Additionally, InternViT introduces some unique features like Layer Scaling.
Those code changes are gathered here. Those code changes are gathered here.
""" """
from functools import partial from functools import partial
from typing import Dict
import torch
import torch
from megatron.core.utils import divide
from megatron.core.dist_checkpointing.mapping import ShardedStateDict from megatron.core.extensions.transformer_engine import (
from megatron.core.extensions.transformer_engine import ( TEColumnParallelLinear,
TEColumnParallelLinear, TEDotProductAttention,
TEDotProductAttention, TERowParallelLinear,
TERowParallelLinear, )
) from megatron.core.parallel_state import (
from megatron.core.parallel_state import ( get_tensor_model_parallel_group,
get_tensor_model_parallel_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size,
get_tensor_model_parallel_world_size, )
) from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.dot_product_attention import DotProductAttention from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.mlp import MLP, MLPSubmodules from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.module import MegatronModule from megatron.core.transformer.spec_utils import ModuleSpec, build_module
from megatron.core.transformer.spec_utils import ModuleSpec, build_module from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
class InternViTRMSNorm(MegatronModule):
class InternViTRMSNorm(MegatronModule):
def __init__(
def __init__( self,
self, config,
config, hidden_size: int,
hidden_size: int, eps: float = 1e-6,
eps: float = 1e-6, sequence_parallel: bool = False,
sequence_parallel: bool = False, compute_var: bool = False,
compute_var: bool = False, ):
): """Custom RMSNorm for InternViT.
"""Custom RMSNorm for InternViT.
Args:
Args: config (TransformerConfig): Config.
config (TransformerConfig): Config. hidden_size (int): Input hidden size.
hidden_size (int): Input hidden size. eps (float): epsilon to use for the norm, default to 1e-6
eps (float): epsilon to use for the norm, default to 1e-6 sequence_parallel (bool): Set to true if sequence parallelism is being used,
sequence_parallel (bool): Set to true if sequence parallelism is being used, this marks the weights as needing to be allreduced.
this marks the weights as needing to be allreduced. compute_var (bool): Indicator to compute statistic manually.
compute_var (bool): Indicator to compute statistic manually. """
""" super().__init__(config=config)
super().__init__(config=config) self.config = config
self.config = config self.eps = eps
self.eps = eps self.weight = torch.nn.Parameter(torch.ones(hidden_size))
self.weight = torch.nn.Parameter(torch.ones(hidden_size)) self._compute_var = compute_var
self._compute_var = compute_var
assert not sequence_parallel, "Sequence parallelism is not supported with InternViT."
assert not sequence_parallel, "Sequence parallelism is not supported with InternViT."
setattr(self.weight, 'sequence_parallel', sequence_parallel)
setattr(self.weight, 'sequence_parallel', sequence_parallel)
def _norm(self, x, var):
def _norm(self, x, var): if var is None:
if var is None: var = x.pow(2).mean(-1, keepdim=True)
var = x.pow(2).mean(-1, keepdim=True)
return x * torch.rsqrt(var + self.eps)
return x * torch.rsqrt(var + self.eps)
def forward(self, x):
def forward(self, x): """Run RMSNorm with an option to compute custom statistic."""
"""Run RMSNorm with an option to compute custom statistic.""" var = None
var = None if self._compute_var:
if self._compute_var: unpadded_hidden_size = self.config.hidden_size # 3200
unpadded_hidden_size = self.config.hidden_size # 3200 max_dim = x.shape[-1] # 128
max_dim = x.shape[-1] # 128
x = x.reshape(x.size(0), x.size(1), -1)
x = x.reshape(x.size(0), x.size(1), -1) var = self._gather_var(x.float().pow(2), max_dim) / unpadded_hidden_size
var = self._gather_var(x.float().pow(2), max_dim) / unpadded_hidden_size
output = self._norm(x.float(), var).type_as(x)
output = self._norm(x.float(), var).type_as(x) output = output * self.weight
output = output * self.weight
if self._compute_var:
if self._compute_var: output = output.reshape(output.size(0), output.size(1), -1, max_dim)
output = output.reshape(output.size(0), output.size(1), -1, max_dim)
return output
return output
def _gather_var(self, input_, max_dim):
def _gather_var(self, input_, max_dim, valid_ranks=6): """Compute statistic across the non-dummy heads."""
"""Compute statistic across the non-dummy heads.""" world_size = get_tensor_model_parallel_world_size()
world_size = get_tensor_model_parallel_world_size()
assert world_size == 8, "tested only with TP=8" # Size and dimension.
last_dim = input_.dim() - 1
# Size and dimension. rank = get_tensor_model_parallel_rank()
last_dim = input_.dim() - 1
rank = get_tensor_model_parallel_rank() num_attention_heads_per_partition = divide(self.config.num_attention_heads, world_size)
valid_ranks = 24 // num_attention_heads_per_partition
if rank < valid_ranks: # Ranks 0-5 have 24 non-dummy attention heads.
var = input_.sum(-1, keepdim=True) residual_heads = 25 % num_attention_heads_per_partition
elif rank == valid_ranks: # Rank 6 has 1 non-dummy attention head. if residual_heads == 0:
var = input_[..., :max_dim].sum(-1, keepdim=True) residual_heads = num_attention_heads_per_partition
else: max_dim = max_dim * residual_heads
var = input_.sum(-1, keepdim=True) * 0.0 # Zero-out the dummy heads.
if rank < valid_ranks: # Ranks without any dummy attention heads.
tensor_list = [torch.empty_like(var) for _ in range(world_size)] var = input_.sum(-1, keepdim=True)
tensor_list[rank] = var elif rank == valid_ranks: # The only rank which may contain 'residual_heads' dummy attention heads.
torch.distributed.all_gather(tensor_list, var, group=get_tensor_model_parallel_group()) var = input_[..., :max_dim].sum(-1, keepdim=True)
else:
output = torch.cat(tensor_list, dim=last_dim).contiguous() var = input_.sum(-1, keepdim=True) * 0.0 # All heads in these ranks are dummy heads: Zero-out.
return output.sum(-1, keepdim=True) tensor_list = [torch.empty_like(var) for _ in range(world_size)]
tensor_list[rank] = var
def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata={}): torch.distributed.all_gather(tensor_list, var, group=get_tensor_model_parallel_group())
# in InternVitSelfAttention the q_layernorm and k_layernorm weights output = torch.cat(tensor_list, dim=last_dim).contiguous()
# are tensor-parallel so must be converted to sharded tensors
if 'q_layernorm' in prefix or 'k_layernorm' in prefix: return output.sum(-1, keepdim=True)
state_dict = self.state_dict(prefix='', keep_vars=True)
return make_sharded_tensors_for_checkpoint( def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata={}):
state_dict, prefix, {'weight': 0}, sharded_offsets
) # in InternVitSelfAttention the q_layernorm and k_layernorm weights
else: # are tensor-parallel so must be converted to sharded tensors
return super().sharded_state_dict(prefix, sharded_offsets, metadata) if 'q_layernorm' in prefix or 'k_layernorm' in prefix:
state_dict = self.state_dict(prefix='', keep_vars=True)
return make_sharded_tensors_for_checkpoint(
def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec: state_dict, prefix, {'weight': 0}, sharded_offsets
# Dense MLP w/ or w/o TE modules. )
return ModuleSpec( else:
module=MLP, return super().sharded_state_dict(prefix, sharded_offsets, metadata)
submodules=MLPSubmodules(
linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
linear_fc2=TERowParallelLinear if use_te else RowParallelLinear, def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
), # Dense MLP w/ or w/o TE modules.
) return ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
# Handle InternViT's layer scaling. linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
def _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training): linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
x, bias = x_with_bias # unpack ),
residual = residual if residual.dtype == x.dtype else residual.to(x.dtype) )
if bias is not None:
x = x + bias
out = torch.nn.functional.dropout(x, p=prob, training=training) # Handle InternViT's layer scaling.
out = residual + out * ls def _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training):
return out x, bias = x_with_bias # unpack
else: residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
out = torch.nn.functional.dropout(x, p=prob, training=training) if bias is not None:
out = residual + out * ls x = x + bias
return out out = torch.nn.functional.dropout(x, p=prob, training=training)
out = residual + out * ls
return out
def bias_dropout_add_unfused_internvit(ls, training): else:
"""Bias-dropout-add as in Megatron but with added LayerScaling handling.""" out = torch.nn.functional.dropout(x, p=prob, training=training)
out = residual + out * ls
def _bias_dropout_add(x_with_bias, residual, prob): return out
return _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training)
return _bias_dropout_add def bias_dropout_add_unfused_internvit(ls, training):
"""Bias-dropout-add as in Megatron but with added LayerScaling handling."""
def get_bias_dropout_add_internvit(ls, training, fused): def _bias_dropout_add(x_with_bias, residual, prob):
"""Bias-dropout-add as in Megatron but with added LayerScaling handling.""" return _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training)
assert not fused, "Fused bias-dropout-add not implemented for InternViT."
return bias_dropout_add_unfused_internvit(ls, training) return _bias_dropout_add
# Add InternViT specialties to our default TransformerLayer. def get_bias_dropout_add_internvit(ls, training, fused):
class InternViTTransformerLayer(TransformerLayer): """Bias-dropout-add as in Megatron but with added LayerScaling handling."""
def __init__(self, *args, **kwargs): assert not fused, "Fused bias-dropout-add not implemented for InternViT."
super().__init__(*args, **kwargs) return bias_dropout_add_unfused_internvit(ls, training)
self.ls1 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
self.ls2 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
# Add InternViT specialties to our default TransformerLayer.
self.self_attn_bda = partial(self.self_attn_bda, self.ls1) class InternViTTransformerLayer(TransformerLayer):
self.mlp_bda = partial(self.mlp_bda, self.ls2) def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.ls1 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
# Override a few things that are special in InternViT and not supported by the SelfAttention class. self.ls2 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
class InternViTSelfAttention(SelfAttention):
def __init__( self.self_attn_bda = partial(self.self_attn_bda, self.ls1)
self, config: TransformerConfig, submodules: SelfAttentionSubmodules, *args, **kwargs self.mlp_bda = partial(self.mlp_bda, self.ls2)
):
super().__init__(config=config, submodules=submodules, *args, **kwargs)
# Override a few things that are special in InternViT and not supported by the SelfAttention class.
# Need to override linear_qkv, q_layernorm and k_layernorm. class InternViTSelfAttention(SelfAttention):
qkv_bias = False def __init__(
self, config: TransformerConfig, submodules: SelfAttentionSubmodules, *args, **kwargs
self.linear_qkv = build_module( ):
submodules.linear_qkv, super().__init__(config=config, submodules=submodules, *args, **kwargs)
self.config.hidden_size,
self.query_projection_size + 2 * self.kv_projection_size, # Need to override linear_qkv, q_layernorm and k_layernorm.
config=self.config, qkv_bias = False
init_method=self.config.init_method,
gather_output=False, self.linear_qkv = build_module(
bias=qkv_bias, submodules.linear_qkv,
skip_bias_add=False, self.config.hidden_size,
is_expert=False, self.query_projection_size + 2 * self.kv_projection_size,
tp_comm_buffer_name='qkv', config=self.config,
) init_method=self.config.init_method,
gather_output=False,
qk_layernorm_hidden_size = ( bias=qkv_bias,
self.hidden_size_per_attention_head * self.num_attention_heads_per_partition skip_bias_add=False,
) # 512 for internvit is_expert=False,
tp_comm_buffer_name='qkv',
self.q_layernorm = build_module( )
submodules.q_layernorm,
hidden_size=qk_layernorm_hidden_size, qk_layernorm_hidden_size = (
config=self.config, self.hidden_size_per_attention_head * self.num_attention_heads_per_partition
eps=self.config.layernorm_epsilon, ) # 512 for internvit
compute_var=True,
) self.q_layernorm = build_module(
submodules.q_layernorm,
self.k_layernorm = build_module( hidden_size=qk_layernorm_hidden_size,
submodules.k_layernorm, config=self.config,
hidden_size=qk_layernorm_hidden_size, eps=self.config.layernorm_epsilon,
config=self.config, compute_var=True,
eps=self.config.layernorm_epsilon, )
compute_var=True,
) self.k_layernorm = build_module(
submodules.k_layernorm,
hidden_size=qk_layernorm_hidden_size,
class InternViTTEDotProductAttention(TEDotProductAttention): config=self.config,
"""Adjusted Attention for InternViT""" eps=self.config.layernorm_epsilon,
compute_var=True,
def forward(self, *args, **kwargs): )
"""Regular TEDotProductAttention + zero-out dummy attention heads."""
out = super().forward(*args, **kwargs)
class InternViTTEDotProductAttention(TEDotProductAttention):
# This makes sure the dummy attention heads are zeroed out. """Adjusted Attention for InternViT"""
mask = torch.ones_like(out, dtype=out.dtype, device=out.device)
rank = get_tensor_model_parallel_rank() def forward(self, *args, **kwargs):
max_dim = out.shape[-1] # 128 """Regular TEDotProductAttention + zero-out dummy attention heads."""
valid_ranks = 6 out = super().forward(*args, **kwargs)
if rank == valid_ranks: # This makes sure the dummy attention heads are zeroed out.
mask[..., max_dim:] *= 0.0 mask = torch.ones_like(out, dtype=out.dtype, device=out.device)
elif rank > valid_ranks: rank = get_tensor_model_parallel_rank()
mask *= 0.0 max_dim = out.shape[-1] # 128
out *= mask valid_ranks = 6
return out if rank == valid_ranks:
mask[..., max_dim:] *= 0.0
elif rank > valid_ranks:
def get_internvit_layer_spec(use_te) -> ModuleSpec: mask *= 0.0
mlp = get_mlp_module_spec(use_te) # no norm out *= mask
return ModuleSpec( return out
module=InternViTTransformerLayer,
submodules=TransformerLayerSubmodules(
input_layernorm=InternViTRMSNorm, def get_internvit_layer_spec(use_te) -> ModuleSpec:
self_attention=ModuleSpec( mlp = get_mlp_module_spec(use_te) # no norm
module=InternViTSelfAttention,
params={"attn_mask_type": AttnMaskType.no_mask}, return ModuleSpec(
submodules=SelfAttentionSubmodules( module=InternViTTransformerLayer,
linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear, submodules=TransformerLayerSubmodules(
core_attention=TEDotProductAttention if use_te else DotProductAttention, input_layernorm=InternViTRMSNorm,
linear_proj=TERowParallelLinear if use_te else RowParallelLinear, self_attention=ModuleSpec(
q_layernorm=InternViTRMSNorm, module=InternViTSelfAttention,
k_layernorm=InternViTRMSNorm, params={"attn_mask_type": AttnMaskType.no_mask},
), submodules=SelfAttentionSubmodules(
), linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear,
self_attn_bda=get_bias_dropout_add_internvit, core_attention=TEDotProductAttention if use_te else DotProductAttention,
pre_mlp_layernorm=InternViTRMSNorm, linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
mlp=mlp, q_layernorm=InternViTRMSNorm,
mlp_bda=get_bias_dropout_add_internvit, k_layernorm=InternViTRMSNorm,
), ),
) ),
self_attn_bda=get_bias_dropout_add_internvit,
pre_mlp_layernorm=InternViTRMSNorm,
mlp=mlp,
mlp_bda=get_bias_dropout_add_internvit,
),
)
File mode changed from 100644 to 100755
#!/bin/bash #!/bin/bash
# Your SBATCH commands here if using SLURM. # Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root. # Please launch this script from megatron-lm root.
# Train a multimodal model. # Train a multimodal model.
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export TOKENIZERS_PARALLELISM="false" export TOKENIZERS_PARALLELISM="false"
DEBUG=0 DEBUG=0
if [[ $BATCH -eq 0 ]]; then if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'` DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-nous-yi34b-internvit-mlp-${DATETIME}" MODEL_NAME="mcore-nous-yi34b-internvit-mlp-${DATETIME}"
else else
MODEL_NAME="mcore-nous-yi34b-internvit-mlp" MODEL_NAME="mcore-nous-yi34b-internvit-mlp"
fi fi
WORKSPACE="<some dir>" WORKSPACE="<some dir>"
SOURCE=`pwd` SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output" OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}" OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs" LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard" TENSORBOARD_DIR="${OUTPUT}/tensorboard"
LOAD_NAME="combined-yi-34b-internvit-tp8-mcore" LOAD_NAME="combined-yi-34b-internvit-tp8-mcore"
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}" CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml" DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then if [[ $DEBUG -eq 1 ]]; then
MBZ=1 MBZ=1
BZ=1 BZ=1
NW=0 NW=0
LI=1 LI=1
AD=0.0 AD=0.0
HD=0.0 HD=0.0
EXTRA_ARGS="" EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1 ALLOW_NONDETERMINISTIC=1
else else
MBZ=1 MBZ=1
BZ=2048 BZ=2048
NW=8 NW=8
LI=5 LI=5
AD=0.1 AD=0.1
HD=0.1 HD=0.1
EXTRA_ARGS="" EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1 ALLOW_NONDETERMINISTIC=1
fi fi
SEQ_LEN=256 # Image embeddings sequence length. SEQ_LEN=256 # Image embeddings sequence length.
DECODER_SEQ_LEN=512 # Language model sequence length. DECODER_SEQ_LEN=512 # Language model sequence length.
MAX_POS_EMBED=512 MAX_POS_EMBED=512
OPTIONS=" \ OPTIONS=" \
--swiglu \ --swiglu \
--use-distributed-optimizer \ --use-distributed-optimizer \
--num-workers ${NW} \ --num-workers ${NW} \
--num-layers 60 \ --num-layers 60 \
--hidden-size 7168 \ --hidden-size 7168 \
--normalization RMSNorm \ --normalization RMSNorm \
--num-attention-heads 56 \ --num-attention-heads 56 \
--exit-duration-in-mins 230 \ --exit-duration-in-mins 230 \
--group-query-attention \ --group-query-attention \
--num-query-groups 8 \ --num-query-groups 8 \
--ffn-hidden-size 20480 \ --ffn-hidden-size 20480 \
--seq-length ${SEQ_LEN} \ --seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \ --decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \ --max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \ --tokenizer-type MultimodalTokenizer \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \ --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \ --tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \ --vocab-size 64000 \
--make-vocab-size-divisible-by 1 \ --make-vocab-size-divisible-by 1 \
--position-embedding-type rope \ --position-embedding-type rope \
--rotary-percent 1.0 \ --rotary-percent 1.0 \
--rotary-base 5000000 \ --rotary-base 5000000 \
--disable-bias-linear \ --disable-bias-linear \
--tensor-model-parallel-size 8 \ --tensor-model-parallel-size 8 \
--language-model-type yi-34b \ --language-model-type yi-34b \
--vision-model-type internvit \ --vision-model-type internvit \
--micro-batch-size ${MBZ} \ --micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \ --global-batch-size ${BZ} \
--train-samples 122880000 \ --train-samples 122880000 \
--lr-decay-samples 25600000 \ --lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \ --lr-warmup-samples 83200 \
--lr 1e-4 \ --lr 1e-4 \
--min-lr 2.5e-5 \ --min-lr 2.5e-5 \
--lr-decay-style cosine \ --lr-decay-style cosine \
--clip-grad 10.0 \ --clip-grad 10.0 \
--weight-decay 0.1 \ --weight-decay 0.1 \
--adam-beta1 0.9 \ --adam-beta1 0.9 \
--adam-beta2 0.95 \ --adam-beta2 0.95 \
--init-method-std 0.014 \ --init-method-std 0.014 \
--attention-dropout ${AD} \ --attention-dropout ${AD} \
--hidden-dropout ${HD} \ --hidden-dropout ${HD} \
--eod-mask-loss \ --untie-embeddings-and-output-weights \
--bf16 \ --eod-mask-loss \
--tensorboard-dir=${TENSORBOARD_DIR} \ --bf16 \
--freeze-LM \ --tensorboard-dir=${TENSORBOARD_DIR} \
--freeze-ViT \ --freeze-LM \
--img-h 448 \ --freeze-ViT \
--img-w 448 \ --img-h 448 \
--patch-dim 14 \ --img-w 448 \
--data-path ${DATA_TRAIN} \ --patch-dim 14 \
--dataloader-type external \ --data-path ${DATA_TRAIN} \
--split 100,0,0 \ --dataloader-type external \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \ --split 100,0,0 \
--log-interval ${LI} \ --prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--save-interval 2000 \ --log-interval ${LI} \
--eval-interval 500 \ --save-interval 2000 \
--eval-iters 10 \ --eval-interval 500 \
--log-params-norm \ --eval-iters 10 \
--log-num-zeros-in-grad \ --log-params-norm \
${EXTRA_ARGS} \ --log-num-zeros-in-grad \
--save ${FINETUNE_DIR} \ ${EXTRA_ARGS} \
--load ${FINETUNE_DIR} \ --save ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \ --load ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \ --dataloader-save ${FINETUNE_DIR}/dataloader \
--allow-missing-vision-projection-checkpoint \ --pretrained-checkpoint ${CHECKPOINT_DIR} \
--disable-vision-class-token \ --allow-missing-vision-projection-checkpoint \
--use-te \ --disable-vision-class-token \
--use-checkpoint-args \ --use-te \
--ckpt-format torch \ --use-checkpoint-args \
--pixel-shuffle \ --ckpt-format torch \
--image-tag-type nvlm --pixel-shuffle \
" --image-tag-type nvlm
"
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0 export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then # Interactive or batch mode
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS} if [[ $BATCH -eq 0 ]]; then
else torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}" else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \ srun -l --verbose \
--container-mounts "<some mount>" \ --container-image <path to docker image> \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \ --container-mounts "<some mount>" \
sh -c "${run_cmd}" --output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi set +x
fi
#!/bin/bash #!/bin/bash
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0 export NVTE_APPLY_QK_LAYER_SCALING=0
export TOKENIZERS_PARALLELISM="false" export TOKENIZERS_PARALLELISM="false"
INPUT_IMAGE_PATH="placeholder" INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder" GROUNDTRUTH_PATH="placeholder"
USE_TILING=0 USE_TILING=0
USE_PIXEL_SHUFFLE_ONLY=0 USE_PIXEL_SHUFFLE_ONLY=0
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
--input-image-path) --input-image-path)
INPUT_IMAGE_PATH="$2" INPUT_IMAGE_PATH="$2"
shift shift
shift shift
;; ;;
-o|--output-path) -o|--output-path)
OUTPUT_PATH="$2" OUTPUT_PATH="$2"
shift shift
shift shift
;; ;;
-m|--model-path) -m|--model-path)
MODEL_PATH="$2" MODEL_PATH="$2"
shift shift
shift shift
;; ;;
--task) --task)
TASK="$2" TASK="$2"
shift shift
shift shift
;; ;;
-g|--gt-path) -g|--gt-path)
GROUNDTRUTH_PATH="$2" GROUNDTRUTH_PATH="$2"
shift shift
shift shift
;; ;;
--use-tiling) --use-tiling)
USE_TILING=1 USE_TILING=1
shift shift
shift shift
;; ;;
--use-pixel-shuffle-only) --use-pixel-shuffle-only)
USE_PIXEL_SHUFFLE_ONLY=1 USE_PIXEL_SHUFFLE_ONLY=1
shift shift
shift shift
;; ;;
-*|--*) -*|--*)
echo "Invalid option $1" echo "Invalid option $1"
exit 1 exit 1
;; ;;
esac esac
done done
# Please modify these as needed. # Please modify these as needed.
NUM_PARTITIONS=0 NUM_PARTITIONS=0
START=0 START=0
END=0 END=0
SEQ_LEN=1024 # Image embeddings sequence length. SEQ_LEN=1024 # Image embeddings sequence length.
DECODER_SEQ_LEN=8192 # Language model sequence length. DECODER_SEQ_LEN=8192 # Language model sequence length.
MAX_POS_EMBED=8192 MAX_POS_EMBED=8192
# Additional arguments. # Additional arguments.
EXTRA_ARGS="" EXTRA_ARGS=""
if [[ $USE_TILING -eq 1 ]]; then if [[ $USE_TILING -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 6 --use-thumbnail --use-tile-tags" EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 6 --use-thumbnail --use-tile-tags"
SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings). SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
fi fi
if [[ $USE_PIXEL_SHUFFLE_ONLY -eq 1 ]]; then if [[ $USE_PIXEL_SHUFFLE_ONLY -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle" EXTRA_ARGS+=" --pixel-shuffle"
SEQ_LEN=256 SEQ_LEN=256
fi fi
for PARTITION_ID in $( eval echo {$START..$END} ) for PARTITION_ID in $( eval echo {$START..$END} )
do do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \ torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \ --attention-softmax-in-fp32 \
--no-masked-softmax-fusion \ --no-masked-softmax-fusion \
--swiglu \ --swiglu \
--num-layers 80 \ --num-layers 80 \
--hidden-size 8192 \ --hidden-size 8192 \
--normalization RMSNorm \ --normalization RMSNorm \
--norm-epsilon 1e-06 \ --norm-epsilon 1e-06 \
--num-attention-heads 64 \ --num-attention-heads 64 \
--exit-on-missing-checkpoint \ --exit-on-missing-checkpoint \
--group-query-attention \ --group-query-attention \
--num-query-groups 8 \ --num-query-groups 8 \
--ffn-hidden-size 29568 \ --ffn-hidden-size 29568 \
--load ${MODEL_PATH} \ --load ${MODEL_PATH} \
--seq-length ${SEQ_LEN} \ --seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \ --decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \ --max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \ --tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2-72B-Instruct \ --tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \ --tokenizer-prompt-format qwen2p0 \
--position-embedding-type rope \ --position-embedding-type rope \
--rotary-percent 1.0 \ --rotary-percent 1.0 \
--rotary-base 1000000 \ --rotary-base 1000000 \
--disable-bias-linear \ --disable-bias-linear \
--add-qkv-bias \ --add-qkv-bias \
--tensor-model-parallel-size 8 \ --tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--language-model-type qwen2.0_72B \ --language-model-type qwen2.0_72B \
--vision-model-type internvit \ --vision-model-type internvit \
--micro-batch-size 1 \ --micro-batch-size 1 \
--attention-dropout 0.0 \ --attention-dropout 0.0 \
--hidden-dropout 0.0 \ --hidden-dropout 0.0 \
--bf16 \ --bf16 \
--freeze-LM \ --freeze-LM \
--freeze-ViT \ --freeze-ViT \
--img-h 448 \ --img-h 448 \
--img-w 448 \ --img-w 448 \
--patch-dim 14 \ --patch-dim 14 \
--use-te \ --use-te \
--transformer-impl transformer_engine \ --transformer-impl transformer_engine \
--use-checkpoint-args \ --use-checkpoint-args \
--out-seq-length 16 \ --out-seq-length 16 \
--temperature 1.0 \ --temperature 1.0 \
--patch-dim 14 \ --patch-dim 14 \
--seed 1234 \ --seed 1234 \
--top_k 1 \ --top_k 1 \
--no-load-rng \ --no-load-rng \
--no-load-optim \ --no-load-optim \
--num-partitions ${NUM_PARTITIONS} \ --num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \ --partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \ --output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \ --gt-path ${GROUNDTRUTH_PATH} \
--disable-vision-class-token \ --disable-vision-class-token \
--input-image-path ${INPUT_IMAGE_PATH} \ --input-image-path ${INPUT_IMAGE_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \ --gt-path ${GROUNDTRUTH_PATH} \
${EXTRA_ARGS} \ ${EXTRA_ARGS} \
--task ${TASK} \ --task ${TASK} \
--image-tag-type nvlm \ --image-tag-type nvlm \
--ckpt-format torch --ckpt-format torch
done done
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
export TOKENIZERS_PARALLELISM="false"
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
while [[ $# -gt 0 ]]; do
case $1 in
--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
--input-metadata-path)
INPUT_METADATA_PATH="$2"
shift
shift
;;
--num-frames)
NUM_FRAMES="$2"
shift
shift
;;
-g|--groundtruth-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
SEQ_LEN=256
DECODER_SEQ_LEN=16384
EXTRA_ARGS=" --pixel-shuffle"
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \
--transformer-impl transformer_engine \
--use-te \
--use-checkpoint-args \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--language-model-type=qwen2.5_7B \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--group-query-attention \
--num-query-groups 4 \
--num-layers 28 \
--hidden-size 3584 \
--ffn-hidden-size 18944 \
--add-qkv-bias \
--num-attention-heads 28 \
--max-position-embeddings 32768 \
--no-masked-softmax-fusion \
--load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2.5-7B-Instruct \
--tokenizer-prompt-format qwen2p5 \
--bf16 \
--micro-batch-size 1 \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--out-seq-length 128 \
--temperature 1.0 \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--seed 153 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--input-image-path ${INPUT_IMAGE_PATH} \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--task ${TASK} \
${EXTRA_ARGS} \
--special-tokens "<image>" "<img>" "</img>" \
--vision-model-type internvit \
--num-frames ${NUM_FRAMES} \
--ckpt-format torch
done
#!/bin/bash #!/bin/bash
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0 export NVTE_APPLY_QK_LAYER_SCALING=0
export TOKENIZERS_PARALLELISM="false" export TOKENIZERS_PARALLELISM="false"
INPUT_IMAGE_PATH="placeholder" INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder" GROUNDTRUTH_PATH="placeholder"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
-i|--input-image-path) -i|--input-image-path)
INPUT_IMAGE_PATH="$2" INPUT_IMAGE_PATH="$2"
shift shift
shift shift
;; ;;
-o|--output-path) -o|--output-path)
OUTPUT_PATH="$2" OUTPUT_PATH="$2"
shift shift
shift shift
;; ;;
-m|--model-path) -m|--model-path)
MODEL_PATH="$2" MODEL_PATH="$2"
shift shift
shift shift
;; ;;
-t|--task) -t|--task)
TASK="$2" TASK="$2"
shift shift
shift shift
;; ;;
-g|--gt-path) -g|--gt-path)
GROUNDTRUTH_PATH="$2" GROUNDTRUTH_PATH="$2"
shift shift
shift shift
;; ;;
-*|--*) -*|--*)
echo "Invalid option $1" echo "Invalid option $1"
exit 1 exit 1
;; ;;
esac esac
done done
# Please modify these as needed. # Please modify these as needed.
NUM_PARTITIONS=0 NUM_PARTITIONS=0
START=0 START=0
END=0 END=0
SEQ_LEN=256 SEQ_LEN=256
DECODER_SEQ_LEN=8192 DECODER_SEQ_LEN=8192
EXTRA_ARGS=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail" EXTRA_ARGS=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
for PARTITION_ID in $( eval echo {$START..$END} ) for PARTITION_ID in $( eval echo {$START..$END} )
do do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \ torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \ --attention-softmax-in-fp32 \
--transformer-impl transformer_engine \ --transformer-impl transformer_engine \
--use-te \ --use-te \
--use-checkpoint-args \ --use-checkpoint-args \
--normalization RMSNorm \ --normalization RMSNorm \
--norm-epsilon 1e-06 \ --norm-epsilon 1e-06 \
--language-model-type=qwen2.5_7B \ --language-model-type=qwen2.5_7B \
--untie-embeddings-and-output-weights \ --untie-embeddings-and-output-weights \
--disable-bias-linear \ --disable-bias-linear \
--position-embedding-type rope \ --position-embedding-type rope \
--rotary-percent 1.0 \ --rotary-percent 1.0 \
--rotary-base 1000000 \ --rotary-base 1000000 \
--swiglu \ --swiglu \
--attention-dropout 0.0 \ --attention-dropout 0.0 \
--hidden-dropout 0.0 \ --hidden-dropout 0.0 \
--tensor-model-parallel-size 4 \ --tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--group-query-attention \ --group-query-attention \
--num-query-groups 4 \ --num-query-groups 4 \
--num-layers 28 \ --num-layers 28 \
--hidden-size 3584 \ --hidden-size 3584 \
--ffn-hidden-size 18944 \ --ffn-hidden-size 18944 \
--add-qkv-bias \ --add-qkv-bias \
--num-attention-heads 28 \ --num-attention-heads 28 \
--max-position-embeddings 32768 \ --max-position-embeddings 32768 \
--no-masked-softmax-fusion \ --no-masked-softmax-fusion \
--load ${MODEL_PATH} \ --load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \ --tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2.5-7B-Instruct \ --tokenizer-model Qwen/Qwen2.5-7B-Instruct \
--tokenizer-prompt-format qwen2p5 \ --tokenizer-prompt-format qwen2p5 \
--bf16 \ --bf16 \
--micro-batch-size 1 \ --micro-batch-size 1 \
--seq-length ${SEQ_LEN} \ --seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \ --decoder-seq-length ${DECODER_SEQ_LEN} \
--out-seq-length 128 \ --out-seq-length 128 \
--temperature 1.0 \ --temperature 1.0 \
--img-h 448 \ --img-h 448 \
--img-w 448 \ --img-w 448 \
--patch-dim 14 \ --patch-dim 14 \
--seed 153 \ --seed 153 \
--top_k 1 \ --top_k 1 \
--no-load-rng \ --no-load-rng \
--no-load-optim \ --no-load-optim \
--input-image-path ${INPUT_IMAGE_PATH} \ --input-image-path ${INPUT_IMAGE_PATH} \
--num-partitions ${NUM_PARTITIONS} \ --num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \ --partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \ --output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \ --gt-path ${GROUNDTRUTH_PATH} \
--task ${TASK} \ --task ${TASK} \
${EXTRA_ARGS} \ ${EXTRA_ARGS} \
--special-tokens "<image>" "<img>" "</img>" \ --special-tokens "<image>" "<img>" "</img>" \
--vision-model-type siglip \ --vision-model-type siglip \
--ckpt-format torch --ckpt-format torch
done done
#!/bin/bash #!/bin/bash
# Your SBATCH commands here if using SLURM. # Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root. # Please launch this script from megatron-lm root.
# Train a multimodal model. # Train a multimodal model.
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_ALGO=^NVLS export NCCL_ALGO=^NVLS
export TOKENIZERS_PARALLELISM="false" export TOKENIZERS_PARALLELISM="false"
DEBUG=0 DEBUG=0
if [[ $BATCH -eq 0 ]]; then if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'` DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-nous-yi34b-internvit-mlp-sft-${DATETIME}" MODEL_NAME="mcore-nous-yi34b-internvit-mlp-sft-${DATETIME}"
else else
MODEL_NAME="mcore-nous-yi34b-internvit-mlp-sft" MODEL_NAME="mcore-nous-yi34b-internvit-mlp-sft"
fi fi
WORKSPACE="<some dir>" WORKSPACE="<some dir>"
SOURCE=`pwd` SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output" OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}" OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs" LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard" TENSORBOARD_DIR="${OUTPUT}/tensorboard"
LOAD_NAME="mcore-nous-yi34b-internvit-mlp" # From pretraining LOAD_NAME="mcore-nous-yi34b-internvit-mlp" # From pretraining
CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints" CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml" DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then if [[ $DEBUG -eq 1 ]]; then
MBZ=1 MBZ=1
BZ=1 BZ=1
NW=0 NW=0
LI=1 LI=1
AD=0.0 AD=0.0
HD=0.0 HD=0.0
ALLOW_NONDETERMINISTIC=1 ALLOW_NONDETERMINISTIC=1
# Can run out of GPU memory in interactive memory without this. # Can run out of GPU memory in interactive memory without this.
# This is just for interactive testing purposes. Do not use for proper training. # This is just for interactive testing purposes. Do not use for proper training.
EXTRA_ARGS=" --freeze-LM" EXTRA_ARGS=" --freeze-LM"
else else
MBZ=1 MBZ=1
BZ=128 BZ=128
NW=2 NW=2
LI=5 LI=5
AD=0.0 AD=0.0
HD=0.0 HD=0.0
ALLOW_NONDETERMINISTIC=1 ALLOW_NONDETERMINISTIC=1
EXTRA_ARGS="" EXTRA_ARGS=""
fi fi
SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings). SEQ_LEN=261 # Image embeddings sequence length (256 image embeddings + 5 tile tag embeddings).
DECODER_SEQ_LEN=3200 # Language model sequence length. DECODER_SEQ_LEN=3200 # Language model sequence length.
MAX_POS_EMBED=3200 MAX_POS_EMBED=3200
OPTIONS=" \ OPTIONS=" \
--swiglu \ --swiglu \
--use-distributed-optimizer \ --use-distributed-optimizer \
--num-workers ${NW} \ --num-workers ${NW} \
--num-layers 60 \ --num-layers 60 \
--hidden-size 7168 \ --hidden-size 7168 \
--normalization RMSNorm \ --normalization RMSNorm \
--num-attention-heads 56 \ --num-attention-heads 56 \
--exit-duration-in-mins 230 \ --exit-duration-in-mins 230 \
--group-query-attention \ --group-query-attention \
--num-query-groups 8 \ --num-query-groups 8 \
--ffn-hidden-size 20480 \ --ffn-hidden-size 20480 \
--seq-length ${SEQ_LEN} \ --seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \ --decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \ --max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \ --tokenizer-type MultimodalTokenizer \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \ --tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \ --tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \ --vocab-size 64000 \
--make-vocab-size-divisible-by 1 \ --make-vocab-size-divisible-by 1 \
--position-embedding-type rope \ --position-embedding-type rope \
--rotary-percent 1.0 \ --rotary-percent 1.0 \
--rotary-base 5000000 \ --rotary-base 5000000 \
--disable-bias-linear \ --disable-bias-linear \
--tensor-model-parallel-size 8 \ --tensor-model-parallel-size 8 \
--language-model-type yi-34b \ --language-model-type yi-34b \
--vision-model-type internvit \ --vision-model-type internvit \
--micro-batch-size ${MBZ} \ --micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \ --global-batch-size ${BZ} \
--train-samples 30000000 \ --train-samples 30000000 \
--lr-decay-samples 25600000 \ --lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \ --lr-warmup-samples 83200 \
--lr 2e-6 \ --lr 2e-6 \
--min-lr 2.5e-7 \ --min-lr 2.5e-7 \
--lr-decay-style cosine \ --lr-decay-style cosine \
--split 100,0,0 \ --split 100,0,0 \
--clip-grad 10 \ --clip-grad 10 \
--weight-decay 0.1 \ --weight-decay 0.1 \
--adam-beta1 0.9 \ --adam-beta1 0.9 \
--adam-beta2 0.95 \ --adam-beta2 0.95 \
--init-method-std 0.014 \ --init-method-std 0.014 \
--attention-dropout ${AD} \ --attention-dropout ${AD} \
--hidden-dropout ${HD} \ --hidden-dropout ${HD} \
--eod-mask-loss \ --untie-embeddings-and-output-weights \
--bf16 \ --eod-mask-loss \
--tensorboard-dir=${TENSORBOARD_DIR} \ --bf16 \
--freeze-ViT \ --tensorboard-dir=${TENSORBOARD_DIR} \
--img-h 448 \ --freeze-ViT \
--img-w 448 \ --img-h 448 \
--patch-dim 14 \ --img-w 448 \
--data-path ${DATA_TRAIN} \ --patch-dim 14 \
--dataloader-type external \ --data-path ${DATA_TRAIN} \
--dataloader-save ${FINETUNE_DIR}/dataloader \ --dataloader-type external \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \ --dataloader-save ${FINETUNE_DIR}/dataloader \
--log-interval ${LI} \ --prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--load ${FINETUNE_DIR} \ --log-interval ${LI} \
--save ${FINETUNE_DIR} \ --load ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \ --save ${FINETUNE_DIR} \
--save-interval 5000 \ --pretrained-checkpoint ${CHECKPOINT_DIR} \
--eval-interval 500 \ --save-interval 5000 \
--eval-iters 10 \ --eval-interval 500 \
--log-params-norm \ --eval-iters 10 \
--log-num-zeros-in-grad \ --log-params-norm \
${EXTRA_ARGS} \ --log-num-zeros-in-grad \
--disable-vision-class-token \ ${EXTRA_ARGS} \
--use-te \ --disable-vision-class-token \
--ckpt-format torch \ --use-te \
--pixel-shuffle \ --ckpt-format torch \
--use-tiling \ --pixel-shuffle \
--max-num-tiles 6 \ --use-tiling \
--use-thumbnail \ --max-num-tiles 6 \
--use-tile-tags \ --use-thumbnail \
--image-tag-type nvlm --use-tile-tags \
" --image-tag-type nvlm
"
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0 export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then # Interactive or batch mode
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS} if [[ $BATCH -eq 0 ]]; then
else torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}" else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \ srun -l --verbose \
--container-mounts "<some mount>" \ --container-image <path to docker image> \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \ --container-mounts "<some mount>" \
sh -c "${run_cmd}" --output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi set +x
fi
File mode changed from 100644 to 100755
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_ALGO=^NVLS
export TOKENIZERS_PARALLELISM=false
USER=$SLURM_JOB_USER
# Auto-detect batch or interactive mode.
which srun
BATCH=$((1-$?))
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="qwen2.5-7B-internvit-video-sft-nvlm-${DATETIME}"
else
MODEL_NAME="qwen2.5-7B-internvitp-video-sft-nvlm"
DEBUG=0
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR="${OUTPUT}/checkpoints"
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
# From pretraining. The pretraining checkpoint should have tensor parallel size to 4.
LOAD_NAME="mcore-qwen2p5-7b-internvit-tp4"
CHECKPOINT_DIR="${WORKSPACE}/output/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/sft_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
AD=0.0
HD=0.0
LI=1
# This is just for interactive testing purposes. Do not use for proper training.
EXTRA_ARGS="--freeze-LM"
ALLOW_NONDETERMINISTIC=1
else
MBZ=1
BZ=256
NW=8
AD=0.0
HD=0.0
LI=5
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
fi
USE_TILING=1
SEQ_LEN=1024
DECODER_SEQ_LEN=16384
MAX_POS_EMBED=32768
TRAIN_SAMPLES=6602173
WARMUP_SAMPLES=198065
if [[ $BATCH -eq 0 ]]; then
# Runs out of GPU memory in interactive memory without this.
EXTRA_ARGS+="--freeze-LM"
fi
if [[ $USE_TILING -eq 1 ]]; then
EXTRA_ARGS+=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
SEQ_LEN=256
fi
OPTIONS=" \
--swiglu \
--use-distributed-optimizer \
--num-workers ${NW} \
--num-layers 28 \
--hidden-size 3584 \
--norm-epsilon 1e-06 \
--normalization RMSNorm \
--num-attention-heads 28 \
--exit-duration-in-mins 110 \
--group-query-attention \
--num-query-groups 4 \
--ffn-hidden-size 18944 \
--add-qkv-bias \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--dataloader-seq-length ${DECODER_SEQ_LEN} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2.5-7B-Instruct \
--tokenizer-prompt-format qwen2p5 \
--pixel-shuffle \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--disable-bias-linear \
--pipeline-model-parallel-size 1 \
--tensor-model-parallel-size 4 \
--language-model-type qwen2.5_7B \
--vision-model-type internvit \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--lr 2e-6 \
--min-lr 2.5e-7 \
--train-samples ${TRAIN_SAMPLES} \
--lr-warmup-samples ${WARMUP_SAMPLES} \
--lr-decay-style cosine \
--clip-grad 10 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--eod-mask-loss \
--bf16 \
--tensorboard-dir ${TENSORBOARD_DIR} \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--data-path ${DATA_TRAIN} \
--dataloader-type external \
--split 100,0,0 \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--log-interval ${LI} \
--save-interval 500 \
--eval-interval 500 \
--eval-iters 10 \
--log-params-norm \
--log-num-zeros-in-grad \
${EXTRA_ARGS} \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--distributed-timeout-minutes 60 \
--allow-missing-vision-projection-checkpoint \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--disable-vision-class-token \
--use-te \
--ckpt-format torch \
--num-frames 32 \
--use-checkpoint-args \
--image-tag-type internvl \
--recompute-granularity full \
--recompute-method block \
--recompute-num-layers 28 \
--recompute-vision \
"
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
#!/bin/bash #!/bin/bash
# Pretrain a multimodal model. # Pretrain a multimodal model.
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-pretraining" MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-pretraining"
# Check that the user has set an output path for model checkpoints. # Check that the user has set an output path for model checkpoints.
if [[ -z $WORKSPACE ]]; then if [[ -z $WORKSPACE ]]; then
echo "Please set WORKSPACE for storing your model checkpoints." echo "Please set WORKSPACE for storing your model checkpoints."
exit 1 exit 1
fi fi
SOURCE=`pwd` SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output" OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}" OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs" LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard" TENSORBOARD_DIR="${OUTPUT}/tensorboard"
if [[ -z $LOAD_NAME ]]; then if [[ -z $LOAD_NAME ]]; then
echo "Please set LOAD_NAME for input model name." echo "Please set LOAD_NAME for input model name."
exit 1 exit 1
fi fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints" CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml" DATA_TRAIN="${SOURCE}/examples/multimodal/pretrain_dataset.yaml"
DEBUG=0 DEBUG=0
if [[ $DEBUG -eq 1 ]]; then if [[ $DEBUG -eq 1 ]]; then
BZ=32 BZ=32
NW=2 NW=2
HD=0.0 HD=0.0
LI=1 LI=1
EXTRA_ARGS="" EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1 NONDETERMINISTIC_ATTN=1
else else
BZ=256 BZ=256
NW=2 NW=2
HD=0.1 HD=0.1
LI=10 LI=10
EXTRA_ARGS="" EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1 NONDETERMINISTIC_ATTN=1
fi fi
OPTIONS=" \ OPTIONS=" \
--apply-layernorm-1p \ --apply-layernorm-1p \
--attention-softmax-in-fp32 \ --attention-softmax-in-fp32 \
--use-checkpoint-args \ --use-checkpoint-args \
--use-distributed-optimizer \ --use-distributed-optimizer \
--transformer-impl transformer_engine \ --transformer-impl transformer_engine \
--use-te \ --use-te \
--normalization RMSNorm \ --normalization RMSNorm \
--group-query-attention \ --group-query-attention \
--num-query-groups 8 \ --num-query-groups 8 \
--no-masked-softmax-fusion \ --no-masked-softmax-fusion \
--num-workers ${NW} \ --num-workers ${NW} \
--exit-duration-in-mins 230 \ --exit-duration-in-mins 230 \
--use-flash-attn \ --use-flash-attn \
--untie-embeddings-and-output-weights \ --untie-embeddings-and-output-weights \
--disable-bias-linear \ --disable-bias-linear \
--position-embedding-type rope \ --position-embedding-type rope \
--rotary-percent 1.0 \ --rotary-percent 1.0 \
--rotary-base 1000000 \ --rotary-base 1000000 \
--swiglu \ --swiglu \
--attention-dropout 0.0 \ --attention-dropout 0.0 \
--hidden-dropout ${HD} \ --hidden-dropout ${HD} \
--tensor-model-parallel-size 4 \ --tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--num-layers 32 \ --num-layers 32 \
--hidden-size 4096 \ --hidden-size 4096 \
--num-attention-heads 32 \ --num-attention-heads 32 \
--seq-length 576 \ --seq-length 576 \
--decoder-seq-length 1024 \ --decoder-seq-length 1024 \
--max-position-embeddings 4096 \ --max-position-embeddings 4096 \
--ffn-hidden-size 14336 \ --ffn-hidden-size 14336 \
--train-iters 20000 \ --train-iters 20000 \
--micro-batch-size 1 \ --micro-batch-size 1 \
--global-batch-size ${BZ} \ --global-batch-size ${BZ} \
--lr-decay-iters 20000 \ --lr-decay-iters 20000 \
--lr-warmup-fraction .01 \ --lr-warmup-fraction .01 \
--lr 0.00015 \ --lr 0.00015 \
--min-lr 1.0e-5 \ --min-lr 1.0e-5 \
--lr-decay-style cosine \ --lr-decay-style cosine \
--log-interval ${LI} \ --log-interval ${LI} \
--eval-iters 10 \ --eval-iters 10 \
--eval-interval 1000 \ --eval-interval 1000 \
--tokenizer-type MultimodalTokenizer \ --tokenizer-type MultimodalTokenizer \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \ --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \ --tokenizer-prompt-format mistral \
--data-path ${DATA_TRAIN} \ --data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \ --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
--save-interval 1000 \ --save-interval 1000 \
--save ${FINETUNE_DIR} \ --save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \ --load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \ --dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \ --pretrained-checkpoint ${CHECKPOINT_DIR} \
--split 100,0,0 \ --split 100,0,0 \
--clip-grad 1.0 \ --clip-grad 1.0 \
--weight-decay 1e-2 \ --weight-decay 1e-2 \
--adam-beta1 0.9 \ --adam-beta1 0.9 \
--adam-beta2 0.95 \ --adam-beta2 0.95 \
--init-method-std 0.014 \ --init-method-std 0.014 \
--log-params-norm \ --log-params-norm \
--log-num-zeros-in-grad \ --log-num-zeros-in-grad \
--bf16 \ --bf16 \
--eod-mask-loss \ --eod-mask-loss \
--freeze-LM \ --freeze-LM \
--freeze-ViT \ --freeze-ViT \
--patch-dim 14 \ --patch-dim 14 \
--img-h 336 \ --img-h 336 \
--img-w 336 \ --img-w 336 \
--dataloader-type external \ --dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \ --tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type=mistral_7b \ --language-model-type=mistral_7b \
--disable-vision-class-token \ --disable-vision-class-token \
${EXTRA_ARGS} \ ${EXTRA_ARGS} \
--distributed-timeout-minutes 60 \ --distributed-timeout-minutes 60 \
--allow-missing-vision-projection-checkpoint \ --allow-missing-vision-projection-checkpoint \
--ckpt-format torch --ckpt-format torch
" "
export NVTE_APPLY_QK_LAYER_SCALING=0 export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN} export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS} torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
\ No newline at end of file
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Generate text using a vision language model.""" """Generate text using a vision language model."""
import json import json
import logging import logging
import os import os
import sys import sys
from functools import partial from functools import partial
# Add megatron to the path. # Add megatron to the path.
sys.path.append( sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
) )
import torch import torch
import yaml import yaml
from config import EvaluationConfig from config import EvaluationConfig
from evaluation.evaluation_datasets import get_evaluation_dataset from evaluation.evaluation_datasets import get_evaluation_dataset
from model import model_provider from model import model_provider
from multimodal_args import add_multimodal_extra_args from multimodal_args import add_multimodal_extra_args
from megatron.core import parallel_state from megatron.core import parallel_state
from megatron.core.enums import ModelType from megatron.core.enums import ModelType
from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
from megatron.inference.text_generation.api import generate_and_post_process from megatron.inference.text_generation.api import generate_and_post_process
from megatron.inference.text_generation.forward_step import ForwardStep from megatron.inference.text_generation.forward_step import ForwardStep
from megatron.inference.text_generation.communication import broadcast_int_list from megatron.inference.text_generation.communication import broadcast_int_list
from megatron.training import get_args, get_model, get_tokenizer, print_rank_0 from megatron.core.inference.sampling_params import SamplingParams
from megatron.training.checkpointing import load_checkpoint from megatron.core.inference.engines.mcore_engine import MCoreEngine
from megatron.training.initialize import initialize_megatron from megatron.core.inference.inference_request import InferenceRequest, VLMInferenceRequest
from megatron.core.inference.text_generation_controllers.vlm_text_generation_controller import (
VLMTextGenerationController,
def add_text_generation_args(parser): )
"""Text generation arguments.""" from megatron.core.inference.model_inference_wrappers.inference_wrapper_config import (
group = parser.add_argument_group(title='Vision language model text generation arguments') InferenceWrapperConfig,
)
group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.') from megatron.core.inference.model_inference_wrappers.multimodal.vlm_inference_wrapper import (
group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.') VLMInferenceWrapper,
group.add_argument("--top_k", type=int, default=0, help='Top k sampling.') )
group.add_argument( from megatron.training import get_args, get_model, get_tokenizer, print_rank_0
"--out-seq-length", type=int, default=128, help='Length of the output generated text.' from megatron.training.checkpointing import load_checkpoint
) from megatron.training.initialize import initialize_megatron
group.add_argument("--output-path", type=str, help='Output file path')
group.add_argument('--input-image-path', type=str, help="Input image directory")
group.add_argument( def add_text_generation_args(parser):
'--num-partitions', type=int, default=0, help="Number of partitions for inputs." """Text generation arguments."""
) group = parser.add_argument_group(title='Vision language model text generation arguments')
group.add_argument('--partition-id', type=int, default=0, help="Partition index")
group.add_argument("--gt-path", type=str, help="Optional ground truth file") group.add_argument("--temperature", type=float, default=1.0, help='Sampling temperature.')
group.add_argument( group.add_argument("--top_p", type=float, default=0.0, help='Top p sampling.')
"--task", group.add_argument("--top_k", type=int, default=0, help='Top k sampling.')
type=str, group.add_argument(
choices=[ "--out-seq-length", type=int, default=128, help='Length of the output generated text.'
"captioning", )
"TextVQA", group.add_argument("--output-path", type=str, help='Output file path')
"VQAv2", group.add_argument('--input-image-path', type=str, help="Input image directory")
"ChartQA", group.add_argument(
"MMMU", '--num-partitions', type=int, default=0, help="Number of partitions for inputs."
"VideoMME", )
"OCRBench", group.add_argument('--partition-id', type=int, default=0, help="Partition index")
"MathVista", group.add_argument("--gt-path", type=str, help="Optional ground truth file")
"AI2D", group.add_argument(
], "--task",
help="Generation task to run", type=str,
) choices=[
group.add_argument( "captioning",
"--num-samples-per-partition", type=int, default=0, help="Number of samples per partition" "TextVQA",
) "VQAv2",
group.add_argument("--config-path", type=str, help="Evaluation config file to use.") "ChartQA",
"MMMU",
# Add common multimodal arguments needed for e.g. building the model. "VideoMME",
parser = add_multimodal_extra_args(parser) "OCRBench",
"MathVista",
return parser "AI2D",
"InfoVQA",
"SPDocVQA",
def get_evaluation_dataloader( ],
task, help="Generation task to run",
input_image_path, )
gt_path, group.add_argument(
img_h, "--num-samples-per-partition", type=int, default=0, help="Number of samples per partition"
img_w, )
use_tiling, group.add_argument("--config-path", type=str, help="Evaluation config file to use.")
max_num_tiles,
use_thumbnail, group.add_argument("--use-mcore-inference", action="store_true", default=False, help="Use the MCore inference API")
num_samples_per_partition,
num_partitions, # Add common multimodal arguments needed for e.g. building the model.
partition_id, parser = add_multimodal_extra_args(parser)
num_frames,
num_workers, return parser
vision_model_type,
):
"""Build evaluation dataset.""" def get_evaluation_dataloader(
dataset = get_evaluation_dataset( task,
task, input_image_path,
input_image_path, gt_path,
gt_path, img_h,
img_h, img_w,
img_w, use_tiling,
use_tiling, max_num_tiles,
max_num_tiles, use_thumbnail,
use_thumbnail, num_samples_per_partition,
num_samples_per_partition, num_partitions,
num_partitions, partition_id,
partition_id, num_frames,
num_frames, num_workers,
vision_model_type, vision_model_type,
) ):
"""Build evaluation dataset."""
dp_rank = parallel_state.get_data_parallel_rank() dataset = get_evaluation_dataset(
dp_world_size = parallel_state.get_data_parallel_world_size() task,
input_image_path,
sampler = torch.utils.data.DistributedSampler( gt_path,
dataset, shuffle=False, num_replicas=dp_world_size, rank=dp_rank img_h,
) img_w,
# TODO: Batched inference is not supported yet. use_tiling,
dataloader = torch.utils.data.DataLoader( max_num_tiles,
dataset, batch_size=None, num_workers=num_workers, sampler=sampler, pin_memory=True use_thumbnail,
) num_samples_per_partition,
num_partitions,
return dataloader partition_id,
num_frames,
vision_model_type,
def generate_samples(model, config: EvaluationConfig, print_output): )
"""Text generation using a trained vision language model."""
args = get_args() dp_rank = parallel_state.get_data_parallel_rank()
dp_world_size = parallel_state.get_data_parallel_world_size()
dataloader = get_evaluation_dataloader(
config.task, sampler = torch.utils.data.DistributedSampler(
config.input_image_path, dataset, shuffle=False, num_replicas=dp_world_size, rank=dp_rank
config.gt_path, )
args.img_h, # TODO: Batched inference is not supported yet.
args.img_w, dataloader = torch.utils.data.DataLoader(
args.use_tiling, dataset, batch_size=None, num_workers=num_workers, sampler=sampler, pin_memory=True
args.max_num_tiles, )
args.use_thumbnail,
config.num_samples_per_partition, return dataloader
config.num_partitions,
config.partition_id,
args.num_frames, def generate_samples(model, config: EvaluationConfig, print_output):
args.num_workers, """Text generation using a trained vision language model."""
args.vision_model_type, args = get_args()
)
dataloader = get_evaluation_dataloader(
num_img_embeddings_per_tile = get_num_image_embeddings( config.task,
args.img_h, config.input_image_path,
args.img_w, config.gt_path,
args.patch_dim, args.img_h,
args.vision_model_type, args.img_w,
args.disable_vision_class_token, args.use_tiling,
1, args.max_num_tiles,
args.pixel_shuffle, args.use_thumbnail,
args.use_tile_tags, config.num_samples_per_partition,
) config.num_partitions,
config.partition_id,
for idx, (imgs, num_tiles, sample_id, question, answers, metadata) in enumerate(dataloader): args.num_frames,
imgs = imgs.to("cuda") args.num_workers,
num_tiles = num_tiles.to("cuda") args.vision_model_type,
)
conv = get_conversation(config.task, question)
num_img_embeddings_per_tile = get_num_image_embeddings(
forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles, args.decoder_seq_length) args.img_h,
args.img_w,
if is_first_rank(): args.patch_dim,
resp_sentences, _, _, _ = generate_and_post_process( args.vision_model_type,
model, args.disable_vision_class_token,
forward_step=forward_step, 1,
prompts=[conv], args.pixel_shuffle,
tokens_to_generate=config.out_seq_length, args.use_tile_tags,
top_k_sampling=config.top_k, )
top_p_sampling=config.top_p,
add_BOS=False, if args.use_mcore_inference:
temperature=config.temperature, inference_wrapper_config = InferenceWrapperConfig(
random_seed=args.seed, hidden_size=args.hidden_size,
detokenize_segments=False, inference_batch_times_seqlen_threshold=args.inference_batch_times_seqlen_threshold,
data_parallel=True, fp32_residual_connection=args.fp32_residual_connection,
) params_dtype=args.params_dtype,
padded_vocab_size=args.padded_vocab_size,
for generation in resp_sentences: )
if isinstance(sample_id, torch.Tensor): inference_wrapped_model = VLMInferenceWrapper(model, inference_wrapper_config)
sample_id = sample_id.item() tokenizer = get_tokenizer()
controller = VLMTextGenerationController(
output = {"sample_id": sample_id} inference_wrapped_model=inference_wrapped_model, tokenizer=tokenizer
)
output_name = "" inference_engine = MCoreEngine(
if config.task == "captioning": controller, max_batch_size=1, random_seed=args.seed
output_name = "caption" )
elif config.task in ( sampling_params = SamplingParams(
"TextVQA", temperature=config.temperature,
"VQAv2", top_k=config.top_k,
"ChartQA", top_p=config.top_p,
"OCRBench", num_tokens_to_generate=config.out_seq_length,
"MathVista", )
"AI2D",
): for idx, (imgs, num_tiles, sample_id, question, answers, metadata) in enumerate(dataloader):
output_name = "answer" imgs = imgs.to("cuda")
elif config.task in ("MMMU"): num_tiles = num_tiles.to("cuda")
output_name = "text"
elif config.task == "VideoMME": conv = get_conversation(config.task, question)
output_name = "response"
output = question if not args.use_mcore_inference:
else: forward_step = partial(VLMForwardStep, num_img_embeddings_per_tile, imgs, num_tiles, args.decoder_seq_length)
raise NotImplementedError("no output name defined for", config.task)
prompt, generated = get_prompt_and_generated( if is_first_rank():
generation, args.tokenizer_prompt_format
) if args.use_mcore_inference:
if config.task == "VideoMME": inference_request = VLMInferenceRequest(
output["questions"][0][output_name] = generated request_id=inference_engine.get_new_request_id(),
else: prompt=conv,
output["prompt"] = prompt prompt_tokens=controller.tokenize_prompt(conv),
output[output_name] = generated inference_parameters=sampling_params,
num_img_embeddings_per_tile=num_img_embeddings_per_tile,
if config.task == "captioning": imgs=imgs,
output["ground_truth"] = answers num_tiles=num_tiles,
elif config.task in ( decoder_seq_length=args.decoder_seq_length,
"TextVQA", )
"VQAv2", results: List[InferenceRequest] = inference_engine.generate(
"ChartQA", inference_requests=[inference_request]
"OCRBench", )
"MathVista",
"AI2D", resp_sentences = [
): tokenizer.detokenize(result.prompt_tokens) + result.generated_text
if isinstance(answers, str): for result in results
answers = [answers] ]
output["gt_answer"] = answers else:
resp_sentences, _, _, _ = generate_and_post_process(
if len(metadata) > 0: model,
output.update(metadata) forward_step=forward_step,
elif config.task == "MMMU": prompts=[conv],
output["prediction"] = generated tokens_to_generate=config.out_seq_length,
output.update(metadata) top_k_sampling=config.top_k,
else: top_p_sampling=config.top_p,
raise NotImplementedError("no output processing defined for", config.task) add_BOS=False,
temperature=config.temperature,
if print_output: random_seed=args.seed,
print(output) detokenize_segments=False,
data_parallel=True,
yield output )
idx += 1
else: for generation in resp_sentences:
generate_and_post_process( if isinstance(sample_id, torch.Tensor):
model, forward_step=forward_step, detokenize_segments=False, data_parallel=True sample_id = sample_id.item()
)
output = {"sample_id": sample_id}
idx += 1
output_name = ""
if config.task == "captioning":
def get_evaluation_config(): output_name = "caption"
"""Get evaluation config from a config file or command-line arguments.""" elif config.task in (
args = get_args() "TextVQA",
if args.config_path: "VQAv2",
with open(args.config_path, "r") as f: "ChartQA",
config_dict = yaml.safe_load(f) "OCRBench",
"MathVista",
config = EvaluationConfig(**config_dict) "AI2D",
else: "InfoVQA",
config = EvaluationConfig( "SPDocVQA",
task=args.task, ):
temperature=args.temperature, output_name = "answer"
top_p=args.top_p, elif config.task in ("MMMU"):
top_k=args.top_k, output_name = "text"
out_seq_length=args.out_seq_length, elif config.task == "VideoMME":
output_path=args.output_path, output_name = "response"
input_image_path=args.input_image_path, output = question
gt_path=args.gt_path, else:
num_partitions=args.num_partitions, raise NotImplementedError("no output name defined for", config.task)
partition_id=args.partition_id,
num_samples_per_partition=args.num_samples_per_partition, prompt, generated = get_prompt_and_generated(
) generation, args.tokenizer_prompt_format
)
# Default output path if not defined... if config.task == "VideoMME":
if not config.output_path: output["questions"][0][output_name] = generated
os.makedirs("generated", exist_ok=True) else:
config.output_path = "generated/" + args.language_model_type output["prompt"] = prompt
output[output_name] = generated
return config
if config.task == "captioning":
output["ground_truth"] = answers
def is_first_rank(): elif config.task in (
"""First tensor and pipeline parallel rank.""" "TextVQA",
return ( "VQAv2",
parallel_state.is_pipeline_first_stage(ignore_virtual=True) "ChartQA",
and parallel_state.get_tensor_model_parallel_rank() == 0 "OCRBench",
) "MathVista",
"AI2D",
"InfoVQA",
def get_output_path(config, dp_rank): "SPDocVQA",
"""Generation output path.""" ):
return ( if isinstance(answers, str):
f"{config.output_path}-{config.task}-dprank={dp_rank}-partition={config.partition_id}.jsonl" answers = [answers]
) output["gt_answer"] = answers
if len(metadata) > 0:
def generate_and_write_samples(model, config, print_output=True): output.update(metadata)
"""Generate text and write to an output file.""" elif config.task == "MMMU":
dp_rank = parallel_state.get_data_parallel_rank() output["prediction"] = generated
output.update(metadata)
if is_first_rank(): else:
output_path = get_output_path(config, dp_rank) raise NotImplementedError("no output processing defined for", config.task)
output_file = open(output_path, "w")
print(f"output path: {output_file.name}") if print_output:
print(output)
with torch.no_grad():
for output in generate_samples(model, config, print_output): yield output
if is_first_rank(): idx += 1
output_file.write(json.dumps(output) + "\n") else:
output_file.flush() if args.use_mcore_inference:
inference_request = VLMInferenceRequest(
if is_first_rank(): request_id=inference_engine.get_new_request_id(),
output_file.close() prompt=conv,
prompt_tokens=controller.tokenize_prompt(conv),
inference_parameters=sampling_params,
class VLMForwardStep(ForwardStep): num_img_embeddings_per_tile=num_img_embeddings_per_tile,
"""Inference forward step for a multimodal model.""" imgs=imgs,
num_tiles=num_tiles,
def __init__( decoder_seq_length=args.decoder_seq_length,
self, )
num_img_embeddings_per_tile, inference_engine.generate(
images, inference_requests=[inference_request]
num_tiles, )
decoder_seq_length, else:
model, generate_and_post_process(
max_batch_size, model, forward_step=forward_step, detokenize_segments=False, data_parallel=True
max_sequence_length, )
):
"""Create multimodal forward step.""" idx += 1
total_num_tiles = torch.sum(num_tiles).item()
num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles
def get_evaluation_config():
super().__init__(model, max_batch_size, max_sequence_length + num_img_embeddings) """Get evaluation config from a config file or command-line arguments."""
self._images = images args = get_args()
self._num_tiles = num_tiles if args.config_path:
self._num_img_embeddings = num_img_embeddings with open(args.config_path, "r") as f:
self.decoder_seq_length = decoder_seq_length config_dict = yaml.safe_load(f)
self._recv_only_vision_embeds = False config = EvaluationConfig(**config_dict)
pp_rank = parallel_state.get_pipeline_model_parallel_rank() else:
# Checks if the previous stage only has a vision encoder, and that the current stage has part of the LM decoder. config = EvaluationConfig(
# In this case, the current stage should only receive vision embeddings. task=args.task,
if pp_rank > 0: temperature=args.temperature,
self._recv_only_vision_embeds = parallel_state.is_inside_encoder(pp_rank - 1) and (not parallel_state.is_inside_decoder(pp_rank - 1)) and parallel_state.is_inside_decoder() top_p=args.top_p,
top_k=args.top_k,
# Checks if the current stage only has a vision encoder out_seq_length=args.out_seq_length,
self._encoder_only = parallel_state.is_inside_encoder() and not parallel_state.is_inside_decoder() output_path=args.output_path,
input_image_path=args.input_image_path,
def _forward(self, tokens, position_ids, attention_mask): gt_path=args.gt_path,
return self.model( num_partitions=args.num_partitions,
self._images, partition_id=args.partition_id,
tokens, num_samples_per_partition=args.num_samples_per_partition,
position_ids, )
attention_mask=None,
inference_params=self.inference_params, # Default output path if not defined...
num_image_tiles=self._num_tiles, if not config.output_path:
runtime_gather_output=True, os.makedirs("generated", exist_ok=True)
) config.output_path = "generated/" + args.language_model_type
def __call__(self, tokens, position_ids, attention_mask): return config
num_image_tokens = (tokens == self.model.module.image_token_index).sum().item()
num_tokens = tokens.size(1)
recv_buffer_seq_length = None def is_first_rank():
if num_image_tokens > 0: """First tensor and pipeline parallel rank."""
# When there are image tokens and this stage only receives vision embeddings, adjust the recv buffer seq length to match the image embeddings sequence length. return (
# If there are image tokens and this stage receives full embeddings, make sure we compensate for expansion of image tokens. parallel_state.is_pipeline_first_stage(ignore_virtual=True)
# Note that this will set a recv_buffer_seq_length for the encoder stage, this length is irrelevant since that recv buffer is never allocated. and parallel_state.get_tensor_model_parallel_rank() == 0
if self._recv_only_vision_embeds: )
recv_buffer_seq_length = self._num_img_embeddings
else:
recv_buffer_seq_length = min(self._num_img_embeddings + num_tokens - num_image_tokens, self.decoder_seq_length) def get_output_path(config, dp_rank):
elif self._recv_only_vision_embeds: """Generation output path."""
# If this stage only receives vision embeddings and there are no image tokens we won't run the encoder and therefore shouldn't try to recv. return (
recv_buffer_seq_length = 0 f"{config.output_path}-{config.task}-dprank={dp_rank}-partition={config.partition_id}.jsonl"
)
# If the pipeline stage only has a vision encoder, then it only needs to run when there are image tokens
if not (self._encoder_only and num_image_tokens == 0):
output = super().__call__(tokens, position_ids, attention_mask, recv_buffer_seq_length=recv_buffer_seq_length) def generate_and_write_samples(model, config, print_output=True):
else: """Generate text and write to an output file."""
output = None dp_rank = parallel_state.get_data_parallel_rank()
if isinstance(output, tuple):
logits, _ = output if is_first_rank():
else: output_path = get_output_path(config, dp_rank)
logits = output output_file = open(output_path, "w")
print(f"output path: {output_file.name}")
# On the first inference iteration, we compute image tokens.
# On every PP stage(although inference params should only matter for decoder), with torch.no_grad():
# update the sequence length offset by the number of image tokens. for output in generate_samples(model, config, print_output):
if num_tokens > 1 and num_image_tokens > 0: if is_first_rank():
if "image_tokens_count" not in self.inference_params.key_value_memory_dict: output_file.write(json.dumps(output) + "\n")
self.inference_params.key_value_memory_dict["image_tokens_count"] = self._num_img_embeddings output_file.flush()
if self._num_img_embeddings + num_tokens - num_image_tokens > self.decoder_seq_length: if is_first_rank():
self.inference_params.sequence_len_offset += self.decoder_seq_length - num_tokens output_file.close()
else:
self.inference_params.sequence_len_offset += ( class VLMForwardStep(ForwardStep):
self.inference_params.key_value_memory_dict["image_tokens_count"] - num_image_tokens """Inference forward step for a multimodal model."""
)
def __init__(
return logits self,
num_img_embeddings_per_tile,
images,
def get_conversation(task, question): num_tiles,
"""Get a conversation for a given task and evaluation question.""" decoder_seq_length,
conversation = [] model,
max_batch_size,
# In all cases, the tokenizer adds possible header tokens for the assistant. max_sequence_length,
if task == "captioning": ):
conversation = [ """Create multimodal forward step."""
{"role": "system", "content": "Answer the questions."}, total_num_tiles = torch.sum(num_tiles).item()
{ num_img_embeddings = num_img_embeddings_per_tile * total_num_tiles
"role": "user",
"content": f"{IMAGE_TOKEN}\nProvide a one-sentence caption for provided image.", super().__init__(model, max_batch_size, max_sequence_length + num_img_embeddings)
}, self._images = images
] self._num_tiles = num_tiles
elif task in ("TextVQA", "VQAv2", "ChartQA"): self._num_img_embeddings = num_img_embeddings
conversation = [ self.decoder_seq_length = decoder_seq_length
{"role": "system", "content": "Answer the questions."},
{ self._recv_only_vision_embeds = False
"role": "user", pp_rank = parallel_state.get_pipeline_model_parallel_rank()
"content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.", # Checks if the previous stage only has a vision encoder, and that the current stage has part of the LM decoder.
}, # In this case, the current stage should only receive vision embeddings.
] if pp_rank > 0:
elif task in ("OCRBench", "MathVista", "AI2D"): self._recv_only_vision_embeds = parallel_state.is_inside_encoder(pp_rank - 1) and (not parallel_state.is_inside_decoder(pp_rank - 1)) and parallel_state.is_inside_decoder()
conversation = [
{"role": "system", "content": "Answer the questions."}, # Checks if the current stage only has a vision encoder
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"}, self._encoder_only = parallel_state.is_inside_encoder() and not parallel_state.is_inside_decoder()
]
elif task == "MMMU": def _forward(self, tokens, position_ids, attention_mask):
conversation = [ return self.model(
{"role": "system", "content": "Answer the questions."}, self._images,
{"role": "user", "content": question}, tokens,
] position_ids,
elif task == "VideoMME": attention_mask=None,
q = ( inference_params=self.inference_params,
"Select the best answer to the following multiple-choice " num_image_tiles=self._num_tiles,
"question based on the video. Respond with only the letter " runtime_gather_output=True,
"(A, B, C, or D) of the correct option.\n" )
)
q += question["questions"][0]["question"] + "\n" def __call__(self, tokens, position_ids, attention_mask):
q += question["questions"][0]["choices"][0] + "\n" num_image_tokens = (tokens == self.model.module.image_token_index).sum().item()
q += question["questions"][0]["choices"][1] + "\n" num_tokens = tokens.size(1)
q += question["questions"][0]["choices"][2] + "\n" recv_buffer_seq_length = None
q += question["questions"][0]["choices"][3] + "\n" if num_image_tokens > 0:
# When there are image tokens and this stage only receives vision embeddings, adjust the recv buffer seq length to match the image embeddings sequence length.
conversation = [ # If there are image tokens and this stage receives full embeddings, make sure we compensate for expansion of image tokens.
{"role": "system", "content": "Answer the questions."}, # Note that this will set a recv_buffer_seq_length for the encoder stage, this length is irrelevant since that recv buffer is never allocated.
{"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"}, if self._recv_only_vision_embeds:
] recv_buffer_seq_length = self._num_img_embeddings
else:
return conversation recv_buffer_seq_length = min(self._num_img_embeddings + num_tokens - num_image_tokens, self.decoder_seq_length)
elif self._recv_only_vision_embeds:
# If this stage only receives vision embeddings and there are no image tokens we won't run the encoder and therefore shouldn't try to recv.
def get_prompt_and_generated(prompt_and_generation, prompt_format): recv_buffer_seq_length = 0
"""Strip prompt and other unnecessary text from generation."""
if prompt_format == "llama3": # If the pipeline stage only has a vision encoder, then it only needs to run when there are image tokens
splitted = prompt_and_generation.split("<|start_header_id|>assistant<|end_header_id|>\n\n") if not (self._encoder_only and num_image_tokens == 0):
prompt = splitted[0] output = super().__call__(tokens, position_ids, attention_mask, recv_buffer_seq_length=recv_buffer_seq_length)
generated = splitted[1] else:
generated = generated.split("<|eot_id|>")[0] output = None
elif prompt_format == "mistral": if isinstance(output, tuple):
splitted = prompt_and_generation.split("[/INST]") logits, _ = output
prompt = splitted[0] else:
generated = splitted[1] logits = output
generated = generated.split("</s>")[0]
elif prompt_format == "chatml": # On the first inference iteration, we compute image tokens.
splitted = prompt_and_generation.split("<|im_start|> assistant\n") # On every PP stage(although inference params should only matter for decoder),
prompt = splitted[0] # update the sequence length offset by the number of image tokens.
generated = splitted[1] if num_tokens > 1 and num_image_tokens > 0:
generated = generated.split("<|im_end|>")[0] if "image_tokens_count" not in self.inference_params.key_value_memory_dict:
elif prompt_format in ("nvlm-yi-34b", "qwen2p0", "qwen2p5"): self.inference_params.key_value_memory_dict["image_tokens_count"] = self._num_img_embeddings
splitted = prompt_and_generation.split("<|im_start|>assistant\n")
prompt = splitted[0] if self._num_img_embeddings + num_tokens - num_image_tokens > self.decoder_seq_length:
generated = splitted[1] self.inference_params.sequence_len_offset += self.decoder_seq_length - num_tokens
generated = generated.split("<|im_end|>")[0] else:
else: self.inference_params.sequence_len_offset += (
raise ValueError(f"Prompt format {prompt_format} is not supported.") self.inference_params.key_value_memory_dict["image_tokens_count"] - num_image_tokens
)
# Remove possible garbage.
generated = generated.strip() return logits
generated = generated.split("\n\n")[0]
generated = generated.split("\n")[0]
def get_conversation(task, question):
return prompt, generated """Get a conversation for a given task and evaluation question."""
conversation = []
def main(): # In all cases, the tokenizer adds possible header tokens for the assistant.
"""Vision language model text generation.""" if task == "captioning":
initialize_megatron(extra_args_provider=add_text_generation_args) conversation = [
{"role": "system", "content": "Answer the questions."},
if torch.distributed.get_rank() == 0: {
logging.getLogger(__name__).warning( "role": "user",
"Models using pipeline parallelism are not supported yet." "content": f"{IMAGE_TOKEN}\nProvide a one-sentence caption for provided image.",
) },
]
args = get_args() elif task in ("TextVQA", "VQAv2", "ChartQA", "InfoVQA", "SPDocVQA"):
conversation = [
def wrapped_model_provider(pre_process, post_process, add_encoder, add_decoder): {"role": "system", "content": "Answer the questions."},
return model_provider(pre_process, post_process, add_encoder, add_decoder, parallel_output=False) {
"role": "user",
# Set up model and load checkpoint. "content": f"{IMAGE_TOKEN}\n{question}\nAnswer the question using a single word or phrase.",
model = get_model(wrapped_model_provider, model_type=ModelType.encoder_and_decoder, wrap_with_ddp=False) },
]
if args.load is not None: elif task in ("OCRBench", "MathVista", "AI2D"):
_ = load_checkpoint(model, None, None) conversation = [
{"role": "system", "content": "Answer the questions."},
model = model[0] {"role": "user", "content": f"{IMAGE_TOKEN}\n{question}"},
]
model.eval() elif task == "MMMU":
conversation = [
config = get_evaluation_config() {"role": "system", "content": "Answer the questions."},
{"role": "user", "content": question},
generate_and_write_samples(model, config) ]
elif task == "VideoMME":
q = (
if __name__ == "__main__": "Select the best answer to the following multiple-choice "
main() "question based on the video. Respond with only the letter "
"(A, B, C, or D) of the correct option.\n"
)
q += question["questions"][0]["question"] + "\n"
q += question["questions"][0]["choices"][0] + "\n"
q += question["questions"][0]["choices"][1] + "\n"
q += question["questions"][0]["choices"][2] + "\n"
q += question["questions"][0]["choices"][3] + "\n"
conversation = [
{"role": "system", "content": "Answer the questions."},
{"role": "user", "content": f"{IMAGE_TOKEN}\n{q}"},
]
return conversation
def get_prompt_and_generated(prompt_and_generation, prompt_format):
"""Strip prompt and other unnecessary text from generation."""
if prompt_format in ("llama3", "llama3p1"):
splitted = prompt_and_generation.split("<|start_header_id|>assistant<|end_header_id|>\n\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|eot_id|>")[0]
elif prompt_format == "mistral":
splitted = prompt_and_generation.split("[/INST]")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("</s>")[0]
elif prompt_format == "chatml":
splitted = prompt_and_generation.split("<|im_start|> assistant\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|im_end|>")[0]
elif prompt_format in ("nvlm-yi-34b", "qwen2p0", "qwen2p5"):
splitted = prompt_and_generation.split("<|im_start|>assistant\n")
prompt = splitted[0]
generated = splitted[1]
generated = generated.split("<|im_end|>")[0]
else:
raise ValueError(f"Prompt format {prompt_format} is not supported.")
# Remove possible garbage.
generated = generated.strip()
generated = generated.split("\n\n")[0]
generated = generated.split("\n")[0]
return prompt, generated
def main():
"""Vision language model text generation."""
initialize_megatron(extra_args_provider=add_text_generation_args)
if torch.distributed.get_rank() == 0:
logging.getLogger(__name__).warning(
"Models using pipeline parallelism are not supported yet."
)
args = get_args()
def wrapped_model_provider(pre_process, post_process, add_encoder, add_decoder):
return model_provider(pre_process, post_process, add_encoder, add_decoder, parallel_output=False)
# Set up model and load checkpoint.
model = get_model(wrapped_model_provider, model_type=ModelType.encoder_and_decoder, wrap_with_ddp=False)
if args.load is not None:
_ = load_checkpoint(model, None, None)
model = model[0]
model.eval()
config = get_evaluation_config()
generate_and_write_samples(model, config)
if __name__ == "__main__":
main()
#!/bin/bash #!/bin/bash
# Run SFT on a pretrained multimodal model # Run SFT on a pretrained multimodal model
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-sft" MODEL_NAME="mcore-llava-mistral-7b-instruct-clip336-sft"
# Check that the user has set an output path for model checkpoints. # Check that the user has set an output path for model checkpoints.
if [[ -z $WORKSPACE ]]; then if [[ -z $WORKSPACE ]]; then
echo "Please set WORKSPACE for storing your model checkpoints." echo "Please set WORKSPACE for storing your model checkpoints."
exit 1 exit 1
fi fi
SOURCE=`pwd` SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output" OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}" OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs" LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard" TENSORBOARD_DIR="${OUTPUT}/tensorboard"
if [[ -z $LOAD_NAME ]]; then if [[ -z $LOAD_NAME ]]; then
echo "Please set LOAD_NAME for input model name." echo "Please set LOAD_NAME for input model name."
exit 1 exit 1
fi fi
if [[ -z $LOAD_ITER ]]; then if [[ -z $LOAD_ITER ]]; then
echo "Please set LOAD_ITER for pre-trained input model iteration." echo "Please set LOAD_ITER for pre-trained input model iteration."
exit 1 exit 1
fi fi
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints" CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}/checkpoints"
DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml" DATA_TRAIN="${SOURCE}/examples/multimodal/sft_dataset.yaml"
DEBUG=0 DEBUG=0
if [[ $DEBUG -eq 1 ]]; then if [[ $DEBUG -eq 1 ]]; then
BZ=8 BZ=8
NW=1 NW=1
HD=0.0 HD=0.0
LI=1 LI=1
EXTRA_ARGS="" EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1 NONDETERMINISTIC_ATTN=1
else else
BZ=128 BZ=128
NW=2 NW=2
HD=0.1 HD=0.1
LI=10 LI=10
EXTRA_ARGS="" EXTRA_ARGS=""
NONDETERMINISTIC_ATTN=1 NONDETERMINISTIC_ATTN=1
fi fi
OPTIONS=" \ OPTIONS=" \
--apply-layernorm-1p \ --apply-layernorm-1p \
--attention-softmax-in-fp32 \ --attention-softmax-in-fp32 \
--use-checkpoint-args \ --use-checkpoint-args \
--use-distributed-optimizer \ --use-distributed-optimizer \
--transformer-impl transformer_engine \ --transformer-impl transformer_engine \
--use-te \ --use-te \
--normalization RMSNorm \ --normalization RMSNorm \
--group-query-attention \ --group-query-attention \
--num-query-groups 8 \ --num-query-groups 8 \
--no-masked-softmax-fusion \ --no-masked-softmax-fusion \
--num-workers ${NW} \ --num-workers ${NW} \
--exit-duration-in-mins 230 \ --exit-duration-in-mins 230 \
--use-flash-attn \ --use-flash-attn \
--untie-embeddings-and-output-weights \ --untie-embeddings-and-output-weights \
--disable-bias-linear \ --disable-bias-linear \
--position-embedding-type rope \ --position-embedding-type rope \
--rotary-percent 1.0 \ --rotary-percent 1.0 \
--rotary-base 1000000 \ --rotary-base 1000000 \
--swiglu \ --swiglu \
--attention-dropout 0.0 \ --attention-dropout 0.0 \
--hidden-dropout ${HD} \ --hidden-dropout ${HD} \
--tensor-model-parallel-size 4 \ --tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--num-layers 32 \ --num-layers 32 \
--hidden-size 4096 \ --hidden-size 4096 \
--num-attention-heads 32 \ --num-attention-heads 32 \
--seq-length 576 \ --seq-length 576 \
--decoder-seq-length 2048 \ --decoder-seq-length 2048 \
--max-position-embeddings 4096 \ --max-position-embeddings 4096 \
--ffn-hidden-size 14336 \ --ffn-hidden-size 14336 \
--train-iters 20000 \ --train-iters 20000 \
--micro-batch-size 1 \ --micro-batch-size 1 \
--global-batch-size ${BZ} \ --global-batch-size ${BZ} \
--lr-decay-iters 20000 \ --lr-decay-iters 20000 \
--lr-warmup-fraction .01 \ --lr-warmup-fraction .01 \
--lr 1e-6 \ --lr 1e-6 \
--min-lr 1e-7 \ --min-lr 1e-7 \
--lr-decay-style cosine \ --lr-decay-style cosine \
--log-interval ${LI} \ --log-interval ${LI} \
--eval-iters 10 \ --eval-iters 10 \
--eval-interval 500 \ --eval-interval 500 \
--tokenizer-type MultimodalTokenizer \ --tokenizer-type MultimodalTokenizer \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \ --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \ --tokenizer-prompt-format mistral \
--data-path ${DATA_TRAIN} \ --data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \ --prompt-path ${SOURCE}/examples/multimodal/manual_prompts.json \
--save-interval 500 \ --save-interval 500 \
--save ${FINETUNE_DIR} \ --save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \ --load ${FINETUNE_DIR} \
--pretrained-checkpoint ${CHECKPOINT_DIR} \ --pretrained-checkpoint ${CHECKPOINT_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \ --dataloader-save ${FINETUNE_DIR}/dataloader \
--split 100,0,0 \ --split 100,0,0 \
--clip-grad 0.5 \ --clip-grad 0.5 \
--weight-decay 0.1 \ --weight-decay 0.1 \
--adam-beta1 0.9 \ --adam-beta1 0.9 \
--adam-beta2 0.95 \ --adam-beta2 0.95 \
--init-method-std 0.014 \ --init-method-std 0.014 \
--log-params-norm \ --log-params-norm \
--log-num-zeros-in-grad \ --log-num-zeros-in-grad \
--eod-mask-loss \ --eod-mask-loss \
--freeze-ViT \ --freeze-ViT \
--patch-dim 14 \ --patch-dim 14 \
--img-h 336 \ --img-h 336 \
--img-w 336 \ --img-w 336 \
--dataloader-type external \ --dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \ --tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type=mistral_7b \ --language-model-type=mistral_7b \
--disable-vision-class-token \ --disable-vision-class-token \
${EXTRA_ARGS} \ ${EXTRA_ARGS} \
--distributed-timeout-minutes 60 \ --distributed-timeout-minutes 60 \
--ckpt-format torch --ckpt-format torch
" "
export NVTE_APPLY_QK_LAYER_SCALING=0 export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN} export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${NONDETERMINISTIC_ATTN}
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS} torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
#!/bin/bash #!/bin/bash
export NCCL_IB_SL=1 export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0 export NVTE_APPLY_QK_LAYER_SCALING=0
INPUT_IMAGE_PATH="placeholder" INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder" GROUNDTRUTH_PATH="placeholder"
NUM_FRAMES=1 NUM_FRAMES=1
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
-i|--input-image-path) -i|--input-image-path)
INPUT_IMAGE_PATH="$2" INPUT_IMAGE_PATH="$2"
shift shift
shift shift
;; ;;
--num-frames) --num-frames)
NUM_FRAMES="$2" NUM_FRAMES="$2"
shift shift
shift shift
;; ;;
-o|--output-path) -o|--output-path)
OUTPUT_PATH="$2" OUTPUT_PATH="$2"
shift shift
shift shift
;; ;;
-m|--model-path) -m|--model-path)
MODEL_PATH="$2" MODEL_PATH="$2"
shift shift
shift shift
;; ;;
-t|--task) -t|--task)
TASK="$2" TASK="$2"
shift shift
shift shift
;; ;;
-g|--gt-path) -g|--gt-path)
GROUNDTRUTH_PATH="$2" GROUNDTRUTH_PATH="$2"
shift shift
shift shift
;; ;;
-*|--*) -*|--*)
echo "Invalid option $1" echo "Invalid option $1"
exit 1 exit 1
;; ;;
esac esac
done done
# Please modify these as needed. # Please modify these as needed.
NUM_PARTITIONS=0 NUM_PARTITIONS=0
START=0 START=0
END=0 END=0
for PARTITION_ID in $( eval echo {$START..$END} ) for PARTITION_ID in $( eval echo {$START..$END} )
do do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \ torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--apply-layernorm-1p \ --apply-layernorm-1p \
--attention-softmax-in-fp32 \ --attention-softmax-in-fp32 \
--use-flash-attn \ --use-flash-attn \
--transformer-impl transformer_engine \ --transformer-impl transformer_engine \
--use-te \ --use-te \
--use-checkpoint-args \ --use-checkpoint-args \
--normalization RMSNorm \ --normalization RMSNorm \
--language-model-type mistral_7b \ --language-model-type mistral_7b \
--untie-embeddings-and-output-weights \ --untie-embeddings-and-output-weights \
--disable-bias-linear \ --disable-bias-linear \
--position-embedding-type rope \ --position-embedding-type rope \
--rotary-percent 1.0 \ --rotary-percent 1.0 \
--rotary-base 1000000 \ --rotary-base 1000000 \
--swiglu \ --swiglu \
--attention-dropout 0.0 \ --attention-dropout 0.0 \
--hidden-dropout 0.0 \ --hidden-dropout 0.0 \
--tensor-model-parallel-size 4 \ --tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--group-query-attention \ --group-query-attention \
--num-query-groups 8 \ --num-query-groups 8 \
--num-layers 32 \ --num-layers 32 \
--hidden-size 4096 \ --hidden-size 4096 \
--ffn-hidden-size 14336 \ --ffn-hidden-size 14336 \
--num-attention-heads 32 \ --num-attention-heads 32 \
--max-position-embeddings 4096 \ --max-position-embeddings 4096 \
--no-masked-softmax-fusion \ --no-masked-softmax-fusion \
--load ${MODEL_PATH} \ --load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \ --tokenizer-type MultimodalTokenizer \
--tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \ --tokenizer-model mistralai/Mistral-7B-Instruct-v0.3 \
--tokenizer-prompt-format mistral \ --tokenizer-prompt-format mistral \
--bf16 \ --bf16 \
--micro-batch-size 1 \ --micro-batch-size 1 \
--seq-length 2048 \ --seq-length 2048 \
--out-seq-length 12 \ --out-seq-length 12 \
--temperature 1.0 \ --temperature 1.0 \
--img-h 336 \ --img-h 336 \
--img-w 336 \ --img-w 336 \
--patch-dim 14 \ --patch-dim 14 \
--seed 153 \ --seed 153 \
--top_k 1 \ --top_k 1 \
--no-load-rng \ --no-load-rng \
--no-load-optim \ --no-load-optim \
--input-image-path ${INPUT_IMAGE_PATH} \ --input-image-path ${INPUT_IMAGE_PATH} \
--num-partitions ${NUM_PARTITIONS} \ --num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \ --partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \ --output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \ --gt-path ${GROUNDTRUTH_PATH} \
--task ${TASK} \ --task ${TASK} \
--disable-vision-class-token \ --disable-vision-class-token \
--num-frames ${NUM_FRAMES} \ --num-frames ${NUM_FRAMES} \
--ckpt-format torch --ckpt-format torch
done done
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Pretrain or SFT multimodal.""" """Pretrain or SFT multimodal."""
import os import math
import sys import os
from functools import partial import sys
from functools import partial
import torch
import yaml import torch
import yaml
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir)) sys.path.append(
) os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
)
from dataloader_provider import train_valid_test_dataloaders_provider, is_first_or_last_stage
from model import model_provider from dataloader_provider import train_valid_test_dataloaders_provider, is_first_or_last_stage
from multimodal_args import add_multimodal_extra_args from model import model_provider
from multimodal_args import add_multimodal_extra_args
from megatron.core import mpu, tensor_parallel
from megatron.core.enums import ModelType from megatron.core import mpu, tensor_parallel
from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, LLaVAModel from megatron.core.enums import ModelType
from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.models.multimodal import context_parallel
from megatron.core.parallel_state import ( from megatron.core.models.multimodal.llava_model import IGNORE_INDEX, LLaVAModel
get_tensor_model_parallel_rank, from megatron.core.packed_seq_params import PackedSeqParams
get_pipeline_model_parallel_world_size, from megatron.core.parallel_state import (
is_pipeline_last_stage, get_tensor_model_parallel_rank,
) get_pipeline_model_parallel_world_size,
from megatron.training import get_args, get_timers, get_tokenizer, pretrain is_pipeline_last_stage,
from megatron.training.utils import is_last_rank )
from megatron.training import get_args, get_timers, get_tokenizer, pretrain
from megatron.training.utils import is_last_rank, get_batch_on_this_cp_rank
def get_batch(data_iterator):
"""Generate a batch
def get_batch(data_iterator, image_token_index, img_seq_len):
Note: attn_mask_type in layer_specs.py sets the attention mask. Attention mask is None here. """Generate a batch
"""
imgs = None Note: attn_mask_type in layer_specs.py sets the attention mask. Attention mask is None here.
tokens = None """
labels = None imgs = None
loss_mask = None tokens = None
attention_mask = None labels = None
position_ids = None loss_mask = None
num_tiles = None attention_mask = None
packed_seq_params = None position_ids = None
num_tiles = None
args = get_args() packed_seq_params = None
# Dataloader doesn't run on the middle stages in a pipeline parallel model. args = get_args()
pp_size = get_pipeline_model_parallel_world_size()
if not is_first_or_last_stage(pp_size, args.encoder_pipeline_model_parallel_size): # Dataloader doesn't run on the middle stages in a pipeline parallel model.
# Note these are all set to None above. pp_size = get_pipeline_model_parallel_world_size()
return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles, packed_seq_params if not is_first_or_last_stage(pp_size, args.encoder_pipeline_model_parallel_size):
# Note these are all set to None above.
# Broadcast data. return tokens, labels, loss_mask, attention_mask, position_ids, imgs, num_tiles, packed_seq_params
torch.cuda.nvtx.range_push("get_data")
if data_iterator is not None and get_tensor_model_parallel_rank() == 0: # Broadcast data.
data = next(data_iterator) torch.cuda.nvtx.range_push("get_data")
else: if data_iterator is not None and get_tensor_model_parallel_rank() == 0:
data = None data = next(data_iterator)
else:
data_text = tensor_parallel.broadcast_data(["tokens"], data, torch.int64)["tokens"] data = None
labels = tensor_parallel.broadcast_data(["labels"], data, torch.int64)["labels"]
data_text = tensor_parallel.broadcast_data(["tokens"], data, torch.int64)["tokens"]
imgs = tensor_parallel.broadcast_data(["imgs"], data, torch.float32)["imgs"] labels = tensor_parallel.broadcast_data(["labels"], data, torch.int64)["labels"]
num_tiles = tensor_parallel.broadcast_data(["num_tiles"], data, torch.int32)["num_tiles"]
imgs = tensor_parallel.broadcast_data(["imgs"], data, torch.float32)["imgs"]
cu_lengths = tensor_parallel.broadcast_data(["cu_lengths"], data, torch.int32)["cu_lengths"] num_tiles = tensor_parallel.broadcast_data(["num_tiles"], data, torch.int32)["num_tiles"]
max_lengths = tensor_parallel.broadcast_data(["max_lengths"], data, torch.int32)["max_lengths"]
cu_lengths = tensor_parallel.broadcast_data(["cu_lengths"], data, torch.int32)["cu_lengths"]
# No image input (text-only sample) if the dataloader produced a dummy image. max_lengths = tensor_parallel.broadcast_data(["max_lengths"], data, torch.int32)["max_lengths"]
if imgs.shape == torch.Size([1, 1]):
# FIXME: text-only data can cause a hang if the vision model is own its own pipeline rank and --freeze-ViT is enabled. # No image input (text-only sample) if the dataloader returned a size 1 image.
imgs = torch.tensor([], dtype=torch.float32, device=data_text.device) if imgs.shape == torch.Size([1, 1]):
num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device) # FSDP can hang with text-only samples. A workaround is to run a valid dummy image through the vision
# model and then add image embeddings with a zero multiplier.
# Last pipeline parallel stage doesn't need images. if args.use_torch_fsdp2:
if pp_size > 1 and is_pipeline_last_stage(): imgs = torch.zeros((1, 3, args.img_h, args.img_w), dtype=torch.float32, device=data_text.device)
imgs = None num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
else:
# If cu_lengths and max_lengths are non-dummy, construct PackedSeqParams. Otherwise, leave it at None. # Similar workaround is not needed without FSDP and we can use an empty image.
if cu_lengths.shape != torch.Size([1, 1]): # FIXME: text-only data can cause still cause a hang in the special case where
assert ( # the vision model is own its own pipeline rank and --freeze-ViT is enabled.
cu_lengths.shape[0] == max_lengths.shape[0] == 1 imgs = torch.tensor([], dtype=torch.float32, device=data_text.device)
), "micro-batch-size must be 1 for packing" num_tiles = torch.tensor([], dtype=torch.int, device=data_text.device)
cu_lengths = cu_lengths[0]
max_lengths = max_lengths[0] # Last pipeline parallel stage doesn't need images.
if pp_size > 1 and is_pipeline_last_stage():
packed_seq_params = PackedSeqParams( imgs = None
qkv_format="thd",
cu_seqlens_q=cu_lengths, # If cu_lengths and max_lengths are non-dummy, construct PackedSeqParams. Otherwise, leave it at None.
cu_seqlens_kv=cu_lengths, if cu_lengths.shape != torch.Size([1, 1]):
max_seqlen_q=max_lengths, assert (
max_seqlen_kv=max_lengths, cu_lengths.shape[0] == max_lengths.shape[0] == 1
) ), "micro-batch-size must be 1 for packing"
cu_lengths = cu_lengths[0]
torch.cuda.nvtx.range_pop() max_lengths = max_lengths[0]
tokens_ = data_text.long() packed_seq_params = PackedSeqParams(
qkv_format="thd",
torch.cuda.nvtx.range_push("index tokens") cu_seqlens_q=cu_lengths,
tokenizer = get_tokenizer() cu_seqlens_kv=cu_lengths,
text_length = tokens_.shape[1] max_seqlen_q=max_lengths,
tokens = tokens_[:, :text_length].contiguous() max_seqlen_kv=max_lengths,
labels = labels[:, 1 : text_length + 1].contiguous() )
assert tokens.shape == labels.shape, f"tokens: {tokens.shape} != labels: {labels.shape}" torch.cuda.nvtx.range_pop()
torch.cuda.nvtx.range_pop()
tokens_ = data_text.long()
torch.cuda.nvtx.range_push("get_ltor_masks_and_position_ids")
loss_mask, position_ids = get_ltor_masks_and_position_ids(tokens, labels, tokenizer.pad) torch.cuda.nvtx.range_push("index tokens")
torch.cuda.nvtx.range_pop() tokenizer = get_tokenizer()
text_length = tokens_.shape[1]
return ( tokens = tokens_[:, :text_length].contiguous()
tokens, labels = labels[:, 1 : text_length + 1].contiguous()
labels,
loss_mask, assert tokens.shape == labels.shape, f"tokens: {tokens.shape} != labels: {labels.shape}"
attention_mask, torch.cuda.nvtx.range_pop()
position_ids,
imgs, torch.cuda.nvtx.range_push("get_ltor_masks_and_position_ids")
num_tiles, loss_mask, position_ids = get_ltor_masks_and_position_ids(tokens, labels, tokenizer.pad)
packed_seq_params, torch.cuda.nvtx.range_pop()
)
# If context parallel is enabled, must shard inputs to CP ranks.
if args.context_parallel_size > 1 or args.sequence_parallel:
def get_ltor_masks_and_position_ids(input_ids, target, pad_token): assert tokens.shape[0], "micro-batch-size > 1 not supported yet with CP"
"""Build masks and position id for left to right model."""
seq_length = input_ids.shape[1] num_image_tokens = torch.sum(tokens == image_token_index).item()
num_image_embeddings = num_image_tokens * img_seq_len - num_image_tokens
# Position ids. seq_len = text_length + num_image_embeddings
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids) # CP expects sequence length is divisible by CP size so apply padding.
mp_padding_needed = context_parallel.get_padding(
# Loss mask. seq_len, args.context_parallel_size,
loss_mask = torch.ones(target.size(), dtype=torch.float, device=input_ids.device) args.tensor_model_parallel_size, args.sequence_parallel,
loss_mask[target == pad_token] = 0.0 # mask paddings )
loss_mask[target == IGNORE_INDEX] = 0.0 # mask prompts tokens, position_ids, labels, loss_mask = [torch.nn.functional.pad(item, (0, mp_padding_needed)) for item in (tokens, position_ids, labels, loss_mask)]
return loss_mask, position_ids # Get PackedSeqParams that indicate the amount of padding for TransformerEngine.
packed_seq_params = context_parallel.get_packed_seq_params(tokens, num_image_embeddings, mp_padding_needed, args.context_parallel_size, True)
def loss_func(loss_mask, output_tensor): return (
losses = output_tensor.float() tokens,
labels,
loss_mask = loss_mask.contiguous().view(-1).float() loss_mask,
attention_mask,
total_tokens = loss_mask.sum() position_ids,
total_loss = torch.sum(losses.view(-1) * loss_mask) imgs,
loss = torch.cat([total_loss.view(1), total_tokens.view(1)]) num_tiles,
packed_seq_params,
reporting_loss = loss.clone().detach() )
torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
local_num_tokens = loss[1].clone().detach().to(torch.int) def get_ltor_masks_and_position_ids(input_ids, target, pad_token):
"""Build masks and position id for left to right model."""
return (total_loss, local_num_tokens, {'lm loss': (reporting_loss[0], reporting_loss[1])}) seq_length = input_ids.shape[1]
# Position ids.
def forward_step(data_iterator, model: LLaVAModel): position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
"""Forward training step. position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
Args: # Loss mask.
data_iterator (torch.utils.data.dataloader): Input data iterator loss_mask = torch.ones(target.size(), dtype=torch.float, device=input_ids.device)
model: Multimodal model loss_mask[target == pad_token] = 0.0 # mask paddings
loss_mask[target == IGNORE_INDEX] = 0.0 # mask prompts
Returns:
output_tensor (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size]. return loss_mask, position_ids
loss_func (callable): Loss function with a loss mask specified.
"""
timers = get_timers() def get_mask_start_and_end_idx(arr):
"""
# Get the batch. Returns a list of tuples holding the start and end index in arr of the non-zeros contiguuous
timers('batch-generator', log_level=2).start() sub arrays.
(
tokens, For instance, if arr = [0, 1, 0, 0, 1, 1]
labels, get_mask_start_and_end_idx(arr) = [(1, 1), (4, 5)]
loss_mask, such that arr[1:1+1] = [1] and arr[4:5+1] = [1, 1]
attention_mask, """
position_ids, mask = (arr != 0)
images,
num_image_tiles, mask_int = mask.int()
packed_seq_params,
) = get_batch(data_iterator) diff = mask_int[1:] - mask_int[:-1]
timers('batch-generator').stop() start_indices = (diff == 1).nonzero(as_tuple=False).flatten() + 1
end_indices = (diff == -1).nonzero(as_tuple=False).flatten()
output_tensor, loss_mask = model( if len(mask)==0: return []
images, if mask[0]:
tokens, start_indices = torch.cat((torch.tensor([0], device=arr.device), start_indices))
position_ids, if mask[-1]:
attention_mask, end_indices = torch.cat((end_indices, torch.tensor([len(arr) - 1], device=arr.device)))
labels, sequences = list(zip(start_indices.tolist(), end_indices.tolist()))
loss_mask, return sequences
num_image_tiles=num_image_tiles,
packed_seq_params=packed_seq_params,
) def scaled_loss_func(loss_mask, output_tensor):
"""
return output_tensor, partial(loss_func, loss_mask) Scaled loss function
Scale the loss for each conversation turn using the formula:
def llava_embedding_ranks(pp_ranks):
"""LLava's embedding ranks consist of the decoder's first and last ranks (ie, the ViT has no embeddings). 1 / sum_j[ sqrt(length(loss_turn_j)) ] * sum_i[ sum(loss_turn_i) / sqrt(length(loss_turn_i)) ]
Args:
pp_ranks: A list of global ranks that constitute a pipeline group. Where we use the loss mask to infer the start / end of the conversation turns.
""" """
args = get_args() losses = output_tensor.float()
# encoder size is also the index to the first rank of the decoder. loss_list = []
epp = args.encoder_pipeline_model_parallel_size num_valid_labels_list = []
for idx in range(losses.shape[0]):
last_rank = pp_ranks[-1] loss_this_sample = losses[idx]
if len(pp_ranks) == 1 or pp_ranks[epp] == last_rank: turn_start_end_list = get_mask_start_and_end_idx(loss_mask[idx])
return [last_rank] for turn_start, turn_end in turn_start_end_list:
else: # compute loss for each turn
return [pp_ranks[epp], last_rank] loss_this_turn = loss_this_sample[turn_start:turn_end+1].sum()
assert (1 - loss_mask)[idx][turn_start:turn_end+1].sum() < 1.0
num_valid_labels_this_turn = turn_end - turn_start + 1
def llava_position_embedding_ranks(pp_ranks): loss_this_turn = loss_this_turn / num_valid_labels_this_turn
"""LLava's embedding ranks consist of the singular rank of the model or the decoder's first rank. loss_list.append(loss_this_turn)
Args: # append num of valid labels for each turn
pp_ranks: A list of global ranks that constitute a pipeline group. num_valid_labels_list.append(num_valid_labels_this_turn)
""" base_num = sum([math.sqrt(each) for each in num_valid_labels_list])
args = get_args() for idx in range(len(loss_list)):
# normalize loss for each turn
# encoder size is also the index to the first rank of the decoder. loss_list[idx] = loss_list[idx] * math.sqrt(num_valid_labels_list[idx]) / base_num
epp = args.encoder_pipeline_model_parallel_size
total_loss = torch.stack(loss_list).sum()
last_rank = pp_ranks[-1] total_tokens = torch.ones_like(total_loss)
if len(pp_ranks) == 1:
return [last_rank] loss = torch.cat([total_loss.view(1), total_tokens.view(1)])
else:
return [pp_ranks[epp]] reporting_loss = loss.clone().detach()
torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
def run_online_eval(model): local_num_tokens = loss[1].clone().detach().to(torch.int)
"""Run an evaluation benchmark during training."""
args = get_args() return (
total_loss,
# Online evaluation config is not defined. Do nothing. local_num_tokens,
if not args.online_evaluation_config: {'lm loss': (reporting_loss[0], reporting_loss[1])},
return [] )
from config import EvaluationConfig
from run_text_generation import generate_and_write_samples def loss_func(loss_mask, output_tensor):
args = get_args()
with open(args.online_evaluation_config, "r") as f:
config_dict = yaml.safe_load(f) losses = output_tensor.float()
config = EvaluationConfig(**config_dict) loss_mask = loss_mask.contiguous().view(-1).float()
# The inference code assumes the first rank is the leader. total_tokens = loss_mask.sum()
# Tensorboard writer is on the last rank. total_loss = torch.sum(losses.view(-1) * loss_mask)
# We must write to a storage space that all ranks see. loss = torch.cat([total_loss.view(1), total_tokens.view(1)])
output_dir = os.path.join(args.save, "online_eval")
os.makedirs(output_dir, exist_ok=True) if args.context_parallel_size > 1:
config.output_path = os.path.join(output_dir, args.language_model_type) torch.distributed.all_reduce(loss, group=mpu.get_context_parallel_group())
# The actual generation. reporting_loss = loss.clone().detach()
generate_and_write_samples(model[0].module, config, print_output=False) torch.distributed.all_reduce(reporting_loss, group=mpu.get_data_parallel_group())
# Make sure the first rank is done writing so that the last rank can run eval. local_num_tokens = loss[1].clone().detach().to(torch.int)
torch.distributed.barrier()
# We multiply by context parallel size because later there will be a divide by CP(+DP) size.
if not is_last_rank(): return (
return [] loss[0] * args.context_parallel_size,
local_num_tokens,
# Run evaluation. {'lm loss': (reporting_loss[0], reporting_loss[1])}
if config.task == "TextVQA": )
from evaluate_textvqa import textvqa_eval
avg_acc = textvqa_eval(config.output_path) def forward_step(data_iterator, model: LLaVAModel):
"""Forward training step.
return [{"TextVQA accuracy": avg_acc}]
else: Args:
raise NotImplementedError(f"online evaluation of {config.task} not implemented yet") data_iterator (torch.utils.data.dataloader): Input data iterator
model: Multimodal model
def write_online_eval_to_tensorboard(data, iteration, writer): Returns:
"""Write online evaluation data to Tensorboard.""" output_tensor (torch.Tensor): Loss of shape [b, s] if labels are provided, otherwise logits of shape [b, s, vocab_size].
if not writer: loss_func (callable): Loss function with a loss mask specified.
return """
timers = get_timers()
for item in data:
for k, v in item.items(): # Get the batch.
writer.add_scalar(k, v, iteration) timers('batch-generator', log_level=2).start()
(
tokens,
if __name__ == "__main__": labels,
loss_mask,
train_valid_test_dataloaders_provider.is_distributed = True attention_mask,
position_ids,
pretrain( images,
train_valid_test_dataloaders_provider, num_image_tiles,
model_provider, packed_seq_params,
ModelType.encoder_and_decoder, ) = get_batch(data_iterator, model.module.module.image_token_index, model.module.module.img_seq_len)
forward_step, timers('batch-generator').stop()
args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
extra_args_provider=add_multimodal_extra_args, output_tensor, loss_mask = model(
process_non_loss_data_func=write_online_eval_to_tensorboard, images,
get_embedding_ranks=llava_embedding_ranks, tokens,
get_position_embedding_ranks=llava_position_embedding_ranks, position_ids,
non_loss_data_func=run_online_eval, attention_mask,
) labels,
loss_mask,
num_image_tiles=num_image_tiles,
packed_seq_params=packed_seq_params,
)
args = get_args()
if args.use_loss_scaling:
loss_function = partial(scaled_loss_func, loss_mask)
else:
loss_function = partial(loss_func, loss_mask)
return output_tensor, loss_function
def llava_embedding_ranks(pp_ranks):
"""LLava's embedding ranks consist of the decoder's first and last ranks (ie, the ViT has no embeddings).
Args:
pp_ranks: A list of global ranks that constitute a pipeline group.
"""
args = get_args()
# encoder size is also the index to the first rank of the decoder.
epp = args.encoder_pipeline_model_parallel_size
last_rank = pp_ranks[-1]
if len(pp_ranks) == 1 or pp_ranks[epp] == last_rank:
return [last_rank]
else:
return [pp_ranks[epp], last_rank]
def llava_position_embedding_ranks(pp_ranks):
"""LLava's embedding ranks consist of the singular rank of the model or the decoder's first rank.
Args:
pp_ranks: A list of global ranks that constitute a pipeline group.
"""
args = get_args()
# encoder size is also the index to the first rank of the decoder.
epp = args.encoder_pipeline_model_parallel_size
last_rank = pp_ranks[-1]
if len(pp_ranks) == 1:
return [last_rank]
else:
return [pp_ranks[epp]]
def run_online_eval(model):
"""Run an evaluation benchmark during training."""
args = get_args()
# Online evaluation config is not defined. Do nothing.
if not args.online_evaluation_config:
return []
from config import EvaluationConfig
from run_text_generation import generate_and_write_samples
with open(args.online_evaluation_config, "r") as f:
config_dict = yaml.safe_load(f)
config = EvaluationConfig(**config_dict)
# The inference code assumes the first rank is the leader.
# Tensorboard writer is on the last rank.
# We must write to a storage space that all ranks see.
output_dir = os.path.join(args.save, "online_eval")
os.makedirs(output_dir, exist_ok=True)
config.output_path = os.path.join(output_dir, args.language_model_type)
# The actual generation.
generate_and_write_samples(model[0].module, config, print_output=False)
# Make sure the first rank is done writing so that the last rank can run eval.
torch.distributed.barrier()
if not is_last_rank():
return []
# Run evaluation.
if config.task == "TextVQA":
from evaluate_textvqa import textvqa_eval
avg_acc = textvqa_eval(config.output_path)
return [{"TextVQA accuracy": avg_acc}]
else:
raise NotImplementedError(f"online evaluation of {config.task} not implemented yet")
def write_online_eval_to_tensorboard(data, iteration, writer):
"""Write online evaluation data to Tensorboard."""
if not writer:
return
for item in data:
for k, v in item.items():
writer.add_scalar(k, v, iteration)
if __name__ == "__main__":
train_valid_test_dataloaders_provider.is_distributed = True
pretrain(
train_valid_test_dataloaders_provider,
model_provider,
ModelType.encoder_and_decoder,
forward_step,
args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
extra_args_provider=add_multimodal_extra_args,
process_non_loss_data_func=write_online_eval_to_tensorboard,
get_embedding_ranks=llava_embedding_ranks,
get_position_embedding_ranks=llava_position_embedding_ranks,
non_loss_data_func=run_online_eval,
)
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
#!/bin/bash #!/bin/bash
# Runs the "220M" parameter model # Runs the "220M" parameter model
export CUDA_DEVICE_MAX_CONNECTIONS=1 export CUDA_DEVICE_MAX_CONNECTIONS=1
GPUS_PER_NODE=8 GPUS_PER_NODE=8
# Change for multinode config # Change for multinode config
MASTER_ADDR=localhost MASTER_ADDR=localhost
MASTER_PORT=6000 MASTER_PORT=6000
NUM_NODES=1 NUM_NODES=1
NODE_RANK=0 NODE_RANK=0
WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES)) WORLD_SIZE=$(($GPUS_PER_NODE*$NUM_NODES))
CHECKPOINT_PATH=$1 #<Specify path> CHECKPOINT_PATH=$1 #<Specify path>
TENSORBOARD_DIR=$2 #<Specify path> TENSORBOARD_DIR=$2 #<Specify path>
VOCAB_FILE=$3 #<Specify path to file>/bert-large-cased-vocab.txt VOCAB_FILE=$3 #<Specify path to file>/bert-large-cased-vocab.txt
DATA_PATH=$4 #<Specify path and file prefix>_text_document DATA_PATH=$4 #<Specify path and file prefix>_text_document
DISTRIBUTED_ARGS=" DISTRIBUTED_ARGS="
--nproc_per_node $GPUS_PER_NODE \ --nproc_per_node $GPUS_PER_NODE \
--nnodes $NUM_NODES \ --nnodes $NUM_NODES \
--node_rank $NODE_RANK \ --node_rank $NODE_RANK \
--master_addr $MASTER_ADDR \ --master_addr $MASTER_ADDR \
--master_port $MASTER_PORT --master_port $MASTER_PORT
" "
T5_ARGS=" T5_ARGS="
--encoder-num-layers 12 \ --encoder-num-layers 12 \
--decoder-num-layers 12 \ --decoder-num-layers 12 \
--hidden-size 768 \ --hidden-size 768 \
--num-attention-heads 12 \ --num-attention-heads 12 \
--kv-channels 64 \ --kv-channels 64 \
--ffn-hidden-size 3072 \ --ffn-hidden-size 3072 \
--encoder-seq-length 512 \ --encoder-seq-length 512 \
--decoder-seq-length 128 \ --decoder-seq-length 128 \
--max-position-embeddings 512 \ --max-position-embeddings 512 \
--micro-batch-size 64 \ --micro-batch-size 64 \
--global-batch-size 512 \ --global-batch-size 512 \
--lr 0.0001 \ --lr 0.0001 \
--train-iters 1000000 \ --train-iters 1000000 \
--lr-decay-iters 1000000 \ --lr-decay-iters 1000000 \
--lr-decay-style linear \ --lr-decay-style linear \
--min-lr 0.00001 \ --min-lr 0.00001 \
--weight-decay 1e-2 \ --weight-decay 1e-2 \
--lr-warmup-fraction .01 \ --lr-warmup-fraction .01 \
--clip-grad 1.0 \ --clip-grad 1.0 \
--bf16 \ --bf16 \
--vocab-extra-ids 100 \ --vocab-extra-ids 100 \
--init-method-std 0.015 \ --init-method-std 0.015 \
--transformer-impl transformer_engine \ --transformer-impl transformer_engine \
--tensor-model-parallel-size 1 \ --tensor-model-parallel-size 1 \
--pipeline-model-parallel-size 1 \ --pipeline-model-parallel-size 1 \
--attention-backend auto \ --attention-backend auto \
" "
DATA_ARGS=" DATA_ARGS="
--data-path $DATA_PATH \ --data-path $DATA_PATH \
--vocab-file $VOCAB_FILE \ --vocab-file $VOCAB_FILE \
--tokenizer-type BertWordPieceCase \ --tokenizer-type BertWordPieceCase \
--split 99982,9,9 \ --split 99982,9,9 \
" "
OUTPUT_ARGS=" OUTPUT_ARGS="
--log-interval 100 \ --log-interval 100 \
--tensorboard-dir ${TENSORBOARD_DIR} \ --tensorboard-dir ${TENSORBOARD_DIR} \
--save-interval 500 \ --save-interval 500 \
--eval-interval 1000 \ --eval-interval 1000 \
--eval-iters 10 --eval-iters 10
" "
torchrun $DISTRIBUTED_ARGS pretrain_t5.py \ torchrun $DISTRIBUTED_ARGS pretrain_t5.py \
$T5_ARGS \ $T5_ARGS \
$DATA_ARGS \ $DATA_ARGS \
$OUTPUT_ARGS \ $OUTPUT_ARGS \
--distributed-backend nccl \ --distributed-backend nccl \
--save $CHECKPOINT_PATH \ --save $CHECKPOINT_PATH \
--load $CHECKPOINT_PATH \ --load $CHECKPOINT_PATH \
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment