Commit d444a97a authored by yangzhong's avatar yangzhong
Browse files

首次上传

parents
Pipeline #3020 canceled with stages
import argparse
import json
from evaluate_mmmu import get_input_output_paths
def merge_input_files(input_path):
"""Merge input files to a format compatible with the evaluator."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="OCRBench")
results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
sample_id = res["sample_id"]
# Remove possible duplicates.
if sample_id in results:
continue
results[sample_id] = res
results = list(results.values())
with open(output_file_path, "w") as output_file:
json.dump(results, output_file)
return output_file_path
def compute_ocrbench_score(result_file):
"""Compute OCRBench score."""
merged_results = json.load(open(result_file))
# OCRBench score calculation is adopted from https://github.com/Yuliang-Liu/MultimodalOCR/blob/1b7713f44c91f30f64efb6d3e494c416861ef15f/example.py#L1
# MIT License. Copyright (c) 2023 Yuliang Liu
score = {
"Regular Text Recognition": 0,
"Irregular Text Recognition": 0,
"Artistic Text Recognition": 0,
"Handwriting Recognition": 0,
"Digit String Recognition": 0,
"Non-Semantic Text Recognition": 0,
"Scene Text-centric VQA": 0,
"Doc-oriented VQA": 0,
"Doc-oriented VQA": 0,
"Key Information Extraction": 0,
"Handwritten Mathematical Expression Recognition": 0,
}
for res in merged_results:
predict = res["answer"]
answers = res["gt_answer"]
dataset_name = res["dataset_name"]
ocr_type = res["data_type"]
if dataset_name == "HME100k":
if isinstance(answers, list):
for j in range(len(answers)):
answer = answers[j].strip().replace("\n", " ").replace(" ", "")
predict = predict.strip().replace("\n", " ").replace(" ", "")
if answer in predict:
score[ocr_type] += 1
else:
answers = answers.strip().replace("\n", " ").replace(" ", "")
predict = predict.strip().replace("\n", " ").replace(" ", "")
if answers in predict:
score[ocr_type] += 1
else:
if isinstance(answers, list):
for j in range(len(answers)):
answer = answers[j].lower().strip().replace("\n", " ")
predict = predict.lower().strip().replace("\n", " ")
if answer in predict:
score[ocr_type] += 1
else:
answers = answers.lower().strip().replace("\n", " ")
predict = predict.lower().strip().replace("\n", " ")
if answers in predict:
score[ocr_type] += 1
recognition_score = (
score['Regular Text Recognition']
+ score['Irregular Text Recognition']
+ score['Artistic Text Recognition']
+ score['Handwriting Recognition']
+ score['Digit String Recognition']
+ score['Non-Semantic Text Recognition']
)
final_score = (
recognition_score
+ score['Scene Text-centric VQA']
+ score['Doc-oriented VQA']
+ score['Key Information Extraction']
+ score['Handwritten Mathematical Expression Recognition']
)
result_log = f"""###########################OCRBench##############################
Text Recognition(Total 300): {recognition_score}
------------------Details of Recognition Score-------------------
Regular Text Recognition(Total 50): {score['Regular Text Recognition']}
Irregular Text Recognition(Total 50): {score['Irregular Text Recognition']}
Artistic Text Recognition(Total 50): {score['Artistic Text Recognition']}
Handwriting Recognition(Total 50): {score['Handwriting Recognition']}
Digit String Recognition(Total 50): {score['Digit String Recognition']}
Non-Semantic Text Recognition(Total 50): {score['Non-Semantic Text Recognition']}
----------------------------------------------------------------
Scene Text-centric VQA(Total 200): {score['Scene Text-centric VQA']}
----------------------------------------------------------------
Doc-oriented VQA(Total 200): {score['Doc-oriented VQA']}
----------------------------------------------------------------
Key Information Extraction(Total 200): {score['Key Information Extraction']}
----------------------------------------------------------------
Handwritten Mathematical Expression Recognition(Total 100): {score['Handwritten Mathematical Expression Recognition']}
----------------------Final Score-------------------------------
Final Score(Total 1000): {final_score}"""
return result_log, final_score
def ocrbench_eval(input_path):
"""Run OCRBench evaluation."""
result_file_path = merge_input_files(input_path)
result_log, score = compute_ocrbench_score(result_file_path)
return result_log, score
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input-path', type=str, help="Path to input file(s)")
args = parser.parse_args()
result_log, _ = ocrbench_eval(args.input_path)
print(result_log)
import argparse
import json
from evaluate_mmmu import get_input_output_paths
from evaluate_vqav2 import compute_vqa_accuracy
def merge_input_files(input_path):
"""Merge input files to a format compatible with the evaluator."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA")
results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
sample_id = res["sample_id"]
# Remove possible duplicates.
if sample_id in results:
continue
results[sample_id] = {
"question_id": sample_id,
"answer": res["answer"],
"gt_answer": res["gt_answer"],
}
results = list(results.values())
with open(output_file_path, "w") as output_file:
json.dump(results, output_file)
return output_file_path
def textvqa_eval(input_path):
"""Run TextVQA evaluation."""
result_file_path = merge_input_files(input_path)
avg_acc = compute_vqa_accuracy(result_file_path, task="TextVQA")
return avg_acc
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input-path', type=str, help="Path to input file(s)")
args = parser.parse_args()
avg_acc = textvqa_eval(args.input_path)
print(f"===== TextVQA Accuracy {avg_acc:.2f}% =====")
import argparse
import json
from evaluate_mmmu import get_input_output_paths
from open_flamingo.eval.vqa_metric import VQAEval
def merge_input_files(input_path):
"""Merge input files to a format compatible with the evaluator."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
sample_id = res["sample_id"]
# Skip possible duplicates.
if sample_id in results:
continue
res["question_id"] = sample_id
results[sample_id] = res
results = list(results.values())
with open(output_file_path, "w") as output_file:
json.dump(results, output_file)
return output_file_path
def is_number(n: str):
"""Check if input is a number."""
try:
float(n)
return True
except ValueError:
return False
def compute_vqa_accuracy(result_file, task):
"""Compute VQA accuracy."""
merged_results = json.load(open(result_file))
vqa = VQAEval(vqa=None, vqaRes=None)
all_acc = []
for res in merged_results:
pred = res["answer"]
pred = vqa.processPunctuation(pred)
pred = vqa.processDigitArticle(pred)
gt = res["gt_answer"]
gt = [vqa.processPunctuation(ans) for ans in gt]
gt = [vqa.processDigitArticle(ans) for ans in gt]
# ChartQA uses relaxed accuracy:
# "We consider an answer to be correct if it is within 5% of the gold answer.
# For non-numeric answers, we still need an exact match to consider an answer to be correct."
if task == "ChartQA":
acc = 0.0
assert len(gt) == 1, "expected exactly one groundtruth answer."
gt = gt[0]
pred = pred.rstrip("%")
gt = gt.rstrip("%")
if is_number(pred) and is_number(gt):
pred = float(pred)
gt = float(gt)
if pred >= (gt * 0.95) and pred <= (gt * 1.05):
acc = 1.0
elif pred == gt:
acc = 1.0
all_acc.append(acc)
elif task in ("VQAv2", "TextVQA"):
num_match = sum([pred == ans for ans in gt])
acc = min(1.0, num_match / 3.0)
all_acc.append(acc)
elif task == "AI2D":
assert len(gt) == 1, f"Expected exactly 1 GT, got {gt}"
acc = pred == gt[0]
all_acc.append(acc)
else:
raise NotImplementedError(f"unknown task {task}")
acc_avg = sum(all_acc) / len(all_acc) * 100
return acc_avg
def vqav2_eval(input_path):
"""Run VQAv2 evaluation."""
result_file = merge_input_files(input_path)
avg_acc = compute_vqa_accuracy(result_file, task="VQAv2")
return avg_acc
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--input-path', type=str, help="Path to input file(s)")
args = parser.parse_args()
avg_acc = vqav2_eval(args.input_path)
print(f"===== VQAv2 Accuracy {avg_acc:.2f}% =====")
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
"""Evaluation datasets."""
import glob
import itertools
import json
import os
import re
from collections import defaultdict
import numpy as np
import torch
from image_processing import get_visual_transform
from PIL import Image
from megatron.training import print_rank_0
def _get_partition_bounds(
total_num_samples, num_samples_per_partition, num_partitions, partition_id
):
if num_samples_per_partition == 0:
samples_per_partition = [
int(x) for x in np.linspace(0, total_num_samples, num_partitions + 1)
]
return samples_per_partition[partition_id], samples_per_partition[partition_id + 1]
return num_samples_per_partition * partition_id, num_samples_per_partition * (partition_id + 1)
class VQADataset(torch.utils.data.Dataset):
"""VQA evaluation dataset."""
def __init__(
self,
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
keys,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
vision_model_type,
):
samples = json.load(open(gt_path, encoding='utf-8'))
if "data" in samples:
samples = samples["data"]
# Optionally, process only a subset of the input files.
if num_partitions > 0:
lb, ub = _get_partition_bounds(
len(samples), num_samples_per_partition, num_partitions, partition_id
)
samples = samples[lb:ub]
self._keys = keys
self._samples = samples
self._input_image_path = input_image_path
self._img_h = img_h
self._img_w = img_w
self._use_tiling = use_tiling
self._max_num_tiles = max_num_tiles
self._use_thumbnail = use_thumbnail
self._vision_model_type = vision_model_type
def __len__(self):
return len(self._samples)
def __getitem__(self, idx):
sample = self._samples[idx]
img_file = "{}/{}".format(self._input_image_path, sample[self._keys["image_id"]])
if not os.path.exists(img_file):
img_file += ".jpg"
if not os.path.exists(img_file):
img_file = img_file.replace('.jpg', '.png')
img = Image.open(img_file)
imgs = get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
self._max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
)
tile_count = torch.tensor([len(imgs)], dtype=torch.int)
sample_id = idx
if "sample_id" in self._keys:
sample_id = sample[self._keys["sample_id"]]
metadata = "" # Not used.
return (
torch.stack(imgs),
tile_count,
sample_id,
sample[self._keys["question"]],
sample[self._keys["answer"]],
metadata,
)
class CaptioningDataset(torch.utils.data.Dataset):
"""Captioning evaluation dataset."""
def __init__(
self,
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
vision_model_type,
):
image_files = sorted(glob.glob(input_image_path + "/*"))
# Optionally, process only a subset of the input files.
if num_partitions > 0:
lb, ub = _get_partition_bounds(
len(image_files), num_samples_per_partition, num_partitions, partition_id
)
image_files = image_files[lb:ub]
gts = json.load(open(gt_path))
answers = defaultdict(list)
for gt in gts["annotations"]:
answers[gt["image_id"]].append(gt['caption'])
self._image_files = image_files
self._answers = answers
self._img_h = img_h
self._img_w = img_w
self._use_tiling = use_tiling
self._max_num_tiles = max_num_tiles
self._use_thumbnail = use_thumbnail
self._vision_model_type = vision_model_type
def __len__(self):
return len(self._image_files)
def __getitem__(self, idx):
img_file = self._image_files[idx]
image_id = int(img_file.split("_")[-1].split(".")[0])
img = Image.open(img_file)
imgs = get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
self._max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
)
tile_count = torch.tensor([len(imgs)], dtype=torch.int)
question = "" # Fixed for all samples.
metadata = "" # Not used.
return torch.stack(imgs), tile_count, image_id, question, self._answers[image_id], metadata
class MMMUDataset(torch.utils.data.Dataset):
"""MMMU evaluation dataset."""
def __init__(
self,
input_image_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
prompt_style,
vision_model_type,
):
import datasets
from MMMU.mmmu.utils.data_utils import CAT_SHORT2LONG, load_yaml
# The following downloads the MMMU dataset from HuggingFace and uses the API from the MMMU github repo to run MMMU evaluation.
all_mmmu_datasets = []
hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
for subject in CAT_SHORT2LONG.values():
# Use a local copy of the dataset if exists (can be faster) or the HF one.
if os.path.exists(input_image_path):
subject_dataset = datasets.load_dataset(
os.path.join(input_image_path, subject),
split=datasets.Split.VALIDATION,
cache_dir=hf_datasets_cache,
verification_mode="no_checks",
)
else:
subject_dataset = datasets.load_dataset(
"MMMU/MMMU",
subject,
split=datasets.Split.VALIDATION,
cache_dir=hf_datasets_cache,
)
all_mmmu_datasets.append(subject_dataset)
dataset = datasets.concatenate_datasets(all_mmmu_datasets)
dataset = [s for s in dataset if s['id'].startswith("val")]
# Optionally, process only a subset of the input files.
if num_partitions > 0:
lb, ub = _get_partition_bounds(
len(dataset), num_samples_per_partition, num_partitions, partition_id
)
dataset = dataset[lb:ub]
# Using the LLaVA config from the MMMU repo.
config = load_yaml("examples/multimodal/MMMU/mmmu/configs/llava1.5.yaml")
for k, v in config.items():
if isinstance(v, list):
assert len(v) == 1, "only one value supported."
config[k] = v[0]
self._config = config
self._dataset = dataset
self._img_h = img_h
self._img_w = img_w
self._use_tiling = use_tiling
self._max_num_tiles = max_num_tiles
self._use_thumbnail = use_thumbnail
self._prompt_style = prompt_style
self._vision_model_type = vision_model_type
def __len__(self):
return len(self._dataset)
def __getitem__(self, idx):
from MMMU.mmmu.utils.data_utils import construct_prompt, process_single_sample
sample = self._dataset[idx]
# Use the single image approach from the MMMU repo.
if self._prompt_style == "single_image":
sample = process_single_sample(sample)
sample = construct_prompt(sample, self._config)
img = sample["image"]
sample_imgs = get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
self._max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
)
sample_num_tiles = [len(sample_imgs)]
prompt = sample["final_input_prompt"]
for i in range(8):
prompt = prompt.replace(f"<image {i}>", "")
sample["final_input_prompt"] = f"<image>\n{prompt}"
elif self._prompt_style == "vlmevalkit":
sample = construct_prompt(sample, self._config)
if sample["question_type"] == "multiple-choice":
question = sample["question"]
options = ""
for k, v in sample["index2ans"].items():
options += f"{k}. {v}\n"
final_prompt = f"{question}\n"
if "hint" in sample:
final_prompt += f"Hint: {sample['hint']}\n"
if "task_instructions" in sample:
final_prompt += f"Task instructions: {sample['task_instructions']}\n"
final_prompt += options
final_prompt += "Answer with the option's letter from the given choices directly."
sample["final_input_prompt"] = final_prompt.rstrip()
else:
question = sample["question"]
final_prompt = f"{question}\n"
final_prompt += "Answer the question directly."
sample["final_input_prompt"] = final_prompt.rstrip()
sample_imgs = []
sample_num_tiles = []
img_indices = sorted(list(set(re.findall(r"<image (\d+)", sample["final_input_prompt"]))))
# If there are multiple input images, we need to avoid the number of image embeddings getting too large.
adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
adjusted_max_num_tiles = min(adjusted_max_num_tiles, self._max_num_tiles)
for img_idx in img_indices:
img_key = f"image_{img_idx}"
img_str = f"<image {img_idx}>"
img = sample[img_key]
assert img is not None, f"{img_str} is in prompt but not in sample images"
imgs = get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
adjusted_max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
) # List of tiles.
sample_imgs.extend(imgs)
sample_num_tiles.append(len(imgs))
sample["final_input_prompt"] = " ".join([f'<image {i + 1}><image>' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"]
elif self._prompt_style == "multi_image":
sample = construct_prompt(sample, self._config)
sample_imgs = []
sample_num_tiles = []
img_indices = re.findall(r"<image (\d+)", sample["final_input_prompt"])
# If there are multiple input images, we need to avoid the number of image embeddings getting too large.
adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
for img_idx in img_indices:
img_key = f"image_{img_idx}"
img_str = f"<image {img_idx}>"
img = sample[img_key]
assert img is not None, f"{img_str} is in prompt but not in sample images"
# Note: Only replace the current image tag.
sample["final_input_prompt"] = sample["final_input_prompt"].replace(
img_str, "<image>", 1
)
imgs = get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
adjusted_max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
) # List of tiles.
sample_imgs.extend(imgs)
sample_num_tiles.append(len(imgs))
# Sanity check.
for i in range(1, 8):
assert (
f"<image {i}>" not in sample["final_input_prompt"]
), "prompt contains unhandled image tags"
else:
raise ValueError(f"unknown prompt style {self._prompt_style}")
# MMMU specific metadata.
metadata = {"question_type": sample["question_type"]}
if sample["question_type"] == "multiple-choice":
metadata["index2ans"] = sample["index2ans"]
metadata["all_choices"] = sample["all_choices"]
prompt = sample['final_input_prompt']
tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
return (
torch.stack(sample_imgs),
tile_count,
sample["id"],
prompt,
sample["answer"],
metadata,
)
class VideoMMMEDataset(torch.utils.data.Dataset):
"Video MME evaluation dataset."
def __init__(
self,
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
num_frames,
vision_model_type,
):
ground_truth_original = json.load(open(gt_path))
ground_truth = []
for gt in ground_truth_original:
video_path = gt["url"]
video_path = video_path.replace("https://www.youtube.com/watch?v=", "")
video_path = video_path.replace("https://m.youtube.com/watch?v=", "")
video_path = os.path.join(input_image_path, video_path + ".mp4")
if not os.path.exists(video_path):
continue
gt["video_path"] = video_path
ground_truth.append(gt)
ground_truth = sorted(ground_truth, key=lambda gt: gt["video_path"])
print_rank_0(f"Found {len(ground_truth)} videos to process.")
if num_partitions > 0:
start_idx, end_idx = _get_partition_bounds(
len(ground_truth), num_samples_per_partition, num_partitions, partition_id
)
ground_truth = ground_truth[start_idx:end_idx]
self._ground_truth = ground_truth
self._img_h = img_h
self._img_w = img_w
self._use_tiling = use_tiling
self._max_num_tiles = max_num_tiles
self._use_thumbnail = use_thumbnail
self._num_frames = num_frames
self._vision_model_type = vision_model_type
def __len__(self):
return len(self._ground_truth)
def __getitem__(self, idx):
from torchvision.io import read_video
gt = self._ground_truth[idx]
video, _, _ = read_video(gt["video_path"], start_pts=0, end_pts=None, pts_unit='sec')
video = video.numpy()
selected_frames = torch.linspace(0, video.shape[0] - 1, self._num_frames).long()
video_frames = video[selected_frames]
if self._num_frames == 1:
video_frames = video_frames[None]
imgs = list(
itertools.chain.from_iterable(
get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
self._max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
)
for img in video_frames
)
)
for question in gt["questions"]:
# Very hacky, but we essentially re-create gt holding only the
# question of interest. This is the make this generation script
# compatible with the Video MME evaluation script.
question_dict = {
"video_id": gt["video_id"],
"duration_category": gt["duration_category"],
"video_category": gt["video_category"],
"video_subcategory": gt["video_subcategory"],
"url": gt["url"],
"questions": [question],
}
num_tiles = torch.tensor([len(imgs)], dtype=torch.int)
answer = ""
metadata = ""
return (
torch.stack(imgs),
num_tiles,
question["question_id"],
question_dict,
answer,
metadata,
)
class OCRBenchDataset(torch.utils.data.Dataset):
"""OCRBench evaluation dataset."""
def __init__(
self,
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
vision_model_type,
):
gt = json.load(open(gt_path, encoding='utf-8'))
if num_partitions > 0:
start_idx, end_idx = _get_partition_bounds(
len(gt), num_samples_per_partition, num_partitions, partition_id
)
gt = gt[start_idx:end_idx]
self._input_image_path = input_image_path
self._gt = gt
self._img_h = img_h
self._img_w = img_w
self._use_tiling = use_tiling
self._max_num_tiles = max_num_tiles
self._use_thumbnail = use_thumbnail
self._vision_model_type = vision_model_type
def __len__(self):
return len(self._gt)
def __getitem__(self, idx):
img_path = os.path.join(self._input_image_path, self._gt[idx]['image_path'])
img = Image.open(img_path)
imgs = get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
self._max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
)
tile_count = torch.tensor([len(imgs)], dtype=torch.int)
metadata = {
"dataset_name": self._gt[idx]["dataset_name"],
"data_type": self._gt[idx]["type"],
}
return (
torch.stack(imgs),
tile_count,
idx,
self._gt[idx]["question"],
self._gt[idx]["answers"],
metadata,
)
class MathVistaDataset(torch.utils.data.Dataset):
"""MathVista evaluation dataset."""
def __init__(
self,
input_image_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
vision_model_type,
):
import datasets
hf_datasets_cache = os.environ["HF_DATASETS_CACHE"]
assert hf_datasets_cache != "", "Please set the environment variable HF_DATASETS_CACHE."
if os.path.exists(input_image_path):
dataset = datasets.load_dataset(
input_image_path, cache_dir=hf_datasets_cache, verification_mode="no_checks"
)
else:
dataset = datasets.load_dataset(
"AI4Math/MathVista", split="testmini", cache_dir=hf_datasets_cache
)
if num_partitions > 0:
start_idx, end_idx = _get_partition_bounds(
len(dataset), num_samples_per_partition, num_partitions, partition_id
)
dataset = dataset[start_idx:end_idx]
self._dataset = dataset
self._img_h = img_h
self._img_w = img_w
self._use_tiling = use_tiling
self._max_num_tiles = max_num_tiles
self._use_thumbnail = use_thumbnail
self._vision_model_type = vision_model_type
def __len__(self):
return len(self._dataset["pid"])
def __getitem__(self, idx):
# Already a PIL object.
img = self._dataset['decoded_image'][idx]
imgs = get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
self._max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
)
tile_count = torch.tensor([len(imgs)], dtype=torch.int)
question_id = self._dataset["pid"][idx]
question = self._dataset["question"][idx]
question_type = self._dataset["question_type"][idx] # free_form or multi_choice
query = self._dataset["query"][idx]
choices = self._dataset["choices"][idx]
answer = self._dataset["answer"][idx]
if question_type == 'multi_choice':
start_chr = 'A'
choices_str = ''
index2ans = {}
all_choices = []
for choice in choices:
all_choices.append(start_chr)
index2ans[start_chr] = choice
choices_str += f"{start_chr}. {choice}\n"
start_chr = chr(ord(start_chr) + 1)
question = question + '\n' + choices_str
question = question + "Answer with the option's letter from the given choices directly."
answer = chr(ord('A') + choices.index(answer))
else:
question = query.replace("Hint: ", "")
index2ans = {}
all_choices = []
metadata = {
"question_type": question_type,
"index2ans": index2ans,
"all_choices": all_choices,
}
return torch.stack(imgs), tile_count, question_id, question, answer, metadata
class AI2DDataset(torch.utils.data.Dataset):
"""AI2D evaluation dataset."""
def __init__(
self,
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
no_mask,
vision_model_type,
):
with open(gt_path, 'r') as f:
jsonl = list(f)
gt = [json.loads(json_str) for json_str in jsonl]
if num_partitions > 0:
start_idx, end_idx = _get_partition_bounds(
len(gt), num_samples_per_partition, num_partitions, partition_id
)
gt = gt[start_idx:end_idx]
self._gt = gt
self._input_image_path = input_image_path
self._img_h = img_h
self._img_w = img_w
self._use_tiling = use_tiling
self._max_num_tiles = max_num_tiles
self._use_thumbnail = use_thumbnail
self._no_mask = no_mask
self._vision_model_type = vision_model_type
def __len__(self):
return len(self._gt)
def __getitem__(self, idx):
img_path = os.path.join(self._input_image_path, self._gt[idx]['image'])
if self._no_mask:
img_path.replace("AI2D_TEST", "AI2D_TEST_NO_MASK_IMAGES")
img = Image.open(img_path)
imgs = get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
self._max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
)
tile_count = torch.tensor([len(imgs)], dtype=torch.int)
metadata = "" # Not used.
return (
torch.stack(imgs),
tile_count,
self._gt[idx]["question_id"],
self._gt[idx]["question"],
self._gt[idx]["answer"],
metadata,
)
def get_evaluation_dataset(
task,
input_image_path,
gt_path,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
num_samples_per_partition,
num_partitions,
partition_id,
num_frames,
vision_model_type,
):
"""Get an evaluation dataset."""
if task == "TextVQA":
keys = {
"image_id": "image_id",
"sample_id": "question_id",
"question": "question",
"answer": "answers",
}
dataset = VQADataset(
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
keys,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
vision_model_type,
)
elif task == "VQAv2":
keys = {
"image_id": "image",
"sample_id": "question_id",
"question": "question",
"answer": "answer",
}
dataset = VQADataset(
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
keys,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
vision_model_type,
)
elif task == "ChartQA":
keys = {"image_id": "imgname", "question": "query", "answer": "label"}
dataset = VQADataset(
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
keys,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
vision_model_type,
)
elif task == "captioning":
dataset = CaptioningDataset(
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
vision_model_type,
)
elif task == 'MMMU':
# Note:
# - prompt_style="single_image" uses only one image like in the MMMU repo example.
# - prompt_style="multi_image" uses multiple input images.
# - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499
dataset = MMMUDataset(
input_image_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
prompt_style="single_image",
vision_model_type=vision_model_type,
)
elif task == "VideoMME":
dataset = VideoMMMEDataset(
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
num_frames,
vision_model_type,
)
elif task == "OCRBench":
dataset = OCRBenchDataset(
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
vision_model_type,
)
elif task == "MathVista":
dataset = MathVistaDataset(
input_image_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
vision_model_type,
)
elif task == "AI2D":
dataset = AI2DDataset(
input_image_path,
gt_path,
num_samples_per_partition,
num_partitions,
partition_id,
img_h,
img_w,
use_tiling,
max_num_tiles,
use_thumbnail,
no_mask=False,
vision_model_type=vision_model_type,
)
else:
raise NotImplementedError(f"unsupported task {task}")
return dataset
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. Except portions as noted which are Copyright (c) 2023 OpenGVLab and licensed under the MIT license found in LICENSE.
from torchvision import transforms as T
from torchvision.transforms import Compose
from torchvision.transforms.functional import InterpolationMode
IMAGENET_PIXEL_MEAN = [0.485, 0.456, 0.406]
IMAGENET_PIXEL_STD = [0.229, 0.224, 0.225]
SIGLIP_PIXEL_MEAN = [0.5, 0.5, 0.5]
SIGLIP_PIXEL_STD = [0.5, 0.5, 0.5]
CLIP_PIXEL_MEAN = [0.48145466, 0.4578275, 0.40821073]
CLIP_PIXEL_STD = [0.26862954, 0.26130258, 0.27577711]
pixel_statistics = {
"clip": (CLIP_PIXEL_MEAN, CLIP_PIXEL_STD),
"siglip": (SIGLIP_PIXEL_MEAN, SIGLIP_PIXEL_STD),
"internvit": (IMAGENET_PIXEL_MEAN, IMAGENET_PIXEL_STD),
}
def get_visual_transform(img, img_h, img_w, use_tiling=False, max_num_tiles=1, use_thumbnail=False, augment=False, vision_model_type="clip"):
pixel_mean, pixel_std = pixel_statistics[vision_model_type]
assert not augment, "Image augmentation not implemented."
transform = build_transform(img_h, pixel_mean, pixel_std, vision_model_type)
if use_tiling:
assert img_h == img_w, "dynamic tiling expects equal tile height and width"
imgs = dynamic_preprocess(img, min_num=1, max_num=max_num_tiles, image_size=img_h, use_thumbnail=use_thumbnail)
imgs = [transform(img) for img in imgs]
else:
imgs = [transform(img)]
return imgs
# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L685
# Copyright (c) 2023 OpenGVLab.
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
# print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
return best_ratio
# From https://github.com/OpenGVLab/InternVL/blob/c62fa4f7c850165d7386bdc48ac6bc5a6fab0864/internvl_chat/internvl/train/dataset.py#L702
# Copyright (c) 2023 OpenGVLab.
def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
# Based on https://github.com/openai/CLIP/blob/dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1/clip/clip.py#L79
# and https://github.com/OpenGVLab/InternVL/blob/aa521e6eb1df4cf153aa4118fcf13e673c055d46/internvl_chat/internvl/train/dataset.py#L276
def build_transform(input_size, pixel_mean, pixel_std, vision_model_type):
if vision_model_type in ("siglip", "internvit"):
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=pixel_mean, std=pixel_std)
])
elif vision_model_type == "clip":
transform = Compose([
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.ToTensor(),
T.Normalize(mean=pixel_mean, std=pixel_std),
])
else:
raise NotImplementedError(f"image processing not defined for vision model {vision_model_type}")
return transform
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import torch
from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.identity_op import IdentityOp
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.spec_utils import ModuleSpec
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
try:
from megatron.core.extensions.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TELayerNormColumnParallelLinear,
TENorm,
TERowParallelLinear,
)
HAVE_TE = True
except ImportError:
HAVE_TE = False
try:
import apex
from megatron.core.fusions.fused_layer_norm import FusedLayerNorm
from megatron.core.transformer.torch_norm import WrappedTorchNorm
HAVE_APEX = True
LNImpl = FusedLayerNorm
except ImportError:
import warnings
from megatron.core.transformer.torch_norm import WrappedTorchNorm
warnings.warn(f'Apex is not installed. Falling back to Torch Norm')
LNImpl = WrappedTorchNorm
def get_layer_spec(is_vit, normalization) -> ModuleSpec:
attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
if normalization == "LayerNorm":
norm = LNImpl
elif normalization == "RMSNorm":
if HAVE_TE:
norm = TENorm
else:
version = torch.__version__.split('.')
version_geq_2_4 = (
int(TORCH_VERSION[0]) > 2
or (
int(TORCH_VERSION[0]) == 2
and int(TORCH_VERSION[1]) >= 4
)
)
assert version_geq_2_4, "Torch version >= 2.4.0 is required for RMSNorm"
if HAVE_APEX:
warnings.warn(f'Apex does not support RMSNorm. Falling back to Torch Norm')
norm = WrappedTorchNorm
else:
raise RuntimeError("unknown normalization", normalization)
mlp = get_mlp_module_spec(use_te=False) # doesn't include norm.
return ModuleSpec(
module=TransformerLayer,
submodules=TransformerLayerSubmodules(
input_layernorm=norm,
self_attention=ModuleSpec(
module=SelfAttention,
params={"attn_mask_type": attn_mask_type},
submodules=SelfAttentionSubmodules(
linear_qkv=ColumnParallelLinear,
core_attention=DotProductAttention,
linear_proj=RowParallelLinear,
q_layernorm=IdentityOp,
k_layernorm=IdentityOp,
),
),
self_attn_bda=get_bias_dropout_add,
pre_mlp_layernorm=norm,
mlp=mlp,
mlp_bda=get_bias_dropout_add,
),
)
def get_layer_spec_te(is_vit=False) -> ModuleSpec:
attn_mask_type = AttnMaskType.no_mask if is_vit else AttnMaskType.causal
mlp = get_norm_mlp_module_spec_te()
return ModuleSpec(
module=TransformerLayer,
submodules=TransformerLayerSubmodules(
self_attention=ModuleSpec(
module=SelfAttention,
params={"attn_mask_type": attn_mask_type},
submodules=SelfAttentionSubmodules(
linear_qkv=TELayerNormColumnParallelLinear,
core_attention=TEDotProductAttention,
linear_proj=TERowParallelLinear,
q_layernorm=IdentityOp,
k_layernorm=IdentityOp,
),
),
self_attn_bda=get_bias_dropout_add,
pre_mlp_layernorm=IdentityOp,
mlp=mlp,
mlp_bda=get_bias_dropout_add,
),
)
def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
# Dense MLP w/ or w/o TE modules.
return ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
),
)
def get_norm_mlp_module_spec_te() -> ModuleSpec:
return ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TELayerNormColumnParallelLinear, linear_fc2=TERowParallelLinear
),
)
{
"COMMENT": "Sources for these prompts include https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
"Captioning": {
"raw": [
"Can you briefly explain what you see in the image?",
"Describe what's happening in this image in one short sentence.",
"Write a short caption that accurately represents the content of this image.",
"Please generate a descriptive caption for the image provided.",
"How would you summarize the scene depicted in the picture in short?",
"Describe the image briefly.",
"Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
"Create a concise caption that accurately describes the main elements in the image provided.",
"Write a brief, yet comprehensive, description of the image.",
"Describe the image in a clear and concise manner.",
"For the given image, provide a one-sentence summary that captures the most important details.",
"Generate a short caption for the picture.",
"Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
"Provide a concise and informative caption for the image, focusing on the primary subjects.",
"Write a clear description of the image, make sure the key features are well covered.",
"Offer a succinct explanation of the picture presented."
]
},
"CaptioningPretraining": {
"raw": [
"Generate a short caption of the image.",
"Describe the image concisely.",
"Provide a brief description of the given image."
],
"llava": [
"Give a brief description of image.",
"Give a brief description of the image.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely.",
"Generate a clear and concise summary of the photo."
]
},
"OCR": {
"raw": [
"Can you read the text from image and output here?",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"
]
}
}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import warnings
from copy import deepcopy
import torch
from config import get_language_model_config, get_vision_model_config, get_vision_projection_config
from layer_specs import get_layer_spec, get_layer_spec_te, get_mlp_module_spec, get_norm_mlp_module_spec_te
from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN, LLaVAModel
from megatron.core.models.vision.clip_vit_model import get_num_image_embeddings
from megatron.training import get_args, get_tokenizer, print_rank_0
from megatron.training.arguments import core_transformer_config_from_args
def model_provider(
pre_process=True, post_process=True, add_encoder=True, add_decoder=True, parallel_output=True
) -> LLaVAModel:
"""Builds the model.
Args:
pre_process (bool): Include the embedding layer in the gpt decoder (used with pipeline parallelism). Defaults to True.
post_process (bool): Include an output layer and a layernorm in the gpt decoder (used with pipeline parallelism). Defaults to True.
add_encoder (bool): Construct the encoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the encoder
will live on only a subset of the pipeline stages (specifically, only the first stage).
add_decoder (bool): Construct the decoder module (used with pipeline parallelism). Defaults to True. When we use pipelining, the decoder
will live on only a subset of the pipeline stages (specifically, every stage after the first one).
parallel_output (bool): Enable parallel model output.
Returns:
model: A multimodal model.
"""
args = get_args()
assert args.ckpt_format == 'torch', "Only ckpt-format torch is supported for VLM training currently."
assert args.encoder_pipeline_model_parallel_size <= 1, "LLaVA does not support pp>1 for encoder on it's own pipeline rank"
use_te = args.use_te
print_rank_0('building a multimodal model ...')
num_image_embeddings = get_num_image_embeddings(
args.img_h,
args.img_w,
args.patch_dim,
args.vision_model_type,
args.disable_vision_class_token,
1,
args.pixel_shuffle,
args.use_tile_tags,
)
old_seq_length = args.seq_length
args.seq_length = args.encoder_seq_length = num_image_embeddings
if torch.distributed.get_rank() == 0 and old_seq_length != args.seq_length:
warnings.warn(
f"Changed seq_length and encoder_seq_length (vision model sequence length) from {old_seq_length} to num_image_tokens ({num_image_embeddings})"
)
max_num_image_embeddings = (args.max_num_tiles + int(args.use_thumbnail)) * num_image_embeddings
assert (
args.decoder_seq_length is not None
), "Please provide --decoder-seq-length to set the language model sequence length"
assert (
args.decoder_seq_length > max_num_image_embeddings
), "Language model sequence length must be greater than the maximum number of image embeddings"
if args.decoder_seq_length > args.max_position_embeddings:
args.max_position_embeddings = args.decoder_seq_length
warnings.warn(
f"Expanded max_position_embeddings to {args.max_position_embeddings} to accommodate the maximum language model sequence length"
)
base_config = core_transformer_config_from_args(get_args())
base_config.language_model_type = args.language_model_type
base_config.vision_model_type = args.vision_model_type
base_config.calculate_per_token_loss = True
language_config = deepcopy(base_config)
language_config = get_language_model_config(language_config)
if use_te:
language_transformer_layer_spec = get_layer_spec_te(
is_vit=False
) # TENorm detects LayerNorm/RMS automatically.
else:
language_transformer_layer_spec = get_layer_spec(
is_vit=False, normalization=language_config.normalization
)
vision_config = deepcopy(base_config)
vision_config = get_vision_model_config(
vision_config, apply_query_key_layer_scaling=args.apply_query_key_layer_scaling
)
vision_model_type = args.vision_model_type
if vision_model_type in ["clip", "siglip"]:
if use_te:
vision_transformer_layer_spec = get_layer_spec_te(
is_vit=True
) # TENorm detects LayerNorm/RMS automatically.
else:
vision_transformer_layer_spec = get_layer_spec(
is_vit=True, normalization=vision_config.normalization
)
elif vision_model_type == "internvit":
from nvlm.internvit import get_internvit_layer_spec
vision_transformer_layer_spec = get_internvit_layer_spec(use_te=use_te)
else:
raise RuntimeError("unsupported vision model type", vision_model_type)
vision_projection_config = deepcopy(base_config)
vision_projection_config = get_vision_projection_config(
vision_projection_config, language_config.hidden_size
)
# --encoder-pipeline-model-parallel-size 1 will enable a separate pipeline stage for the vision model.
if args.encoder_pipeline_model_parallel_size > 0:
assert (
args.encoder_pipeline_model_parallel_size == 1
), "vision model and projection can only live on 1 pipeline stage."
if args.encoder_tensor_model_parallel_size > 0:
vision_config.tensor_model_parallel_size = args.encoder_tensor_model_parallel_size
vision_projection_config.tensor_model_parallel_size = (
args.encoder_tensor_model_parallel_size
)
# Make sure vision model pipeline parallel size is not inherited from the language model pipeline parallel size.
# 0 is not a valid for the config value, hence max(1, ).
vision_config.pipeline_model_parallel_size = max(1, args.encoder_pipeline_model_parallel_size)
vision_projection_config.pipeline_model_parallel_size = vision_config.pipeline_model_parallel_size
# Make sure the vision model does not inherit first and last pipeline num layers from the language model.
vision_config.first_pipeline_num_layers = vision_config.last_pipeline_num_layers = None
if vision_projection_config.normalization:
vision_projection_layer_spec = get_norm_mlp_module_spec_te().submodules
else:
vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
# Toggle --recompute* for the vision and language model separately.
if args.recompute_vision:
if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None:
vision_config.recompute_num_layers = vision_config.num_layers
else:
vision_config.recompute_granularity = None
vision_config.recompute_method = None
vision_config.recompute_num_layers = None
vision_projection_config.recompute_granularity = None
vision_projection_config.recompute_method = None
vision_projection_config.recompute_num_layers = None
tokenizer = get_tokenizer()
image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
tile_tags = _get_tile_tags(args, tokenizer)
model = LLaVAModel(
language_transformer_config=language_config,
language_transformer_layer_spec=language_transformer_layer_spec,
language_vocab_size=args.padded_vocab_size,
language_max_sequence_length=args.decoder_seq_length,
vision_transformer_config=vision_config,
vision_transformer_layer_spec=vision_transformer_layer_spec,
drop_vision_class_token=args.disable_vision_class_token,
vision_projection_config=vision_projection_config,
vision_projection_layer_spec=vision_projection_layer_spec,
vision_projection_type="mlp",
allow_missing_vision_projection_checkpoint=args.allow_missing_vision_projection_checkpoint,
parallel_output=parallel_output,
language_position_embedding_type=args.position_embedding_type,
language_rotary_percent=args.rotary_percent,
pre_process=pre_process,
post_process=post_process,
add_encoder=add_encoder,
add_decoder=add_decoder,
img_h=args.img_h,
img_w=args.img_w,
patch_dim=args.patch_dim,
language_rotary_base=args.rotary_base,
language_rope_scaling=args.use_rope_scaling,
image_token_index=image_token_index,
pixel_shuffle=args.pixel_shuffle,
tile_tags=tile_tags,
)
model.freeze(
freeze_language_model=args.freeze_LM,
freeze_vision_model=args.freeze_ViT,
freeze_vision_projection=False,
)
return model
def _get_tile_tags(args, tokenizer):
"""Tile tags are used in NVLM to surround image tiles with text tags."""
if not args.use_tile_tags:
return None
# We expect the tokenized length of the tags is same.
thumbnail_tag_text = "<tile_global_thumbnail>"
if args.tokenizer_prompt_format == "nvlm-yi-34b":
thumbnail_tag_text = "<tile_global>"
assert args.max_num_tiles <= 6, "Up to 6 tile tags used"
tile_tags_text = [f"<tile_{i}>" for i in range(1, args.max_num_tiles + 1)] + [thumbnail_tag_text]
start_idx = 0
if tokenizer._prompt_config.has_bos:
start_idx = 1
# Convert to tokens [num_tiles, tile_seq_len].
tile_tags = [tokenizer.tokenize(t)[start_idx:] for t in tile_tags_text]
return tile_tags
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import argparse
import os
import torch
import clip
def convert(download_root, output_path, tensor_parallel_size, use_te):
device = "cuda"
model, _ = clip.load("ViT-L/14@336px", device=device, download_root=download_root)
state_dict = model.state_dict()
new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
# Indices from mapping pytorch multihead attention to megatron.
kv_channels = 64
hidden_dim = 1024
num_heads = 16
indices = []
for i in range(num_heads):
lb = i * kv_channels
ub = (i + 1) * kv_channels
indices.append(torch.arange(lb, ub, dtype=torch.int))
indices.append(torch.arange(hidden_dim + lb, hidden_dim + ub, dtype=torch.int))
indices.append(torch.arange(2 * hidden_dim + lb, 2 * hidden_dim + ub, dtype=torch.int))
indices = torch.cat(indices)
for name, tensor in state_dict.items():
# Skip text model.
if "visual" not in name:
continue
# Skip final layers not used in our model.
if name == "visual.proj" or "ln_post" in name:
continue
# Map parameter names to ones used in megatron.
new_name = ""
new_tensor = tensor
if new_tensor.dtype == torch.float16:
new_tensor = new_tensor.to(torch.float32)
# This is used for chunking some tensors to target tensor parallel size.
chunk_dim = None
if "class_embedding" in name:
new_name = "class_token"
# Our model uses class token that is expanded to input dimensions already.
new_tensor = new_tensor.expand(1, 1, -1)
elif "positional_embedding" in name:
new_name = "position_embeddings.weight"
elif "conv1" in name:
new_name = "conv1.weight"
elif "ln_pre.weight" in name:
new_name = "ln_pre.weight"
elif "ln_pre.bias" in name:
new_name = "ln_pre.bias"
elif "transformer.resblocks" in name:
layer_idx = name.split(".")[3]
base = f"decoder.layers.{layer_idx}"
if "attn.in_proj_weight" in name:
new_name = f"{base}.self_attention.linear_qkv.weight"
new_tensor = new_tensor[indices]
chunk_dim = 0
elif "attn.in_proj_bias" in name:
new_name = f"{base}.self_attention.linear_qkv.bias"
new_tensor = new_tensor[indices]
chunk_dim = 0
elif "attn.out_proj.weight" in name:
new_name = f"{base}.self_attention.linear_proj.weight"
chunk_dim = 1
elif "attn.out_proj.bias" in name:
new_name = f"{base}.self_attention.linear_proj.bias"
elif "ln_1.weight" in name:
new_name = f"{base}.input_layernorm.weight"
if use_te:
new_name = f"{base}.self_attention.linear_qkv.layer_norm_weight"
elif "ln_1.bias" in name:
new_name = f"{base}.input_layernorm.bias"
if use_te:
new_name = f"{base}.self_attention.linear_qkv.layer_norm_bias"
elif "mlp.c_fc.weight" in name:
new_name = f"{base}.mlp.linear_fc1.weight"
chunk_dim = 0
elif "mlp.c_fc.bias" in name:
new_name = f"{base}.mlp.linear_fc1.bias"
chunk_dim = 0
elif "mlp.c_proj.weight" in name:
new_name = f"{base}.mlp.linear_fc2.weight"
chunk_dim = 1
elif "mlp.c_proj.bias" in name:
new_name = f"{base}.mlp.linear_fc2.bias"
elif "ln_2.weight" in name:
new_name = f"{base}.pre_mlp_layernorm.weight"
if use_te:
new_name = f"{base}.mlp.linear_fc1.layer_norm_weight"
elif "ln_2.bias" in name:
new_name = f"{base}.pre_mlp_layernorm.bias"
if use_te:
new_name = f"{base}.mlp.linear_fc1.layer_norm_bias"
assert new_name != "", f"unexpected layer name {name}"
if chunk_dim is None:
new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
else:
new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
for i in range(tensor_parallel_size):
# chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
# TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
is_extra_state_layer = any([l in new_name for l in extra_state_layers])
if use_te and is_extra_state_layer:
layer = new_name.split(".")[-2]
if layer in extra_state_layers:
extra_state_name = (
new_name[: new_name.rfind(".") + 1] + "_extra_state"
) # Replace the weight name.
new_state_dicts[i]["model"][extra_state_name] = None
for i in range(tensor_parallel_size):
output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
os.makedirs(output_dir_tp)
output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
torch.save(new_state_dicts[i], output_path_tp)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""
Convert OpenAI CLIP VIT weights to megatron format.
Example usage:
python clip_converter.py --download-root /some/download/folder --output /some/output/folder --tensor-parallel-size 4
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--download-root", type=str, required=True, help="Download folder for OpenAI CLIP weights"
)
parser.add_argument(
"--output", type=str, required=True, help="output directory for megatron state dict file(s)"
)
parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
)
parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
args = parser.parse_args()
convert(args.download_root, args.output, args.tensor_parallel_size, args.use_te)
print("done.")
import argparse
import os
import torch
from transformers import AutoModel
def convert(model_name, output_path, tensor_parallel_size, use_te):
"""Convert InternViT HF checkpoint to mcore."""
hf_model = AutoModel.from_pretrained(
model_name,
trust_remote_code=True
)
hf_state_dict = hf_model.state_dict()
new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
hidden_size = 3200
num_heads = 25
dim = 128
order = torch.ones(3 * hidden_size).long()
for j in range(num_heads):
for i in range(dim):
order[i + dim*3*j] = j*dim+i
order[dim + i + dim*3*j] = j*dim+i+num_heads*dim
order[dim*2 + i + dim*3*j] = j*dim+i+num_heads*dim*2
for name, tensor in hf_state_dict.items():
# Map parameter names to ones used in megatron.
new_name = ""
new_tensor = tensor
# This is used for chunking some tensors to target tensor parallel size.
chunk_dim = None
if "embeddings.class_embedding" in name:
new_name = "class_token"
elif "embeddings.patch_embedding.weight" in name:
new_name = "conv1.weight"
elif "embeddings.patch_embedding.bias" in name:
new_name = "conv1.bias"
elif "embeddings.position_embedding" in name:
new_name = "position_embeddings.weight"
new_tensor = new_tensor.squeeze(0)
elif "encoder.layers" in name:
layer_idx = name.split(".")[2]
base = f"decoder.layers.{layer_idx}"
head_dim = 128
if tensor_parallel_size == 1:
num_padded_heads = 25
elif tensor_parallel_size == 8:
# Note: 25 is not divisible by 8 and we don't currently support uneven heads split with tensor parallelism.
# So we pad with dummy all-zero heads. Please use a nice even number of attention heads in your model.
num_padded_heads = 32
else:
raise NotImplementedError("invalid tensor parallel size value:", tensor_parallel_size)
if "ls1" in name:
new_name = f"{base}.ls1"
elif "ls2" in name:
new_name = f"{base}.ls2"
elif "attn.qkv.weight" in name:
new_name = f"{base}.self_attention.linear_qkv.weight"
num_tensors = 3
padded_dim = head_dim * num_padded_heads * num_tensors
padded_tensor = torch.zeros((padded_dim, new_tensor.shape[-1]), dtype=new_tensor.dtype, device=new_tensor.device)
padded_tensor[:new_tensor.shape[0], :] = new_tensor[order]
new_tensor = padded_tensor
chunk_dim = 0
elif "attn.q_norm.weight" in name:
new_name = f"{base}.self_attention.q_layernorm.weight"
num_tensors = 1
padded_dim = head_dim * num_padded_heads * num_tensors
padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
padded_tensor[:new_tensor.shape[0]] = new_tensor
new_tensor = padded_tensor
chunk_dim = 0
elif "attn.k_norm.weight" in name:
new_name = f"{base}.self_attention.k_layernorm.weight"
num_tensors = 1
padded_dim = head_dim * num_padded_heads * num_tensors
padded_tensor = torch.zeros(padded_dim, dtype=new_tensor.dtype, device=new_tensor.device)
padded_tensor[:new_tensor.shape[0]] = new_tensor
new_tensor = padded_tensor
chunk_dim = 0
elif "attn.proj.weight" in name:
new_name = f"{base}.self_attention.linear_proj.weight"
num_tensors = 1
padded_dim = head_dim * num_padded_heads * num_tensors
padded_tensor = torch.zeros((new_tensor.shape[0], padded_dim), dtype=new_tensor.dtype, device=new_tensor.device)
padded_tensor[:, :new_tensor.shape[-1]] = new_tensor
new_tensor = padded_tensor
chunk_dim = 1
elif "attn.proj.bias" in name:
new_name = f"{base}.self_attention.linear_proj.bias"
elif "mlp.fc1.weight" in name:
new_name = f"{base}.mlp.linear_fc1.weight"
chunk_dim = 0
elif "mlp.fc1.bias" in name:
new_name = f"{base}.mlp.linear_fc1.bias"
chunk_dim = 0
elif "mlp.fc2.weight" in name:
new_name = f"{base}.mlp.linear_fc2.weight"
chunk_dim = 1
elif "mlp.fc2.bias" in name:
new_name = f"{base}.mlp.linear_fc2.bias"
elif "norm1" in name:
new_name = f"{base}.input_layernorm.weight"
elif "norm2" in name:
new_name = f"{base}.pre_mlp_layernorm.weight"
else:
raise RuntimeError("unexpected transformer layer name", name)
else:
raise RuntimeError("unexpected layer name", name)
assert new_name != "", f"unexpected layer name {name}"
# TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
is_extra_state_layer = any([l in new_name for l in extra_state_layers])
if use_te and is_extra_state_layer:
layer = new_name.split(".")[-2]
if layer in extra_state_layers:
extra_state_name = (
new_name[: new_name.rfind(".") + 1] + "_extra_state"
) # Replace the weight name.
for i in range(tensor_parallel_size):
new_state_dicts[i]["model"][extra_state_name] = None
if chunk_dim is None:
new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
else:
new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
for i in range(tensor_parallel_size):
new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
for i in range(tensor_parallel_size):
output_dir_tp = os.path.join(output_path, f"iter_0000001/mp_rank_0{i}")
os.makedirs(output_dir_tp, exist_ok=True)
output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
torch.save(new_state_dicts[i], output_path_tp)
print("saved file", output_path_tp)
print("done")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="InternVIT HuggingFace to Mcore converter")
parser.add_argument("--model-name", type=str, default="OpenGVLab/InternViT-6B-448px-V1-5", help="Model name in HuggingFace")
parser.add_argument("--output-dir", type=str, required=True, help="Output directory for the mcore model.")
parser.add_argument("--use-te", action="store_true", default=True)
parser.add_argument("--tensor-parallel-size", type=int, required=True)
args = parser.parse_args()
convert(args.model_name, args.output_dir, args.tensor_parallel_size, args.use_te)
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import argparse
import os
from transformers import PaliGemmaForConditionalGeneration
import torch
def convert(output_path, tensor_parallel_size, use_te):
device = "cuda"
model_id = "google/paligemma-3b-pt-448"
model = PaliGemmaForConditionalGeneration.from_pretrained(model_id).eval()
model = model.to(device)
print(model.config)
for name, tensor in model.state_dict().items():
if "vision_model" not in name:
continue
shape_str = "(" + ", ".join([str(x) for x in tensor.shape]) + ")"
print(f"{name:<75} {shape_str:>20}")
state_dict = model.state_dict()
new_state_dicts = [{"model": dict()} for _ in range(tensor_parallel_size)]
def add_chunck_tensor(new_tensor, new_name, chunk_dim=None):
if chunk_dim is None:
new_tensors = [new_tensor for _ in range(tensor_parallel_size)]
else:
new_tensors = torch.chunk(new_tensor, tensor_parallel_size, dim=chunk_dim)
for i in range(tensor_parallel_size):
# chunk() creates a view of a bigger tensor. clone() is used here to avoid excessive storage.
new_state_dicts[i]["model"][new_name] = new_tensors[i].clone()
# TE sets _extra_state (for FP8 purposes), so set an empty one here for compatibility.
extra_state_layers = ("linear_qkv", "linear_proj", "linear_fc1", "linear_fc2")
is_extra_state_layer = any([l in new_name for l in extra_state_layers])
if use_te and is_extra_state_layer:
layer = new_name.split(".")[-2]
if layer in extra_state_layers:
extra_state_name = (
new_name[: new_name.rfind(".") + 1] + "_extra_state"
) # Replace the weight name.
new_state_dicts[i]["model"][extra_state_name] = None
for name, tensor in state_dict.items():
if tensor.dtype == torch.float16:
state_dict[name] = tensor.to(torch.float32)
add_chunck_tensor(
state_dict["vision_tower.vision_model.embeddings.position_embedding.weight"],
"position_embeddings.weight")
add_chunck_tensor(
state_dict["vision_tower.vision_model.embeddings.patch_embedding.weight"],
"conv1.weight")
add_chunck_tensor(
state_dict["vision_tower.vision_model.embeddings.patch_embedding.bias"],
"conv1.bias")
head_dim = 72
num_head = 16
for layer_idx in range(27):
origin_base = f"vision_tower.vision_model.encoder.layers.{layer_idx}"
target_base = f"decoder.layers.{layer_idx}"
for param_type in ["weight", "bias"]:
# QKV
q_proj_params = state_dict[f"{origin_base}.self_attn.q_proj.{param_type}"]
k_proj_params = state_dict[f"{origin_base}.self_attn.k_proj.{param_type}"]
v_proj_params = state_dict[f"{origin_base}.self_attn.v_proj.{param_type}"]
# Do some tensor manipulation because megatron expect one tensor
# projection for the QKV in the order
# [(Q1, K1, V1), (Q2, K2, V2), ...] where Qi is the query of the
# i-th head with dimension num_head.
new_tensor = torch.concatenate([
q_proj_params.view(num_head, head_dim, -1),
k_proj_params.view(num_head, head_dim, -1),
v_proj_params.view(num_head, head_dim, -1)], axis=1).view(
3*head_dim*num_head, -1)
if param_type == "bias":
new_tensor = new_tensor[:, 0]
new_name = f"{target_base}.self_attention.linear_qkv.{param_type}"
add_chunck_tensor(new_tensor, new_name, chunk_dim=0)
# linear_proj
add_chunck_tensor(
state_dict[f"{origin_base}.self_attn.out_proj.{param_type}"],
f"{target_base}.self_attention.linear_proj.{param_type}",
chunk_dim=1 if param_type == "weight" else None)
# layer_norm
new_name = f"{target_base}.input_layernorm.{param_type}"
if use_te:
new_name = f"{target_base}.self_attention.linear_qkv.layer_norm_{param_type}"
add_chunck_tensor(
state_dict[f"{origin_base}.layer_norm1.{param_type}"],
new_name)
# FC 1
add_chunck_tensor(
state_dict[f"{origin_base}.mlp.fc1.{param_type}"],
f"{target_base}.mlp.linear_fc1.{param_type}",
chunk_dim=0)
# FC 2
add_chunck_tensor(
state_dict[f"{origin_base}.mlp.fc2.{param_type}"],
f"{target_base}.mlp.linear_fc2.{param_type}",
chunk_dim=1 if param_type=="weight" else None)
# layer_norm
new_name = f"{target_base}.pre_mlp_layernorm.{param_type}"
if use_te:
new_name = f"{target_base}.mlp.linear_fc1.layer_norm_{param_type}"
add_chunck_tensor(
state_dict[f"{origin_base}.layer_norm2.{param_type}"],
new_name)
add_chunck_tensor(
state_dict["vision_tower.vision_model.post_layernorm.weight"],
"ln_post.weight")
add_chunck_tensor(
state_dict["vision_tower.vision_model.post_layernorm.bias"],
"ln_post.bias")
for i in range(tensor_parallel_size):
output_dir_tp = os.path.join(output_path, "iter_0000001", f"mp_rank_0{i}")
os.makedirs(output_dir_tp)
output_path_tp = os.path.join(output_dir_tp, "model_optim_rng.pt")
torch.save(new_state_dicts[i], output_path_tp)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="""
Convert SigLIP weights to megatron format.
Example usage:
python siglip_converter.py --tensor-parallel-size 4 --output google_paligemma_3b_pt_44_mcore_tp_4 --use-te
examples/multimodal/combine_mistral_clip.sh Mistral-7B-Instruct-v0.3-mcore-tp4 google_paligemma_3b_pt_44_mcore_tp_4 mistral_7b_instruct_v0p3_google_paligemma_3b_pt_44_mcore_tp_4
""",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--output", type=str, required=True, help="output directory for megatron state dict file(s)"
)
parser.add_argument(
"--tensor-parallel-size", type=int, default=1, help="model tensor parallel size"
)
parser.add_argument("--use-te", action="store_true", help="Use Transformer Engine")
args = parser.parse_args()
convert(args.output, args.tensor_parallel_size, args.use_te)
print("done.")
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import argparse
import os
import sys
# Add megatron and the multimodal example to the path.
sys.path.append(
os.path.abspath(
os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir)
)
)
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir)))
import torch
from transformers import AutoModel
from examples.multimodal.model import model_provider
from examples.multimodal.multimodal_args import add_multimodal_extra_args
from megatron.training import get_model
from megatron.training.checkpointing import load_checkpoint
from megatron.training.initialize import initialize_megatron
def run_mcore_vision(model_path):
"""Run mcore vision model."""
os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "1"
# Megatron has some mandatory flags.
sys.argv = [
"ignore_me.py",
"--micro-batch-size=1",
"--num-layers=2",
"--vision-model-type=internvit",
"--language-model-type=mistral_7b",
"--tokenizer-prompt-format=mistral",
"--tokenizer-type=MultimodalTokenizer",
"--tokenizer-model=mistralai/Mistral-7B-Instruct-v0.3",
"--vocab-size=1024",
"--hidden-size=64",
"--num-attention-heads=8",
"--seq-length=1024",
"--decoder-seq-length=2048",
"--max-position-embeddings=2048",
"--bf16",
"--img-h=448",
"--img-w=448",
"--patch-dim=14",
"--tensor-model-parallel-size=8",
"--use-te",
f"--pretrained-checkpoint={model_path}",
]
initialize_megatron(extra_args_provider=add_multimodal_extra_args)
def wrapped_model_provider(pre_process, post_process):
return model_provider(pre_process, post_process, parallel_output=False)
# Set up model and load checkpoint.
model = get_model(wrapped_model_provider, wrap_with_ddp=False)
vision_model = model[0].module.vision_model
load_checkpoint([vision_model], None, None)
vision_model.eval()
images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
output = vision_model(images)
return output
def run_hf_vision(model_name):
"""Run HF vision model."""
model = (
AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, trust_remote_code=True)
.cuda()
.eval()
)
images = torch.ones((1, 3, 448, 448), dtype=torch.bfloat16, device="cuda")
outputs = model(images, return_dict=True)
return outputs
def main(mcore_model, hf_model):
"""Compare vision model outputs between mcore and HF given the same fixed input."""
mcore = run_mcore_vision(mcore_model)
if torch.distributed.get_rank() == 0:
hf = run_hf_vision(hf_model)
hf = hf["last_hidden_state"]
# Compare logits. Due to different attention implementations and other details,
# there will be numerical differences.
diff = (mcore - hf).abs()
mean_diff = diff.mean().item()
max_diff = diff.max().item()
print(f"mean diff {mean_diff}, max diff {max_diff}")
assert mean_diff < 0.1, "mean output difference is greater than expected"
assert max_diff < 50, "max output difference is greater than expected"
print("lgtm")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Check mcore vision model output vs. HF numerically.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--mcore-model", type=str, required=True, help="directory for mcore model weights"
)
parser.add_argument("--hf-model", type=str, required=True, help="Model name in HF")
args = parser.parse_args()
main(args.mcore_model, args.hf_model)
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
from megatron.core.models.multimodal.llava_model import IMAGE_TOKEN
def add_multimodal_extra_args(parser):
"""Extra arguments."""
group = parser.add_argument_group(title='multimodal arguments')
group.add_argument('--dataset-config', type=str, default=None)
group.add_argument("--prompt-path", type=str, default=None)
group.add_argument('--freeze-LM', action='store_true', default=False)
group.add_argument('--freeze-ViT', action='store_true', default=False)
group.add_argument('--language-model-type', type=str, required=True)
group.add_argument('--vision-model-type', type=str, default="clip")
group.add_argument("--disable-vision-class-token", action="store_true", default=False)
group.add_argument(
"--allow-missing-vision-projection-checkpoint", action="store_true", default=False
)
group.add_argument("--use-te", action="store_true", default=False)
group.add_argument(
"--dataloader-save", type=str, default=None, help="Energon dataloader state save path"
)
group.add_argument(
"--use-tiling", action="store_true", default=False, help="Use input image tiling"
)
group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles")
group.add_argument(
"--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile"
)
group.add_argument(
"--dataloader-seq-length",
type=int,
help="Make dataloader to produce sequences of specific length.",
)
group.add_argument(
"--num-frames",
type=int,
default=1,
help="Number of frames to regularly sample from the video as input to the model.",
)
group.add_argument(
"--online-evaluation-config", type=str, help="Config file for online evaluation."
)
group.add_argument(
"--special-tokens",
nargs="*",
default=[IMAGE_TOKEN],
help="Special tokens used in the multimodal model",
)
group.add_argument(
"--tokenizer-prompt-format",
type=str,
choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"],
required=True,
help="Prompt format to use with the tokenizer.",
)
group.add_argument("--pixel-shuffle", action="store_true", default=False)
group.add_argument(
"--image-tag-type",
type=str,
choices=["nvlm", "internvl", ""],
default="", # Default: Image tag not used.
help="Surround image tokens with tags.",
)
group.add_argument("--use-tile-tags", action="store_true", default=False, help="Use tile tags")
group.add_argument(
"--packing-buffer-size",
type=int,
default=None, # Packing is disabled by default.
help="Enable sample packing by setting the buffer size to > 0",
)
group.add_argument(
"--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
)
group.add_argument(
"--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
)
return parser
NVLM
====
Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
*NOTE: VLMs in Megatron are under active development and are expected to change.*
# Checkpoints
NVLM 1.0 model weights are publicly available in HuggingFace and Megatron format.
- NVLM-1.0-D 72B [HuggingFace version](https://huggingface.co/nvidia/NVLM-D-72B)
- NVLM-1.0-D 72B [Megatron-Core version](https://huggingface.co/nvidia/NVLM-D-72B-mcore)
# Setup
## Docker image
Please use `examples/multimodal/Dockerfile`.
## Dataset preparation
Please refer to Tables 4 and 6 in the [NVLM paper](https://arxiv.org/pdf/2409.11402) for full list of pretrain and SFT datasets.
Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
## Model conversion
### Vision model
NVLM 1.0 models use [OpenGVLab/InternViT-6B-448px-V1-5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V1-5) from HuggingFace.
Please download it and run the following command to convert it to Megatron format.
```
python examples/multimodal/model_converter/internvit_converter.py --output-dir <some output dir> --use-te --tensor-parallel-size 8
```
### 34B Language model
NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface.co/NousResearch/Nous-Hermes-2-Yi-34B) from HuggingFace.
Please download it and run the following command to convert it to Megatron format.
```
python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
--load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model name/directory> \
--saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1
```
### 72B Language model
NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Qwen2-72B-Instruct) from HuggingFace.
Please download it and run the following command to convert it to Megatron format.
```
python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
--load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model name/directory> \
--saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf
```
### Combined checkpoint
Combine the vision model checkpoint from [InternVit](#internvit) with the [34B](#34b-language-model) or [72B](#72b-language-model) language model by running:
```
examples/multimodal/combine_lm_vision_checkpoints.sh <language model directory> <vision model directory> <output directory> nvlm
```
# Training
## 34B
1. Pretraining: please run `examples/multimodal/nvlm/pretrain_yi_34b_internvit_6b.sh`. Please use the InternViT + 34B [combined checkpoint](#combined-checkpoint) and tokenizer from HuggingFace.
2. SFT: please run `examples/multimodal/nvlm/sft_34b_internvit.sh` using the checkpoint from 1.
## 72B
1. Pretraining: please run `examples/multimodal/nvlm/pretrain_qwen20_72b_internvit_6b.sh`. Please use the InternViT + 72B [combined checkpoint](#combined-checkpoint) and tokenizer from HuggingFace.
2. Convert the pretraining checkpoint from 1. to have pipeline parallel size = 4 for SFT. Please run
```
examples/multimodal/nvlm/pp_checkpoint_converter.py --input <pretrained checkpoint directory> \
--input-pipeline-parallel 1 --output <some output dir> --output-pipeline-parallel 4 \
--tensor-parallel 8
```
3. SFT: please run `examples/multimodal/nvlm/sft_qwen20_72b_internvit_6b.sh` using the checkpoint from 2.
4. To convert the checkpoint with pipeline parallel size = 4 back to 1 for evaluation, please run
```
examples/multimodal/nvlm/pp_checkpoint_converter.py --input <sft checkpoint directory> \
--input-pipeline-parallel 4 --output <some output dir> --output-pipeline-parallel 1 \
--tensor-parallel 8
```
# Evaluation
Run the text generation script.
- 34B
```
examples/multimodal/nvlm/run_text_generation_yi_34b_internvit_6b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
--model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name --use-tiling
```
- 72B
```
examples/multimodal/nvlm/run_text_generation_qwen20_72b_internvit_6b.sh --input-image-path /path/to/input/images --output-path /some/output/directory \
--model-path /path/to/model.pt --gt-path /path/to/groundtruth/file --task generation-task-name --use-tiling
```
where `--task generation-task-name` is the name of the evaluation benchmark such as `captioning`, `MMMU` or `TextVQA`.
Then, run one of the evaluation scripts from `examples/multimodal`. For example
```
python examples/multimodal/evaluate_mmmu.py --input-path /output/directory/from/generation
```
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
""""
NOTE: NVLM uses InternViT with tensor parallel (TP) size = 8.
Since InternViT has 25 attention heads and Megatron currently requires the number of attention heads
to be divisible by the TP size, we add 7 dummy zero attention heads to have 32 attention heads.
This workaround requires some changes to how we compute RMSNorm, Attention etc.
Additionally, InternViT introduces some unique features like Layer Scaling.
Those code changes are gathered here.
"""
from functools import partial
from typing import Dict
import torch
from megatron.core.dist_checkpointing.mapping import ShardedStateDict
from megatron.core.extensions.transformer_engine import (
TEColumnParallelLinear,
TEDotProductAttention,
TERowParallelLinear,
)
from megatron.core.parallel_state import (
get_tensor_model_parallel_group,
get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
)
from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
from megatron.core.transformer.dot_product_attention import DotProductAttention
from megatron.core.transformer.enums import AttnMaskType
from megatron.core.transformer.mlp import MLP, MLPSubmodules
from megatron.core.transformer.module import MegatronModule
from megatron.core.transformer.spec_utils import ModuleSpec, build_module
from megatron.core.transformer.transformer_config import TransformerConfig
from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
class InternViTRMSNorm(MegatronModule):
def __init__(
self,
config,
hidden_size: int,
eps: float = 1e-6,
sequence_parallel: bool = False,
compute_var: bool = False,
):
"""Custom RMSNorm for InternViT.
Args:
config (TransformerConfig): Config.
hidden_size (int): Input hidden size.
eps (float): epsilon to use for the norm, default to 1e-6
sequence_parallel (bool): Set to true if sequence parallelism is being used,
this marks the weights as needing to be allreduced.
compute_var (bool): Indicator to compute statistic manually.
"""
super().__init__(config=config)
self.config = config
self.eps = eps
self.weight = torch.nn.Parameter(torch.ones(hidden_size))
self._compute_var = compute_var
assert not sequence_parallel, "Sequence parallelism is not supported with InternViT."
setattr(self.weight, 'sequence_parallel', sequence_parallel)
def _norm(self, x, var):
if var is None:
var = x.pow(2).mean(-1, keepdim=True)
return x * torch.rsqrt(var + self.eps)
def forward(self, x):
"""Run RMSNorm with an option to compute custom statistic."""
var = None
if self._compute_var:
unpadded_hidden_size = self.config.hidden_size # 3200
max_dim = x.shape[-1] # 128
x = x.reshape(x.size(0), x.size(1), -1)
var = self._gather_var(x.float().pow(2), max_dim) / unpadded_hidden_size
output = self._norm(x.float(), var).type_as(x)
output = output * self.weight
if self._compute_var:
output = output.reshape(output.size(0), output.size(1), -1, max_dim)
return output
def _gather_var(self, input_, max_dim, valid_ranks=6):
"""Compute statistic across the non-dummy heads."""
world_size = get_tensor_model_parallel_world_size()
assert world_size == 8, "tested only with TP=8"
# Size and dimension.
last_dim = input_.dim() - 1
rank = get_tensor_model_parallel_rank()
if rank < valid_ranks: # Ranks 0-5 have 24 non-dummy attention heads.
var = input_.sum(-1, keepdim=True)
elif rank == valid_ranks: # Rank 6 has 1 non-dummy attention head.
var = input_[..., :max_dim].sum(-1, keepdim=True)
else:
var = input_.sum(-1, keepdim=True) * 0.0 # Zero-out the dummy heads.
tensor_list = [torch.empty_like(var) for _ in range(world_size)]
tensor_list[rank] = var
torch.distributed.all_gather(tensor_list, var, group=get_tensor_model_parallel_group())
output = torch.cat(tensor_list, dim=last_dim).contiguous()
return output.sum(-1, keepdim=True)
def sharded_state_dict(self, prefix='', sharded_offsets=(), metadata={}):
# in InternVitSelfAttention the q_layernorm and k_layernorm weights
# are tensor-parallel so must be converted to sharded tensors
if 'q_layernorm' in prefix or 'k_layernorm' in prefix:
state_dict = self.state_dict(prefix='', keep_vars=True)
return make_sharded_tensors_for_checkpoint(
state_dict, prefix, {'weight': 0}, sharded_offsets
)
else:
return super().sharded_state_dict(prefix, sharded_offsets, metadata)
def get_mlp_module_spec(use_te: bool = True) -> ModuleSpec:
# Dense MLP w/ or w/o TE modules.
return ModuleSpec(
module=MLP,
submodules=MLPSubmodules(
linear_fc1=TEColumnParallelLinear if use_te else ColumnParallelLinear,
linear_fc2=TERowParallelLinear if use_te else RowParallelLinear,
),
)
# Handle InternViT's layer scaling.
def _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training):
x, bias = x_with_bias # unpack
residual = residual if residual.dtype == x.dtype else residual.to(x.dtype)
if bias is not None:
x = x + bias
out = torch.nn.functional.dropout(x, p=prob, training=training)
out = residual + out * ls
return out
else:
out = torch.nn.functional.dropout(x, p=prob, training=training)
out = residual + out * ls
return out
def bias_dropout_add_unfused_internvit(ls, training):
"""Bias-dropout-add as in Megatron but with added LayerScaling handling."""
def _bias_dropout_add(x_with_bias, residual, prob):
return _bias_dropout_add_func_internvit(ls, x_with_bias, residual, prob, training)
return _bias_dropout_add
def get_bias_dropout_add_internvit(ls, training, fused):
"""Bias-dropout-add as in Megatron but with added LayerScaling handling."""
assert not fused, "Fused bias-dropout-add not implemented for InternViT."
return bias_dropout_add_unfused_internvit(ls, training)
# Add InternViT specialties to our default TransformerLayer.
class InternViTTransformerLayer(TransformerLayer):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.ls1 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
self.ls2 = torch.nn.Parameter(torch.ones(self.config.hidden_size))
self.self_attn_bda = partial(self.self_attn_bda, self.ls1)
self.mlp_bda = partial(self.mlp_bda, self.ls2)
# Override a few things that are special in InternViT and not supported by the SelfAttention class.
class InternViTSelfAttention(SelfAttention):
def __init__(
self, config: TransformerConfig, submodules: SelfAttentionSubmodules, *args, **kwargs
):
super().__init__(config=config, submodules=submodules, *args, **kwargs)
# Need to override linear_qkv, q_layernorm and k_layernorm.
qkv_bias = False
self.linear_qkv = build_module(
submodules.linear_qkv,
self.config.hidden_size,
self.query_projection_size + 2 * self.kv_projection_size,
config=self.config,
init_method=self.config.init_method,
gather_output=False,
bias=qkv_bias,
skip_bias_add=False,
is_expert=False,
tp_comm_buffer_name='qkv',
)
qk_layernorm_hidden_size = (
self.hidden_size_per_attention_head * self.num_attention_heads_per_partition
) # 512 for internvit
self.q_layernorm = build_module(
submodules.q_layernorm,
hidden_size=qk_layernorm_hidden_size,
config=self.config,
eps=self.config.layernorm_epsilon,
compute_var=True,
)
self.k_layernorm = build_module(
submodules.k_layernorm,
hidden_size=qk_layernorm_hidden_size,
config=self.config,
eps=self.config.layernorm_epsilon,
compute_var=True,
)
class InternViTTEDotProductAttention(TEDotProductAttention):
"""Adjusted Attention for InternViT"""
def forward(self, *args, **kwargs):
"""Regular TEDotProductAttention + zero-out dummy attention heads."""
out = super().forward(*args, **kwargs)
# This makes sure the dummy attention heads are zeroed out.
mask = torch.ones_like(out, dtype=out.dtype, device=out.device)
rank = get_tensor_model_parallel_rank()
max_dim = out.shape[-1] # 128
valid_ranks = 6
if rank == valid_ranks:
mask[..., max_dim:] *= 0.0
elif rank > valid_ranks:
mask *= 0.0
out *= mask
return out
def get_internvit_layer_spec(use_te) -> ModuleSpec:
mlp = get_mlp_module_spec(use_te) # no norm
return ModuleSpec(
module=InternViTTransformerLayer,
submodules=TransformerLayerSubmodules(
input_layernorm=InternViTRMSNorm,
self_attention=ModuleSpec(
module=InternViTSelfAttention,
params={"attn_mask_type": AttnMaskType.no_mask},
submodules=SelfAttentionSubmodules(
linear_qkv=TEColumnParallelLinear if use_te else ColumnParallelLinear,
core_attention=TEDotProductAttention if use_te else DotProductAttention,
linear_proj=TERowParallelLinear if use_te else RowParallelLinear,
q_layernorm=InternViTRMSNorm,
k_layernorm=InternViTRMSNorm,
),
),
self_attn_bda=get_bias_dropout_add_internvit,
pre_mlp_layernorm=InternViTRMSNorm,
mlp=mlp,
mlp_bda=get_bias_dropout_add_internvit,
),
)
{
"COMMENT": "Mixture of our own custom prompts and some prompts from https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain/viewer and https://huggingface.co/datasets/HuggingFaceM4/M3IT",
"Captioning": {
"raw": [
"Can you briefly explain what you see in the image?",
"Describe what's happening in this image in one short sentence.",
"Write a short caption that accurately represents the content of this image.",
"Please generate a descriptive caption for the image provided.",
"How would you summarize the scene depicted in the picture in short?",
"Describe the image briefly.",
"Write a succinct description of the image, capturing its main components, the relationships between them, and any notable details.",
"Create a concise caption that accurately describes the main elements in the image provided.",
"Write a brief, yet comprehensive, description of the image.",
"Describe the image in a clear and concise manner.",
"For the given image, provide a one-sentence summary that captures the most important details.",
"Generate a short caption for the picture.",
"Write a short and informative description that highlights the primary subjects and actions occurring in the given image.",
"Provide a concise and informative caption for the image, focusing on the primary subjects.",
"Write a clear description of the image, make sure the key features are well covered.",
"Offer a succinct explanation of the picture presented."
]
},
"CaptioningPretraining": {
"raw": [
"Give a brief description of image.",
"Give a brief description of the image.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely.",
"Generate a clear and concise summary of the photo."
]
},
"CaptioningSFT": {
"raw": [
"Give a brief description of the image.",
"Give a short and clear explanation of the subsequent image.",
"Present a compact description of the photo's key features.",
"Provide a brief description of the given image.",
"Provide a one-sentence caption for the provided image.",
"Render a clear and concise summary of the photo.",
"Share a concise interpretation of the image provided.",
"Summarize the visual content of the image.",
"Write a terse but informative summary of the picture.",
"Describe the image concisely."
]
},
"VQAPretraining": {
"raw": [
"Question: {} Short answer:",
"Question: {} Answer:"
]
},
"VQASFT": {
"raw": [
"{}",
"{}\nAnswer the question using a single word or phrase."
],
"docvqa": [
"{}",
"{}\nAnswer this question using the text in the image directly."
]
},
"DocPretraining": {
"raw": [
"Retrieve the text from the given pdf image.",
"Extract the text from the provided document.",
"Transcribe the text displayed in the image."
],
"ocr_multi": [
"Apply grounded Optical Character Recognition (OCR) to the provided image.",
"Extract all texts and their bounding boxes from the given image using grounded OCR.",
"Extract and transcribe all visible text from the provided image, ensuring accurate spatial recognition.",
"Conduct a detailed optical character recognition analysis on this image, maintaining the text's original layout and positioning.",
"Execute a thorough text recognition procedure on this visual input, ensuring that the spatial arrangement of the text is accurately represented.",
"Perform an in-depth OCR scan of the image, capturing both the content and contextual positioning of all textual information.",
"OCR with grounding:"
],
"md": [
"Extract the text from the given image and format it in Markdown.",
"Convert the text from the provided image into Markdown format.",
"Transform the text from the given image into Markdown syntax.",
"Extract and convert the text from the image to Markdown.",
"Retrieve the text from the image and present it in Markdown format."
],
"grounded_ocr": [
"{}. Text:",
"Recognize the text in this region: {}.",
"Identify the text in this area: {}.",
"Detect the text within this section: {}."
],
"referring_grounding": [
"Region of \"{}\" is:",
"Locate the text \"{}\" in the image.",
"Identify the text \"{}\" in the image and provide the coordinates."
]
},
"CaptioningDetailed": {
"raw": [
"Create a comprehensive paragraph that captures the essence of the image while weaving a cohesive narrative around its elements.",
"Compose a paragraph that thoroughly describes the image's content, providing context and connections between different aspects of the scene.",
"Provide a detailed, paragraph-length description of the image that paints a vivid picture and tells a coherent story.",
"Write a rich and engaging paragraph that delves into the image's components, describing not only what is seen but also how the elements relate to one another.",
"Give a well-rounded, paragraph-length explanation of the image, describing the scene and its components while forming a complete and engaging narrative.",
"Produce a paragraph that not only describes the individual elements in the image but also weaves them together to form a cohesive, connected account.",
"Construct a paragraph that captures the image's details and context, offering a more in-depth and engaging story than a simple caption.",
"Compose a descriptive paragraph that brings the image to life through detailed storytelling, connecting the various visual elements into a unified narrative.",
"Create a paragraph that provides an extensive and interconnected description of the image, ensuring that the narrative is both detailed and cohesive.",
"Write a compelling and detailed paragraph that delves into the image's components, linking them together to create a unified and engaging story."
]
},
"OCR": {
"raw": [
"Can you read the text from image and output here?",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"
],
"markdown": [
"Can you extract all visible text from the provided image?",
"Converting the text embedded in this image into a readable markdown document.",
"Can you read the text in the document as markdown?",
"Transcribe the document as markdown.",
"Extract and document the text from the provided image."
],
"table_markdown": [
"Can you extract all visible text from the provided table?",
"Can you read the text in the provided table as markdown?",
"Transcribe the table as markdown.",
"Extract and document the text from the provided table image."
],
"plain": [
"Transcribe the document as plain text.",
"Extract and document the text from the provided image.",
"Converting the text embedded in this image into a readable document.",
"Transcribe all the text you find.",
"Can you extract all visible text from the image here?"
],
"bbox_plain": [
"Transcribe the document as plain text along with bounding boxes.",
"Extract and document the text from the provided image along with bounding boxes.",
"Converting the text embedded in this image into a readable documen along with bounding boxes.",
"Can you extract all visible text with bounding boxes from the image here?"
]
},
"VQA": {
"raw": [
"Given the image, answer the following question with few words.",
"Answer the following question: ",
"What is the answer to this question?",
"Write the answer: ",
"Please answer this question: "
]
},
"Embedded": {
"raw": [
"Given the image, answer the following question with few words.",
"Answer the following question: ",
"What is the answer to this question?",
"Write the answer: ",
"Please answer this question: "
]
}
}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
import argparse
import os
import sys
import torch
# Add megatron to the path.
sys.path.append(
os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir, os.path.pardir))
)
def split(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_layers_per_pp_rank):
"""Split pipeline parallel size = 1 checkpoint to pipeline parallel size N."""
for tp in range(num_tp):
path = os.path.join(input_dir, f"mp_rank_0{tp}", "model_optim_rng.pt")
sd = torch.load(path)
if num_layers_per_pp_rank is None:
num_layers = sd["args"].num_layers
assert num_layers % output_pp == 0, "specify --num-layers-per-pp-rank for an uneven split"
num_layers_per_pp_rank = [num_layers // output_pp] * output_pp
layer_lb = 0
for pp in range(output_pp):
assert num_layers_per_pp_rank[pp] > 0, "each pp rank must have at least 1 layer"
layer_ub = layer_lb + num_layers_per_pp_rank[pp]
new_sd = sd.copy()
new_sd["model"] = dict()
for k, v in sd["model"].items():
# First pp rank has vision model.
if pp == 0 and ("vision_model" in k or "vision_projection" in k):
new_sd["model"][k] = v
continue
# Only the first pp rank has the word embeddings.
if "language_model.embedding.word_embeddings" in k and pp == 0:
new_sd["model"][k] = v
# Only the last pp rank has the output layer.
if "language_model.output_layer" in k and pp == output_pp - 1:
new_sd["model"][k] = v
# Only the last pp rank has final layer norm.
if "language_model.decoder.final_layernorm" in k and pp == output_pp - 1:
new_sd["model"][k] = v
if "language_model.decoder.layers" in k:
layer_num = int(k.split(".")[3])
if layer_lb <= layer_num and layer_num < layer_ub:
# On all pp ranks, megatron starts layer nums from 0!
new_layer_num = int(layer_num - layer_lb)
k_splitted = k.split(".")
k_splitted[3] = str(new_layer_num)
new_k = ".".join(k_splitted)
new_sd["model"][new_k] = v
output_dir = os.path.join(base_output_dir, f"iter_0000001/mp_rank_0{tp}_00{pp}")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "model_optim_rng.pt")
torch.save(new_sd, output_path)
print(f"processed tp rank: {tp}/{num_tp - 1} and pp rank: {pp}/{output_pp - 1}")
layer_lb = layer_ub
# This is needed for megatron checkpoint loading.
with open(os.path.join(base_output_dir, "latest_checkpointed_iteration.txt"), "w") as f:
f.write("1")
def combine(input_dir, base_output_dir, input_pp, output_pp, num_tp, num_layers_per_pp_rank):
"""Combine pipeline parallel size = N checkpoint to pipeline parallel size 1."""
for tp in range(num_tp):
new_sd = None
layer_num_offset = 0
max_layer_num = 0
for pp in range(input_pp):
path = os.path.join(input_dir, f"mp_rank_0{tp}_00{pp}", "model_optim_rng.pt")
sd = torch.load(path)
if pp == 0:
new_sd = sd.copy()
new_sd["model"] = dict()
new_sd["args"].pipeline_model_parallel_size = 1
assert new_sd is not None
for k, v in sd["model"].items():
# First pp rank has vision model.
if pp == 0 and ("vision_model" in k or "vision_projection" in k):
new_sd["model"][k] = v
continue
# Only the first pp rank has the word embeddings.
if "language_model.embedding.word_embeddings" in k and pp == 0:
new_sd["model"][k] = v
# Only the last pp rank has the output layer.
if "language_model.output_layer" in k and pp == input_pp - 1:
new_sd["model"][k] = v
# Only the last pp rank has final layer norm.
if "language_model.decoder.final_layernorm" in k and pp == input_pp - 1:
new_sd["model"][k] = v
if "language_model.decoder.layers" in k:
layer_num = int(k.split(".")[3])
# On all pp ranks, megatron starts layer nums from 0!
new_layer_num = layer_num_offset + layer_num
if new_layer_num > max_layer_num:
max_layer_num = new_layer_num
k_splitted = k.split(".")
k_splitted[3] = str(new_layer_num)
new_k = ".".join(k_splitted)
new_sd["model"][new_k] = v
print(f"processed tp rank: {tp}/{num_tp - 1} and pp rank: {pp}/{input_pp - 1}")
layer_num_offset = max_layer_num + 1
output_dir = os.path.join(base_output_dir, f"iter_0000001/mp_rank_0{tp}")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, "model_optim_rng.pt")
torch.save(new_sd, output_path)
# This is needed for megatron checkpoint loading.
with open(os.path.join(base_output_dir, "latest_checkpointed_iteration.txt"), "w") as f:
f.write("1")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Change pipeline parallelism for a model",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"--input", type=str, required=True, help="Input model directory"
)
parser.add_argument(
"--input-pipeline-parallel", type=int, required=True, help="Input model pipeline parallelism"
)
parser.add_argument(
"--output", type=str, required=True, help="Output model directory"
)
parser.add_argument(
"--output-pipeline-parallel", type=int, required=True, help="Output model pipeline parallelism"
)
parser.add_argument(
"--tensor-parallel", type=int, required=True, help="Model tensor parallel size",
)
parser.add_argument(
"--num-layers-per-pp-rank", type=int, default=None, nargs="*", help="Specify this for uneven pipeline parallel split",
)
args = parser.parse_args()
f = None
if args.input_pipeline_parallel == 1 and args.output_pipeline_parallel > 1:
f = split
elif args.input_pipeline_parallel > 1 and args.output_pipeline_parallel == 1:
f = combine
else:
raise NotImplementedError("Only pipeline parallel 1 to N and N to 1 are supported")
f(args.input, args.output, args.input_pipeline_parallel, args.output_pipeline_parallel, args.tensor_parallel, args.num_layers_per_pp_rank)
print("done.")
__module__: megatron.energon
__class__: Metadataset
splits:
train:
datasets:
- weight: 0.579 # Datasets are weighted according to their size. Weights sum up to 1.
path: <path to laion dataset>
subflavors:
augmentation: False
- weight: 0.02
path: <path to coco>
subflavors:
augmentation: False
- weight: 0.01
path: <path to vqav2 dataset>
subflavors:
augmentation: False
# Please refer to Table 4 in https://arxiv.org/pdf/2409.11402 for full list of pretrain datasets.
# Please refer to https://nvidia.github.io/Megatron-Energon/data_prep.html on preparing datasets in the Megatron Energon format.
val:
datasets:
- weight: 1.
path: <path to validation dataset>
subflavors:
augmentation: False
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export TOKENIZERS_PARALLELISM="false"
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-qwen20-72b-internvit-${DATETIME}"
else
MODEL_NAME="mcore-qwen20-72b-internvit"
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
CHECKPOINT_DIR="${WORKSPACE}/combined-qwen2.0-72b-instruct-internvit-6b-448px-1.5-tp8-te"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
AD=0.0
HD=0.0
LI=1
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
else
MBZ=1
BZ=2048
NW=8
AD=0.1
HD=0.1
LI=5
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
fi
SEQ_LEN=256 # Image embeddings sequence length.
DECODER_SEQ_LEN=512 # Language model sequence length.
MAX_POS_EMBED=512
OPTIONS=" \
--use-checkpoint-args \
--exit-duration-in-mins 230 \
--disable-bias-linear \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \
--transformer-impl transformer_engine \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--group-query-attention \
--num-query-groups 8 \
--no-masked-softmax-fusion \
--attention-softmax-in-fp32 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--untie-embeddings-and-output-weights \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--tensor-model-parallel-size 8 \
--pipeline-model-parallel-size 1 \
--num-layers 80 \
--hidden-size 8192 \
--ffn-hidden-size 29568 \
--add-qkv-bias \
--num-attention-heads 64 \
--use-distributed-optimizer \
--use-te \
--num-workers ${NW} \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings 32768 \
--train-samples 122880000 \
--lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--lr 1e-4 \
--min-lr 2.5e-5 \
--lr-decay-style cosine \
--log-interval ${LI} \
--eval-iters 10 \
--eval-interval 500 \
--data-path ${DATA_TRAIN} \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--save-interval 5000 \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--split 100,0,0 \
--clip-grad 10.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--bf16 \
--eod-mask-loss \
--freeze-ViT \
--freeze-LM \
--patch-dim 14 \
--img-h 448 \
--img-w 448 \
--dataloader-type external \
--tensorboard-dir ${TENSORBOARD_DIR} \
--language-model-type qwen2.0_72B \
${EXTRA_ARGS} \
--allow-missing-vision-projection-checkpoint \
--vision-model-type internvit \
--disable-vision-class-token \
--log-params-norm \
--log-num-zeros-in-grad \
--ckpt-format torch \
--pixel-shuffle \
--image-tag-type nvlm
"
export NVTE_APPLY_QK_LAYER_SCALING=0
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
#!/bin/bash
# Your SBATCH commands here if using SLURM.
# Please launch this script from megatron-lm root.
# Train a multimodal model.
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export TOKENIZERS_PARALLELISM="false"
DEBUG=0
if [[ $BATCH -eq 0 ]]; then
DATETIME=`date +'%y-%m-%d-%H-%M-%S'`
MODEL_NAME="mcore-nous-yi34b-internvit-mlp-${DATETIME}"
else
MODEL_NAME="mcore-nous-yi34b-internvit-mlp"
fi
WORKSPACE="<some dir>"
SOURCE=`pwd`
OUTPUT_BASE="${WORKSPACE}/output"
OUTPUT="${OUTPUT_BASE}/${MODEL_NAME}"
FINETUNE_DIR=${OUTPUT}/checkpoints
LOGS_DIR="${OUTPUT}/logs"
TENSORBOARD_DIR="${OUTPUT}/tensorboard"
LOAD_NAME="combined-yi-34b-internvit-tp8-mcore"
CHECKPOINT_DIR="${WORKSPACE}/${LOAD_NAME}"
DATA_TRAIN="${SOURCE}/examples/multimodal/nvlm/pretrain_blend.yaml"
if [[ $DEBUG -eq 1 ]]; then
MBZ=1
BZ=1
NW=0
LI=1
AD=0.0
HD=0.0
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
else
MBZ=1
BZ=2048
NW=8
LI=5
AD=0.1
HD=0.1
EXTRA_ARGS=""
ALLOW_NONDETERMINISTIC=1
fi
SEQ_LEN=256 # Image embeddings sequence length.
DECODER_SEQ_LEN=512 # Language model sequence length.
MAX_POS_EMBED=512
OPTIONS=" \
--swiglu \
--use-distributed-optimizer \
--num-workers ${NW} \
--num-layers 60 \
--hidden-size 7168 \
--normalization RMSNorm \
--num-attention-heads 56 \
--exit-duration-in-mins 230 \
--group-query-attention \
--num-query-groups 8 \
--ffn-hidden-size 20480 \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 5000000 \
--disable-bias-linear \
--tensor-model-parallel-size 8 \
--language-model-type yi-34b \
--vision-model-type internvit \
--micro-batch-size ${MBZ} \
--global-batch-size ${BZ} \
--train-samples 122880000 \
--lr-decay-samples 25600000 \
--lr-warmup-samples 83200 \
--lr 1e-4 \
--min-lr 2.5e-5 \
--lr-decay-style cosine \
--clip-grad 10.0 \
--weight-decay 0.1 \
--adam-beta1 0.9 \
--adam-beta2 0.95 \
--init-method-std 0.014 \
--attention-dropout ${AD} \
--hidden-dropout ${HD} \
--eod-mask-loss \
--bf16 \
--tensorboard-dir=${TENSORBOARD_DIR} \
--freeze-LM \
--freeze-ViT \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--data-path ${DATA_TRAIN} \
--dataloader-type external \
--split 100,0,0 \
--prompt-path ${SOURCE}/examples/multimodal/nvlm/nvlm_prompts.json \
--log-interval ${LI} \
--save-interval 2000 \
--eval-interval 500 \
--eval-iters 10 \
--log-params-norm \
--log-num-zeros-in-grad \
${EXTRA_ARGS} \
--save ${FINETUNE_DIR} \
--load ${FINETUNE_DIR} \
--dataloader-save ${FINETUNE_DIR}/dataloader \
--pretrained-checkpoint ${CHECKPOINT_DIR} \
--allow-missing-vision-projection-checkpoint \
--disable-vision-class-token \
--use-te \
--use-checkpoint-args \
--ckpt-format torch \
--pixel-shuffle \
--image-tag-type nvlm
"
export NVTE_ALLOW_NONDETERMINISTIC_ALGO=${ALLOW_NONDETERMINISTIC}
export NVTE_APPLY_QK_LAYER_SCALING=0
# Interactive or batch mode
if [[ $BATCH -eq 0 ]]; then
torchrun --nproc_per_node 8 examples/multimodal/train.py ${OPTIONS}
else
run_cmd="python -u ${SOURCE}/examples/multimodal/train.py ${OPTIONS}"
DATETIME=`date +'date_%y-%m-%d_time_%H-%M-%S'`
srun -l --verbose \
--container-image <path to docker image> \
--container-mounts "<some mount>" \
--output=${LOGS_DIR}/%x_%j_$DATETIME.log \
sh -c "${run_cmd}"
set +x
fi
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment