Commit 99a0c39e authored by xingjinliang's avatar xingjinliang
Browse files

同步最新代码

parent 50fe58fa
Pipeline #2152 passed with stage
......@@ -2,9 +2,15 @@ import argparse
import glob
import json
import os
import sys
import re
import subprocess
# Get the absolute path of the parent directory
parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
# Add the parent directory to sys.path
sys.path.insert(0, parent_dir)
from run_text_generation import get_output_path
from config import EvaluationConfig
......@@ -48,6 +54,10 @@ def convert_to_mmmu_format(input_path):
)
# MMMU eval script expects just a sample_id to prediction mapping.
# Skip possible duplicates.
if sample_id in output:
continue
output[sample_id] = prediction
with open(output_file_path, "w") as output_file:
......
......@@ -8,13 +8,21 @@ def merge_input_files(input_path):
"""Merge input files to a format compatible with the evaluator."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="OCRBench")
results = []
results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
results.append(res)
sample_id = res["sample_id"]
# Remove possible duplicates.
if sample_id in results:
continue
results[sample_id] = res
results = list(results.values())
with open(output_file_path, "w") as output_file:
json.dump(results, output_file)
......
......@@ -9,22 +9,25 @@ def merge_input_files(input_path):
"""Merge input files to a format compatible with the evaluator."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="TextVQA")
results = []
results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
results.append(
{
"question_id": res["sample_id"],
"answer": res["answer"],
"gt_answer": res["gt_answer"],
}
)
# Make order deterministic.
# results = sorted(results, key=lambda d: d["question_id"])
sample_id = res["sample_id"]
# Remove possible duplicates.
if sample_id in results:
continue
results[sample_id] = {
"question_id": sample_id,
"answer": res["answer"],
"gt_answer": res["gt_answer"],
}
results = list(results.values())
with open(output_file_path, "w") as output_file:
json.dump(results, output_file)
......
......@@ -9,15 +9,22 @@ def merge_input_files(input_path):
"""Merge input files to a format compatible with the evaluator."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
results = []
results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
res["question_id"] = res["sample_id"]
sample_id = res["sample_id"]
results.append(res)
# Skip possible duplicates.
if sample_id in results:
continue
res["question_id"] = sample_id
results[sample_id] = res
results = list(results.values())
with open(output_file_path, "w") as output_file:
json.dump(results, output_file)
......@@ -57,6 +64,9 @@ def compute_vqa_accuracy(result_file, task):
assert len(gt) == 1, "expected exactly one groundtruth answer."
gt = gt[0]
pred = pred.rstrip("%")
gt = gt.rstrip("%")
if is_number(pred) and is_number(gt):
pred = float(pred)
gt = float(gt)
......
......@@ -188,7 +188,7 @@ class MMMUDataset(torch.utils.data.Dataset):
use_tiling,
max_num_tiles,
use_thumbnail,
single_image,
prompt_style,
vision_model_type,
):
import datasets
......@@ -246,7 +246,7 @@ class MMMUDataset(torch.utils.data.Dataset):
self._use_tiling = use_tiling
self._max_num_tiles = max_num_tiles
self._use_thumbnail = use_thumbnail
self._single_image = single_image
self._prompt_style = prompt_style
self._vision_model_type = vision_model_type
def __len__(self):
......@@ -258,7 +258,7 @@ class MMMUDataset(torch.utils.data.Dataset):
sample = self._dataset[idx]
# Use the single image approach from the MMMU repo.
if self._single_image:
if self._prompt_style == "single_image":
sample = process_single_sample(sample)
sample = construct_prompt(sample, self._config)
......@@ -274,7 +274,69 @@ class MMMUDataset(torch.utils.data.Dataset):
vision_model_type=self._vision_model_type,
)
sample_num_tiles = [len(sample_imgs)]
else:
prompt = sample["final_input_prompt"]
for i in range(8):
prompt = prompt.replace(f"<image {i}>", "")
sample["final_input_prompt"] = f"<image>\n{prompt}"
elif self._prompt_style == "vlmevalkit":
sample = construct_prompt(sample, self._config)
if sample["question_type"] == "multiple-choice":
question = sample["question"]
options = ""
for k, v in sample["index2ans"].items():
options += f"{k}. {v}\n"
final_prompt = f"{question}\n"
if "hint" in sample:
final_prompt += f"Hint: {sample['hint']}\n"
if "task_instructions" in sample:
final_prompt += f"Task instructions: {sample['task_instructions']}\n"
final_prompt += options
final_prompt += "Answer with the option's letter from the given choices directly."
sample["final_input_prompt"] = final_prompt.rstrip()
else:
question = sample["question"]
final_prompt = f"{question}\n"
final_prompt += "Answer the question directly."
sample["final_input_prompt"] = final_prompt.rstrip()
sample_imgs = []
sample_num_tiles = []
img_indices = sorted(list(set(re.findall(r"<image (\d+)", sample["final_input_prompt"]))))
# If there are multiple input images, we need to avoid the number of image embeddings getting too large.
adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
adjusted_max_num_tiles = min(adjusted_max_num_tiles, self._max_num_tiles)
for img_idx in img_indices:
img_key = f"image_{img_idx}"
img_str = f"<image {img_idx}>"
img = sample[img_key]
assert img is not None, f"{img_str} is in prompt but not in sample images"
imgs = get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
adjusted_max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
) # List of tiles.
sample_imgs.extend(imgs)
sample_num_tiles.append(len(imgs))
sample["final_input_prompt"] = " ".join([f'<image {i + 1}><image>' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"]
elif self._prompt_style == "multi_image":
sample = construct_prompt(sample, self._config)
sample_imgs = []
......@@ -315,6 +377,8 @@ class MMMUDataset(torch.utils.data.Dataset):
assert (
f"<image {i}>" not in sample["final_input_prompt"]
), "prompt contains unhandled image tags"
else:
raise ValueError(f"unknown prompt style {self._prompt_style}")
# MMMU specific metadata.
metadata = {"question_type": sample["question_type"]}
......@@ -323,10 +387,6 @@ class MMMUDataset(torch.utils.data.Dataset):
metadata["all_choices"] = sample["all_choices"]
prompt = sample['final_input_prompt']
if self._single_image:
for i in range(8):
prompt = prompt.replace(f"<image {i}>", "")
prompt = f"<image>\n{prompt}"
tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
......@@ -780,8 +840,10 @@ def get_evaluation_dataset(
vision_model_type,
)
elif task == 'MMMU':
# Note: single_image=True uses only one image like in the MMMU repo example.
# single_image=False uses all images in the sample.
# Note:
# - prompt_style="single_image" uses only one image like in the MMMU repo example.
# - prompt_style="multi_image" uses multiple input images.
# - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499
dataset = MMMUDataset(
input_image_path,
num_samples_per_partition,
......@@ -792,7 +854,7 @@ def get_evaluation_dataset(
use_tiling,
max_num_tiles,
use_thumbnail,
single_image=True,
prompt_style="single_image",
vision_model_type=vision_model_type,
)
elif task == "VideoMME":
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -136,6 +136,20 @@ def model_provider(
else:
vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
# Toggle --recompute* for the vision and language model separately.
if args.recompute_vision:
if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None:
vision_config.recompute_num_layers = vision_config.num_layers
else:
vision_config.recompute_granularity = None
vision_config.recompute_method = None
vision_config.recompute_num_layers = None
vision_projection_config.recompute_granularity = None
vision_projection_config.recompute_method = None
vision_projection_config.recompute_num_layers = None
tokenizer = get_tokenizer()
image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -49,7 +49,7 @@ def add_multimodal_extra_args(parser):
group.add_argument(
"--tokenizer-prompt-format",
type=str,
choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0"],
choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"],
required=True,
help="Prompt format to use with the tokenizer.",
)
......@@ -71,5 +71,9 @@ def add_multimodal_extra_args(parser):
group.add_argument(
"--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
)
group.add_argument(
"--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
)
return parser
......@@ -5,6 +5,13 @@ Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
*NOTE: VLMs in Megatron are under active development and are expected to change.*
# Checkpoints
NVLM 1.0 model weights are publicly available in HuggingFace and Megatron format.
- NVLM-1.0-D 72B [HuggingFace version](https://huggingface.co/nvidia/NVLM-D-72B)
- NVLM-1.0-D 72B [Megatron-Core version](https://huggingface.co/nvidia/NVLM-D-72B-mcore)
# Setup
## Docker image
......@@ -32,7 +39,7 @@ NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface
Please download it and run the following command to convert it to Megatron format.
```
python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
--load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model directory> \
--load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model name/directory> \
--saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1
```
......@@ -42,7 +49,7 @@ NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Q
Please download it and run the following command to convert it to Megatron format.
```
python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
--load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model directory> \
--load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model name/directory> \
--saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf
```
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -62,7 +62,7 @@ OPTIONS=" \
--exit-duration-in-mins 230 \
--disable-bias-linear \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<path to tokenizer model> \
--tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \
--transformer-impl transformer_engine \
--normalization RMSNorm \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment