Commit 0d99ae1f authored by silencealiang's avatar silencealiang
Browse files

add

parent c271aaae
Pipeline #2498 canceled with stages
......@@ -9,15 +9,22 @@ def merge_input_files(input_path):
"""Merge input files to a format compatible with the evaluator."""
input_file_paths, output_file_path = get_input_output_paths(input_path, task="VQAv2")
results = []
results = dict()
for input_file_path in input_file_paths:
with open(input_file_path, "r") as input_file:
for line in input_file:
res = json.loads(line)
res["question_id"] = res["sample_id"]
sample_id = res["sample_id"]
results.append(res)
# Skip possible duplicates.
if sample_id in results:
continue
res["question_id"] = sample_id
results[sample_id] = res
results = list(results.values())
with open(output_file_path, "w") as output_file:
json.dump(results, output_file)
......@@ -57,6 +64,9 @@ def compute_vqa_accuracy(result_file, task):
assert len(gt) == 1, "expected exactly one groundtruth answer."
gt = gt[0]
pred = pred.rstrip("%")
gt = gt.rstrip("%")
if is_number(pred) and is_number(gt):
pred = float(pred)
gt = float(gt)
......
......@@ -188,7 +188,7 @@ class MMMUDataset(torch.utils.data.Dataset):
use_tiling,
max_num_tiles,
use_thumbnail,
single_image,
prompt_style,
vision_model_type,
):
import datasets
......@@ -246,7 +246,7 @@ class MMMUDataset(torch.utils.data.Dataset):
self._use_tiling = use_tiling
self._max_num_tiles = max_num_tiles
self._use_thumbnail = use_thumbnail
self._single_image = single_image
self._prompt_style = prompt_style
self._vision_model_type = vision_model_type
def __len__(self):
......@@ -258,7 +258,7 @@ class MMMUDataset(torch.utils.data.Dataset):
sample = self._dataset[idx]
# Use the single image approach from the MMMU repo.
if self._single_image:
if self._prompt_style == "single_image":
sample = process_single_sample(sample)
sample = construct_prompt(sample, self._config)
......@@ -274,7 +274,69 @@ class MMMUDataset(torch.utils.data.Dataset):
vision_model_type=self._vision_model_type,
)
sample_num_tiles = [len(sample_imgs)]
else:
prompt = sample["final_input_prompt"]
for i in range(8):
prompt = prompt.replace(f"<image {i}>", "")
sample["final_input_prompt"] = f"<image>\n{prompt}"
elif self._prompt_style == "vlmevalkit":
sample = construct_prompt(sample, self._config)
if sample["question_type"] == "multiple-choice":
question = sample["question"]
options = ""
for k, v in sample["index2ans"].items():
options += f"{k}. {v}\n"
final_prompt = f"{question}\n"
if "hint" in sample:
final_prompt += f"Hint: {sample['hint']}\n"
if "task_instructions" in sample:
final_prompt += f"Task instructions: {sample['task_instructions']}\n"
final_prompt += options
final_prompt += "Answer with the option's letter from the given choices directly."
sample["final_input_prompt"] = final_prompt.rstrip()
else:
question = sample["question"]
final_prompt = f"{question}\n"
final_prompt += "Answer the question directly."
sample["final_input_prompt"] = final_prompt.rstrip()
sample_imgs = []
sample_num_tiles = []
img_indices = sorted(list(set(re.findall(r"<image (\d+)", sample["final_input_prompt"]))))
# If there are multiple input images, we need to avoid the number of image embeddings getting too large.
adjusted_max_num_tiles = max(1, self._max_num_tiles // len(img_indices))
adjusted_max_num_tiles = min(adjusted_max_num_tiles, self._max_num_tiles)
for img_idx in img_indices:
img_key = f"image_{img_idx}"
img_str = f"<image {img_idx}>"
img = sample[img_key]
assert img is not None, f"{img_str} is in prompt but not in sample images"
imgs = get_visual_transform(
img,
self._img_h,
self._img_w,
self._use_tiling,
adjusted_max_num_tiles,
self._use_thumbnail,
augment=False,
vision_model_type=self._vision_model_type,
) # List of tiles.
sample_imgs.extend(imgs)
sample_num_tiles.append(len(imgs))
sample["final_input_prompt"] = " ".join([f'<image {i + 1}><image>' for i in range(len(img_indices))]) + "\n" + sample["final_input_prompt"]
elif self._prompt_style == "multi_image":
sample = construct_prompt(sample, self._config)
sample_imgs = []
......@@ -315,6 +377,8 @@ class MMMUDataset(torch.utils.data.Dataset):
assert (
f"<image {i}>" not in sample["final_input_prompt"]
), "prompt contains unhandled image tags"
else:
raise ValueError(f"unknown prompt style {self._prompt_style}")
# MMMU specific metadata.
metadata = {"question_type": sample["question_type"]}
......@@ -323,10 +387,6 @@ class MMMUDataset(torch.utils.data.Dataset):
metadata["all_choices"] = sample["all_choices"]
prompt = sample['final_input_prompt']
if self._single_image:
for i in range(8):
prompt = prompt.replace(f"<image {i}>", "")
prompt = f"<image>\n{prompt}"
tile_count = torch.tensor(sample_num_tiles, dtype=torch.int)
......@@ -780,8 +840,10 @@ def get_evaluation_dataset(
vision_model_type,
)
elif task == 'MMMU':
# Note: single_image=True uses only one image like in the MMMU repo example.
# single_image=False uses all images in the sample.
# Note:
# - prompt_style="single_image" uses only one image like in the MMMU repo example.
# - prompt_style="multi_image" uses multiple input images.
# - prompt_style="vlmevalkit" is similar to https://github.com/open-compass/VLMEvalKit/blob/5d3cebcf18ef4bfbadc3bd3ef80bdc7aad2c6557/vlmeval/vlm/internvl_chat.py#L499
dataset = MMMUDataset(
input_image_path,
num_samples_per_partition,
......@@ -792,7 +854,7 @@ def get_evaluation_dataset(
use_tiling,
max_num_tiles,
use_thumbnail,
single_image=True,
prompt_style="single_image",
vision_model_type=vision_model_type,
)
elif task == "VideoMME":
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -136,6 +136,20 @@ def model_provider(
else:
vision_projection_layer_spec = get_mlp_module_spec(use_te=use_te).submodules
# Toggle --recompute* for the vision and language model separately.
if args.recompute_vision:
if vision_config.recompute_method is not None and vision_config.recompute_granularity is not None:
vision_config.recompute_num_layers = vision_config.num_layers
else:
vision_config.recompute_granularity = None
vision_config.recompute_method = None
vision_config.recompute_num_layers = None
vision_projection_config.recompute_granularity = None
vision_projection_config.recompute_method = None
vision_projection_config.recompute_num_layers = None
tokenizer = get_tokenizer()
image_token_index = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -49,7 +49,7 @@ def add_multimodal_extra_args(parser):
group.add_argument(
"--tokenizer-prompt-format",
type=str,
choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0"],
choices=["mistral", "llama3", "chatml", "nvlm-yi-34b", "qwen2p0", "qwen2p5"],
required=True,
help="Prompt format to use with the tokenizer.",
)
......@@ -71,5 +71,9 @@ def add_multimodal_extra_args(parser):
group.add_argument(
"--packing-seq-length", type=int, default=0, help="Packing sequence length. Must be > 0 if using packing."
)
group.add_argument(
"--recompute-vision", action="store_true", default=False, help="Enable activation checkpointing in the vision model"
)
return parser
......@@ -5,6 +5,13 @@ Please refer to the [NVLM paper](https://arxiv.org/pdf/2409.11402) for details.
*NOTE: VLMs in Megatron are under active development and are expected to change.*
# Checkpoints
NVLM 1.0 model weights are publicly available in HuggingFace and Megatron format.
- NVLM-1.0-D 72B [HuggingFace version](https://huggingface.co/nvidia/NVLM-D-72B)
- NVLM-1.0-D 72B [Megatron-Core version](https://huggingface.co/nvidia/NVLM-D-72B-mcore)
# Setup
## Docker image
......@@ -32,7 +39,7 @@ NVLM 1.0 34B starts from [NousResearch/Nous-Hermes-2-Yi-34B](https://huggingface
Please download it and run the following command to convert it to Megatron format.
```
python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
--load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model directory> \
--load-dir <hf model directory> --save-dir <output dir> --tokenizer-model <hf model name/directory> \
--saver-transformer-impl transformer_engine --model-size yi-34B --make-vocab-size-divisible-by 1
```
......@@ -42,7 +49,7 @@ NVLM 1.0 72B starts from [Qwen/Qwen2-72B-Instruct](https://huggingface.co/Qwen/Q
Please download it and run the following command to convert it to Megatron format.
```
python tools/checkpoint/convert.py --bf16 --model-type GPT --loader llama_mistral --saver mcore --target-tensor-parallel-size 8 --checkpoint-type hf \
--load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model directory> \
--load-dir <hf model directory> --save-dir <output directory> --tokenizer-model <hf model name/directory> \
--saver-transformer-impl transformer_engine --model-size qwen2.5-72Bf
```
......
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
File mode changed from 100755 to 100644
......@@ -62,7 +62,7 @@ OPTIONS=" \
--exit-duration-in-mins 230 \
--disable-bias-linear \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<path to tokenizer model> \
--tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \
--transformer-impl transformer_engine \
--normalization RMSNorm \
......
......@@ -75,7 +75,7 @@ OPTIONS=" \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model ${WORKSPACE}/<path to tokenizer> \
--tokenizer-model NousResearch/Nous-Hermes-2-Yi-34B \
--tokenizer-prompt-format nvlm-yi-34b \
--vocab-size 64000 \
--make-vocab-size-divisible-by 1 \
......
......@@ -97,7 +97,7 @@ do
--decoder-seq-length ${DECODER_SEQ_LEN} \
--max-position-embeddings ${MAX_POS_EMBED} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model <tokenizer model path> \
--tokenizer-model Qwen/Qwen2-72B-Instruct \
--tokenizer-prompt-format qwen2p0 \
--position-embedding-type rope \
--rotary-percent 1.0 \
......
#!/bin/bash
export NCCL_IB_SL=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
export NVTE_APPLY_QK_LAYER_SCALING=0
export TOKENIZERS_PARALLELISM="false"
INPUT_IMAGE_PATH="placeholder"
GROUNDTRUTH_PATH="placeholder"
while [[ $# -gt 0 ]]; do
case $1 in
-i|--input-image-path)
INPUT_IMAGE_PATH="$2"
shift
shift
;;
-o|--output-path)
OUTPUT_PATH="$2"
shift
shift
;;
-m|--model-path)
MODEL_PATH="$2"
shift
shift
;;
-t|--task)
TASK="$2"
shift
shift
;;
-g|--gt-path)
GROUNDTRUTH_PATH="$2"
shift
shift
;;
-*|--*)
echo "Invalid option $1"
exit 1
;;
esac
done
# Please modify these as needed.
NUM_PARTITIONS=0
START=0
END=0
SEQ_LEN=256
DECODER_SEQ_LEN=8192
EXTRA_ARGS=" --pixel-shuffle --use-tiling --max-num-tiles 12 --use-thumbnail"
for PARTITION_ID in $( eval echo {$START..$END} )
do
torchrun --nproc_per_node 8 examples/multimodal/run_text_generation.py \
--attention-softmax-in-fp32 \
--transformer-impl transformer_engine \
--use-te \
--use-checkpoint-args \
--normalization RMSNorm \
--norm-epsilon 1e-06 \
--language-model-type=qwen2.5_7B \
--untie-embeddings-and-output-weights \
--disable-bias-linear \
--position-embedding-type rope \
--rotary-percent 1.0 \
--rotary-base 1000000 \
--swiglu \
--attention-dropout 0.0 \
--hidden-dropout 0.0 \
--tensor-model-parallel-size 4 \
--pipeline-model-parallel-size 1 \
--group-query-attention \
--num-query-groups 4 \
--num-layers 28 \
--hidden-size 3584 \
--ffn-hidden-size 18944 \
--add-qkv-bias \
--num-attention-heads 28 \
--max-position-embeddings 32768 \
--no-masked-softmax-fusion \
--load ${MODEL_PATH} \
--tokenizer-type MultimodalTokenizer \
--tokenizer-model Qwen/Qwen2.5-7B-Instruct \
--tokenizer-prompt-format qwen2p5 \
--bf16 \
--micro-batch-size 1 \
--seq-length ${SEQ_LEN} \
--decoder-seq-length ${DECODER_SEQ_LEN} \
--out-seq-length 128 \
--temperature 1.0 \
--img-h 448 \
--img-w 448 \
--patch-dim 14 \
--seed 153 \
--top_k 1 \
--no-load-rng \
--no-load-optim \
--input-image-path ${INPUT_IMAGE_PATH} \
--num-partitions ${NUM_PARTITIONS} \
--partition-id ${PARTITION_ID} \
--output-path ${OUTPUT_PATH} \
--gt-path ${GROUNDTRUTH_PATH} \
--task ${TASK} \
${EXTRA_ARGS} \
--special-tokens "<image>" "<img>" "</img>" \
--vision-model-type siglip \
--ckpt-format torch
done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment