Commit 3d735feb authored by luopl's avatar luopl
Browse files

"Initial commit"

parents
Pipeline #3074 canceled with stages
#!/bin/bash
# RealWorldQA Inference Script (Instruct Model)
# This script runs inference on the RealWorldQA dataset using vLLM
python run_realworldqa.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--dataset RealWorldQA \
--data-dir /path/to/data \
--output-file results/RealWorldQA_results.jsonl \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.9 \
--max-model-len 128000 \
--max-images-per-prompt 10 \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
#!/bin/bash
# RealWorldQA Inference Script (Thinking Model)
# This script runs inference on the RealWorldQA dataset using vLLM with thinking mode parameters
python run_realworldqa.py infer \
--model-path /path/to/Qwen3-VL-Thinking \
--dataset RealWorldQA \
--data-dir /path/to/data \
--output-file results/RealWorldQA_results_thinking.jsonl \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.9 \
--max-model-len 128000 \
--max-images-per-prompt 10 \
--max-new-tokens 32768 \
--temperature 0.6 \
--top-p 0.95 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 0.0
vllm
transformers
qwen_vl_utils
pandas
numpy
Pillow
tqdm
requests
validators
torch
torchvision
accelerate
sentencepiece
flash_attn
\ No newline at end of file
import os
import sys
import json
import argparse
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from typing import List, Dict, Any
import torch
import warnings
import string
# vLLM imports
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor
# Local imports from refactored files
from dataset_utils import load_dataset, dump_image, build_realworldqa_prompt
from eval_utils import build_judge, eval_single_sample
# Set vLLM multiprocessing method
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
def prepare_inputs_for_vllm(messages, processor):
"""
Prepare inputs for vLLM.
Args:
messages: List of messages in standard conversation format
processor: AutoProcessor instance
Returns:
dict: Input format required by vLLM
"""
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# qwen_vl_utils 0.0.14+ required
image_inputs, video_inputs, video_kwargs = process_vision_info(
messages,
image_patch_size=processor.image_processor.patch_size,
return_video_kwargs=True,
return_video_metadata=True
)
mm_data = {}
if image_inputs is not None:
mm_data['image'] = image_inputs
if video_inputs is not None:
mm_data['video'] = video_inputs
return {
'prompt': text,
'multi_modal_data': mm_data,
'mm_processor_kwargs': video_kwargs
}
def run_inference(args):
"""Run inference on the RealWorldQA dataset using vLLM."""
print("\n" + "="*80)
print("🚀 RealWorldQA Inference with vLLM (High-Speed Mode)")
print("="*80 + "\n")
# Set up data directory
if args.data_dir:
os.environ['LMUData'] = args.data_dir
elif 'LMUData' not in os.environ:
raise ValueError("Please specify --data-dir or set LMUData environment variable")
print(f"✓ Data directory: {os.environ['LMUData']}")
# Load dataset
print(f"Loading dataset: {args.dataset}")
data = load_dataset(args.dataset)
print(f"✓ Loaded {len(data)} samples from {args.dataset}")
# DEBUG: Process only first N samples if specified
if os.getenv('DEBUG_SAMPLE_SIZE'):
debug_size = int(os.getenv('DEBUG_SAMPLE_SIZE'))
data = data.iloc[:debug_size]
print(f"⚠️ DEBUG MODE: Only processing {len(data)} samples")
# Set up image root directory
img_root = os.path.join(os.environ['LMUData'], 'images', args.dataset)
os.makedirs(img_root, exist_ok=True)
# Set up dump_image function
def dump_image_func(line):
return dump_image(line, img_root)
# Create output directory
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
# Set resolution parameters
min_pixels = args.min_pixels if args.min_pixels is not None else 768*28*28
max_pixels = args.max_pixels if args.max_pixels is not None else 5120*28*28
print(f"✓ Image resolution: min_pixels={min_pixels}, max_pixels={max_pixels}")
# Set up generation parameters (vLLM SamplingParams format)
sampling_params = SamplingParams(
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
max_tokens=args.max_new_tokens,
repetition_penalty=args.repetition_penalty,
presence_penalty=args.presence_penalty,
stop_token_ids=[],
)
print(f"\n⚙️ Generation parameters (vLLM SamplingParams):")
print(f" max_tokens={sampling_params.max_tokens}")
print(f" temperature={sampling_params.temperature}, top_p={sampling_params.top_p}, top_k={sampling_params.top_k}")
print(f" repetition_penalty={sampling_params.repetition_penalty}")
print(f" presence_penalty={sampling_params.presence_penalty}")
if sampling_params.presence_penalty > 0:
print(f" ✅ Anti-repetition enabled (presence_penalty={sampling_params.presence_penalty})")
if sampling_params.temperature <= 0.02 and sampling_params.top_k == 1:
print(f" ✅ Using FAST greedy-like decoding")
else:
print(f" ⚠️ Using sampling decoding (slower but more diverse)")
print()
# Load processor for input preparation
print(f"Loading processor from {args.model_path}")
processor = AutoProcessor.from_pretrained(args.model_path)
print("✓ Processor loaded\n")
# Initialize vLLM
print(f"Initializing vLLM with model: {args.model_path}")
print(f" GPU count: {torch.cuda.device_count()}")
print(f" Tensor parallel size: {args.tensor_parallel_size}")
llm = LLM(
model=args.model_path,
tensor_parallel_size=args.tensor_parallel_size,
gpu_memory_utilization=args.gpu_memory_utilization,
trust_remote_code=True,
max_model_len=args.max_model_len,
limit_mm_per_prompt={"image": args.max_images_per_prompt},
seed=42,
)
print("✓ vLLM initialized successfully\n")
# Prepare all inputs
print("Preparing inputs for vLLM...")
all_inputs = []
all_line_dicts = []
all_messages = []
for idx, (_, line) in enumerate(tqdm(data.iterrows(), total=len(data), desc="Building prompts")):
# Convert line to dict
line_dict = line.to_dict()
for k, v in line_dict.items():
if isinstance(v, np.integer):
line_dict[k] = int(v)
elif isinstance(v, np.floating):
line_dict[k] = float(v)
# Build prompt
messages = build_realworldqa_prompt(line, dump_image_func, min_pixels, max_pixels)
# Prepare input for vLLM
vllm_input = prepare_inputs_for_vllm(messages, processor)
all_inputs.append(vllm_input)
all_line_dicts.append(line_dict)
all_messages.append(messages)
print(f"✓ Prepared {len(all_inputs)} inputs\n")
# Batch inference (vLLM automatic optimization)
print("="*80)
print("🚀 Running vLLM batch inference (automatic optimization)")
print("="*80)
start_time = time.time()
outputs = llm.generate(all_inputs, sampling_params=sampling_params)
end_time = time.time()
total_time = end_time - start_time
print(f"\n✓ Inference completed in {total_time:.2f} seconds")
print(f" Average: {total_time/len(data):.2f} seconds/sample")
print(f" Throughput: {len(data)/total_time:.2f} samples/second\n")
# Save results
print("Saving results...")
results = []
for idx, (line_dict, messages, output) in enumerate(zip(all_line_dicts, all_messages, outputs)):
response = output.outputs[0].text
index = line_dict['index']
# Handle </think> tag
response_final = str(response).split("</think>")[-1].strip()
result = {
"question_id": int(index) if isinstance(index, (int, np.integer)) else index,
"annotation": line_dict,
"task": args.dataset,
"result": {"gen": response_final, "gen_raw": response},
"messages": messages
}
results.append(result)
# Write final results
with open(args.output_file, 'w') as f:
for res in results:
f.write(json.dumps(res) + '\n')
print(f"\n✓ Results saved to {args.output_file}")
print(f"✓ Total samples processed: {len(results)}")
def run_evaluation(args):
"""Run evaluation on inference results."""
print("\n" + "="*80)
print("📊 RealWorldQA Evaluation")
print("="*80 + "\n")
# Set up data directory
if args.data_dir:
os.environ['LMUData'] = args.data_dir
elif 'LMUData' not in os.environ:
raise ValueError("Please specify --data-dir or set LMUData environment variable")
# Load results
results = []
with open(args.input_file, 'r') as f:
for line in f:
job = json.loads(line)
annotation = job["annotation"]
annotation["prediction"] = job["result"]["gen"]
results.append(annotation)
data = pd.DataFrame.from_records(results)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# Convert column names to lowercase
for k in list(data.keys()):
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
print(f"✓ Loaded {len(data)} results from {args.input_file}")
# Create output directory
output_dir = os.path.dirname(args.output_file)
os.makedirs(output_dir, exist_ok=True)
# Build judge model (if specified)
model = None
if args.eval_model:
model = build_judge(
model=args.eval_model,
api_type=getattr(args, 'api_type', 'dash')
)
print(f"✓ Evaluation model: {args.eval_model}")
else:
print("⚠️ No evaluation model specified, using rule-based extraction only")
# Prepare evaluation tasks
items = []
for i in range(len(data)):
item = data.iloc[i].to_dict()
items.append(item)
eval_tasks = []
for item in items:
eval_tasks.append((model, item))
# Run evaluation
eval_results = []
# Debug mode: process single-threaded with first few samples
debug = os.environ.get('DEBUG', '').lower() == 'true'
if debug:
print("Running in debug mode with first 5 samples...")
for task in eval_tasks[:5]:
try:
result = eval_single_sample(task)
eval_results.append(result)
except Exception as e:
print(f"Error processing task: {e}")
raise
else:
# Normal mode: process all samples with threading
from concurrent.futures import ThreadPoolExecutor
nproc = getattr(args, 'nproc', 4)
print(f"✓ Using {nproc} parallel processes")
with ThreadPoolExecutor(max_workers=nproc) as executor:
for result in tqdm(executor.map(eval_single_sample, eval_tasks),
total=len(eval_tasks), desc="Evaluating"):
eval_results.append(result)
# Calculate overall accuracy
accuracy = sum(r['hit'] for r in eval_results) / len(eval_results)
# Save results
output_df = pd.DataFrame(eval_results)
output_df.to_csv(args.output_file, index=False)
# Save accuracy to JSON
acc_file = args.output_file.replace('.csv', '_acc.json')
with open(acc_file, 'w') as f:
json.dump({
"overall_accuracy": accuracy,
"task_samples": len(results),
"correct": sum(r['hit'] for r in eval_results),
"total": len(eval_results)
}, f, indent=2)
print(f"\n{'='*50}")
print(f"Evaluation Results:")
print(f"{'='*50}")
print(f"Overall accuracy: {accuracy:.4f} ({sum(r['hit'] for r in eval_results)}/{len(eval_results)})")
print(f"{'='*50}\n")
print(f"✓ Detailed results saved to {args.output_file}")
print(f"✓ Accuracy saved to {acc_file}")
def main():
parser = argparse.ArgumentParser(description="RealWorldQA Evaluation with vLLM")
subparsers = parser.add_subparsers(dest='command', help='Command to run')
# Inference parser
infer_parser = subparsers.add_parser("infer", help="Run inference with vLLM")
infer_parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
infer_parser.add_argument("--dataset", type=str, default="RealWorldQA", help="Dataset name")
infer_parser.add_argument("--data-dir", type=str, help="Data directory (LMUData)")
infer_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
# Image resolution parameters
infer_parser.add_argument("--min-pixels", type=int, default=None,
help="Minimum pixels for image (default: 768*28*28)")
infer_parser.add_argument("--max-pixels", type=int, default=None,
help="Maximum pixels for image (default: 5120*28*28)")
# vLLM specific parameters
infer_parser.add_argument("--tensor-parallel-size", type=int, default=None,
help="Tensor parallel size (default: number of GPUs)")
infer_parser.add_argument("--gpu-memory-utilization", type=float, default=0.9,
help="GPU memory utilization (0.0-1.0, default: 0.9)")
infer_parser.add_argument("--max-model-len", type=int, default=128000,
help="Maximum model context length (default: 128000)")
infer_parser.add_argument("--max-images-per-prompt", type=int, default=10,
help="Maximum images per prompt (default: 10)")
# Generation parameters
infer_parser.add_argument("--max-new-tokens", type=int, default=32768,
help="Maximum number of tokens to generate (default: 32768)")
infer_parser.add_argument("--temperature", type=float, default=0.7,
help="Temperature for sampling (default: 0.7)")
infer_parser.add_argument("--top-p", type=float, default=0.8,
help="Top-p for sampling (default: 0.8)")
infer_parser.add_argument("--top-k", type=int, default=20,
help="Top-k for sampling (default: 20)")
infer_parser.add_argument("--repetition-penalty", type=float, default=1.0,
help="Repetition penalty (default: 1.0)")
infer_parser.add_argument("--presence-penalty", type=float, default=1.5,
help="Presence penalty (default: 1.5)")
# Evaluation parser
eval_parser = subparsers.add_parser("eval", help="Run evaluation")
eval_parser.add_argument("--data-dir", type=str, help="Data directory (LMUData)")
eval_parser.add_argument("--input-file", type=str, required=True, help="Input file with inference results")
eval_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
eval_parser.add_argument("--dataset", type=str, default="RealWorldQA", help="Dataset name")
eval_parser.add_argument("--eval-model", type=str, default=None,
help="Model to use for evaluation (default: None, use rule-based only)")
eval_parser.add_argument("--api-type", type=str, default="dash", choices=["dash", "mit"],
help="API type for evaluation")
eval_parser.add_argument("--nproc", type=int, default=4, help="Number of processes to use")
args = parser.parse_args()
# Automatically set tensor_parallel_size
if args.command == 'infer' and args.tensor_parallel_size is None:
args.tensor_parallel_size = torch.cuda.device_count()
print(f"Auto-set tensor_parallel_size to {args.tensor_parallel_size}")
if args.command == 'infer':
run_inference(args)
elif args.command == 'eval':
run_evaluation(args)
else:
parser.print_help()
if __name__ == "__main__":
main()
# MMMU Benchmark Evaluation
This directory contains the implementation for evaluating vision-language models on the MMMU (Massive Multi-discipline Multimodal Understanding) benchmark using vLLM for high-speed inference.
## Overview
The MMMU benchmark evaluates models across diverse disciplines with multi-modal questions. This implementation provides:
- **High-speed inference** using vLLM with automatic batch optimization
- **Flexible evaluation** using GPT-based judge models
- **Support for thinking models** with extended reasoning
- **Modular code structure** for easy maintenance and extension
## Project Structure
```
mmmu/
├── run_mmmu.py # Main script for inference and evaluation
├── dataset_utils.py # Dataset loading and preprocessing utilities
├── eval_utils.py # Evaluation logic and judge model wrappers
├── common_utils.py # Common utilities for image processing, file I/O
├── infer_instruct.sh # Inference script for instruct models
├── infer_think.sh # Inference script for thinking models
├── eval_instruct.sh # Evaluation script for instruct model results
├── eval_think.sh # Evaluation script for thinking model results
├── requirements.txt # Python dependencies
└── README.md # This file
```
## Requirements
### Python Dependencies
```bash
pip install -r requirements.txt
```
Key dependencies:
- `vllm` - High-speed LLM inference engine
- `transformers` - HuggingFace transformers
- `qwen_vl_utils` - Qwen VL utilities for vision processing
- `pandas`, `numpy` - Data processing
- `Pillow` - Image processing
- `requests` - API calls for evaluation
### Environment Variables
For evaluation, you need to set up API credentials for the judge model:
**Option 1: DashScope API (Recommended)**
```bash
export CHATGPT_DASHSCOPE_API_KEY="your-api-key"
export DASHSCOPE_API_BASE="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
```
**Option 2: Custom OpenAI-compatible API**
```bash
export MIT_SPIDER_TOKEN="your-api-key"
export MIT_SPIDER_URL="your-api-endpoint"
```
## Quick Start
### 1. Inference
Run inference on MMMU dataset using an instruct model:
```bash
bash infer_instruct.sh
```
Or customize the inference:
```bash
python run_mmmu.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--data-dir /path/to/data \
--dataset MMMU_DEV_VAL \
--output-file results/predictions.jsonl \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
```
For thinking models with extended reasoning:
```bash
bash infer_think.sh
```
### 2. Evaluation
Evaluate the inference results using a judge model:
```bash
bash eval_instruct.sh
```
Or customize the evaluation:
```bash
python run_mmmu.py eval \
--data-dir /path/to/data \
--input-file results/predictions.jsonl \
--output-file results/evaluation.csv \
--dataset MMMU_DEV_VAL \
--eval-model gpt-3.5-turbo-0125 \
--api-type dash \
--nproc 16
```
## Detailed Usage
### Inference Mode
**Basic Arguments:**
- `--model-path`: Path to the Qwen3-VL model directory (required)
- `--data-dir`: Directory to store/load MMMU dataset (required)
- `--dataset`: Dataset name, default: `MMMU_DEV_VAL`
- `--output-file`: Path to save inference results in JSONL format (required)
**vLLM Arguments:**
- `--tensor-parallel-size`: Number of GPUs for tensor parallelism (default: auto-detect)
- `--gpu-memory-utilization`: GPU memory utilization ratio, 0.0-1.0 (default: 0.9)
- `--max-model-len`: Maximum model context length (default: 128000)
- `--max-images-per-prompt`: Maximum images per prompt (default: 10)
**Generation Parameters:**
- `--max-new-tokens`: Maximum tokens to generate (default: 32768)
- `--temperature`: Sampling temperature (default: 0.7)
- `--top-p`: Top-p sampling (default: 0.8)
- `--top-k`: Top-k sampling (default: 20)
- `--repetition-penalty`: Repetition penalty (default: 1.0)
- `--presence-penalty`: Presence penalty to reduce repetition (default: 1.5)
**Advanced Options:**
- `--use-cot`: Enable Chain-of-Thought prompting for better reasoning
- `--cot-prompt`: Custom CoT prompt (optional)
### Evaluation Mode
**Basic Arguments:**
- `--data-dir`: Directory containing MMMU dataset (required)
- `--input-file`: Inference results file in JSONL format (required)
- `--output-file`: Path to save evaluation results in CSV format (required)
- `--dataset`: Dataset name, must match inference (default: `MMMU_DEV_VAL`)
**Judge Model Arguments:**
- `--eval-model`: Judge model name (default: `gpt-3.5-turbo-0125`)
- Options: `gpt-3.5-turbo-0125`, `gpt-4-0125-preview`, `gpt-4o`, etc.
- `--api-type`: API service type (default: `dash`)
- `dash`: DashScope API (Alibaba Cloud)
- `mit`: Custom OpenAI-compatible API
- `--nproc`: Number of parallel workers for evaluation (default: 4)
## Output Files
### Inference Output
The inference script generates a JSONL file where each line contains:
```json
{
"question_id": 123,
"annotation": {
"index": 123,
"question": "What is shown in the image?",
"A": "Option A",
"B": "Option B",
"answer": "A",
...
},
"task": "MMMU_DEV_VAL",
"result": {
"gen": "The final answer",
"gen_raw": "Raw model output including thinking process"
},
"messages": [...]
}
```
### Evaluation Output
The evaluation script generates two files:
1. **CSV file** (`*_eval_results.csv`): Detailed results for each sample
- Columns: `index`, `question`, `prediction`, `extracted_answer`, `extraction_method`, `gt`, `hit`, `split`
2. **JSON file** (`*_eval_results_acc.json`): Accuracy summary
```json
{
"overall_accuracy": 0.7234,
"accuracy_by_split": {
"validation": 0.7234
}
}
```
## Model-Specific Configurations
### Instruct Models (e.g., Qwen3-VL-2B-Instruct, Qwen3-VL-30B-Instruct)
Use standard parameters for balanced performance:
```bash
--max-new-tokens 32768
--temperature 0.7
--top-p 0.8
--top-k 20
--repetition-penalty 1.0
--presence-penalty 1.5
```
### Thinking Models (e.g., Qwen3-VL-4B-Thinking)
Use extended parameters for deeper reasoning:
```bash
--max-new-tokens 40960
--temperature 1.0
--top-p 0.95
--top-k 20
--repetition-penalty 1.0
--presence-penalty 0.0
```
**Note:** Thinking models output reasoning steps wrapped in `<think>...</think>` tags. The evaluation automatically extracts the final answer after `</think>`.
## Performance Tips
1. **GPU Memory**: Adjust `--gpu-memory-utilization` based on your GPU:
- 0.9: Recommended for most cases
- 0.95: For maximum throughput (may cause OOM)
- 0.7-0.8: If experiencing OOM errors
2. **Batch Size**: vLLM automatically optimizes batch size based on available memory
3. **Tensor Parallelism**: Use `--tensor-parallel-size` for large models:
- 2B/4B models: 1-2 GPUs
- 7B/14B models: 2-4 GPUs
- 30B+ models: 4-8 GPUs
4. **Context Length**: Reduce `--max-model-len` if memory is limited:
- 128000: Default, works well for most cases
- 64000: Reduces memory usage by ~40%
## Troubleshooting
### Common Issues
**1. CUDA Out of Memory**
```bash
# Reduce GPU memory utilization
--gpu-memory-utilization 0.7
# Or reduce context length
--max-model-len 64000
```
**2. vLLM Multiprocessing Issues**
The code automatically sets `VLLM_WORKER_MULTIPROC_METHOD=spawn`. If you still encounter issues:
```bash
export VLLM_WORKER_MULTIPROC_METHOD=spawn
```
**3. Evaluation API Errors**
- Verify API credentials are set correctly
- Check API endpoint connectivity
- Increase `--nproc` value if rate-limited (up to 32)
**4. Dataset Download Issues**
The dataset is automatically downloaded from:
```
https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv
```
If download fails, manually download and place in `--data-dir`.
## Advanced Usage
### Custom Image Resolution
Edit `run_mmmu.py` to modify image resolution:
```python
MIN_PIXELS = 1280*28*28 # ~1M pixels
MAX_PIXELS = 5120*28*28 # ~4M pixels
```
### Custom Evaluation Logic
The evaluation uses a two-stage approach:
1. **Rule-based extraction**: Fast pattern matching for clear answers
2. **Model-based extraction**: GPT judge for ambiguous answers
To customize, edit `eval_utils.py`:
- `can_infer_option()`: Modify option extraction rules
- `can_infer_text()`: Modify text matching logic
- `build_prompt()`: Customize judge prompt
### Debugging
Enable debug mode to process only 5 samples:
```bash
DEBUG=true python run_mmmu.py eval ...
```
## Citation
If you use this code or the MMMU benchmark, please cite:
```bibtex
@article{yue2023mmmu,
title={Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi},
author={Yue, Xiang and Ni, Yuansheng and Zhang, Kai and Zheng, Tianyu and Liu, Ruoqi and Zhang, Ge and Stevens, Samuel and Jiang, Dongfu and Ren, Weiming and Sun, Yuxuan and others},
journal={arXiv:2311.16502},
year={2023}
}
```
## License
This code is released under the same license as the Qwen3-VL model.
## Support
For issues and questions:
- GitHub Issues: [Qwen3-VL Repository](https://github.com/QwenLM/Qwen3-VL)
- Documentation: See inline code comments and docstrings
import os
import requests
import base64
import hashlib
import io
from PIL import Image
from typing import List, Union
def encode_image_to_base64(image, target_size=None):
"""Encode an image to base64 string."""
if target_size is not None:
width, height = image.size
# Resize the image while maintaining the aspect ratio
if width > height:
new_width = target_size
new_height = int(height * target_size / width)
else:
new_height = target_size
new_width = int(width * target_size / height)
image = image.resize((new_width, new_height))
buffer = io.BytesIO()
image.save(buffer, format="JPEG")
return base64.b64encode(buffer.getvalue()).decode('utf-8')
def decode_base64_to_image(base64_string):
"""Decode a base64 string to an image."""
image_data = base64.b64decode(base64_string)
return Image.open(io.BytesIO(image_data))
def decode_base64_to_image_file(base64_string, output_path):
"""Decode a base64 string and save it to a file."""
image = decode_base64_to_image(base64_string)
image.save(output_path)
def download_file(url, local_path):
"""Download a file from a URL to a local path."""
response = requests.get(url, stream=True)
response.raise_for_status()
with open(local_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
def md5(file_path):
"""Calculate the MD5 hash of a file."""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def toliststr(s):
if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'):
return [str(x) for x in eval(s)]
elif isinstance(s, str):
return [s]
elif isinstance(s, list):
return [str(x) for x in s]
raise NotImplementedError
\ No newline at end of file
import os
import pandas as pd
import numpy as np
from typing import Dict, Any
from common_utils import download_file, md5, toliststr, decode_base64_to_image_file
MMMU_DATASET_URL = 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv'
MMMU_DATASET_MD5 = '521afc0f3bf341e6654327792781644d'
def load_dataset(dataset_name='MMMU_DEV_VAL'):
"""Load the MMMU dataset."""
data_root = os.path.join(os.environ['LMUData'])
os.makedirs(data_root, exist_ok=True)
file_name = f"{dataset_name}.tsv"
data_path = os.path.join(data_root, file_name)
# Download if not exists or MD5 doesn't match
if not os.path.exists(data_path) or md5(data_path) != MMMU_DATASET_MD5:
print(f"Downloading {dataset_name} dataset...")
download_file(MMMU_DATASET_URL, data_path)
# Load the dataset
data = pd.read_csv(data_path, sep='\t')
# Process the dataset
data['index'] = [str(x) for x in data['index']]
# Handle image data
if 'image' in data:
data['image'] = [str(x) for x in data['image']]
image_map = {x: y for x, y in zip(data['index'], data['image'])}
for k in image_map:
if len(image_map[k]) <= 64:
idx = image_map[k]
assert idx in image_map and len(image_map[idx]) > 64
image_map[k] = image_map[idx]
images = [toliststr(image_map[k]) for k in data['index']]
data['image'] = [x[0] if len(x) == 1 else x for x in images]
# Handle image paths
if 'image_path' in data:
paths = [toliststr(x) for x in data['image_path']]
data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
# Convert index to int if possible
if np.all([isinstance(x, int) or x.isdigit() for x in data['index']]):
data['index'] = [int(x) for x in data['index']]
return data
def dump_image(line, img_root):
"""Save image data to disk and return the path."""
os.makedirs(img_root, exist_ok=True)
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = os.path.join(img_root, im_name)
if not os.path.exists(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = os.path.join(img_root, f"{line['index']}.jpg")
if not os.path.exists(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
def MMMU_preproc(data):
"""
Preprocess MMMU dataset to reformulate open questions to multi-choice ones.
This aligns with the implementation in multiple_choice.py
"""
print("Preprocessing MMMU dataset...")
cnt = 0
As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer'])
lt = len(data)
for i in range(lt):
if pd.isna(As[i]):
As[i] = Ans[i]
Bs[i] = 'Other Answers'
cnt += 1
print(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones.')
data['A'] = As
data['B'] = Bs
return data
\ No newline at end of file
#!/bin/bash
# MMMU Evaluation Script (Instruct Model)
# This script evaluates the inference results using a judge model
python run_mmmu.py eval \
--data-dir /path/to/mmmu_data \
--input-file results/mmmu_dev_val_predictions.jsonl \
--output-file results/mmmu_dev_val_eval_results.csv \
--dataset MMMU_DEV_VAL \
--eval-model gpt-3.5-turbo-0125 \
--api-type dash \
--nproc 16
\ No newline at end of file
#!/bin/bash
# MMMU Evaluation Script (Thinking Model)
# This script evaluates the inference results from thinking model using a judge model
python run_mmmu.py eval \
--data-dir /path/to/mmmu_data \
--input-file results/mmmu_dev_val_predictions_thinking.jsonl \
--output-file results/mmmu_dev_val_eval_results_thinking.csv \
--dataset MMMU_DEV_VAL \
--eval-model gpt-3.5-turbo-0125 \
--api-type dash \
--nproc 16
\ No newline at end of file
import os
import requests
import time
import random
import string
import copy
import traceback
import pandas as pd
from PIL import Image
from typing import List, Dict, Tuple, Any
from common_utils import encode_image_to_base64
class OpenAIWrapper:
"""Wrapper for OpenAI API."""
def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
self.model = model
self.api_base = api_base
self.api_key = api_key
self.timeout = timeout
self.retry = retry
self.wait = wait
self.fail_msg = 'Failed to obtain answer via API.'
def generate(self, messages):
"""Generate a response from the API."""
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
# Format messages for API
formatted_messages = []
for msg in messages:
if msg['type'] == 'text':
formatted_messages.append({"role": "user", "content": [{"type": "text", "text": msg['value']}]})
elif msg['type'] == 'image':
# Load and encode the image
image = Image.open(msg['value'])
image_data = encode_image_to_base64(image)
formatted_messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]
})
payload = {
"model": self.model,
"messages": formatted_messages,
"max_tokens": 4096,
"temperature": 0
}
for i in range(self.retry):
try:
response = requests.post(
self.api_base,
headers=headers,
json=payload,
timeout=self.timeout
)
if response.status_code == 200:
resp_json = response.json()
return resp_json['choices'][0]['message']['content'].strip()
time.sleep(self.wait)
except Exception as e:
print(f"API error: {e}")
time.sleep(self.wait)
return self.fail_msg
class DashScopeWrapper:
"""Wrapper for DashScope API."""
def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
self.model = model
self.api_base = api_base
self.api_key = api_key
self.timeout = timeout
self.retry = retry
self.wait = wait
self.fail_msg = 'Failed to obtain answer via API.'
def generate(self, messages):
"""Generate a response from the API."""
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
# Format messages for API
formatted_messages = []
for msg in messages:
if msg['type'] == 'text':
formatted_messages.append({"role": "user", "content": [{"type": "text", "text": msg['value']}]})
elif msg['type'] == 'image':
# Load and encode the image
image = Image.open(msg['value'])
image_data = encode_image_to_base64(image)
formatted_messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]
})
payload = {
"model": self.model,
"messages": formatted_messages,
"max_completion_tokens": 4096,
"n": 1,
"temperature": 0,
"stream": False
}
for i in range(self.retry):
try:
response = requests.post(
self.api_base,
headers=headers,
json=payload,
timeout=self.timeout
)
if response.status_code == 200:
resp_json = response.json()
# Check finish reason
for output in resp_json['choices']:
if output['finish_reason'] not in ['stop', 'function_call']:
print(f"DashScope finished with error: {resp_json}")
time.sleep(self.wait)
continue
return resp_json['choices'][0]['message']['content']
else:
print(f"DashScope API error: HTTP {response.status_code}")
try:
error_content = response.json()
print(f"Error details: {error_content}")
except:
print(f"Raw error content: {response.content.decode('utf-8', errors='replace')}")
time.sleep(self.wait)
except requests.exceptions.ConnectionError as conn_err:
print(f"DashScope: Connection error occurred: {conn_err}")
time.sleep(self.wait)
except requests.exceptions.Timeout as timeout_err:
print(f"DashScope: Timeout error occurred: {timeout_err}")
time.sleep(self.wait)
except requests.exceptions.RequestException as req_err:
print(f"DashScope: Request exception occurred: {req_err}")
time.sleep(self.wait)
except Exception as e:
print(f"DashScope: An error occurred: {e}")
print(traceback.format_exc())
time.sleep(self.wait)
return self.fail_msg
def build_judge(model, api_type):
"""Build a judge model for evaluation."""
if api_type == 'mit':
api_key = os.environ.get('MIT_SPIDER_TOKEN', '')
api_base = os.environ.get('MIT_SPIDER_URL', '')
return OpenAIWrapper(model, api_base, api_key)
elif api_type == 'dash':
api_key = os.environ.get('CHATGPT_DASHSCOPE_API_KEY', '')
api_base = os.environ.get('DASHSCOPE_API_BASE', '')
return DashScopeWrapper(model, api_base, api_key)
else:
raise ValueError(f"Unsupported API type: {api_type}")
def can_infer_option(answer, choices):
"""Rule-based extraction of answer option."""
if 'Failed to obtain answer via API' in answer:
return False
reject_to_answer = [
"Sorry, I can't help with images of people yet.",
"I can't process this file.",
"I'm sorry, but without the image provided",
'Cannot determine the answer'
]
for err in reject_to_answer:
if err in answer:
return 'Z'
def count_choice(splits, choices, prefix='', suffix=''):
cnt = 0
for c in choices:
if prefix + c + suffix in splits:
cnt += 1
return cnt
answer_mod = copy.copy(answer)
chars = '.()[],:;!*#{}'
for c in chars:
answer_mod = answer_mod.replace(c, ' ')
splits = [x.strip() for x in answer_mod.split()]
count = count_choice(splits, choices)
if count == 1:
for ch in choices:
if 'A' in splits and len(splits) > 3:
# print(f'A might be a quantifier in the string: {answer}.')
return False
if ch in splits:
return ch
elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
return 'Z'
return False
def can_infer_text(answer, choices):
"""Extract answer by matching text content."""
answer = answer.lower()
assert isinstance(choices, dict)
for k in choices:
assert k in string.ascii_uppercase
choices[k] = str(choices[k]).lower()
cands = []
for k in choices:
if choices[k] in answer:
cands.append(k)
if len(cands) == 1:
return cands[0]
return False
def can_infer(answer, choices):
"""Combined approach to infer answer choice."""
answer = str(answer)
copt = can_infer_option(answer, choices)
return copt if copt else can_infer_text(answer, choices)
def build_choices(item):
ret = {}
for ch in string.ascii_uppercase:
if ch in item and (not pd.isna(item[ch])):
ret[ch] = item[ch]
return ret
def build_option_str(option_dict):
s = 'There are several options: \n'
for c, content in option_dict.items():
if not pd.isna(content):
s += f'{c}. {content}\n'
return s
def build_prompt(question, options, prediction):
tmpl = (
'You are an AI assistant who will help me to match '
'an answer with several options of a single-choice question. '
'You are provided with a question, several options, and an answer, '
'and you need to find which option is most similar to the answer. '
'If the meaning of all options are significantly different from the answer, output Z. '
'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
'Example 1: \n'
'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
'Answer: a cute teddy bear\nYour output: A\n'
'Example 2: \n'
'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
'Answer: Spider\nYour output: Z\n'
'Example 3: \n'
'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
)
return tmpl.format(question, options, prediction)
def extract_answer_from_item(model, item, wait=5):
"""Extract answer from model prediction using rule-based and model-based approaches."""
# Build choices dictionary
choices = build_choices(item)
option_str = build_option_str(choices)
prompt = build_prompt(item['question'], option_str, item['prediction'])
# Try rule-based extraction first
prediction = item['prediction']
ret = can_infer(prediction, choices)
if ret:
if ret == 'Z':
extract_flag = False
log = f"Rule extract failed with rule result: {ret} prediction: {prediction}"
else:
extract_flag = True
log = f"Rule extract success with rule result: {ret} prediction: {prediction}"
return dict(opt=ret, log=log, extract_model='rule', extract_flag=extract_flag)
# If rule-based extraction fails, use model-based extraction
print(f"Rule extract failed. Use model-based extraction.")
if model is None:
assert model is not None, 'Judge model is None for MMMU_DEV_VAL !!!'
# Try model-based extraction with retries
retry = 25
while retry:
ans = model.generate([{"type": "text", "value": prompt}])
if 'Failed to obtain answer via API' in ans:
print('API failed to answer.')
else:
ret = can_infer(ans, choices)
if ret and ret != 'Z':
log = f'{model.model} extract Succeed. {model.model}:{ans}\n'
return dict(opt=ret, log=log, extract_model=model.model, extract_flag=True)
else:
print(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
retry -= 1
T = random.random() * wait * 2
time.sleep(T)
if retry == 0:
options = list(choices) + ['Z'] if 'Z' not in choices else list(choices)
log = f'{model.model} extract failed. randomly generate one. {model.model} response:{ans}\n'
return dict(opt=random.choice(options), log=log, extract_model=model.model, extract_flag=False)
def eval_single_sample(args):
"""Evaluate a single sample."""
model, item = args
# Extract answer using the combined approach
result = extract_answer_from_item(model, item)
# Determine if the answer is correct
hit = 1 if result['opt'] == item['GT'] else 0
return {
"index": item['index'],
"split": item['split'],
"question": item['question'],
"prediction": item['prediction'],
"extracted_answer": result['opt'],
"extraction_method": result['extract_model'],
"extraction_success": result['extract_flag'],
"extraction_log": result['log'],
"gt": item['GT'],
"hit": hit
}
\ No newline at end of file
#!/bin/bash
# MMMU Inference Script (Instruct Model)
# This script runs inference on the MMMU dataset using vLLM
python run_mmmu.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--data-dir /path/to/mmmu_data \
--dataset MMMU_DEV_VAL \
--output-file results/mmmu_dev_val_predictions.jsonl \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
#!/bin/bash
# MMMU Inference Script (Thinking Model)
# This script runs inference on the MMMU dataset using vLLM with thinking mode parameters
python run_mmmu.py infer \
--model-path /path/to/Qwen3-VL-Thinking \
--data-dir /path/to/mmmu_data \
--dataset MMMU_DEV_VAL \
--output-file results/mmmu_dev_val_predictions_thinking.jsonl \
--max-new-tokens 40960 \
--temperature 1.0 \
--top-p 0.95 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 0.0 \
--tensor-parallel-size 4
vllm
transformers
qwen_vl_utils
pandas
numpy
Pillow
tqdm
requests
validators
torch
torchvision
accelerate
flash_attn
\ No newline at end of file
import os
import sys
import json
import argparse
import pandas as pd
import numpy as np
import time
from tqdm import tqdm
from typing import List, Dict, Any
import torch
import warnings
import string
import traceback
# vLLM imports
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor
# Local imports from refactored files
from dataset_utils import load_dataset, dump_image, MMMU_preproc
from eval_utils import build_judge, eval_single_sample
# Set vLLM multiprocessing method
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
def build_mmmu_prompt(line, dump_image_func, dataset):
"""Build MMMU dataset prompt with standard resolution settings."""
# Standard resolution settings
MIN_PIXELS = 1280*28*28 # ~1M pixels
MAX_PIXELS = 5120*28*28 # ~4M pixels
tgt_path = dump_image_func(line)
question = line['question']
options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
prompt += 'Please select the correct answer from the options above. \n'
prompt = prompt.rstrip()
# Build messages in standard conversation format
content = []
if isinstance(tgt_path, list):
for p in tgt_path:
content.append({
"type": "image",
"image": p,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS
})
else:
content.append({
"type": "image",
"image": tgt_path,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS
})
content.append({"type": "text", "text": prompt})
# Return messages in standard conversation format
messages = [{
"role": "user",
"content": content
}]
return messages
def prepare_inputs_for_vllm(messages, processor):
"""
Prepare inputs for vLLM (following the examples in README.md).
Args:
messages: List of messages in standard conversation format
processor: AutoProcessor instance
Returns:
dict: Input format required by vLLM
"""
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# qwen_vl_utils 0.0.14+ required
image_inputs, video_inputs, video_kwargs = process_vision_info(
messages,
image_patch_size=processor.image_processor.patch_size,
return_video_kwargs=True,
return_video_metadata=True
)
mm_data = {}
if image_inputs is not None:
mm_data['image'] = image_inputs
if video_inputs is not None:
mm_data['video'] = video_inputs
return {
'prompt': text,
'multi_modal_data': mm_data,
'mm_processor_kwargs': video_kwargs
}
def run_inference(args):
"""Run inference on the MMMU dataset using vLLM."""
print("\n" + "="*80)
print("🚀 MMMU Inference with vLLM (High-Speed Mode)")
print("="*80 + "\n")
# Load dataset
data = load_dataset(args.dataset)
print(f"✓ Loaded {len(data)} samples from {args.dataset}")
# Set up image root directory
img_root = os.path.join(os.environ['LMUData'], 'images', 'MMMU')
os.makedirs(img_root, exist_ok=True)
# Set up dump_image function
def dump_image_func(line):
return dump_image(line, img_root)
# Create output directory
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
# Set up CoT prompt if enabled
cot_prompt = ""
if args.use_cot:
cot_prompt = args.cot_prompt if args.cot_prompt else " If you are uncertain or the problem is too complex, make a reasoned guess based on the information provided. Avoid repeating steps indefinitely—provide your best guess even if unsure. Determine whether to think step by step based on the difficulty of the question, considering all relevant information before answering."
print(f"✓ Using CoT prompt: {cot_prompt[:50]}...")
# Set up generation parameters (vLLM SamplingParams format)
sampling_params = SamplingParams(
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
max_tokens=args.max_new_tokens,
repetition_penalty=args.repetition_penalty,
presence_penalty=args.presence_penalty,
stop_token_ids=[],
)
print(f"\n⚙️ Generation parameters (vLLM SamplingParams):")
print(f" max_tokens={sampling_params.max_tokens}")
print(f" temperature={sampling_params.temperature}, top_p={sampling_params.top_p}, top_k={sampling_params.top_k}")
print(f" repetition_penalty={sampling_params.repetition_penalty}")
print(f" presence_penalty={sampling_params.presence_penalty}")
if sampling_params.presence_penalty > 0:
print(f" ✅ Anti-repetition enabled (presence_penalty={sampling_params.presence_penalty})")
if sampling_params.temperature <= 0.02 and sampling_params.top_k == 1:
print(f" ✅ Using FAST greedy-like decoding")
else:
print(f" ⚠️ Using sampling decoding (slower but more diverse)")
print()
# Load processor for input preparation
print(f"Loading processor from {args.model_path}")
processor = AutoProcessor.from_pretrained(args.model_path)
print("✓ Processor loaded\n")
# Initialize vLLM
print(f"Initializing vLLM with model: {args.model_path}")
print(f" GPU count: {torch.cuda.device_count()}")
print(f" Tensor parallel size: {args.tensor_parallel_size}")
llm = LLM(
model=args.model_path,
tensor_parallel_size=args.tensor_parallel_size,
gpu_memory_utilization=args.gpu_memory_utilization,
trust_remote_code=True,
max_model_len=args.max_model_len,
limit_mm_per_prompt={"image": args.max_images_per_prompt},
seed=42,
)
print("✓ vLLM initialized successfully\n")
# Prepare all inputs
print("Preparing inputs for vLLM...")
all_inputs = []
all_line_dicts = []
all_messages = []
for idx, (_, line) in enumerate(tqdm(data.iterrows(), total=len(data), desc="Building prompts")):
# Convert line to dict
line_dict = line.to_dict()
for k, v in line_dict.items():
if isinstance(v, np.integer):
line_dict[k] = int(v)
elif isinstance(v, np.floating):
line_dict[k] = float(v)
# Build prompt
messages = build_mmmu_prompt(line, dump_image_func, args.dataset)
# Add CoT prompt
if args.use_cot and len(messages) > 0 and len(messages[0]['content']) > 0:
last_content = messages[0]['content'][-1]
if last_content['type'] == 'text':
last_content['text'] += cot_prompt
# Prepare input for vLLM
vllm_input = prepare_inputs_for_vllm(messages, processor)
all_inputs.append(vllm_input)
all_line_dicts.append(line_dict)
all_messages.append(messages)
print(f"✓ Prepared {len(all_inputs)} inputs\n")
# Batch inference (vLLM automatic optimization)
print("="*80)
print("🚀 Running vLLM batch inference (automatic optimization)")
print("="*80)
start_time = time.time()
outputs = llm.generate(all_inputs, sampling_params=sampling_params)
end_time = time.time()
total_time = end_time - start_time
print(f"\n✓ Inference completed in {total_time:.2f} seconds")
print(f" Average: {total_time/len(data):.2f} seconds/sample")
print(f" Throughput: {len(data)/total_time:.2f} samples/second\n")
# Save results
print("Saving results...")
results = []
for idx, (line_dict, messages, output) in enumerate(zip(all_line_dicts, all_messages, outputs)):
response = output.outputs[0].text
index = line_dict['index']
response_final = str(response).split("</think>")[-1].strip()
result = {
"question_id": int(index) if isinstance(index, np.integer) else index,
"annotation": line_dict,
"task": args.dataset,
"result": {"gen": response_final, "gen_raw": response},
"messages": messages
}
results.append(result)
# Write final results
with open(args.output_file, 'w') as f:
for res in results:
f.write(json.dumps(res) + '\n')
print(f"\n✓ Results saved to {args.output_file}")
print(f"✓ Total samples processed: {len(results)}")
def run_evaluation(args):
"""Run evaluation on inference results."""
# Load results
results = []
with open(args.input_file, 'r') as f:
for line in f:
job = json.loads(line)
annotation = job["annotation"]
annotation["prediction"] = job["result"]["gen"]
results.append(annotation)
data = pd.DataFrame.from_records(results)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# If not choice label, then use lower case
for k in data.keys():
data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
# Load dataset
meta = load_dataset(args.dataset)
# Validation
print(f"len(data): {len(data)}")
print(f"len(meta): {len(meta)}")
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset MMMU_DEV_VAL'
)
answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
data = MMMU_preproc(data)
answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()}
data = data[data['index'].isin(answer_map)]
data['GT'] = [answer_map[idx] for idx in data['index']]
items = []
for i in range(len(data)):
item = data.iloc[i]
items.append(item)
# Build judge model
model = build_judge(
model=getattr(args, 'eval_model', 'gpt-3.5-turbo-0125'),
api_type=getattr(args, 'api_type', 'dash')
)
# Prepare evaluation tasks
eval_tasks = []
for item in items:
eval_tasks.append((model, item))
# Run evaluation
eval_results = []
# Debug mode: process single-threaded with first few samples
debug = os.environ.get('DEBUG', '').lower() == 'true'
if debug:
print("Running in debug mode with first 5 samples...")
for task in eval_tasks[:5]:
try:
result = eval_single_sample(task)
eval_results.append(result)
except Exception as e:
print(f"Error processing task: {e}")
print(f"Task details: {task}")
raise
else:
# Normal mode: process all samples with threading
from concurrent.futures import ThreadPoolExecutor
nproc = getattr(args, 'nproc', 4)
with ThreadPoolExecutor(max_workers=nproc) as executor:
for result in tqdm(executor.map(eval_single_sample, eval_tasks),
total=len(eval_tasks), desc="Evaluating"):
eval_results.append(result)
# Calculate overall accuracy
accuracy = sum(r['hit'] for r in eval_results) / len(eval_results)
# Calculate accuracy by split
results_by_split = {}
for result in eval_results:
split = result.get('split', 'unknown')
if split not in results_by_split:
results_by_split[split] = []
results_by_split[split].append(result)
accuracy_by_split = {}
for split, split_results in results_by_split.items():
split_accuracy = sum(r['hit'] for r in split_results) / len(split_results)
accuracy_by_split[split] = split_accuracy
print(f"Accuracy for {split} split: {split_accuracy:.4f} ({sum(r['hit'] for r in split_results)}/{len(split_results)})")
# Save results
output_df = pd.DataFrame(eval_results)
output_df.to_csv(args.output_file, index=False)
# Save accuracy
with open(args.output_file.replace('.csv', '_acc.json'), 'w') as f:
json.dump({
"overall_accuracy": accuracy,
"accuracy_by_split": accuracy_by_split
}, f, indent=2)
print(f"\n{'='*50}")
print(f"Evaluation Results:")
print(f"{'='*50}")
print(f"Overall accuracy: {accuracy:.4f}")
print(f"{'='*50}\n")
def main():
parser = argparse.ArgumentParser(description="MMMU Evaluation with vLLM")
subparsers = parser.add_subparsers(dest='command', help='Command to run')
# Inference parser
infer_parser = subparsers.add_parser("infer", help="Run inference with vLLM")
infer_parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
infer_parser.add_argument("--dataset", type=str, default="MMMU_DEV_VAL", help="Dataset name")
infer_parser.add_argument("--data-dir", type=str, help="The absolute path of MMMU_DEV_VAL.tsv")
infer_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
infer_parser.add_argument("--use-cot", action="store_true", help="Use Chain-of-Thought prompting")
infer_parser.add_argument("--cot-prompt", type=str, default="", help="Custom Chain-of-Thought prompt")
# vLLM specific parameters
infer_parser.add_argument("--tensor-parallel-size", type=int, default=None,
help="Tensor parallel size (default: number of GPUs)")
infer_parser.add_argument("--gpu-memory-utilization", type=float, default=0.9,
help="GPU memory utilization (0.0-1.0, default: 0.9)")
infer_parser.add_argument("--max-model-len", type=int, default=128000,
help="Maximum model context length (default: 128000, balance between performance and memory)")
infer_parser.add_argument("--max-images-per-prompt", type=int, default=10,
help="Maximum images per prompt (default: 10)")
# Generation parameters
infer_parser.add_argument("--max-new-tokens", type=int, default=32768,
help="Maximum number of tokens to generate (default: 2048)")
infer_parser.add_argument("--temperature", type=float, default=0.7,
help="Temperature for sampling (default: 0.7 for greedy-like decoding)")
infer_parser.add_argument("--top-p", type=float, default=0.8,
help="Top-p for sampling (default: 0.8 for greedy-like decoding)")
infer_parser.add_argument("--top-k", type=int, default=20,
help="Top-k for sampling (default: 20 for greedy decoding)")
infer_parser.add_argument("--repetition-penalty", type=float, default=1.0,
help="Repetition penalty (default: 1.0, increase to 1.2-1.5 to reduce repetition)")
infer_parser.add_argument("--presence-penalty", type=float, default=1.5,
help="Presence penalty (default: 1.5, range: 0.0-2.0, penalize tokens that have already appeared)")
# Evaluation parser
eval_parser = subparsers.add_parser("eval", help="Run evaluation")
eval_parser.add_argument("--data-dir", type=str, help="The absolute path of MMMU_DEV_VAL.tsv")
eval_parser.add_argument("--input-file", type=str, required=True, help="Input file with inference results")
eval_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
eval_parser.add_argument("--dataset", type=str, default="MMMU_DEV_VAL", help="Dataset name")
eval_parser.add_argument("--eval-model", type=str, default="gpt-3.5-turbo-0125",
help="Model to use for evaluation (default: gpt-3.5-turbo-0125)")
eval_parser.add_argument("--api-type", type=str, default="dash", choices=["dash", "mit"],
help="API type for evaluation")
eval_parser.add_argument("--nproc", type=int, default=4, help="Number of processes to use")
args = parser.parse_args()
# Set data directory if provided
if hasattr(args, 'data_dir') and args.data_dir:
os.environ['LMUData'] = args.data_dir
# Automatically set tensor_parallel_size
if args.command == 'infer' and args.tensor_parallel_size is None:
args.tensor_parallel_size = torch.cuda.device_count()
print(f"Auto-set tensor_parallel_size to {args.tensor_parallel_size}")
if args.command == 'infer':
run_inference(args)
elif args.command == 'eval':
run_evaluation(args)
else:
parser.print_help()
if __name__ == "__main__":
main()
icon.png

50.3 KB

# 模型唯一标识
modelCode=1858
# 模型名称
modelName=Qwen3-VL_pytorch
# 模型描述
modelDescription=Qwen3-VL这一代产品在各方面都进行了全面升级,迄今为止 是Qwen 系列中最强大的视觉语言模型。
# 运行过程
processType=推理
# 算法类别
appCategory=多模态
# 框架类型
frameType=pytorch
# 加速卡类型
accelerateType=BW1000
# QwenVL Training Framework
This repository provides a training framework for Qwen VL models. The are two steps to use our repo:
1. Customize your dataset: downloading data, implement the config
2. Modify training scripts:
## Repository Structure
The `qwenvl` directory contains the following components:
### `train/`
- `trainer.py`: Main trainer updated from Huggingface Trainer
- `train_qwen.py`: Main file for training
- `argument.py`: Dataclasses for model, data and training arguments
### `data/`
- `__init__.py`: Contains datasets configs
- `data_processor.py`: Data processing module for QwenVL models
- `rope2d.py`: Provide RoPE implementation
### `tools`
- `process_bbox.ipynb`: Convert bbox into QwenVL format. If you have grounding data, please refer this file to tranform your data.
- `pack_data.py`: Pack data into even length buckets.
## Requirements
You could use follow version of packages:
- `torch==2.6.0`
- `torchvision==0.21.0`
- `transformers==4.57.0.dev0`
- `deepspeed==0.17.1`
- `flash_attn==2.7.4.post1`
- `triton==3.2.0`
- `accelerate==1.7.0`
- `torchcodec==0.2`
- `peft==0.17.1`
## Custom Dataset Configuration
The customized data should have the format like:
### JSON Data Structure
**Media Specification**:
- `image/video`: Contains path to the media file (required)
- Media tags in prompts:
- `<image>` for image understanding tasks
- `<video>` for video understanding tasks
- `conversations`: contains the questions and answers
### Example Instances:
1. **Single Image Example**:
```json
{
"image": "images/001.jpg",
"conversations": [
{
"from": "human",
"value": "<image>\nWhat's the main object in this picture?"
},
{
"from": "gpt",
"value": "A red apple on a wooden table"
}
]
}
```
2. **Multi-Image Example**:
```json
{
"image": ["cats/001.jpg", "cats/002.jpg"],
"conversations": [
{
"from": "human",
"value": "<image>\n<image>\nWhat are the differences between these two cats?"
},
{
"from": "gpt",
"value": "The first cat is an orange tabby with short fur and green eyes, while the second is a gray Siamese with blue eyes and pointed coloration. They also appear to be in different environments - the first is indoors on a couch, the second is outdoors in a garden."
}
]
}
```
3. **Video Example**:
```json
{
"video": "videos/005.mp4",
"conversations": [
{
"from": "human",
"value": "<video>\nWhat caused the blue object to move?\nOptions:\n(A) Gravity\n(B) Collision\n(C) Magnetic force"
},
{
"from": "gpt",
"value": "Answer: (B) Collision"
}
]
}
```
4. **Grounding Example**:
```json
{
"image": "demo/COCO_train2014_000000580957.jpg",
"conversations": [
{
"from": "human",
"value": "<image>\nLocate house in this image and output the bbox coordinates in JSON format."
},
{
"from": "gpt",
"value": "{\n"bbox_2d": [135, 114, 1016, 672]\n}"
}
]
}
```
5. **Packed Data Example**:
```json
[
{
"image": "images/001.jpg",
"conversations": [
{
"from": "human",
"value": "<image>\nWhat's the main object in this picture?"
},
{
"from": "gpt",
"value": "A red apple on a wooden table"
}
]
},
{
"image": "images/002.jpg",
"conversations": [
{
"from": "human",
"value": "<image>\nWhat's the main object in this picture?"
},
{
"from": "gpt",
"value": "A green orange on a plastic table"
}
]
}
]
```
Some examples are shown in `demo/single_images.json` and `demo/video.json` and these json files could be used for training.
### Dataset config for training
To add or modify datasets for training, follow these steps:
### Dataset Definition Structure
1. **Create a dataset dictionary** in the format in the file `data/__init__.py`:
```python
DATASET_NAME = {
"annotation_path": "/path/to/annotations.json",
"data_path": "/path/to/image/data", # Can be empty if paths are in annotations
}
```
2. **Register your dataset** by adding it to the `data_dict`:
```python
data_dict = {
"your_dataset_name": DATASET_NAME,
# ... other datasets
}
```
### Sampling Rate Control
You can optionally specify sampling rates by appending `%X` to the dataset name:
- `"dataset_name%50"` will sample 50% of the data
- `"dataset_name%20"` will sample 20% of the data
### Usage Example
1. Define your dataset:
```python
MY_DATASET = {
"annotation_path": "/data/my_dataset/annotations.json",
"data_path": "/data/my_dataset/images/",
}
data_dict = {
"my_dataset": MY_DATASET,
"cambrian_737k": CAMBRIAN_737K, # existing dataset
}
```
2. Use it in training:
```python
dataset_names = ["my_dataset%50"] # Will use 50% of your dataset
configs = data_list(dataset_names)
```
### Notes
- The `annotation_path` should point to a JSON or JSONL file containing your dataset annotations.
- The `data_path` can be left empty if the image paths in the annotations are absolute.
- Sampling rates are applied per-dataset when multiple datasets are specified.
- Some datasets you can use directly: `nyu-visionx/Cambrian-10M`, `lmms-lab/LLaVA-NeXT-Data`, `FreedomIntelligence/ALLaVA-4V`, `TIGER-Lab/VisualWebInstruct`.
- The training data should strictly follow this format:
- One `<image>` tag in the question must correspond to exactly one image file
- Similarly, `<video>` tags must correspond to video files
- These special tokens should not appear in the answer text
- For open source data that might have missing images or other issues, you can verify data completeness using `tools/check_image.py`.
## Usage
To train a model:
```bash
#!/bin/bash
# Complete QwenVL Training Launch Script with Full Parameter Documentation
# ======================
# Distributed Configuration
# ======================
MASTER_ADDR="127.0.0.1" # [Required] Master node IP for multi-GPU training
MASTER_PORT=$(shuf -i 20000-29999 -n 1) # Random port to avoid conflicts
NPROC_PER_NODE=$(nvidia-smi --list-gpus | wc -l) # Automatically detects available GPUs
# ======================
# Path Configuration
# ======================
MODEL_PATH="/path/to/Qwen2.5-VL-3B-Instruct" # [ModelArguments] Pretrained model path
OUTPUT_DIR="./checkpoints" # Directory for saving checkpoints
CACHE_DIR="./cache" # [TrainingArguments] Cache directory for models
# ======================
# Model Configuration
# ======================
DATASETS="your_dataset%100" # [DataArguments] Dataset with sampling rate
# ======================
# Training Hyperparameters
# ======================
torchrun --nproc_per_node=$NPROC_PER_NODE \
--master_addr=$MASTER_ADDR \
--master_port=$MASTER_PORT \
qwenvl/train/train_qwen.py \
# Core Arguments
--model_name_or_path $MODEL_PATH \ # [ModelArguments] Model identifier
--tune_mm_llm True \ # [TrainingArguments] Train LLM or not
--tune_mm_vision False \ # [TrainingArguments] Train VIT or not
--tune_mm_mlp False \ # [TrainingArguments] Train MLP or not
--dataset_use $DATASETS \ # [DataArguments] Dataset specification
--output_dir $OUTPUT_DIR \ # Output directory for checkpoints
--cache_dir $CACHE_DIR \ # [TrainingArguments] Model cache location
# Precision & Memory
--bf16 \ # Use bfloat16 precision (Ampere+ GPUs)
--per_device_train_batch_size 4 \ # Batch size per GPU
--gradient_accumulation_steps 4 \ # Effective batch size multiplier
# Learning Rate Configuration
--learning_rate 2e-7 \ # Base learning rate
--mm_projector_lr 1e-5 \ # [TrainingArguments] Projector-specific LR
--vision_tower_lr 1e-6 \ # [TrainingArguments] Vision encoder LR
--optim adamw_torch \ # [TrainingArguments] Optimizer selection
# Sequence Configuration
--model_max_length 4096 \ # [TrainingArguments] Max sequence length
--data_flatten True \ # [DataArguments] Concatenate batch sequences
--data_packing True \ # [DataArguments] Using packing data
# Image Processing
--max_pixels 576\*28\*28 \ # [DataArguments] Max image pixels (H*W) for image
--min_pixels 16\*28\*28 \ # [DataArguments] Min image pixels for image
# Video Processing
--video_fps 2 \ # [DataArguments] video fps
--video_max_frames 8 \ # [DataArguments] Max frames per video
--video_min_frames 4 \ # [DataArguments] Min frames per video
--video_max_pixels 1664\*28\*28 \ # [DataArguments] Max pixels per video
--video_min_pixels 256\*28\*28 \ # [DataArguments] Min pixels per video
# Training Schedule
--num_train_epochs 3 \ # Total training epochs
--warmup_ratio 0.03 \ # LR warmup proportion
--lr_scheduler_type "cosine" \ # Learning rate schedule
--weight_decay 0.01 \ # L2 regularization strength
# Logging & Checkpoints
--logging_steps 10 \ # Log metrics interval
--save_steps 500 \ # Checkpoint save interval
--save_total_limit 3 \ # Max checkpoints to keep
# Lora Config
--lora_enable True \ # [TrainingArguments] Enable LoRA
--lora_r 8 \ # [TrainingArguments] LoRA r
--lora_alpha 16 \ # [TrainingArguments] LoRA alpha
--lora_dropout 0.0 \ # [TrainingArguments] LoRA dropout
# Advanced Options
--deepspeed zero3.json \ # DeepSpeed configuration
```
The script accepts arguments in three categories:
- Flags to control which components to tune (`tune_mm_vision`, `tune_mm_mlp`, `tune_mm_llm`). If trained with both image and video data, tune_mm_vision should be False: `tune_mm_vision=False`
- `data_flatten` flag means data in a batch are concat into one sequence
- `data_packing` requires preprocess with `tools/pack_data.py`
- Training hyperparameters, the suggested learning rate is from 1e-6 to 2e-7
- Training resolution is critical for the model performances, hence `--max_pixels` and `--min_pixels` should be properly set
- Training with Qwen2.5-VL-32B model, you should have 8 80G GPU refering to `scripts/sft_32b.sh`
- `"_attn_implementation": "flash_attention_2",` could be add in the config.json of the model to use flash attention.
- The Qwen3VL MoE model does not support DeepSpeed with ZeRO-3. Additionally, Hugging Face’s official implementation does not include support for load balancing loss currently.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment