Commit 3d735feb authored by luopl's avatar luopl
Browse files

"Initial commit"

parents
Pipeline #3074 canceled with stages
import os
import requests
import time
import random
import copy
import traceback
import pandas as pd
from PIL import Image
from typing import List, Dict, Tuple, Any
from common_utils import encode_image_to_base64
from collections import defaultdict
try:
from latex2sympy2 import latex2sympy
except ImportError:
print('Warning: latex2sympy2 not installed. Install with: pip install latex2sympy2')
latex2sympy = None
FAIL_MSG = 'Failed to obtain answer via API.'
def is_equal(asw: str, gt_asw: str) -> bool:
"""Check if two answers are equal."""
if not isinstance(asw, str) or not isinstance(gt_asw, str):
print('Warning: input is not string')
print(asw, gt_asw)
asw = str(asw).lower().strip()
gt_asw = str(gt_asw).lower().strip()
if gt_asw == asw:
return True
try:
a = eval(gt_asw)
b = eval(asw)
if abs(a - b) < 1e-6:
return True
except:
pass
if latex2sympy is not None:
try:
a = latex2sympy(gt_asw)
b = latex2sympy(asw)
if abs(eval(str(a)) - eval(str(b))) < 1e-6:
return True
if abs(a - b) < 1e-6:
return True
except:
pass
return False
def get_gpt4_ICE():
"""Get in-context examples for GPT-4 answer extraction."""
example_1 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: Which number is missing?\n
Model response: The number missing in the sequence is 14.\n
Extracted answer: 14
"""
example_2 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: What is the fraction of females facing the camera?\n
Model response: The fraction of females facing the camera is 0.6,
which means that six out of ten females in the group are facing the camera.\n
Extracted answer: 0.6
"""
example_3 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
Extracted answer: 1.45
"""
example_4 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: Between which two years does the line graph saw its maximum peak?\n
Model response: The line graph saw its maximum peak between 2007 and 2008.\n
Extracted answer: [2007, 2008]
"""
example_5 = """
Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
Question: What fraction of the shape is blue?\n
Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
Model response: The correct answer is (B) 8/11.\n
Extracted answer: B
"""
return [example_1, example_2, example_3, example_4, example_5]
def build_mathv_gpt4_prompt(line):
"""Build the prompt for GPT-4 to extract answer from model response."""
task_description = """
Please read the following example.
Then extract the answer from the model response and type it at the end of the prompt.\n
"""
question = line['question']
prediction = str(line['prediction'])
prompt = task_description
examples = get_gpt4_ICE()
for example in examples:
prompt += example + '\n'
prompt += question + '\n'
prompt += 'Model response: ' + prediction + '\n'
prompt += 'Extracted answer: '
return prompt
def list_to_dict(lst):
"""Convert list to dictionary with uppercase letters as keys."""
return {chr(65 + i): val for i, val in enumerate(lst)}
def can_infer_option(answer, choices):
"""Rule-based extraction of answer option."""
if FAIL_MSG in answer:
return False
reject_to_answer = [
"Sorry, I can't help with images of people yet.",
"I can't process this file.",
"I'm sorry, but without the image provided",
'Cannot determine the answer'
]
for err in reject_to_answer:
if err in answer:
return 'Z'
def count_choice(splits, choices, prefix='', suffix=''):
cnt = 0
for c in choices:
if prefix + c + suffix in splits:
cnt += 1
return cnt
answer_mod = copy.copy(answer)
chars = '.()[],:;!*#{}'
for c in chars:
answer_mod = answer_mod.replace(c, ' ')
splits = [x.strip() for x in answer_mod.split()]
count = count_choice(splits, choices)
if count == 1:
for ch in choices:
if 'A' in splits and len(splits) > 3:
return False
if ch in splits:
return ch
elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
return 'Z'
return False
def can_infer_text(answer, choices):
"""Extract answer by matching text content."""
answer = answer.lower()
assert isinstance(choices, dict)
for k in choices:
choices[k] = str(choices[k]).lower()
cands = []
for k in choices:
if choices[k] in answer:
cands.append(k)
if len(cands) == 1:
return cands[0]
return False
def can_infer(answer, choices):
"""Combined approach to infer answer choice."""
answer = str(answer)
copt = can_infer_option(answer, choices)
return copt if copt else can_infer_text(answer, choices)
def post_check(line, prefetch=False):
"""Check if the prediction matches the answer."""
res = None
ans = line['answer']
response = line['prediction'] if prefetch else line['res']
try:
if len(eval(line['choices'])) > 0:
ans = line['answer']
choices = list_to_dict(eval(line['choices']))
res = can_infer(response, choices)
if prefetch:
return res
else:
res = str(response)
ans = str(ans)
except ValueError:
pass
if is_equal(res, ans):
return res if prefetch else True
else:
return False
class OpenAIWrapper:
"""Wrapper for OpenAI API."""
def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
self.model = model
self.api_base = api_base
self.api_key = api_key
self.timeout = timeout
self.retry = retry
self.wait = wait
self.fail_msg = FAIL_MSG
def generate(self, prompt, temperature=0):
"""Generate a response from the API."""
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
payload = {
"model": self.model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 4096,
"temperature": temperature
}
for i in range(self.retry):
try:
response = requests.post(
self.api_base,
headers=headers,
json=payload,
timeout=self.timeout
)
if response.status_code == 200:
resp_json = response.json()
return resp_json['choices'][0]['message']['content'].strip()
time.sleep(self.wait)
except Exception as e:
print(f"API error: {e}")
time.sleep(self.wait)
return self.fail_msg
class DashScopeWrapper:
"""Wrapper for DashScope API."""
def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
self.model = model
self.api_base = api_base
self.api_key = api_key
self.timeout = timeout
self.retry = retry
self.wait = wait
self.fail_msg = FAIL_MSG
def generate(self, prompt, temperature=0):
"""Generate a response from the API."""
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
payload = {
"model": self.model,
"messages": [{"role": "user", "content": prompt}],
"max_completion_tokens": 4096,
"n": 1,
"temperature": temperature,
"stream": False
}
for i in range(self.retry):
try:
response = requests.post(
self.api_base,
headers=headers,
json=payload,
timeout=self.timeout
)
if response.status_code == 200:
resp_json = response.json()
# Check finish reason
for output in resp_json['choices']:
if output['finish_reason'] not in ['stop', 'function_call']:
print(f"DashScope finished with error: {resp_json}")
time.sleep(self.wait)
continue
return resp_json['choices'][0]['message']['content']
else:
print(f"DashScope API error: HTTP {response.status_code}")
try:
error_content = response.json()
print(f"Error details: {error_content}")
except:
print(f"Raw error content: {response.content.decode('utf-8', errors='replace')}")
time.sleep(self.wait)
except Exception as e:
print(f"DashScope error: {e}")
time.sleep(self.wait)
return self.fail_msg
def build_judge(model, api_type):
"""Build a judge model for evaluation."""
if api_type == 'mit':
api_key = os.environ.get('MIT_SPIDER_TOKEN', '')
api_base = os.environ.get('MIT_SPIDER_URL', '')
return OpenAIWrapper(model, api_base, api_key)
elif api_type == 'dash':
api_key = os.environ.get('CHATGPT_DASHSCOPE_API_KEY', '')
api_base = os.environ.get('DASHSCOPE_API_BASE', '')
return DashScopeWrapper(model, api_base, api_key)
else:
raise ValueError(f"Unsupported API type: {api_type}")
def MATH_V_auxeval(args):
"""Auxiliary evaluation for MathVision - extract answer from model response."""
model, line = args
prompt = build_mathv_gpt4_prompt(line)
log = ''
retry = 5
# Try rule-based extraction first
if post_check(line, prefetch=True):
res = post_check(line, prefetch=True)
log += 'Prefetch succeed.\n'
extract_flag = True
if not res or res == 'Z':
extract_flag = False
log += f'Rule extract failed with ans: {res}'
else:
log += f'Rule extract success with ans: {res}'
return dict(log=log, res=res, extract_model='rule', extract_flag=extract_flag)
# Use model-based extraction
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += f'{model.model} extract Succeed.\n'
return dict(log=log, res=res, extract_model=model.model, extract_flag=True)
log += f'All {retry} retries failed.\n {model.model} response:{res}'
return dict(log=log, res='', extract_model=model.model, extract_flag=False)
def MATH_V_acc(result_file):
"""Calculate accuracy for MathVision results."""
data = pd.read_excel(result_file) if result_file.endswith('.xlsx') else pd.read_csv(result_file)
tot = defaultdict(lambda: 0)
fetch = defaultdict(lambda: 0)
hit = defaultdict(lambda: 0)
lt = len(data)
extract_counts = {}
for i in range(lt):
item = data.iloc[i]
cate = item['category']
tot['Overall'] += 1
tot[cate] += 1
if 'Prefetch succeed' in item['log']:
fetch['Overall'] += 1
fetch[cate] += 1
if post_check(item, prefetch=False):
hit['Overall'] += 1
hit[cate] += 1
# Statistics of answers extracted by rule and gpt
extract_model = item['extract_model']
extract_flag = item['extract_flag']
if extract_model in extract_counts:
extract_counts[extract_model][1] += 1
else:
extract_counts[extract_model] = [0, 1] # succeed, total
if extract_flag:
extract_counts[extract_model][0] += 1
res = defaultdict(list)
for k in tot.keys():
res['Subject'].append(k)
res['tot'].append(tot[k])
res['prefetch'].append(fetch[k])
res['hit'].append(hit[k])
res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
res['acc'].append(hit[k] / tot[k] * 100)
if k == 'Overall':
for model_key in extract_counts:
res[model_key+'_success'].append(extract_counts[model_key][0])
res[model_key+'_all'].append(extract_counts[model_key][1])
else:
for model_key in extract_counts:
res[model_key+'_success'].append(0)
res[model_key+'_all'].append(0)
res = pd.DataFrame(res).sort_values('Subject', ignore_index=True)
return res
def eval_single_sample(args):
"""Evaluate a single sample."""
return MATH_V_auxeval(args)
#!/bin/bash
# MathVision Inference Script (Instruct Model)
# This script runs inference on the MathVision dataset using vLLM
python run_mathv.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--data-dir /path/to/mathvision_data \
--dataset MathVision \
--output-file results/mathvision_predictions.jsonl \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
# --num-samples 100
\ No newline at end of file
#!/bin/bash
# MathVision Inference Script (Thinking Model)
# This script runs inference on the MathVision dataset using vLLM with thinking mode parameters
python run_mathv.py infer \
--model-path /path/to/Qwen3-VL-Thinking \
--data-dir /path/to/mathvision_data \
--dataset MathVision \
--output-file results/mathvision_predictions_thinking.jsonl \
--max-new-tokens 40960 \
--temperature 1.0 \
--top-p 0.95 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 0.0
# --num-samples 100
\ No newline at end of file
vllm
transformers
qwen_vl_utils
pandas
numpy
Pillow
# Utilities
tqdm
requests
validators
torch
torchvision
accelerate
openpyxl
latex2sympy2
flash_attn
\ No newline at end of file
import os
import sys
import json
import argparse
import pandas as pd
import numpy as np
import time
import re
from tqdm import tqdm
from typing import List, Dict, Any
import torch
import warnings
import traceback
# vLLM imports
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor
# Local imports from refactored files
from dataset_utils import load_dataset, dump_image
from eval_utils import build_judge, eval_single_sample, MATH_V_acc
# Set vLLM multiprocessing method
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
def clean_for_excel(val):
"""
Remove characters that are illegal in Excel cells.
Excel doesn't support control characters (0x00-0x1F) except tab, newline, carriage return.
"""
if isinstance(val, str):
# Remove control characters (0x00-0x1F) except tab(0x09), newline(0x0A), carriage return(0x0D)
return re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', '', val)
return val
def clean_dataframe_for_excel(df):
"""Clean all string columns in a DataFrame for Excel compatibility."""
return df.applymap(clean_for_excel) if hasattr(df, 'applymap') else df.map(clean_for_excel)
def build_mathv_prompt(line, dump_image_func, dataset):
"""
Build MathVision dataset prompt.
"""
# Standard resolution (MathVision uses smaller min_pixels)
MIN_PIXELS = 768*28*28 # ~0.6M pixels
MAX_PIXELS = 5120*28*28 # ~4M pixels
tgt_path = dump_image_func(line)
question = line['question']
# Build messages in standard conversation format
content = []
# Add all images first
if isinstance(tgt_path, list):
for p in tgt_path:
content.append({
"type": "image",
"image": p,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS
})
else:
content.append({
"type": "image",
"image": tgt_path,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS
})
# Add question text last
content.append({"type": "text", "text": question})
# Return messages in standard conversation format
messages = [{
"role": "user",
"content": content
}]
return messages
def prepare_inputs_for_vllm(messages, processor):
"""
Prepare inputs for vLLM (following the examples in README.md).
Args:
messages: List of messages in standard conversation format
processor: AutoProcessor instance
Returns:
dict: Input format required by vLLM
"""
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# qwen_vl_utils 0.0.14+ required
image_inputs, video_inputs, video_kwargs = process_vision_info(
messages,
image_patch_size=processor.image_processor.patch_size,
return_video_kwargs=True,
return_video_metadata=True
)
mm_data = {}
if image_inputs is not None:
mm_data['image'] = image_inputs
if video_inputs is not None:
mm_data['video'] = video_inputs
return {
'prompt': text,
'multi_modal_data': mm_data,
'mm_processor_kwargs': video_kwargs
}
def run_inference(args):
"""Run inference on the MathVision dataset using vLLM."""
print("\n" + "="*80)
print("🚀 MathVision Inference with vLLM (High-Speed Mode)")
print("="*80 + "\n")
# Load dataset
data = load_dataset(args.dataset)
# Limit number of samples if specified
if args.num_samples is not None and args.num_samples > 0:
original_len = len(data)
data = data.iloc[:args.num_samples]
print(f"✓ Loaded {len(data)} samples from {args.dataset} (limited from {original_len} samples)")
else:
print(f"✓ Loaded {len(data)} samples from {args.dataset}")
# Set up image root directory
img_root = os.path.join(os.environ['LMUData'], 'images', args.dataset)
os.makedirs(img_root, exist_ok=True)
# Set up dump_image function
def dump_image_func(line):
return dump_image(line, img_root)
# Create output directory
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
# Set up CoT prompt if enabled
cot_prompt = ""
if args.use_cot:
cot_prompt = args.cot_prompt if args.cot_prompt else " Let's think step by step."
print(f"✓ Using CoT prompt: {cot_prompt[:50]}...")
# Set up generation parameters (vLLM SamplingParams format)
sampling_params = SamplingParams(
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
max_tokens=args.max_new_tokens,
repetition_penalty=args.repetition_penalty,
presence_penalty=args.presence_penalty,
stop_token_ids=[],
)
print(f"\n⚙️ Generation parameters (vLLM SamplingParams):")
print(f" max_tokens={sampling_params.max_tokens}")
print(f" temperature={sampling_params.temperature}, top_p={sampling_params.top_p}, top_k={sampling_params.top_k}")
print(f" repetition_penalty={sampling_params.repetition_penalty}")
print(f" presence_penalty={sampling_params.presence_penalty}")
if sampling_params.presence_penalty > 0:
print(f" ✅ Anti-repetition enabled (presence_penalty={sampling_params.presence_penalty})")
if sampling_params.temperature <= 0.02 and sampling_params.top_k == 1:
print(f" ✅ Using FAST greedy-like decoding")
else:
print(f" ⚠️ Using sampling decoding (slower but more diverse)")
print()
# Load processor for input preparation
print(f"Loading processor from {args.model_path}")
processor = AutoProcessor.from_pretrained(args.model_path)
print("✓ Processor loaded\n")
# Initialize vLLM
print(f"Initializing vLLM with model: {args.model_path}")
print(f" GPU count: {torch.cuda.device_count()}")
print(f" Tensor parallel size: {args.tensor_parallel_size}")
llm = LLM(
model=args.model_path,
tensor_parallel_size=args.tensor_parallel_size,
gpu_memory_utilization=args.gpu_memory_utilization,
trust_remote_code=True,
max_model_len=args.max_model_len,
limit_mm_per_prompt={"image": args.max_images_per_prompt},
seed=42,
)
print("✓ vLLM initialized successfully\n")
# Prepare all inputs
print("Preparing inputs for vLLM...")
all_inputs = []
all_line_dicts = []
all_messages = []
for idx, (_, line) in enumerate(tqdm(data.iterrows(), total=len(data), desc="Building prompts")):
# Convert line to dict
line_dict = line.to_dict()
for k, v in line_dict.items():
if isinstance(v, np.integer):
line_dict[k] = int(v)
elif isinstance(v, np.floating):
line_dict[k] = float(v)
# Build prompt
messages = build_mathv_prompt(line, dump_image_func, args.dataset)
# Add CoT prompt
if args.use_cot and len(messages) > 0 and len(messages[0]['content']) > 0:
last_content = messages[0]['content'][-1]
if last_content['type'] == 'text':
last_content['text'] += cot_prompt
# Prepare input for vLLM
vllm_input = prepare_inputs_for_vllm(messages, processor)
all_inputs.append(vllm_input)
all_line_dicts.append(line_dict)
all_messages.append(messages)
print(f"✓ Prepared {len(all_inputs)} inputs\n")
# Batch inference (vLLM automatic optimization)
print("="*80)
print("🚀 Running vLLM batch inference (automatic optimization)")
print("="*80)
start_time = time.time()
outputs = llm.generate(all_inputs, sampling_params=sampling_params)
end_time = time.time()
total_time = end_time - start_time
print(f"\n✓ Inference completed in {total_time:.2f} seconds")
print(f" Average: {total_time/len(data):.2f} seconds/sample")
print(f" Throughput: {len(data)/total_time:.2f} samples/second\n")
# Save results
print("Saving results...")
results = []
for idx, (line_dict, messages, output) in enumerate(zip(all_line_dicts, all_messages, outputs)):
response = output.outputs[0].text
index = line_dict['index']
response_final = str(response).split("</think>")[-1].strip()
result = {
"question_id": int(index) if isinstance(index, np.integer) else index,
"annotation": line_dict,
"task": args.dataset,
"result": {"gen": response_final, "gen_raw": response},
"messages": messages
}
results.append(result)
# Write final results
with open(args.output_file, 'w') as f:
for res in results:
f.write(json.dumps(res) + '\n')
print(f"\n✓ Results saved to {args.output_file}")
print(f"✓ Total samples processed: {len(results)}")
def run_evaluation(args):
"""Run evaluation on inference results."""
# Load results
results = []
with open(args.input_file, 'r') as f:
for line in f:
job = json.loads(line)
annotation = job["annotation"]
annotation["prediction"] = job["result"]["gen"]
results.append(annotation)
data = pd.DataFrame.from_records(results)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# Load dataset for validation
meta = load_dataset(args.dataset)
# Validation
print(f"len(data): {len(data)}")
print(f"len(meta): {len(meta)}")
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {args.dataset}'
)
# Save intermediate results
output_xlsx = args.output_file.replace('.csv', '.xlsx') if args.output_file.endswith('.csv') else args.output_file
clean_dataframe_for_excel(data).to_excel(output_xlsx, index=False)
print(f"✓ Saved intermediate results to {output_xlsx}")
# Build judge model
model = build_judge(
model=getattr(args, 'eval_model', 'gpt-4o-2024-05-13'),
api_type=getattr(args, 'api_type', 'dash')
)
# Prepare evaluation tasks
eval_tasks = []
for i in range(len(data)):
item = data.iloc[i]
eval_tasks.append((model, item))
# Run evaluation
eval_results = []
# Debug mode: process single-threaded with first few samples
debug = os.environ.get('DEBUG', '').lower() == 'true'
if debug:
print("Running in debug mode with first 5 samples...")
for task in eval_tasks[:5]:
try:
result = eval_single_sample(task)
eval_results.append(result)
except Exception as e:
print(f"Error processing task: {e}")
print(f"Task details: {task}")
raise
else:
# Normal mode: process all samples with threading
from concurrent.futures import ThreadPoolExecutor
nproc = getattr(args, 'nproc', 4)
with ThreadPoolExecutor(max_workers=nproc) as executor:
for result in tqdm(executor.map(eval_single_sample, eval_tasks),
total=len(eval_tasks), desc="Evaluating"):
eval_results.append(result)
# Update data with evaluation results
data['res'] = [r['res'] for r in eval_results]
data['log'] = [r['log'] for r in eval_results]
data['extract_model'] = [r['extract_model'] for r in eval_results]
data['extract_flag'] = [r['extract_flag'] for r in eval_results]
# Save evaluation results
storage = args.output_file.replace('.csv', '_eval.xlsx')
clean_dataframe_for_excel(data).to_excel(storage, index=False)
print(f"✓ Saved evaluation results to {storage}")
# Calculate accuracy
score = MATH_V_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
score.to_csv(score_pth, index=False)
print(f"✓ Saved score to {score_pth}")
print(f"\n{'='*50}")
print(f"Evaluation Results:")
print(f"{'='*50}")
print(score)
print(f"{'='*50}\n")
return score
def main():
parser = argparse.ArgumentParser(description="MathVision Evaluation with vLLM")
subparsers = parser.add_subparsers(dest='command', help='Command to run')
# Inference parser
infer_parser = subparsers.add_parser("infer", help="Run inference with vLLM")
infer_parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
infer_parser.add_argument("--dataset", type=str, default="MathVision",
choices=["MathVision", "MathVision_MINI"],
help="Dataset name")
infer_parser.add_argument("--data-dir", type=str, help="The absolute path of MathVision data directory")
infer_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
infer_parser.add_argument("--num-samples", type=int, default=None,
help="Number of samples to process (default: None, process all samples)")
infer_parser.add_argument("--use-cot", action="store_true", help="Use Chain-of-Thought prompting")
infer_parser.add_argument("--cot-prompt", type=str, default="", help="Custom Chain-of-Thought prompt")
# vLLM specific parameters
infer_parser.add_argument("--tensor-parallel-size", type=int, default=None,
help="Tensor parallel size (default: number of GPUs)")
infer_parser.add_argument("--gpu-memory-utilization", type=float, default=0.9,
help="GPU memory utilization (0.0-1.0, default: 0.9)")
infer_parser.add_argument("--max-model-len", type=int, default=128000,
help="Maximum model context length (default: 128000)")
infer_parser.add_argument("--max-images-per-prompt", type=int, default=10,
help="Maximum images per prompt (default: 10)")
# Generation parameters
infer_parser.add_argument("--max-new-tokens", type=int, default=32768,
help="Maximum number of tokens to generate (default: 2048)")
infer_parser.add_argument("--temperature", type=float, default=0.7,
help="Temperature for sampling (default: 0.7 for greedy-like decoding)")
infer_parser.add_argument("--top-p", type=float, default=0.8,
help="Top-p for sampling (default: 0.8 for greedy-like decoding)")
infer_parser.add_argument("--top-k", type=int, default=20,
help="Top-k for sampling (default: 20 for greedy decoding)")
infer_parser.add_argument("--repetition-penalty", type=float, default=1.0,
help="Repetition penalty (default: 1.0, increase to 1.2-1.5 to reduce repetition)")
infer_parser.add_argument("--presence-penalty", type=float, default=1.5,
help="Presence penalty (default: 1.5, range: 0.0-2.0, penalize tokens that have already appeared)")
# Evaluation parser
eval_parser = subparsers.add_parser("eval", help="Run evaluation")
eval_parser.add_argument("--data-dir", type=str, help="The absolute path of MathVision data directory")
eval_parser.add_argument("--input-file", type=str, required=True, help="Input file with inference results")
eval_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
eval_parser.add_argument("--dataset", type=str, default="MathVision",
choices=["MathVision", "MathVision_MINI"],
help="Dataset name")
eval_parser.add_argument("--eval-model", type=str, default="gpt-4o",
help="Model to use for evaluation (default: gpt-4o)")
eval_parser.add_argument("--api-type", type=str, default="dash", choices=["dash", "mit"],
help="API type for evaluation")
eval_parser.add_argument("--nproc", type=int, default=4, help="Number of processes to use")
args = parser.parse_args()
# Set data directory if provided
if hasattr(args, 'data_dir') and args.data_dir:
os.environ['LMUData'] = args.data_dir
# Automatically set tensor_parallel_size
if args.command == 'infer' and args.tensor_parallel_size is None:
args.tensor_parallel_size = torch.cuda.device_count()
print(f"Auto-set tensor_parallel_size to {args.tensor_parallel_size}")
if args.command == 'infer':
run_inference(args)
elif args.command == 'eval':
run_evaluation(args)
else:
parser.print_help()
if __name__ == "__main__":
main()
# ODinW Benchmark Evaluation
This directory contains the implementation for evaluating vision-language models on the ODinW (Object Detection in the Wild) 13 dataset using vLLM for high-speed inference.
## Overview
ODinW is a comprehensive object detection benchmark that consists of 13 diverse datasets spanning various domains. This implementation provides:
- **High-speed inference** using vLLM with automatic batch optimization
- **Unified evaluation** across 13 diverse object detection datasets
- **COCO-style metrics** including mAP, mAP_50, mAP_75, etc.
- **Modular code structure** for easy maintenance and extension
## Project Structure
```
ODinW-13/
├── run_odinw.py # Main script for inference and evaluation
├── dataset_utils.py # Dataset loading and preprocessing utilities
├── eval_utils.py # Evaluation logic and COCO metrics computation
├── infer_instruct.sh # Inference script for instruct models
├── infer_think.sh # Inference script for thinking models
├── eval_instruct.sh # Evaluation script for instruct model results
├── eval_think.sh # Evaluation script for thinking model results
├── requirements.txt # Python dependencies
└── README.md # This file
```
## Requirements
### Python Dependencies
```bash
pip install -r requirements.txt
```
Key dependencies:
- `vllm` - High-speed LLM inference engine
- `transformers` - HuggingFace transformers
- `qwen_vl_utils` - Qwen VL utilities for vision processing
- `pycocotools` - COCO evaluation API
- `pandas`, `numpy` - Data processing
- `Pillow` - Image processing
- `tabulate` - Table formatting (optional, for better output display)
### Data Preparation
The ODinW dataset requires a specific directory structure:
```
/path/to/odinw_data/
├── odinw13_config.py # Dataset configuration file (required)
├── AerialMaritimeDrone/ # Individual datasets
│ ├── large/
│ │ ├── train/
│ │ └── test/
│ └── tiled/
├── Aquarium/
├── Cottontail Rabbits/
├── EgoHands/
├── NorthAmerica Mushrooms/
├── Packages/
├── Pascal VOC/
├── Pistols/
├── Pothole/
├── Raccoon/
├── ShellfishOpenImages/
├── Thermal Dogs and People/
└── Vehicles OpenImages/
```
**Important**: The `odinw13_config.py` file must contain:
- `datasets`: List of dataset configurations
- `dataset_prefixes`: List of dataset names
## Quick Start
### 1. Inference
Run inference on the ODinW dataset using an instruct model:
```bash
bash infer_instruct.sh
```
Or customize the inference:
```bash
python run_odinw.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--data-dir /path/to/odinw_data \
--output-file results/odinw_predictions.jsonl \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
```
For thinking models with extended reasoning:
```bash
bash infer_think.sh
```
### 2. Evaluation
Evaluate the inference results using COCO metrics:
```bash
bash eval_instruct.sh
```
Or customize the evaluation:
```bash
python run_odinw.py eval \
--data-dir /path/to/odinw_data \
--input-file results/odinw_predictions.jsonl \
--output-file results/odinw_eval_results.json
```
## Detailed Usage
### Inference Mode
**Basic Arguments:**
- `--model-path`: Path to the Qwen3-VL model directory (required)
- `--data-dir`: Path to ODinW data directory containing `odinw13_config.py` (required)
- `--output-file`: Path to save inference results in JSONL format (required)
**vLLM Arguments:**
- `--tensor-parallel-size`: Number of GPUs for tensor parallelism (default: auto-detect)
- `--gpu-memory-utilization`: GPU memory utilization ratio, 0.0-1.0 (default: 0.9)
- `--max-model-len`: Maximum model context length (default: 128000)
- `--max-images-per-prompt`: Maximum images per prompt (default: 10)
**Generation Parameters:**
- `--max-new-tokens`: Maximum tokens to generate (default: 32768)
- `--temperature`: Sampling temperature (default: 0.7)
- `--top-p`: Top-p sampling (default: 0.8)
- `--top-k`: Top-k sampling (default: 20)
- `--repetition-penalty`: Repetition penalty (default: 1.0)
- `--presence-penalty`: Presence penalty to reduce repetition (default: 1.5)
### Evaluation Mode
**Basic Arguments:**
- `--data-dir`: Path to ODinW data directory containing `odinw13_config.py` (required)
- `--input-file`: Inference results file in JSONL format (required)
- `--output-file`: Path to save evaluation results in JSON format (required)
## Output Files
### Inference Output
The inference script generates two files:
1. **Predictions file** (`odinw_predictions.jsonl`): JSONL file where each line contains:
```json
{
"question_id": 0,
"annotation": [...],
"extra_info": {
"dataset_name": "AerialMaritimeDrone_large",
"img_id": 1,
"anno_path": "/path/to/annotations.json",
"resized_h": 640,
"resized_w": 640,
"img_h": 1080,
"img_w": 1920,
"img_path": "/path/to/image.jpg"
},
"result": {
"gen": "[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"boat\"}, ...]",
"gen_raw": "Raw model output including thinking process"
},
"messages": [...]
}
```
2. **Dataset config file** (`odinw_predictions_datasets.json`): Configuration for evaluation
### Evaluation Output
The evaluation script generates a JSON file with results for each dataset:
```json
{
"AerialMaritimeDrone_large": {
"mAP": 0.456,
"mAP_50": 0.678,
"mAP_75": 0.512,
"mAP_s": 0.234,
"mAP_m": 0.456,
"mAP_l": 0.567
},
"Aquarium_Aquarium Combined.v2-raw-1024.coco": {
...
},
...
"Average": 0.423
}
```
**Evaluation Metrics:**
- **mAP**: Mean Average Precision at IoU 0.5:0.95 (primary metric)
- **mAP_50**: mAP at IoU threshold 0.5
- **mAP_75**: mAP at IoU threshold 0.75
- **mAP_s**: mAP for small objects (area < 32²)
- **mAP_m**: mAP for medium objects (32² < area < 96²)
- **mAP_l**: mAP for large objects (area > 96²)
## Model-Specific Configurations
### Instruct Models (e.g., Qwen3-VL-2B-Instruct, Qwen3-VL-7B-Instruct)
Use standard parameters for balanced performance:
```bash
--max-new-tokens 32768
--temperature 0.7
--top-p 0.8
--top-k 20
--repetition-penalty 1.0
--presence-penalty 1.5
```
### Thinking Models (e.g., Qwen3-VL-2B-Thinking)
Use adjusted parameters for deeper reasoning:
```bash
--max-new-tokens 32768
--temperature 0.6
--top-p 0.95
--top-k 20
--repetition-penalty 1.0
--presence-penalty 0.0
```
**Note:** Thinking models output reasoning steps wrapped in `<think>...</think>` tags. The evaluation automatically extracts the final answer after `</think>`.
## Performance Tips
1. **GPU Memory**: Adjust `--gpu-memory-utilization` based on your GPU:
- 0.9: Recommended for most cases
- 0.95: For maximum throughput (may cause OOM)
- 0.7-0.8: If experiencing OOM errors
2. **Batch Size**: vLLM automatically optimizes batch size based on available memory
3. **Tensor Parallelism**: Use `--tensor-parallel-size` for large models:
- 2B model: 1 GPU recommended
- 7B model: 1-2 GPUs
- 14B+ model: 2-4 GPUs
4. **Context Length**: Reduce `--max-model-len` if memory is limited:
- 128000: Default, works well for most cases
- 64000: Reduces memory usage by ~40%
5. **Image Processing**: The implementation uses `smart_resize` to automatically adjust image dimensions:
- Dimensions are made divisible by 32
- Total pixels are constrained to [min_pixels, max_pixels]
- Aspect ratio is preserved
## Troubleshooting
### Common Issues
**1. Config file not found**
```
FileNotFoundError: Config file not found: /path/to/odinw13_config.py
```
**Solution**: Ensure `odinw13_config.py` exists in `--data-dir`
**2. CUDA Out of Memory**
```bash
# Reduce GPU memory utilization
--gpu-memory-utilization 0.7
# Or reduce context length
--max-model-len 64000
```
**3. vLLM Multiprocessing Issues**
The code automatically sets `VLLM_WORKER_MULTIPROC_METHOD=spawn`. If you still encounter issues:
```bash
export VLLM_WORKER_MULTIPROC_METHOD=spawn
```
**4. Empty or Invalid JSON Output**
- Check model output format
- Verify prompt clarity
- Try adjusting temperature/top_p
**5. Low mAP Scores**
- Verify category names match dataset classes
- Check coordinate format (xyxy vs xywh)
- Ensure model outputs JSON format correctly
**6. COCO API Errors**
```
IndexError: The testing results of the whole dataset is empty.
```
**Solution**: No valid predictions were generated. Check model outputs.
## Advanced Usage
### Custom Image Resolution
Edit `dataset_utils.py` to modify resolution parameters:
```python
# Calculate image resolution parameters
patch_size = 16
merge_base = 2
pixels_per_token = patch_size * patch_size * merge_base * merge_base
min_pixels = pixels_per_token * 768
max_pixels = pixels_per_token * 12800
```
### Filtering Datasets
To evaluate only specific datasets, edit `generate_odinw_jobs()` in `dataset_utils.py`:
```python
# Only process specific datasets
dataset_filter = ['AerialMaritimeDrone', 'Aquarium']
for data_name, data_config in datasets.items():
if data_name not in dataset_filter:
continue
# ... rest of the code
```
### Custom Prompt Format
Edit the prompt in `dataset_utils.py`:
```python
# Default prompt
prompt = f"Locate every instance that belongs to the following categories: '{obj_names}'. Report bbox coordinates in JSON format."
# Custom prompt example
prompt = f"Find all {obj_names} objects in the image and output their bounding boxes as JSON."
```
## Citation
If you use this code or the ODinW benchmark, please cite:
```bibtex
@inproceedings{li2022grounded,
title={Grounded language-image pre-training},
author={Li, Liunian Harold and Zhang, Pengchuan and Zhang, Haotian and Yang, Jianwei and Li, Chunyuan and Zhong, Yiwu and Wang, Lijuan and Yuan, Lu and Zhang, Lei and Hwang, Jenq-Neng and others},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={10965--10975},
year={2022}
}
```
## License
This code is released under the same license as the Qwen3-VL model.
## Support
For issues and questions:
- GitHub Issues: [Qwen3-VL Repository](https://github.com/QwenLM/Qwen3-VL)
- Documentation: See inline code comments and docstrings
"""
ODinW dataset loading and processing utilities.
"""
import os
import math
from typing import Dict, List, Tuple
from pycocotools.coco import COCO
def round_by_factor(number: int, factor: int) -> int:
"""Return the nearest integer divisible by factor."""
return round(number / factor) * factor
def ceil_by_factor(number: int, factor: int) -> int:
"""Return the ceiling integer divisible by factor."""
return math.ceil(number / factor) * factor
def floor_by_factor(number: int, factor: int) -> int:
"""Return the floor integer divisible by factor."""
return math.floor(number / factor) * factor
def smart_resize(height: int, width: int, factor: int = 28,
min_pixels: int = 56*56, max_pixels: int = 14*14*4*1280,
max_long_side: int = 8192) -> Tuple[int, int]:
"""Resize image to meet the following conditions:
1. Both height and width are divisible by factor
2. Total pixels are within [min_pixels, max_pixels]
3. Longest side is within max_long_side
4. Aspect ratio is preserved
Args:
height: Original image height
width: Original image width
factor: Size must be divisible by this factor
min_pixels: Minimum pixel count
max_pixels: Maximum pixel count
max_long_side: Maximum longest side
Returns:
(resized_height, resized_width): Resized dimensions
"""
if height < 2 or width < 2:
raise ValueError(f'height:{height} or width:{width} must be larger than factor:{factor}')
elif max(height, width) / min(height, width) > 200:
raise ValueError(f'absolute aspect ratio must be smaller than 200, got {height} / {width}')
if max(height, width) > max_long_side:
beta = max(height, width) / max_long_side
height, width = int(height / beta), int(width / beta)
h_bar = round_by_factor(height, factor)
w_bar = round_by_factor(width, factor)
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, factor)
w_bar = floor_by_factor(width / beta, factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, factor)
w_bar = ceil_by_factor(width * beta, factor)
return h_bar, w_bar
def load_odinw_config(config_path: str) -> Dict:
"""Load odinw13_config.py configuration file.
Args:
config_path: Path to config file
Returns:
datasets: Dictionary mapping dataset names to configurations
"""
import runpy
config = runpy.run_path(config_path)
dataset_configs = config["datasets"]
dataset_names = config["dataset_prefixes"]
datasets = {}
for dataset_name, dataset_config in zip(dataset_names, dataset_configs):
datasets[dataset_name] = dataset_config
return datasets
def generate_odinw_jobs(data_dir: str, args) -> Tuple[List[Dict], Dict]:
"""Generate inference task list for ODinW dataset.
Args:
data_dir: Data directory path (containing odinw13_config.py)
args: Command line arguments
Returns:
(question_list, datasets): Task list and dataset configurations
"""
# Load config
config_path = os.path.join(data_dir, "odinw13_config.py")
if not os.path.exists(config_path):
raise FileNotFoundError(f"Config file not found: {config_path}")
datasets = load_odinw_config(config_path)
question_list = []
question_id = 0
num_questions_per_dataset = {}
# Calculate image resolution parameters
patch_size = 16
merge_base = 2
pixels_per_token = patch_size * patch_size * merge_base * merge_base
min_pixels = pixels_per_token * 768
max_pixels = pixels_per_token * 12800
# Iterate through all datasets
for data_name, data_config in datasets.items():
print(f'Parsing ODinW:{data_name}')
classes = list(data_config["metainfo"]["classes"])
# Build data paths
idx = data_config["data_root"].find('data/odinw/') + len('data/odinw/')
sub_root = os.path.join(data_dir, data_config["data_root"][idx:])
sub_anno = sub_root + data_config["ann_file"]
sub_img_root = sub_root + data_config["data_prefix"]["img"]
# Load COCO format annotations
dataset = COCO(sub_anno)
num_questions = 0
# Iterate through all images
for img_idx, img_meta in dataset.imgs.items():
img_name = img_meta["file_name"]
img_path = sub_img_root + img_name
img_h = img_meta["height"]
img_w = img_meta["width"]
# Calculate resized image dimensions
resized_h, resized_w = smart_resize(
img_h, img_w,
factor=32,
min_pixels=min_pixels,
max_pixels=max_pixels,
max_long_side=50000
)
# Get annotations
img_annos = dataset.imgToAnns[img_idx]
# Build class names list
obj_names = ", ".join(classes)
# Build prompt
prompt = f"Locate every instance that belongs to the following categories: '{obj_names}'. Report bbox coordinates in JSON format."
# Build messages
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": f"file://{img_path}",
"min_pixels": min_pixels,
"max_pixels": max_pixels
},
{"type": "text", "text": prompt}
]
}
]
# Build task item
item = {
"question_id": question_id,
"annotation": img_annos,
'messages': messages,
"extra_info": {
'dataset_name': data_name,
'dataset_config': data_config,
'img_id': img_meta["id"],
'anno_path': sub_anno,
'resized_h': resized_h,
'resized_w': resized_w,
'img_h': img_h,
'img_w': img_w,
'img_path': img_path
}
}
question_list.append(item)
question_id += 1
num_questions += 1
num_questions_per_dataset[data_name] = num_questions
# Print statistics
for data_name, num_questions in num_questions_per_dataset.items():
print(f'{data_name}: {num_questions}')
print(f"Total ODinW questions: {len(question_list)}")
return question_list, datasets
#!/bin/bash
# ODinW Evaluation Script (Instruct Model)
# This script evaluates the inference results using COCO metrics
python run_odinw.py eval \
--data-dir /path/to/odinw_data \
--input-file results/odinw_predictions.jsonl \
--output-file results/odinw_eval_results.json
#!/bin/bash
# ODinW Evaluation Script (Thinking Model)
# This script evaluates the inference results from thinking model using COCO metrics
python run_odinw.py eval \
--data-dir /path/to/odinw_data \
--input-file results/odinw_predictions_thinking.jsonl \
--output-file results/odinw_eval_results_thinking.json
"""
ODinW evaluation utilities.
"""
import os
import json
import tempfile
import numpy as np
from typing import List, Dict, Sequence
from collections import OrderedDict
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
def xyxy2xywh(bbox: np.ndarray) -> list:
"""Convert bbox format from xyxy to xywh.
Args:
bbox: Bounding box in [x1, y1, x2, y2] format
Returns:
Bounding box in [x, y, w, h] format
"""
_bbox = bbox.tolist()
return [
_bbox[0],
_bbox[1],
_bbox[2] - _bbox[0],
_bbox[3] - _bbox[1],
]
def results2json(results: Sequence[dict], outfile_prefix: str, cat_ids: dict) -> dict:
"""Convert results to COCO JSON format.
Args:
results: List of prediction results
outfile_prefix: Output file prefix
cat_ids: Category ID mapping
Returns:
result_files: Dictionary of result file paths
"""
bbox_json_results = []
for idx, result in enumerate(results):
image_id = result.get('img_id', idx)
labels = result['labels']
bboxes = result['bboxes']
scores = result['scores']
for i, label in enumerate(labels):
data = dict()
data['image_id'] = image_id
data['bbox'] = xyxy2xywh(bboxes[i])
data['score'] = float(scores[i])
data['category_id'] = cat_ids[label]
bbox_json_results.append(data)
result_files = dict()
result_files['bbox'] = f'{outfile_prefix}.bbox.json'
with open(result_files['bbox'], 'w') as f:
json.dump(bbox_json_results, f)
return result_files
def compute_metrics(results: list, outfile_prefix: str = None, _coco_api: COCO = None) -> Dict[str, float]:
"""Compute mAP and other metrics using COCO API.
Args:
results: List of evaluation results, each element is a (gt, pred) tuple
outfile_prefix: Output file prefix (optional)
_coco_api: COCO API instance
Returns:
eval_results: Dictionary of evaluation metrics
"""
proposal_nums = (100, 300, 1000)
iou_thrs = np.linspace(
.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
# Separate ground truth and predictions
if len(results) == 0:
gts, preds = [], []
else:
gts, preds = zip(*results)
tmp_dir = None
if outfile_prefix is None:
tmp_dir = tempfile.TemporaryDirectory()
outfile_prefix = os.path.join(tmp_dir.name, 'results')
cat_ids = _coco_api.getCatIds()
img_ids = _coco_api.getImgIds()
# Convert to COCO format and save
result_files = results2json(preds, outfile_prefix, cat_ids)
eval_results = OrderedDict()
for metric in ["bbox"]:
iou_type = metric
if metric not in result_files:
raise KeyError(f'{metric} is not in results')
try:
with open(result_files[metric], 'r') as f:
predictions = json.load(f)
coco_dt = _coco_api.loadRes(predictions)
except IndexError:
print('The testing results of the whole dataset is empty.')
break
coco_eval = COCOeval(_coco_api, coco_dt, iou_type)
coco_eval.params.catIds = cat_ids
coco_eval.params.imgIds = img_ids
coco_eval.params.maxDets = list(proposal_nums)
coco_eval.params.iouThrs = iou_thrs
# mapping of cocoEval.stats
coco_metric_names = {
'mAP': 0,
'mAP_50': 1,
'mAP_75': 2,
'mAP_s': 3,
'mAP_m': 4,
'mAP_l': 5,
'AR@100': 6,
'AR@300': 7,
'AR@1000': 8,
'AR_s@1000': 9,
'AR_m@1000': 10,
'AR_l@1000': 11
}
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
metric_items = [
'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
]
for metric_item in metric_items:
val = coco_eval.stats[coco_metric_names[metric_item]]
eval_results[metric_item] = float(f'{round(val, 3)}')
if tmp_dir is not None:
tmp_dir.cleanup()
return eval_results
#!/bin/bash
# ODinW Inference Script (Instruct Model)
# This script runs inference on the ODinW dataset using vLLM
python run_odinw.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--data-dir /path/to/odinw_data \
--output-file results/odinw_predictions.jsonl \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.9 \
--max-model-len 128000 \
--max-images-per-prompt 10 \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
#!/bin/bash
# ODinW Inference Script (Thinking Model)
# This script runs inference on the ODinW dataset using vLLM with thinking mode parameters
python run_odinw.py infer \
--model-path /path/to/Qwen3-VL-Thinking \
--data-dir /path/to/odinw_data \
--output-file results/odinw_predictions_thinking.jsonl \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.9 \
--max-model-len 128000 \
--max-images-per-prompt 10 \
--max-new-tokens 32768 \
--temperature 0.6 \
--top-p 0.95 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 0.0
vllm
transformers
qwen_vl_utils
pandas
numpy
Pillow
tqdm
requests
validators
torch
torchvision
accelerate
pycocotools
tabulate
flash_attn
import os
import sys
import json
import argparse
import numpy as np
import time
from tqdm import tqdm
from typing import List, Dict, Any
from collections import defaultdict, OrderedDict
import torch
# vLLM imports
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor
# pycocotools imports
from pycocotools.coco import COCO
# Local imports from refactored files
from dataset_utils import load_odinw_config, generate_odinw_jobs
from eval_utils import compute_metrics
# Set vLLM multiprocessing method
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
def prepare_inputs_for_vllm(messages, processor):
"""Prepare inputs for vLLM."""
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# qwen_vl_utils 0.0.14+ required
image_inputs, video_inputs, video_kwargs = process_vision_info(
messages,
image_patch_size=processor.image_processor.patch_size,
return_video_kwargs=True,
return_video_metadata=True
)
mm_data = {}
if image_inputs is not None:
mm_data['image'] = image_inputs
if video_inputs is not None:
mm_data['video'] = video_inputs
return {
'prompt': text,
'multi_modal_data': mm_data,
'mm_processor_kwargs': video_kwargs
}
def run_inference(args):
"""Run inference on the ODinW dataset using vLLM."""
print("\n" + "="*80)
print("🚀 ODinW Inference with vLLM (High-Speed Mode)")
print("="*80 + "\n")
# Generate task list
question_list, datasets = generate_odinw_jobs(args.data_dir, args)
print(f"✓ Generated {len(question_list)} inference jobs\n")
# Create output directory
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
# Set up generation parameters
sampling_params = SamplingParams(
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
max_tokens=args.max_new_tokens,
repetition_penalty=args.repetition_penalty,
presence_penalty=args.presence_penalty,
stop_token_ids=[],
)
print(f"\n⚙️ Generation parameters (vLLM SamplingParams):")
print(f" max_tokens={sampling_params.max_tokens}")
print(f" temperature={sampling_params.temperature}, top_p={sampling_params.top_p}, top_k={sampling_params.top_k}")
print(f" repetition_penalty={sampling_params.repetition_penalty}")
print(f" presence_penalty={sampling_params.presence_penalty}")
print()
# Load processor
print(f"Loading processor from {args.model_path}")
processor = AutoProcessor.from_pretrained(args.model_path)
print("✓ Processor loaded\n")
# Initialize vLLM
print(f"Initializing vLLM with model: {args.model_path}")
print(f" GPU count: {torch.cuda.device_count()}")
print(f" Tensor parallel size: {args.tensor_parallel_size}")
llm = LLM(
model=args.model_path,
tensor_parallel_size=args.tensor_parallel_size,
gpu_memory_utilization=args.gpu_memory_utilization,
trust_remote_code=True,
max_model_len=args.max_model_len,
limit_mm_per_prompt={"image": args.max_images_per_prompt},
seed=42,
)
print("✓ vLLM initialized successfully\n")
# Prepare all inputs
print("Preparing inputs for vLLM...")
all_inputs = []
for item in tqdm(question_list, desc="Building prompts"):
vllm_input = prepare_inputs_for_vllm(item['messages'], processor)
all_inputs.append(vllm_input)
print(f"✓ Prepared {len(all_inputs)} inputs\n")
# Batch inference
print("="*80)
print("🚀 Running vLLM batch inference")
print("="*80)
start_time = time.time()
outputs = llm.generate(all_inputs, sampling_params=sampling_params)
end_time = time.time()
total_time = end_time - start_time
print(f"\n✓ Inference completed in {total_time:.2f} seconds")
print(f" Average: {total_time/len(question_list):.2f} seconds/sample")
print(f" Throughput: {len(question_list)/total_time:.2f} samples/second\n")
# Save results
print("Saving results...")
results = []
for idx, (item, output) in enumerate(zip(question_list, outputs)):
response = output.outputs[0].text
# Handle </think> tag
response_final = str(response).split("</think>")[-1].strip()
result = {
"question_id": item['question_id'],
"annotation": item['annotation'],
"extra_info": item['extra_info'],
"result": {"gen": response_final, "gen_raw": response},
"messages": item['messages']
}
results.append(result)
# Save results
with open(args.output_file, 'w') as f:
for res in results:
f.write(json.dumps(res) + '\n')
print(f"\n✓ Results saved to {args.output_file}")
print(f"✓ Total samples processed: {len(results)}")
# Save dataset config (for evaluation)
config_output = args.output_file.replace('.jsonl', '_datasets.json')
with open(config_output, 'w') as f:
# Convert config for JSON serialization
datasets_serializable = {}
for k, v in datasets.items():
datasets_serializable[k] = {
'metainfo': v['metainfo'],
'data_root': v['data_root'],
'ann_file': v['ann_file'],
'data_prefix': v['data_prefix']
}
json.dump(datasets_serializable, f, indent=2)
print(f"✓ Dataset config saved to {config_output}")
def run_evaluation(args):
"""Run evaluation on inference results."""
print("\n" + "="*80)
print("🎯 ODinW Evaluation")
print("="*80 + "\n")
# Load inference results
results = []
with open(args.input_file, 'r') as f:
for line in f:
results.append(json.loads(line))
print(f"✓ Loaded {len(results)} inference results\n")
# Load dataset config
config_path = os.path.join(args.data_dir, "odinw13_config.py")
datasets = load_odinw_config(config_path)
# Group by dataset
all_outputs = defaultdict(list)
for job in results:
all_outputs[job["extra_info"]["dataset_name"]].append(job)
all_results = {}
# Evaluate each dataset
for dataset_name, sub_jobs in all_outputs.items():
print(f"\n{'='*60}")
print(f"Evaluating dataset: {dataset_name}")
print(f"{'='*60}")
anno_path = sub_jobs[0]["extra_info"]["anno_path"]
coco_api = COCO(anno_path)
classes = datasets[dataset_name]['metainfo']['classes']
pred_bboxes_per_img = defaultdict(list)
for job in sub_jobs:
img_id = job["extra_info"]["img_id"]
resized_h = job["extra_info"]["resized_h"]
resized_w = job["extra_info"]["resized_w"]
img_h = job["extra_info"]["img_h"]
img_w = job["extra_info"]["img_w"]
answer = job['result']['gen']
answer = answer.replace("```json", "")
answer = answer.replace("```", "")
# Parse predictions
import ast
import re
try:
json_data = ast.literal_eval(answer)
pred_bboxes = []
pred_labels = []
for data in json_data:
if len(data.get("bbox_2d", [])) != 4:
continue
pred_bboxes.append(data["bbox_2d"])
pred_labels.append(data["label"])
except Exception as e:
# If parsing fails, use empty results
pred_bboxes = []
pred_labels = []
# Coordinate conversion (from resized to original size)
if os.getenv("is_rel", "0") == "1":
pred_bboxes = np.array(pred_bboxes).reshape(-1, 4) / 1000 * np.array([img_w, img_h, img_w, img_h])
else:
if len(pred_bboxes) > 0:
pred_bboxes = np.array(pred_bboxes).reshape(-1, 4) / np.array([resized_w, resized_h, resized_w, resized_h]) * np.array([img_w, img_h, img_w, img_h])
else:
pred_bboxes = np.array(pred_bboxes).reshape(-1, 4)
pred_bboxes = pred_bboxes.tolist()
# Group by category
pred_objs = defaultdict(list)
for pred_bbox, pred_label in zip(pred_bboxes, pred_labels):
pred_objs[pred_label].append(pred_bbox)
for k, v in pred_objs.items():
class_names = [name.lower() for name in classes]
if k.lower() not in class_names:
continue
pred_bboxes_per_img[img_id].append({
'label': class_names.index(k.lower()),
'bbox': v
})
# Prepare evaluation results
pred_results = []
for k, v in pred_bboxes_per_img.items():
bboxes = []
labels = []
for tmp in v:
bboxes.extend(tmp['bbox'])
labels.extend([tmp['label']] * len(tmp['bbox']))
height = coco_api.imgs[k]["height"]
width = coco_api.imgs[k]["width"]
pred_tuple = (
{'width': width, 'height': height, 'img_id': k},
{
'img_id': k,
'bboxes': np.array(bboxes),
'scores': np.array([1.0] * len(bboxes)),
'labels': np.array(labels),
},
)
pred_results.append(pred_tuple)
# Compute metrics
eval_results = compute_metrics(pred_results, _coco_api=coco_api)
print(f"{dataset_name}: {eval_results}")
all_results[dataset_name] = eval_results
# Summarize results
results_ordered = OrderedDict(sorted(all_results.items(), key=lambda x: x[0]))
metric_items = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l']
results_display = []
for prefix, result in results_ordered.items():
results_display.append([prefix] + [result[k] for k in metric_items])
# Calculate average
average_scores = []
for col_idx in range(len(metric_items)):
average_scores.append(np.mean([line[col_idx + 1] for line in results_display]))
results_display.append(['Average'] + average_scores)
# Print results table
try:
from tabulate import tabulate
print("\n" + "="*80)
print(
tabulate(
results_display,
headers=["ODinW13 Dataset"] + metric_items,
tablefmt="fancy_outline",
floatfmt=".3f",
)
)
print("="*80 + "\n")
except ImportError:
print("\n" + "="*80)
print("ODinW13 Results:")
print("="*80)
for row in results_display:
print(row)
print("="*80 + "\n")
# Save results
all_results.update({"Average": average_scores[0]})
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
with open(args.output_file, 'w') as f:
json.dump(all_results, f, ensure_ascii=False, indent=4)
print(f"✓ Evaluation results saved to {args.output_file}")
print(f"\n{'='*80}")
print(f"Final Average mAP: {average_scores[0]:.4f}")
print(f"{'='*80}\n")
def main():
parser = argparse.ArgumentParser(description="ODinW Evaluation with vLLM")
subparsers = parser.add_subparsers(dest='command', help='Command to run')
# Inference parser
infer_parser = subparsers.add_parser("infer", help="Run inference with vLLM")
infer_parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
infer_parser.add_argument("--data-dir", type=str, required=True,
help="Path to ODinW data directory (containing odinw13_config.py)")
infer_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
# vLLM specific parameters
infer_parser.add_argument("--tensor-parallel-size", type=int, default=None,
help="Tensor parallel size (default: number of GPUs)")
infer_parser.add_argument("--gpu-memory-utilization", type=float, default=0.9,
help="GPU memory utilization (0.0-1.0, default: 0.9)")
infer_parser.add_argument("--max-model-len", type=int, default=128000,
help="Maximum model context length (default: 128000)")
infer_parser.add_argument("--max-images-per-prompt", type=int, default=10,
help="Maximum images per prompt (default: 10)")
# Generation parameters
infer_parser.add_argument("--max-new-tokens", type=int, default=32768,
help="Maximum number of tokens to generate (default: 32768)")
infer_parser.add_argument("--temperature", type=float, default=0.7,
help="Temperature for sampling (default: 0.7)")
infer_parser.add_argument("--top-p", type=float, default=0.8,
help="Top-p for sampling (default: 0.8)")
infer_parser.add_argument("--top-k", type=int, default=20,
help="Top-k for sampling (default: 20)")
infer_parser.add_argument("--repetition-penalty", type=float, default=1.0,
help="Repetition penalty (default: 1.0)")
infer_parser.add_argument("--presence-penalty", type=float, default=1.5,
help="Presence penalty (default: 1.5)")
# Evaluation parser
eval_parser = subparsers.add_parser("eval", help="Run evaluation")
eval_parser.add_argument("--data-dir", type=str, required=True,
help="Path to ODinW data directory (containing odinw13_config.py)")
eval_parser.add_argument("--input-file", type=str, required=True,
help="Input file with inference results")
eval_parser.add_argument("--output-file", type=str, required=True,
help="Output file path")
args = parser.parse_args()
# Automatically set tensor_parallel_size
if args.command == 'infer' and args.tensor_parallel_size is None:
args.tensor_parallel_size = torch.cuda.device_count()
print(f"Auto-set tensor_parallel_size to {args.tensor_parallel_size}")
if args.command == 'infer':
run_inference(args)
elif args.command == 'eval':
run_evaluation(args)
else:
parser.print_help()
if __name__ == "__main__":
main()
# RealWorldQA Benchmark Evaluation
This directory contains the implementation for evaluating vision-language models on the RealWorldQA benchmark using vLLM for high-speed inference.
## Overview
RealWorldQA is a real-world visual question answering benchmark containing 700+ high-quality VQA samples covering various real-world scenarios. This implementation provides:
- **High-speed inference** using vLLM with automatic batch optimization
- **Two-stage evaluation** using rule-based extraction with optional LLM-based fallback
- **Automatic dataset download** from OpenCompass
- **Modular code structure** for easy maintenance and extension
## Project Structure
```
RealWorldQA/
├── run_realworldqa.py # Main script for inference and evaluation
├── dataset_utils.py # Dataset loading and preprocessing utilities
├── eval_utils.py # Evaluation logic and answer extraction
├── common_utils.py # Common utilities for image processing, file I/O
├── infer_instruct.sh # Inference script for instruct models
├── infer_think.sh # Inference script for thinking models
├── eval_instruct.sh # Evaluation script for instruct model results
├── eval_think.sh # Evaluation script for thinking model results
├── requirements.txt # Python dependencies
└── README.md # This file
```
## Requirements
### Python Dependencies
```bash
pip install -r requirements.txt
```
Key dependencies:
- `vllm` - High-speed LLM inference engine
- `transformers` - HuggingFace transformers
- `qwen_vl_utils` - Qwen VL utilities for vision processing
- `pandas`, `numpy` - Data processing
- `Pillow` - Image processing
- `requests` - API calls for evaluation
### Environment Variables
For optional LLM-based evaluation, you need to set up API credentials:
**Option 1: DashScope API (Recommended)**
```bash
export CHATGPT_DASHSCOPE_API_KEY="your-api-key"
export DASHSCOPE_API_BASE="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
```
**Option 2: Custom OpenAI-compatible API**
```bash
export MIT_SPIDER_TOKEN="your-api-key"
export MIT_SPIDER_URL="your-api-endpoint"
```
### Data Preparation
The RealWorldQA dataset is stored in TSV format and will be **automatically downloaded** on first run from:
```
https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv
```
**Directory structure after download:**
```
${DATA_DIR}/
├── RealWorldQA.tsv # Main data file (auto-downloaded)
└── images/
└── RealWorldQA/ # Decoded image files
```
**Setting data path:**
- Option 1: Environment variable `export LMUData="/path/to/data"`
- Option 2: Use `--data-dir` argument in commands
## Quick Start
### 1. Inference
Run inference on the RealWorldQA dataset using an instruct model:
```bash
bash infer_instruct.sh
```
Or customize the inference:
```bash
python run_realworldqa.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--data-dir /path/to/data \
--dataset RealWorldQA \
--output-file results/predictions.jsonl \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
```
For thinking models with extended reasoning:
```bash
bash infer_think.sh
```
### 2. Evaluation
Evaluate the inference results:
```bash
bash eval_instruct.sh
```
Or customize the evaluation:
```bash
python run_realworldqa.py eval \
--data-dir /path/to/data \
--input-file results/predictions.jsonl \
--output-file results/evaluation.csv \
--dataset RealWorldQA \
--eval-model gpt-3.5-turbo-0125 \
--api-type dash \
--nproc 4
```
## Detailed Usage
### Inference Mode
**Basic Arguments:**
- `--model-path`: Path to the Qwen3-VL model directory (required)
- `--data-dir`: Directory to store/load RealWorldQA dataset (required)
- `--dataset`: Dataset name (default: `RealWorldQA`)
- `--output-file`: Path to save inference results in JSONL format (required)
**vLLM Arguments:**
- `--tensor-parallel-size`: Number of GPUs for tensor parallelism (default: auto-detect)
- `--gpu-memory-utilization`: GPU memory utilization ratio, 0.0-1.0 (default: 0.9)
- `--max-model-len`: Maximum model context length (default: 128000)
- `--max-images-per-prompt`: Maximum images per prompt (default: 10)
**Generation Parameters:**
- `--max-new-tokens`: Maximum tokens to generate (default: 32768)
- `--temperature`: Sampling temperature (default: 0.7)
- `--top-p`: Top-p sampling (default: 0.8)
- `--top-k`: Top-k sampling (default: 20)
- `--repetition-penalty`: Repetition penalty (default: 1.0)
- `--presence-penalty`: Presence penalty to reduce repetition (default: 1.5)
**Advanced Options:**
- `--min-pixels`: Minimum pixels for image (default: 768×28×28 ≈ 600K pixels)
- `--max-pixels`: Maximum pixels for image (default: 5120×28×28 ≈ 4M pixels)
### Evaluation Mode
**Basic Arguments:**
- `--data-dir`: Directory containing RealWorldQA dataset (required)
- `--input-file`: Inference results file in JSONL format (required)
- `--output-file`: Path to save evaluation results in CSV format (required)
- `--dataset`: Dataset name, must match inference (default: `RealWorldQA`)
**Judge Model Arguments:**
- `--eval-model`: Judge model name (default: None, uses rule-based only)
- Options: `gpt-3.5-turbo-0125`, `gpt-4-0125-preview`, `gpt-4o`, etc.
- `--api-type`: API service type (default: `dash`)
- `dash`: DashScope API (Alibaba Cloud)
- `mit`: Custom OpenAI-compatible API
- `--nproc`: Number of parallel workers for evaluation (default: 4)
## Output Files
### Inference Output
The inference script generates a JSONL file where each line contains:
```json
{
"question_id": 0,
"annotation": {
"index": "0",
"question": "What is shown in the image?",
"A": "Cat",
"B": "Dog",
"C": "Bird",
"D": "Fish",
"answer": "A"
},
"task": "RealWorldQA",
"result": {
"gen": "The correct answer is A",
"gen_raw": "Raw model output including thinking process"
},
"messages": [...]
}
```
### Evaluation Output
The evaluation script generates two files:
1. **CSV file** (`*_evaluation.csv`): Detailed evaluation results
- Columns: `index`, `question`, `prediction`, `extracted_answer`, `extraction_method`, `extraction_success`, `gt`, `hit`
2. **JSON file** (`*_evaluation_acc.json`): Accuracy statistics
```json
{
"overall_accuracy": 0.7234,
"task_samples": 765,
"correct": 553,
"total": 765
}
```
## Model-Specific Configurations
### Instruct Models (e.g., Qwen3-VL-2B-Instruct, Qwen3-VL-7B-Instruct)
Use standard parameters for balanced performance:
```bash
--max-new-tokens 32768
--temperature 0.7
--top-p 0.8
--top-k 20
--repetition-penalty 1.0
--presence-penalty 1.5
```
### Thinking Models (e.g., Qwen3-VL-2B-Thinking)
Use adjusted parameters for deeper reasoning:
```bash
--max-new-tokens 32768
--temperature 0.6
--top-p 0.95
--top-k 20
--repetition-penalty 1.0
--presence-penalty 0.0
```
**Note:** Thinking models output reasoning steps wrapped in `<think>...</think>` tags. The evaluation automatically extracts the final answer after `</think>`.
## Performance Tips
1. **GPU Memory**: Adjust `--gpu-memory-utilization` based on your GPU:
- 0.9: Recommended for most cases
- 0.95: For maximum throughput (may cause OOM)
- 0.7-0.8: If experiencing OOM errors
2. **Batch Size**: vLLM automatically optimizes batch size based on available memory
3. **Tensor Parallelism**: Use `--tensor-parallel-size` for large models:
- 2B model: 1 GPU recommended
- 7B model: 1-2 GPUs
- 14B+ model: 2-4 GPUs
4. **Context Length**: Reduce `--max-model-len` if memory is limited:
- 128000: Default, works well for most cases
- 64000: Reduces memory usage by ~40%
5. **Evaluation Speed**: Omit `--eval-model` to use rule-based extraction only (faster, ~70-80% success rate)
## Troubleshooting
### Common Issues
**1. CUDA Out of Memory**
```bash
# Reduce GPU memory utilization
--gpu-memory-utilization 0.7
# Or reduce context length
--max-model-len 64000
# Or reduce image resolution
--max-pixels 1003520 # 1280×28×28
```
**2. vLLM Multiprocessing Issues**
The code automatically sets `VLLM_WORKER_MULTIPROC_METHOD=spawn`. If you still encounter issues:
```bash
export VLLM_WORKER_MULTIPROC_METHOD=spawn
```
**3. Evaluation API Errors**
- If you don't need LLM-based extraction, omit `--eval-model` (rule-based only)
- If using LLM extraction, verify API credentials are set correctly
- Check API endpoint connectivity
- Increase `--nproc` value if rate-limited (up to 32)
**4. Dataset Download Issues**
The dataset is automatically downloaded from:
```
https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv
```
If download fails, manually download and place in `${DATA_DIR}/RealWorldQA.tsv`
**5. Import Errors**
Ensure all required files exist in the RealWorldQA directory:
```bash
ls common_utils.py dataset_utils.py eval_utils.py run_realworldqa.py
```
## Advanced Usage
### Custom Image Resolution
Modify resolution parameters in the inference command:
```bash
python run_realworldqa.py infer \
--min-pixels 393216 # 512×28×28
--max-pixels 1003520 # 1280×28×28
...
```
### Evaluation Without LLM
Use rule-based extraction only (faster, no API calls):
```bash
python run_realworldqa.py eval \
--input-file results/predictions.jsonl \
--output-file results/evaluation.csv
# No --eval-model specified
```
### Debug Mode
Process only first N samples for testing:
```bash
DEBUG_SAMPLE_SIZE=10 python run_realworldqa.py infer ...
```
## Citation
If you use this code or the RealWorldQA benchmark, please cite:
```bibtex
@misc{realworldqa2024,
title = {RealWorldQA: A Benchmark for Real-World Spatial Understanding},
author = {{xAI}},
year = {2024},
howpublished = {\url{https://huggingface.co/datasets/xai-org/RealworldQA}},
note = {Accessed: 2025-04-26}
}
```
## License
This code is released under the same license as the Qwen3-VL model.
## Support
For issues and questions:
- GitHub Issues: [Qwen3-VL Repository](https://github.com/QwenLM/Qwen3-VL)
- Documentation: See inline code comments and docstrings
import os
import requests
import base64
import hashlib
import io
from PIL import Image
from typing import List, Union
def encode_image_to_base64(image, target_size=None):
"""Encode an image to base64 string."""
if target_size is not None:
width, height = image.size
# Resize the image while maintaining the aspect ratio
if width > height:
new_width = target_size
new_height = int(height * target_size / width)
else:
new_height = target_size
new_width = int(width * target_size / height)
image = image.resize((new_width, new_height))
buffer = io.BytesIO()
image.save(buffer, format="JPEG")
return base64.b64encode(buffer.getvalue()).decode('utf-8')
def decode_base64_to_image(base64_string):
"""Decode a base64 string to an image."""
image_data = base64.b64decode(base64_string)
return Image.open(io.BytesIO(image_data))
def decode_base64_to_image_file(base64_string, output_path):
"""Decode a base64 string and save it to a file."""
image = decode_base64_to_image(base64_string)
image.save(output_path)
def download_file(url, local_path):
"""Download a file from a URL to a local path."""
response = requests.get(url, stream=True)
response.raise_for_status()
with open(local_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
def md5(file_path):
"""Calculate the MD5 hash of a file."""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def toliststr(s):
if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'):
return [str(x) for x in eval(s)]
elif isinstance(s, str):
return [s]
elif isinstance(s, list):
return [str(x) for x in s]
raise NotImplementedError
"""
RealWorldQA Dataset Utilities
Data loading and processing utilities, fully independent of VLMEvalKit.
"""
import os
import pandas as pd
import numpy as np
import string
from typing import Dict, Any, List
from PIL import Image
from common_utils import download_file, md5, toliststr, decode_base64_to_image_file
# RealWorldQA dataset URL and MD5
REALWORLDQA_DATASET_URL = 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv'
REALWORLDQA_DATASET_MD5 = '92321028d2bc29040284b6674721e48f'
def load_dataset(dataset_name='RealWorldQA'):
"""
Load RealWorldQA dataset.
Args:
dataset_name: Dataset name (default: 'RealWorldQA')
Returns:
pd.DataFrame: Loaded dataset
"""
if 'LMUData' not in os.environ:
raise ValueError("Please set LMUData environment variable or use --data-dir argument")
data_root = os.path.join(os.environ['LMUData'])
os.makedirs(data_root, exist_ok=True)
file_name = f"{dataset_name}.tsv"
data_path = os.path.join(data_root, file_name)
# Download dataset if not exists or MD5 mismatch
if not os.path.exists(data_path) or md5(data_path) != REALWORLDQA_DATASET_MD5:
print(f"Downloading {dataset_name} dataset...")
download_file(REALWORLDQA_DATASET_URL, data_path)
# Load dataset
data = pd.read_csv(data_path, sep='\t')
# Process dataset
data['index'] = [str(x) for x in data['index']]
# Process image data (base64 encoded or referenced)
if 'image' in data:
data['image'] = [str(x) for x in data['image']]
image_map = {x: y for x, y in zip(data['index'], data['image'])}
# Process image references (some images may reference other indices)
for k in image_map:
if len(image_map[k]) <= 64:
idx = image_map[k]
assert idx in image_map and len(image_map[idx]) > 64
image_map[k] = image_map[idx]
images = [toliststr(image_map[k]) for k in data['index']]
data['image'] = [x[0] if len(x) == 1 else x for x in images]
# Process image paths
if 'image_path' in data:
paths = [toliststr(x) for x in data['image_path']]
data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
# Convert index to integer if possible
if np.all([isinstance(x, int) or (isinstance(x, str) and x.isdigit()) for x in data['index']]):
data['index'] = [int(x) for x in data['index']]
return data
def dump_image(line, img_root):
"""
Save image data to disk and return path.
Args:
line: Data row containing image data
img_root: Image save root directory
Returns:
list: List of image paths
"""
os.makedirs(img_root, exist_ok=True)
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = os.path.join(img_root, im_name)
if not os.path.exists(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = os.path.join(img_root, f"{line['index']}.jpg")
if not os.path.exists(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
def build_realworldqa_prompt(line, dump_image_func, min_pixels, max_pixels):
"""
Build RealWorldQA dataset prompt.
Args:
line: Data row
dump_image_func: Image save function
min_pixels: Minimum pixels
max_pixels: Maximum pixels
Returns:
list: List of messages in standard conversation format
"""
# Save and get image path
tgt_path = dump_image_func(line)
# Build question text
question = line['question']
# Build options
options = {
cand: line[cand]
for cand in string.ascii_uppercase
if cand in line and not pd.isna(line[cand])
}
options_prompt = 'Options:\n'
for key, item in options.items():
options_prompt += f'{key}. {item}\n'
# Process hint if exists
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
# Build complete prompt
prompt = ''
if hint is not None:
prompt += f'Hint: {hint}\n'
prompt += f'Question: {question}\n'
if len(options):
prompt += options_prompt
prompt += 'Please select the correct answer from the options above. \n'
# Build messages in standard conversation format
content = []
# Add images (using file:// prefix for consistency)
if isinstance(tgt_path, list):
for p in tgt_path:
content.append({
"type": "image",
"image": f"file://{p}",
"min_pixels": min_pixels,
"max_pixels": max_pixels
})
else:
content.append({
"type": "image",
"image": f"file://{tgt_path}",
"min_pixels": min_pixels,
"max_pixels": max_pixels
})
# Add text
content.append({"type": "text", "text": prompt})
# Return messages in standard conversation format
messages = [{
"role": "user",
"content": content
}]
return messages
#!/bin/bash
# RealWorldQA Evaluation Script (Instruct Model)
# This script evaluates the inference results using rule-based and optionally model-based extraction
python run_realworldqa.py eval \
--data-dir /path/to/data \
--input-file results/RealWorldQA_results.jsonl \
--output-file results/RealWorldQA_evaluation.csv \
--dataset RealWorldQA \
--eval-model gpt-3.5-turbo-0125 \
--api-type dash \
--nproc 4
#!/bin/bash
# RealWorldQA Evaluation Script (Thinking Model)
# This script evaluates the inference results from thinking model using rule-based and optionally model-based extraction
python run_realworldqa.py eval \
--data-dir /path/to/data \
--input-file results/RealWorldQA_results_thinking.jsonl \
--output-file results/RealWorldQA_evaluation_thinking.csv \
--dataset RealWorldQA \
--eval-model qwen-plus \
--api-type dash \
--nproc 4
"""
RealWorldQA Evaluation Utilities
Evaluation utilities, fully independent of VLMEvalKit.
"""
import os
import requests
import time
import random
import string
import copy
import traceback
import pandas as pd
from PIL import Image
from typing import List, Dict, Tuple, Any
from common_utils import encode_image_to_base64
class OpenAIWrapper:
"""Wrapper for OpenAI API."""
def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
self.model = model
self.api_base = api_base
self.api_key = api_key
self.timeout = timeout
self.retry = retry
self.wait = wait
self.fail_msg = 'Failed to obtain answer via API.'
def generate(self, messages):
"""Generate a response from the API."""
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
# Format messages for API
formatted_messages = []
for msg in messages:
if msg['type'] == 'text':
formatted_messages.append({"role": "user", "content": [{"type": "text", "text": msg['value']}]})
elif msg['type'] == 'image':
# Load and encode the image
image = Image.open(msg['value'])
image_data = encode_image_to_base64(image)
formatted_messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]
})
payload = {
"model": self.model,
"messages": formatted_messages,
"max_tokens": 4096,
"temperature": 0
}
for i in range(self.retry):
try:
response = requests.post(
self.api_base,
headers=headers,
json=payload,
timeout=self.timeout
)
if response.status_code == 200:
resp_json = response.json()
return resp_json['choices'][0]['message']['content'].strip()
time.sleep(self.wait)
except Exception as e:
print(f"API error: {e}")
time.sleep(self.wait)
return self.fail_msg
class DashScopeWrapper:
"""Wrapper for DashScope API."""
def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
self.model = model
self.api_base = api_base
self.api_key = api_key
self.timeout = timeout
self.retry = retry
self.wait = wait
self.fail_msg = 'Failed to obtain answer via API.'
def generate(self, messages):
"""Generate a response from the API."""
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
# Format messages for API
formatted_messages = []
for msg in messages:
if msg['type'] == 'text':
formatted_messages.append({"role": "user", "content": [{"type": "text", "text": msg['value']}]})
elif msg['type'] == 'image':
# Load and encode the image
image = Image.open(msg['value'])
image_data = encode_image_to_base64(image)
formatted_messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
]
})
payload = {
"model": self.model,
"messages": formatted_messages,
"max_completion_tokens": 4096,
"n": 1,
"temperature": 0,
"stream": False
}
for i in range(self.retry):
try:
response = requests.post(
self.api_base,
headers=headers,
json=payload,
timeout=self.timeout
)
if response.status_code == 200:
resp_json = response.json()
# Check finish reason
for output in resp_json['choices']:
if output['finish_reason'] not in ['stop', 'function_call']:
print(f"DashScope finished with error: {resp_json}")
time.sleep(self.wait)
continue
return resp_json['choices'][0]['message']['content']
else:
print(f"DashScope API error: HTTP {response.status_code}")
try:
error_content = response.json()
print(f"Error details: {error_content}")
except:
print(f"Raw error content: {response.content.decode('utf-8', errors='replace')}")
time.sleep(self.wait)
except requests.exceptions.ConnectionError as conn_err:
print(f"DashScope: Connection error occurred: {conn_err}")
time.sleep(self.wait)
except requests.exceptions.Timeout as timeout_err:
print(f"DashScope: Timeout error occurred: {timeout_err}")
time.sleep(self.wait)
except requests.exceptions.RequestException as req_err:
print(f"DashScope: Request exception occurred: {req_err}")
time.sleep(self.wait)
except Exception as e:
print(f"DashScope: An error occurred: {e}")
print(traceback.format_exc())
time.sleep(self.wait)
return self.fail_msg
def build_judge(model, api_type):
"""Build a judge model for evaluation."""
if api_type == 'mit':
api_key = os.environ.get('MIT_SPIDER_TOKEN', '')
api_base = os.environ.get('MIT_SPIDER_URL', '')
return OpenAIWrapper(model, api_base, api_key)
elif api_type == 'dash':
api_key = os.environ.get('CHATGPT_DASHSCOPE_API_KEY', '')
api_base = os.environ.get('DASHSCOPE_API_BASE', '')
return DashScopeWrapper(model, api_base, api_key)
else:
raise ValueError(f"Unsupported API type: {api_type}")
def can_infer_option(answer, choices):
"""Rule-based extraction of answer option."""
if 'Failed to obtain answer via API' in answer:
return False
reject_to_answer = [
"Sorry, I can't help with images of people yet.",
"I can't process this file.",
"I'm sorry, but without the image provided",
'Cannot determine the answer'
]
for err in reject_to_answer:
if err in answer:
return 'Z'
def count_choice(splits, choices, prefix='', suffix=''):
cnt = 0
for c in choices:
if prefix + c + suffix in splits:
cnt += 1
return cnt
answer_mod = copy.copy(answer)
chars = '.()[],:;!*#{}'
for c in chars:
answer_mod = answer_mod.replace(c, ' ')
splits = [x.strip() for x in answer_mod.split()]
count = count_choice(splits, choices)
if count == 1:
for ch in choices:
if 'A' in splits and len(splits) > 3:
return False
if ch in splits:
return ch
elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
return 'Z'
return False
def can_infer_text(answer, choices):
"""Extract answer by matching text content."""
answer = answer.lower()
assert isinstance(choices, dict)
for k in choices:
assert k in string.ascii_uppercase
choices[k] = str(choices[k]).lower()
cands = []
for k in choices:
if choices[k] in answer:
cands.append(k)
if len(cands) == 1:
return cands[0]
return False
def can_infer(answer, choices):
"""Combined approach to infer answer choice."""
answer = str(answer)
copt = can_infer_option(answer, choices)
return copt if copt else can_infer_text(answer, choices)
def build_choices(item):
"""Build choices dictionary from item."""
ret = {}
for ch in string.ascii_uppercase:
if ch in item and (not pd.isna(item[ch])):
ret[ch] = item[ch]
return ret
def build_option_str(option_dict):
"""Build option string for prompt."""
s = 'There are several options: \n'
for c, content in option_dict.items():
if not pd.isna(content):
s += f'{c}. {content}\n'
return s
def build_prompt(question, options, prediction):
"""Build prompt for answer extraction."""
tmpl = (
'You are an AI assistant who will help me to match '
'an answer with several options of a single-choice question. '
'You are provided with a question, several options, and an answer, '
'and you need to find which option is most similar to the answer. '
'If the meaning of all options are significantly different from the answer, output Z. '
'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
'Example 1: \n'
'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
'Answer: a cute teddy bear\nYour output: A\n'
'Example 2: \n'
'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
'Answer: Spider\nYour output: Z\n'
'Example 3: \n'
'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
)
return tmpl.format(question, options, prediction)
def extract_answer_from_item(model, item, wait=5):
"""Extract answer from model prediction using rule-based and model-based approaches."""
# Build choices dictionary
choices = build_choices(item)
option_str = build_option_str(choices)
prompt = build_prompt(item['question'], option_str, item['prediction'])
# Try rule-based extraction first
prediction = item['prediction']
ret = can_infer(prediction, choices)
if ret:
if ret == 'Z':
extract_flag = False
log = f"Rule extract failed with rule result: {ret} prediction: {prediction}"
else:
extract_flag = True
log = f"Rule extract success with rule result: {ret} prediction: {prediction}"
return dict(opt=ret, log=log, extract_model='rule', extract_flag=extract_flag)
# If rule-based extraction fails, use model-based extraction
print(f"Rule extract failed. Use model-based extraction.")
if model is None:
# For RealWorldQA, if model is None, use random choice
options = list(choices) + ['Z'] if 'Z' not in choices else list(choices)
log = f'No judge model provided. Randomly generate one.\n'
return dict(opt=random.choice(options), log=log, extract_model='random', extract_flag=False)
# Try model-based extraction with retries
retry = 5
while retry:
messages_for_judge = [{'type': 'text', 'value': prompt}]
ans = model.generate(messages_for_judge)
if 'Failed to obtain answer via API' in ans:
print('API failed to answer.')
else:
ret = can_infer(ans, choices)
if ret and ret != 'Z':
log = f'{model.model} extract Succeed. {model.model}:{ans}\n'
return dict(opt=ret, log=log, extract_model=model.model, extract_flag=True)
else:
print(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
retry -= 1
if retry <= 0:
options = list(choices) + ['Z'] if 'Z' not in choices else []
log = f'{model.model} extract failed. randomly generate one. {model.model} response:{ans}\n'
return dict(opt=random.choice(options), log=log, extract_model=model.model, extract_flag=False)
def eval_single_sample(args):
"""Evaluate a single sample."""
model, item = args
# Extract answer using the combined approach
result = extract_answer_from_item(model, item)
# Get ground truth answer
gt_answer = item['answer']
# Determine if the answer is correct
hit = 1 if result['opt'] == gt_answer else 0
return {
"index": item['index'],
"question": item['question'],
"prediction": item['prediction'],
"extracted_answer": result['opt'],
"extraction_method": result['extract_model'],
"extraction_success": result['extract_flag'],
"extraction_log": result['log'],
"gt": gt_answer,
"hit": hit
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment