Commit 3d735feb authored by luopl's avatar luopl
Browse files

"Initial commit"

parents
Pipeline #3074 canceled with stages
import os
import requests
import time
import random
import copy
import traceback
import pandas as pd
from PIL import Image
from typing import List, Dict, Tuple, Any
from common_utils import encode_image_to_base64
from collections import defaultdict
try:
from latex2sympy2 import latex2sympy
except ImportError:
print('Warning: latex2sympy2 not installed. Install with: pip install latex2sympy2')
latex2sympy = None
FAIL_MSG = 'Failed to obtain answer via API.'
def is_equal(asw: str, gt_asw: str) -> bool:
"""Check if two answers are equal."""
if not isinstance(asw, str) or not isinstance(gt_asw, str):
print('Warning: input is not string')
print(asw, gt_asw)
asw = str(asw).lower().strip()
gt_asw = str(gt_asw).lower().strip()
if gt_asw == asw:
return True
try:
a = eval(gt_asw)
b = eval(asw)
if abs(a - b) < 1e-6:
return True
except:
pass
if latex2sympy is not None:
try:
a = latex2sympy(gt_asw)
b = latex2sympy(asw)
if abs(eval(str(a)) - eval(str(b))) < 1e-6:
return True
if abs(a - b) < 1e-6:
return True
except:
pass
return False
def get_gpt4_ICE():
"""Get in-context examples for GPT-4 answer extraction."""
example_1 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: Which number is missing?\n
Model response: The number missing in the sequence is 14.\n
Extracted answer: 14
"""
example_2 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: What is the fraction of females facing the camera?\n
Model response: The fraction of females facing the camera is 0.6,
which means that six out of ten females in the group are facing the camera.\n
Extracted answer: 0.6
"""
example_3 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
Extracted answer: 1.45
"""
example_4 = """
Hint: Please answer the question and provide the final answer at the end.\n
Question: Between which two years does the line graph saw its maximum peak?\n
Model response: The line graph saw its maximum peak between 2007 and 2008.\n
Extracted answer: [2007, 2008]
"""
example_5 = """
Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
Question: What fraction of the shape is blue?\n
Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
Model response: The correct answer is (B) 8/11.\n
Extracted answer: B
"""
return [example_1, example_2, example_3, example_4, example_5]
def build_mathv_gpt4_prompt(line):
"""Build the prompt for GPT-4 to extract answer from model response."""
task_description = """
Please read the following example.
Then extract the answer from the model response and type it at the end of the prompt.\n
"""
question = line['question']
prediction = str(line['prediction'])
prompt = task_description
examples = get_gpt4_ICE()
for example in examples:
prompt += example + '\n'
prompt += question + '\n'
prompt += 'Model response: ' + prediction + '\n'
prompt += 'Extracted answer: '
return prompt
def list_to_dict(lst):
"""Convert list to dictionary with uppercase letters as keys."""
return {chr(65 + i): val for i, val in enumerate(lst)}
def can_infer_option(answer, choices):
"""Rule-based extraction of answer option."""
if FAIL_MSG in answer:
return False
reject_to_answer = [
"Sorry, I can't help with images of people yet.",
"I can't process this file.",
"I'm sorry, but without the image provided",
'Cannot determine the answer'
]
for err in reject_to_answer:
if err in answer:
return 'Z'
def count_choice(splits, choices, prefix='', suffix=''):
cnt = 0
for c in choices:
if prefix + c + suffix in splits:
cnt += 1
return cnt
answer_mod = copy.copy(answer)
chars = '.()[],:;!*#{}'
for c in chars:
answer_mod = answer_mod.replace(c, ' ')
splits = [x.strip() for x in answer_mod.split()]
count = count_choice(splits, choices)
if count == 1:
for ch in choices:
if 'A' in splits and len(splits) > 3:
return False
if ch in splits:
return ch
elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
return 'Z'
return False
def can_infer_text(answer, choices):
"""Extract answer by matching text content."""
answer = answer.lower()
assert isinstance(choices, dict)
for k in choices:
choices[k] = str(choices[k]).lower()
cands = []
for k in choices:
if choices[k] in answer:
cands.append(k)
if len(cands) == 1:
return cands[0]
return False
def can_infer(answer, choices):
"""Combined approach to infer answer choice."""
answer = str(answer)
copt = can_infer_option(answer, choices)
return copt if copt else can_infer_text(answer, choices)
def post_check(line, prefetch=False):
"""Check if the prediction matches the answer."""
res = None
ans = line['answer']
response = line['prediction'] if prefetch else line['res']
try:
if len(eval(line['choices'])) > 0:
ans = line['answer']
choices = list_to_dict(eval(line['choices']))
res = can_infer(response, choices)
if prefetch:
return res
else:
res = str(response)
ans = str(ans)
except ValueError:
pass
if is_equal(res, ans):
return res if prefetch else True
else:
return False
class OpenAIWrapper:
"""Wrapper for OpenAI API."""
def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
self.model = model
self.api_base = api_base
self.api_key = api_key
self.timeout = timeout
self.retry = retry
self.wait = wait
self.fail_msg = FAIL_MSG
def generate(self, prompt, temperature=0):
"""Generate a response from the API."""
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
payload = {
"model": self.model,
"messages": [{"role": "user", "content": prompt}],
"max_tokens": 4096,
"temperature": temperature
}
for i in range(self.retry):
try:
response = requests.post(
self.api_base,
headers=headers,
json=payload,
timeout=self.timeout
)
if response.status_code == 200:
resp_json = response.json()
return resp_json['choices'][0]['message']['content'].strip()
time.sleep(self.wait)
except Exception as e:
print(f"API error: {e}")
time.sleep(self.wait)
return self.fail_msg
class DashScopeWrapper:
"""Wrapper for DashScope API."""
def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
self.model = model
self.api_base = api_base
self.api_key = api_key
self.timeout = timeout
self.retry = retry
self.wait = wait
self.fail_msg = FAIL_MSG
def generate(self, prompt, temperature=0):
"""Generate a response from the API."""
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
payload = {
"model": self.model,
"messages": [{"role": "user", "content": prompt}],
"max_completion_tokens": 4096,
"n": 1,
"temperature": temperature,
"stream": False
}
for i in range(self.retry):
try:
response = requests.post(
self.api_base,
headers=headers,
json=payload,
timeout=self.timeout
)
if response.status_code == 200:
resp_json = response.json()
# Check finish reason
for output in resp_json['choices']:
if output['finish_reason'] not in ['stop', 'function_call']:
print(f"DashScope finished with error: {resp_json}")
time.sleep(self.wait)
continue
return resp_json['choices'][0]['message']['content']
else:
print(f"DashScope API error: HTTP {response.status_code}")
try:
error_content = response.json()
print(f"Error details: {error_content}")
except:
print(f"Raw error content: {response.content.decode('utf-8', errors='replace')}")
time.sleep(self.wait)
except Exception as e:
print(f"DashScope error: {e}")
time.sleep(self.wait)
return self.fail_msg
def build_judge(model, api_type):
"""Build a judge model for evaluation."""
if api_type == 'mit':
api_key = os.environ.get('MIT_SPIDER_TOKEN', '')
api_base = os.environ.get('MIT_SPIDER_URL', '')
return OpenAIWrapper(model, api_base, api_key)
elif api_type == 'dash':
api_key = os.environ.get('CHATGPT_DASHSCOPE_API_KEY', '')
api_base = os.environ.get('DASHSCOPE_API_BASE', '')
return DashScopeWrapper(model, api_base, api_key)
else:
raise ValueError(f"Unsupported API type: {api_type}")
def MATH_V_auxeval(args):
"""Auxiliary evaluation for MathVision - extract answer from model response."""
model, line = args
prompt = build_mathv_gpt4_prompt(line)
log = ''
retry = 5
# Try rule-based extraction first
if post_check(line, prefetch=True):
res = post_check(line, prefetch=True)
log += 'Prefetch succeed.\n'
extract_flag = True
if not res or res == 'Z':
extract_flag = False
log += f'Rule extract failed with ans: {res}'
else:
log += f'Rule extract success with ans: {res}'
return dict(log=log, res=res, extract_model='rule', extract_flag=extract_flag)
# Use model-based extraction
for i in range(retry):
prediction = line['prediction']
res = model.generate(prompt, temperature=i * 0.5)
if FAIL_MSG in res:
log += f'Try {i}: output is {prediction}, failed to parse.\n'
else:
log += f'{model.model} extract Succeed.\n'
return dict(log=log, res=res, extract_model=model.model, extract_flag=True)
log += f'All {retry} retries failed.\n {model.model} response:{res}'
return dict(log=log, res='', extract_model=model.model, extract_flag=False)
def MATH_V_acc(result_file):
"""Calculate accuracy for MathVision results."""
data = pd.read_excel(result_file) if result_file.endswith('.xlsx') else pd.read_csv(result_file)
tot = defaultdict(lambda: 0)
fetch = defaultdict(lambda: 0)
hit = defaultdict(lambda: 0)
lt = len(data)
extract_counts = {}
for i in range(lt):
item = data.iloc[i]
cate = item['category']
tot['Overall'] += 1
tot[cate] += 1
if 'Prefetch succeed' in item['log']:
fetch['Overall'] += 1
fetch[cate] += 1
if post_check(item, prefetch=False):
hit['Overall'] += 1
hit[cate] += 1
# Statistics of answers extracted by rule and gpt
extract_model = item['extract_model']
extract_flag = item['extract_flag']
if extract_model in extract_counts:
extract_counts[extract_model][1] += 1
else:
extract_counts[extract_model] = [0, 1] # succeed, total
if extract_flag:
extract_counts[extract_model][0] += 1
res = defaultdict(list)
for k in tot.keys():
res['Subject'].append(k)
res['tot'].append(tot[k])
res['prefetch'].append(fetch[k])
res['hit'].append(hit[k])
res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
res['acc'].append(hit[k] / tot[k] * 100)
if k == 'Overall':
for model_key in extract_counts:
res[model_key+'_success'].append(extract_counts[model_key][0])
res[model_key+'_all'].append(extract_counts[model_key][1])
else:
for model_key in extract_counts:
res[model_key+'_success'].append(0)
res[model_key+'_all'].append(0)
res = pd.DataFrame(res).sort_values('Subject', ignore_index=True)
return res
def eval_single_sample(args):
"""Evaluate a single sample."""
return MATH_V_auxeval(args)
#!/bin/bash
# MathVision Inference Script (Instruct Model)
# This script runs inference on the MathVision dataset using vLLM
python run_mathv.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--data-dir /path/to/mathvision_data \
--dataset MathVision \
--output-file results/mathvision_predictions.jsonl \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
# --num-samples 100
\ No newline at end of file
#!/bin/bash
# MathVision Inference Script (Thinking Model)
# This script runs inference on the MathVision dataset using vLLM with thinking mode parameters
python run_mathv.py infer \
--model-path /path/to/Qwen3-VL-Thinking \
--data-dir /path/to/mathvision_data \
--dataset MathVision \
--output-file results/mathvision_predictions_thinking.jsonl \
--max-new-tokens 40960 \
--temperature 1.0 \
--top-p 0.95 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 0.0
# --num-samples 100
\ No newline at end of file
vllm
transformers
qwen_vl_utils
pandas
numpy
Pillow
# Utilities
tqdm
requests
validators
torch
torchvision
accelerate
openpyxl
latex2sympy2
flash_attn
\ No newline at end of file
import os
import sys
import json
import argparse
import pandas as pd
import numpy as np
import time
import re
from tqdm import tqdm
from typing import List, Dict, Any
import torch
import warnings
import traceback
# vLLM imports
from vllm import LLM, SamplingParams
from qwen_vl_utils import process_vision_info
from transformers import AutoProcessor
# Local imports from refactored files
from dataset_utils import load_dataset, dump_image
from eval_utils import build_judge, eval_single_sample, MATH_V_acc
# Set vLLM multiprocessing method
os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
def clean_for_excel(val):
"""
Remove characters that are illegal in Excel cells.
Excel doesn't support control characters (0x00-0x1F) except tab, newline, carriage return.
"""
if isinstance(val, str):
# Remove control characters (0x00-0x1F) except tab(0x09), newline(0x0A), carriage return(0x0D)
return re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', '', val)
return val
def clean_dataframe_for_excel(df):
"""Clean all string columns in a DataFrame for Excel compatibility."""
return df.applymap(clean_for_excel) if hasattr(df, 'applymap') else df.map(clean_for_excel)
def build_mathv_prompt(line, dump_image_func, dataset):
"""
Build MathVision dataset prompt.
"""
# Standard resolution (MathVision uses smaller min_pixels)
MIN_PIXELS = 768*28*28 # ~0.6M pixels
MAX_PIXELS = 5120*28*28 # ~4M pixels
tgt_path = dump_image_func(line)
question = line['question']
# Build messages in standard conversation format
content = []
# Add all images first
if isinstance(tgt_path, list):
for p in tgt_path:
content.append({
"type": "image",
"image": p,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS
})
else:
content.append({
"type": "image",
"image": tgt_path,
"min_pixels": MIN_PIXELS,
"max_pixels": MAX_PIXELS
})
# Add question text last
content.append({"type": "text", "text": question})
# Return messages in standard conversation format
messages = [{
"role": "user",
"content": content
}]
return messages
def prepare_inputs_for_vllm(messages, processor):
"""
Prepare inputs for vLLM (following the examples in README.md).
Args:
messages: List of messages in standard conversation format
processor: AutoProcessor instance
Returns:
dict: Input format required by vLLM
"""
text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# qwen_vl_utils 0.0.14+ required
image_inputs, video_inputs, video_kwargs = process_vision_info(
messages,
image_patch_size=processor.image_processor.patch_size,
return_video_kwargs=True,
return_video_metadata=True
)
mm_data = {}
if image_inputs is not None:
mm_data['image'] = image_inputs
if video_inputs is not None:
mm_data['video'] = video_inputs
return {
'prompt': text,
'multi_modal_data': mm_data,
'mm_processor_kwargs': video_kwargs
}
def run_inference(args):
"""Run inference on the MathVision dataset using vLLM."""
print("\n" + "="*80)
print("🚀 MathVision Inference with vLLM (High-Speed Mode)")
print("="*80 + "\n")
# Load dataset
data = load_dataset(args.dataset)
# Limit number of samples if specified
if args.num_samples is not None and args.num_samples > 0:
original_len = len(data)
data = data.iloc[:args.num_samples]
print(f"✓ Loaded {len(data)} samples from {args.dataset} (limited from {original_len} samples)")
else:
print(f"✓ Loaded {len(data)} samples from {args.dataset}")
# Set up image root directory
img_root = os.path.join(os.environ['LMUData'], 'images', args.dataset)
os.makedirs(img_root, exist_ok=True)
# Set up dump_image function
def dump_image_func(line):
return dump_image(line, img_root)
# Create output directory
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
# Set up CoT prompt if enabled
cot_prompt = ""
if args.use_cot:
cot_prompt = args.cot_prompt if args.cot_prompt else " Let's think step by step."
print(f"✓ Using CoT prompt: {cot_prompt[:50]}...")
# Set up generation parameters (vLLM SamplingParams format)
sampling_params = SamplingParams(
temperature=args.temperature,
top_p=args.top_p,
top_k=args.top_k,
max_tokens=args.max_new_tokens,
repetition_penalty=args.repetition_penalty,
presence_penalty=args.presence_penalty,
stop_token_ids=[],
)
print(f"\n⚙️ Generation parameters (vLLM SamplingParams):")
print(f" max_tokens={sampling_params.max_tokens}")
print(f" temperature={sampling_params.temperature}, top_p={sampling_params.top_p}, top_k={sampling_params.top_k}")
print(f" repetition_penalty={sampling_params.repetition_penalty}")
print(f" presence_penalty={sampling_params.presence_penalty}")
if sampling_params.presence_penalty > 0:
print(f" ✅ Anti-repetition enabled (presence_penalty={sampling_params.presence_penalty})")
if sampling_params.temperature <= 0.02 and sampling_params.top_k == 1:
print(f" ✅ Using FAST greedy-like decoding")
else:
print(f" ⚠️ Using sampling decoding (slower but more diverse)")
print()
# Load processor for input preparation
print(f"Loading processor from {args.model_path}")
processor = AutoProcessor.from_pretrained(args.model_path)
print("✓ Processor loaded\n")
# Initialize vLLM
print(f"Initializing vLLM with model: {args.model_path}")
print(f" GPU count: {torch.cuda.device_count()}")
print(f" Tensor parallel size: {args.tensor_parallel_size}")
llm = LLM(
model=args.model_path,
tensor_parallel_size=args.tensor_parallel_size,
gpu_memory_utilization=args.gpu_memory_utilization,
trust_remote_code=True,
max_model_len=args.max_model_len,
limit_mm_per_prompt={"image": args.max_images_per_prompt},
seed=42,
)
print("✓ vLLM initialized successfully\n")
# Prepare all inputs
print("Preparing inputs for vLLM...")
all_inputs = []
all_line_dicts = []
all_messages = []
for idx, (_, line) in enumerate(tqdm(data.iterrows(), total=len(data), desc="Building prompts")):
# Convert line to dict
line_dict = line.to_dict()
for k, v in line_dict.items():
if isinstance(v, np.integer):
line_dict[k] = int(v)
elif isinstance(v, np.floating):
line_dict[k] = float(v)
# Build prompt
messages = build_mathv_prompt(line, dump_image_func, args.dataset)
# Add CoT prompt
if args.use_cot and len(messages) > 0 and len(messages[0]['content']) > 0:
last_content = messages[0]['content'][-1]
if last_content['type'] == 'text':
last_content['text'] += cot_prompt
# Prepare input for vLLM
vllm_input = prepare_inputs_for_vllm(messages, processor)
all_inputs.append(vllm_input)
all_line_dicts.append(line_dict)
all_messages.append(messages)
print(f"✓ Prepared {len(all_inputs)} inputs\n")
# Batch inference (vLLM automatic optimization)
print("="*80)
print("🚀 Running vLLM batch inference (automatic optimization)")
print("="*80)
start_time = time.time()
outputs = llm.generate(all_inputs, sampling_params=sampling_params)
end_time = time.time()
total_time = end_time - start_time
print(f"\n✓ Inference completed in {total_time:.2f} seconds")
print(f" Average: {total_time/len(data):.2f} seconds/sample")
print(f" Throughput: {len(data)/total_time:.2f} samples/second\n")
# Save results
print("Saving results...")
results = []
for idx, (line_dict, messages, output) in enumerate(zip(all_line_dicts, all_messages, outputs)):
response = output.outputs[0].text
index = line_dict['index']
response_final = str(response).split("</think>")[-1].strip()
result = {
"question_id": int(index) if isinstance(index, np.integer) else index,
"annotation": line_dict,
"task": args.dataset,
"result": {"gen": response_final, "gen_raw": response},
"messages": messages
}
results.append(result)
# Write final results
with open(args.output_file, 'w') as f:
for res in results:
f.write(json.dumps(res) + '\n')
print(f"\n✓ Results saved to {args.output_file}")
print(f"✓ Total samples processed: {len(results)}")
def run_evaluation(args):
"""Run evaluation on inference results."""
# Load results
results = []
with open(args.input_file, 'r') as f:
for line in f:
job = json.loads(line)
annotation = job["annotation"]
annotation["prediction"] = job["result"]["gen"]
results.append(annotation)
data = pd.DataFrame.from_records(results)
data = data.sort_values(by='index')
data['prediction'] = [str(x) for x in data['prediction']]
# Load dataset for validation
meta = load_dataset(args.dataset)
# Validation
print(f"len(data): {len(data)}")
print(f"len(meta): {len(meta)}")
meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
data_map = {x: y for x, y in zip(data['index'], data['question'])}
for k in data_map:
assert k in meta_q_map, (
f'eval_file should be the same as or a subset of dataset {args.dataset}'
)
# Save intermediate results
output_xlsx = args.output_file.replace('.csv', '.xlsx') if args.output_file.endswith('.csv') else args.output_file
clean_dataframe_for_excel(data).to_excel(output_xlsx, index=False)
print(f"✓ Saved intermediate results to {output_xlsx}")
# Build judge model
model = build_judge(
model=getattr(args, 'eval_model', 'gpt-4o-2024-05-13'),
api_type=getattr(args, 'api_type', 'dash')
)
# Prepare evaluation tasks
eval_tasks = []
for i in range(len(data)):
item = data.iloc[i]
eval_tasks.append((model, item))
# Run evaluation
eval_results = []
# Debug mode: process single-threaded with first few samples
debug = os.environ.get('DEBUG', '').lower() == 'true'
if debug:
print("Running in debug mode with first 5 samples...")
for task in eval_tasks[:5]:
try:
result = eval_single_sample(task)
eval_results.append(result)
except Exception as e:
print(f"Error processing task: {e}")
print(f"Task details: {task}")
raise
else:
# Normal mode: process all samples with threading
from concurrent.futures import ThreadPoolExecutor
nproc = getattr(args, 'nproc', 4)
with ThreadPoolExecutor(max_workers=nproc) as executor:
for result in tqdm(executor.map(eval_single_sample, eval_tasks),
total=len(eval_tasks), desc="Evaluating"):
eval_results.append(result)
# Update data with evaluation results
data['res'] = [r['res'] for r in eval_results]
data['log'] = [r['log'] for r in eval_results]
data['extract_model'] = [r['extract_model'] for r in eval_results]
data['extract_flag'] = [r['extract_flag'] for r in eval_results]
# Save evaluation results
storage = args.output_file.replace('.csv', '_eval.xlsx')
clean_dataframe_for_excel(data).to_excel(storage, index=False)
print(f"✓ Saved evaluation results to {storage}")
# Calculate accuracy
score = MATH_V_acc(storage)
score_pth = storage.replace('.xlsx', '_score.csv')
score.to_csv(score_pth, index=False)
print(f"✓ Saved score to {score_pth}")
print(f"\n{'='*50}")
print(f"Evaluation Results:")
print(f"{'='*50}")
print(score)
print(f"{'='*50}\n")
return score
def main():
parser = argparse.ArgumentParser(description="MathVision Evaluation with vLLM")
subparsers = parser.add_subparsers(dest='command', help='Command to run')
# Inference parser
infer_parser = subparsers.add_parser("infer", help="Run inference with vLLM")
infer_parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
infer_parser.add_argument("--dataset", type=str, default="MathVision",
choices=["MathVision", "MathVision_MINI"],
help="Dataset name")
infer_parser.add_argument("--data-dir", type=str, help="The absolute path of MathVision data directory")
infer_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
infer_parser.add_argument("--num-samples", type=int, default=None,
help="Number of samples to process (default: None, process all samples)")
infer_parser.add_argument("--use-cot", action="store_true", help="Use Chain-of-Thought prompting")
infer_parser.add_argument("--cot-prompt", type=str, default="", help="Custom Chain-of-Thought prompt")
# vLLM specific parameters
infer_parser.add_argument("--tensor-parallel-size", type=int, default=None,
help="Tensor parallel size (default: number of GPUs)")
infer_parser.add_argument("--gpu-memory-utilization", type=float, default=0.9,
help="GPU memory utilization (0.0-1.0, default: 0.9)")
infer_parser.add_argument("--max-model-len", type=int, default=128000,
help="Maximum model context length (default: 128000)")
infer_parser.add_argument("--max-images-per-prompt", type=int, default=10,
help="Maximum images per prompt (default: 10)")
# Generation parameters
infer_parser.add_argument("--max-new-tokens", type=int, default=32768,
help="Maximum number of tokens to generate (default: 2048)")
infer_parser.add_argument("--temperature", type=float, default=0.7,
help="Temperature for sampling (default: 0.7 for greedy-like decoding)")
infer_parser.add_argument("--top-p", type=float, default=0.8,
help="Top-p for sampling (default: 0.8 for greedy-like decoding)")
infer_parser.add_argument("--top-k", type=int, default=20,
help="Top-k for sampling (default: 20 for greedy decoding)")
infer_parser.add_argument("--repetition-penalty", type=float, default=1.0,
help="Repetition penalty (default: 1.0, increase to 1.2-1.5 to reduce repetition)")
infer_parser.add_argument("--presence-penalty", type=float, default=1.5,
help="Presence penalty (default: 1.5, range: 0.0-2.0, penalize tokens that have already appeared)")
# Evaluation parser
eval_parser = subparsers.add_parser("eval", help="Run evaluation")
eval_parser.add_argument("--data-dir", type=str, help="The absolute path of MathVision data directory")
eval_parser.add_argument("--input-file", type=str, required=True, help="Input file with inference results")
eval_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
eval_parser.add_argument("--dataset", type=str, default="MathVision",
choices=["MathVision", "MathVision_MINI"],
help="Dataset name")
eval_parser.add_argument("--eval-model", type=str, default="gpt-4o",
help="Model to use for evaluation (default: gpt-4o)")
eval_parser.add_argument("--api-type", type=str, default="dash", choices=["dash", "mit"],
help="API type for evaluation")
eval_parser.add_argument("--nproc", type=int, default=4, help="Number of processes to use")
args = parser.parse_args()
# Set data directory if provided
if hasattr(args, 'data_dir') and args.data_dir:
os.environ['LMUData'] = args.data_dir
# Automatically set tensor_parallel_size
if args.command == 'infer' and args.tensor_parallel_size is None:
args.tensor_parallel_size = torch.cuda.device_count()
print(f"Auto-set tensor_parallel_size to {args.tensor_parallel_size}")
if args.command == 'infer':
run_inference(args)
elif args.command == 'eval':
run_evaluation(args)
else:
parser.print_help()
if __name__ == "__main__":
main()
# ODinW Benchmark Evaluation
This directory contains the implementation for evaluating vision-language models on the ODinW (Object Detection in the Wild) 13 dataset using vLLM for high-speed inference.
## Overview
ODinW is a comprehensive object detection benchmark that consists of 13 diverse datasets spanning various domains. This implementation provides:
- **High-speed inference** using vLLM with automatic batch optimization
- **Unified evaluation** across 13 diverse object detection datasets
- **COCO-style metrics** including mAP, mAP_50, mAP_75, etc.
- **Modular code structure** for easy maintenance and extension
## Project Structure
```
ODinW-13/
├── run_odinw.py # Main script for inference and evaluation
├── dataset_utils.py # Dataset loading and preprocessing utilities
├── eval_utils.py # Evaluation logic and COCO metrics computation
├── infer_instruct.sh # Inference script for instruct models
├── infer_think.sh # Inference script for thinking models
├── eval_instruct.sh # Evaluation script for instruct model results
├── eval_think.sh # Evaluation script for thinking model results
├── requirements.txt # Python dependencies
└── README.md # This file
```
## Requirements
### Python Dependencies
```bash
pip install -r requirements.txt
```
Key dependencies:
- `vllm` - High-speed LLM inference engine
- `transformers` - HuggingFace transformers
- `qwen_vl_utils` - Qwen VL utilities for vision processing
- `pycocotools` - COCO evaluation API
- `pandas`, `numpy` - Data processing
- `Pillow` - Image processing
- `tabulate` - Table formatting (optional, for better output display)
### Data Preparation
The ODinW dataset requires a specific directory structure:
```
/path/to/odinw_data/
├── odinw13_config.py # Dataset configuration file (required)
├── AerialMaritimeDrone/ # Individual datasets
│ ├── large/
│ │ ├── train/
│ │ └── test/
│ └── tiled/
├── Aquarium/
├── Cottontail Rabbits/
├── EgoHands/
├── NorthAmerica Mushrooms/
├── Packages/
├── Pascal VOC/
├── Pistols/
├── Pothole/
├── Raccoon/
├── ShellfishOpenImages/
├── Thermal Dogs and People/
└── Vehicles OpenImages/
```
**Important**: The `odinw13_config.py` file must contain:
- `datasets`: List of dataset configurations
- `dataset_prefixes`: List of dataset names
## Quick Start
### 1. Inference
Run inference on the ODinW dataset using an instruct model:
```bash
bash infer_instruct.sh
```
Or customize the inference:
```bash
python run_odinw.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--data-dir /path/to/odinw_data \
--output-file results/odinw_predictions.jsonl \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
```
For thinking models with extended reasoning:
```bash
bash infer_think.sh
```
### 2. Evaluation
Evaluate the inference results using COCO metrics:
```bash
bash eval_instruct.sh
```
Or customize the evaluation:
```bash
python run_odinw.py eval \
--data-dir /path/to/odinw_data \
--input-file results/odinw_predictions.jsonl \
--output-file results/odinw_eval_results.json
```
## Detailed Usage
### Inference Mode
**Basic Arguments:**
- `--model-path`: Path to the Qwen3-VL model directory (required)
- `--data-dir`: Path to ODinW data directory containing `odinw13_config.py` (required)
- `--output-file`: Path to save inference results in JSONL format (required)
**vLLM Arguments:**
- `--tensor-parallel-size`: Number of GPUs for tensor parallelism (default: auto-detect)
- `--gpu-memory-utilization`: GPU memory utilization ratio, 0.0-1.0 (default: 0.9)
- `--max-model-len`: Maximum model context length (default: 128000)
- `--max-images-per-prompt`: Maximum images per prompt (default: 10)
**Generation Parameters:**
- `--max-new-tokens`: Maximum tokens to generate (default: 32768)
- `--temperature`: Sampling temperature (default: 0.7)
- `--top-p`: Top-p sampling (default: 0.8)
- `--top-k`: Top-k sampling (default: 20)
- `--repetition-penalty`: Repetition penalty (default: 1.0)
- `--presence-penalty`: Presence penalty to reduce repetition (default: 1.5)
### Evaluation Mode
**Basic Arguments:**
- `--data-dir`: Path to ODinW data directory containing `odinw13_config.py` (required)
- `--input-file`: Inference results file in JSONL format (required)
- `--output-file`: Path to save evaluation results in JSON format (required)
## Output Files
### Inference Output
The inference script generates two files:
1. **Predictions file** (`odinw_predictions.jsonl`): JSONL file where each line contains:
```json
{
"question_id": 0,
"annotation": [...],
"extra_info": {
"dataset_name": "AerialMaritimeDrone_large",
"img_id": 1,
"anno_path": "/path/to/annotations.json",
"resized_h": 640,
"resized_w": 640,
"img_h": 1080,
"img_w": 1920,
"img_path": "/path/to/image.jpg"
},
"result": {
"gen": "[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"boat\"}, ...]",
"gen_raw": "Raw model output including thinking process"
},
"messages": [...]
}
```
2. **Dataset config file** (`odinw_predictions_datasets.json`): Configuration for evaluation
### Evaluation Output
The evaluation script generates a JSON file with results for each dataset:
```json
{
"AerialMaritimeDrone_large": {
"mAP": 0.456,
"mAP_50": 0.678,
"mAP_75": 0.512,
"mAP_s": 0.234,
"mAP_m": 0.456,
"mAP_l": 0.567
},
"Aquarium_Aquarium Combined.v2-raw-1024.coco": {
...
},
...
"Average": 0.423
}
```
**Evaluation Metrics:**
- **mAP**: Mean Average Precision at IoU 0.5:0.95 (primary metric)
- **mAP_50**: mAP at IoU threshold 0.5
- **mAP_75**: mAP at IoU threshold 0.75
- **mAP_s**: mAP for small objects (area < 32²)
- **mAP_m**: mAP for medium objects (32² < area < 96²)
- **mAP_l**: mAP for large objects (area > 96²)
## Model-Specific Configurations
### Instruct Models (e.g., Qwen3-VL-2B-Instruct, Qwen3-VL-7B-Instruct)
Use standard parameters for balanced performance:
```bash
--max-new-tokens 32768
--temperature 0.7
--top-p 0.8
--top-k 20
--repetition-penalty 1.0
--presence-penalty 1.5
```
### Thinking Models (e.g., Qwen3-VL-2B-Thinking)
Use adjusted parameters for deeper reasoning:
```bash
--max-new-tokens 32768
--temperature 0.6
--top-p 0.95
--top-k 20
--repetition-penalty 1.0
--presence-penalty 0.0
```
**Note:** Thinking models output reasoning steps wrapped in `<think>...</think>` tags. The evaluation automatically extracts the final answer after `</think>`.
## Performance Tips
1. **GPU Memory**: Adjust `--gpu-memory-utilization` based on your GPU:
- 0.9: Recommended for most cases
- 0.95: For maximum throughput (may cause OOM)
- 0.7-0.8: If experiencing OOM errors
2. **Batch Size**: vLLM automatically optimizes batch size based on available memory
3. **Tensor Parallelism**: Use `--tensor-parallel-size` for large models:
- 2B model: 1 GPU recommended
- 7B model: 1-2 GPUs
- 14B+ model: 2-4 GPUs
4. **Context Length**: Reduce `--max-model-len` if memory is limited:
- 128000: Default, works well for most cases
- 64000: Reduces memory usage by ~40%
5. **Image Processing**: The implementation uses `smart_resize` to automatically adjust image dimensions:
- Dimensions are made divisible by 32
- Total pixels are constrained to [min_pixels, max_pixels]
- Aspect ratio is preserved
## Troubleshooting
### Common Issues
**1. Config file not found**
```
FileNotFoundError: Config file not found: /path/to/odinw13_config.py
```
**Solution**: Ensure `odinw13_config.py` exists in `--data-dir`
**2. CUDA Out of Memory**
```bash
# Reduce GPU memory utilization
--gpu-memory-utilization 0.7
# Or reduce context length
--max-model-len 64000
```
**3. vLLM Multiprocessing Issues**
The code automatically sets `VLLM_WORKER_MULTIPROC_METHOD=spawn`. If you still encounter issues:
```bash
export VLLM_WORKER_MULTIPROC_METHOD=spawn
```
**4. Empty or Invalid JSON Output**
- Check model output format
- Verify prompt clarity
- Try adjusting temperature/top_p
**5. Low mAP Scores**
- Verify category names match dataset classes
- Check coordinate format (xyxy vs xywh)
- Ensure model outputs JSON format correctly
**6. COCO API Errors**
```
IndexError: The testing results of the whole dataset is empty.
```
**Solution**: No valid predictions were generated. Check model outputs.
## Advanced Usage
### Custom Image Resolution
Edit `dataset_utils.py` to modify resolution parameters:
```python
# Calculate image resolution parameters
patch_size = 16
merge_base = 2
pixels_per_token = patch_size * patch_size * merge_base * merge_base
min_pixels = pixels_per_token * 768
max_pixels = pixels_per_token * 12800
```
### Filtering Datasets
To evaluate only specific datasets, edit `generate_odinw_jobs()` in `dataset_utils.py`:
```python
# Only process specific datasets
dataset_filter = ['AerialMaritimeDrone', 'Aquarium']
for data_name, data_config in datasets.items():
if data_name not in dataset_filter:
continue
# ... rest of the code
```
### Custom Prompt Format
Edit the prompt in `dataset_utils.py`:
```python
# Default prompt
prompt = f"Locate every instance that belongs to the following categories: '{obj_names}'. Report bbox coordinates in JSON format."
# Custom prompt example
prompt = f"Find all {obj_names} objects in the image and output their bounding boxes as JSON."
```
## Citation
If you use this code or the ODinW benchmark, please cite:
```bibtex
@inproceedings{li2022grounded,
title={Grounded language-image pre-training},
author={Li, Liunian Harold and Zhang, Pengchuan and Zhang, Haotian and Yang, Jianwei and Li, Chunyuan and Zhong, Yiwu and Wang, Lijuan and Yuan, Lu and Zhang, Lei and Hwang, Jenq-Neng and others},
booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
pages={10965--10975},
year={2022}
}
```
## License
This code is released under the same license as the Qwen3-VL model.
## Support
For issues and questions:
- GitHub Issues: [Qwen3-VL Repository](https://github.com/QwenLM/Qwen3-VL)
- Documentation: See inline code comments and docstrings
"""
ODinW dataset loading and processing utilities.
"""
import os
import math
from typing import Dict, List, Tuple
from pycocotools.coco import COCO
def round_by_factor(number: int, factor: int) -> int:
"""Return the nearest integer divisible by factor."""
return round(number / factor) * factor
def ceil_by_factor(number: int, factor: int) -> int:
"""Return the ceiling integer divisible by factor."""
return math.ceil(number / factor) * factor
def floor_by_factor(number: int, factor: int) -> int:
"""Return the floor integer divisible by factor."""
return math.floor(number / factor) * factor
def smart_resize(height: int, width: int, factor: int = 28,
min_pixels: int = 56*56, max_pixels: int = 14*14*4*1280,
max_long_side: int = 8192) -> Tuple[int, int]:
"""Resize image to meet the following conditions:
1. Both height and width are divisible by factor
2. Total pixels are within [min_pixels, max_pixels]
3. Longest side is within max_long_side
4. Aspect ratio is preserved
Args:
height: Original image height
width: Original image width
factor: Size must be divisible by this factor
min_pixels: Minimum pixel count
max_pixels: Maximum pixel count
max_long_side: Maximum longest side
Returns:
(resized_height, resized_width): Resized dimensions
"""
if height < 2 or width < 2:
raise ValueError(f'height:{height} or width:{width} must be larger than factor:{factor}')
elif max(height, width) / min(height, width) > 200:
raise ValueError(f'absolute aspect ratio must be smaller than 200, got {height} / {width}')
if max(height, width) > max_long_side:
beta = max(height, width) / max_long_side
height, width = int(height / beta), int(width / beta)
h_bar = round_by_factor(height, factor)
w_bar = round_by_factor(width, factor)
if h_bar * w_bar > max_pixels:
beta = math.sqrt((height * width) / max_pixels)
h_bar = floor_by_factor(height / beta, factor)
w_bar = floor_by_factor(width / beta, factor)
elif h_bar * w_bar < min_pixels:
beta = math.sqrt(min_pixels / (height * width))
h_bar = ceil_by_factor(height * beta, factor)
w_bar = ceil_by_factor(width * beta, factor)
return h_bar, w_bar
def load_odinw_config(config_path: str) -> Dict:
"""Load odinw13_config.py configuration file.
Args:
config_path: Path to config file
Returns:
datasets: Dictionary mapping dataset names to configurations
"""
import runpy
config = runpy.run_path(config_path)
dataset_configs = config["datasets"]
dataset_names = config["dataset_prefixes"]
datasets = {}
for dataset_name, dataset_config in zip(dataset_names, dataset_configs):
datasets[dataset_name] = dataset_config
return datasets
def generate_odinw_jobs(data_dir: str, args) -> Tuple[List[Dict], Dict]:
"""Generate inference task list for ODinW dataset.
Args:
data_dir: Data directory path (containing odinw13_config.py)
args: Command line arguments
Returns:
(question_list, datasets): Task list and dataset configurations
"""
# Load config
config_path = os.path.join(data_dir, "odinw13_config.py")
if not os.path.exists(config_path):
raise FileNotFoundError(f"Config file not found: {config_path}")
datasets = load_odinw_config(config_path)
question_list = []
question_id = 0
num_questions_per_dataset = {}
# Calculate image resolution parameters
patch_size = 16
merge_base = 2
pixels_per_token = patch_size * patch_size * merge_base * merge_base
min_pixels = pixels_per_token * 768
max_pixels = pixels_per_token * 12800
# Iterate through all datasets
for data_name, data_config in datasets.items():
print(f'Parsing ODinW:{data_name}')
classes = list(data_config["metainfo"]["classes"])
# Build data paths
idx = data_config["data_root"].find('data/odinw/') + len('data/odinw/')
sub_root = os.path.join(data_dir, data_config["data_root"][idx:])
sub_anno = sub_root + data_config["ann_file"]
sub_img_root = sub_root + data_config["data_prefix"]["img"]
# Load COCO format annotations
dataset = COCO(sub_anno)
num_questions = 0
# Iterate through all images
for img_idx, img_meta in dataset.imgs.items():
img_name = img_meta["file_name"]
img_path = sub_img_root + img_name
img_h = img_meta["height"]
img_w = img_meta["width"]
# Calculate resized image dimensions
resized_h, resized_w = smart_resize(
img_h, img_w,
factor=32,
min_pixels=min_pixels,
max_pixels=max_pixels,
max_long_side=50000
)
# Get annotations
img_annos = dataset.imgToAnns[img_idx]
# Build class names list
obj_names = ", ".join(classes)
# Build prompt
prompt = f"Locate every instance that belongs to the following categories: '{obj_names}'. Report bbox coordinates in JSON format."
# Build messages
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": f"file://{img_path}",
"min_pixels": min_pixels,
"max_pixels": max_pixels
},
{"type": "text", "text": prompt}
]
}
]
# Build task item
item = {
"question_id": question_id,
"annotation": img_annos,
'messages': messages,
"extra_info": {
'dataset_name': data_name,
'dataset_config': data_config,
'img_id': img_meta["id"],
'anno_path': sub_anno,
'resized_h': resized_h,
'resized_w': resized_w,
'img_h': img_h,
'img_w': img_w,
'img_path': img_path
}
}
question_list.append(item)
question_id += 1
num_questions += 1
num_questions_per_dataset[data_name] = num_questions
# Print statistics
for data_name, num_questions in num_questions_per_dataset.items():
print(f'{data_name}: {num_questions}')
print(f"Total ODinW questions: {len(question_list)}")
return question_list, datasets
#!/bin/bash
# ODinW Evaluation Script (Instruct Model)
# This script evaluates the inference results using COCO metrics
python run_odinw.py eval \
--data-dir /path/to/odinw_data \
--input-file results/odinw_predictions.jsonl \
--output-file results/odinw_eval_results.json
#!/bin/bash
# ODinW Evaluation Script (Thinking Model)
# This script evaluates the inference results from thinking model using COCO metrics
python run_odinw.py eval \
--data-dir /path/to/odinw_data \
--input-file results/odinw_predictions_thinking.jsonl \
--output-file results/odinw_eval_results_thinking.json
"""
ODinW evaluation utilities.
"""
import os
import json
import tempfile
import numpy as np
from typing import List, Dict, Sequence
from collections import OrderedDict
from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval
def xyxy2xywh(bbox: np.ndarray) -> list:
"""Convert bbox format from xyxy to xywh.
Args:
bbox: Bounding box in [x1, y1, x2, y2] format
Returns:
Bounding box in [x, y, w, h] format
"""
_bbox = bbox.tolist()
return [
_bbox[0],
_bbox[1],
_bbox[2] - _bbox[0],
_bbox[3] - _bbox[1],
]
def results2json(results: Sequence[dict], outfile_prefix: str, cat_ids: dict) -> dict:
"""Convert results to COCO JSON format.
Args:
results: List of prediction results
outfile_prefix: Output file prefix
cat_ids: Category ID mapping
Returns:
result_files: Dictionary of result file paths
"""
bbox_json_results = []
for idx, result in enumerate(results):
image_id = result.get('img_id', idx)
labels = result['labels']
bboxes = result['bboxes']
scores = result['scores']
for i, label in enumerate(labels):
data = dict()
data['image_id'] = image_id
data['bbox'] = xyxy2xywh(bboxes[i])
data['score'] = float(scores[i])
data['category_id'] = cat_ids[label]
bbox_json_results.append(data)
result_files = dict()
result_files['bbox'] = f'{outfile_prefix}.bbox.json'
with open(result_files['bbox'], 'w') as f:
json.dump(bbox_json_results, f)
return result_files
def compute_metrics(results: list, outfile_prefix: str = None, _coco_api: COCO = None) -> Dict[str, float]:
"""Compute mAP and other metrics using COCO API.
Args:
results: List of evaluation results, each element is a (gt, pred) tuple
outfile_prefix: Output file prefix (optional)
_coco_api: COCO API instance
Returns:
eval_results: Dictionary of evaluation metrics
"""
proposal_nums = (100, 300, 1000)
iou_thrs = np.linspace(
.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
# Separate ground truth and predictions
if len(results) == 0:
gts, preds = [], []
else:
gts, preds = zip(*results)
tmp_dir = None
if outfile_prefix is None:
tmp_dir = tempfile.TemporaryDirectory()
outfile_prefix = os.path.join(tmp_dir.name, 'results')
cat_ids = _coco_api.getCatIds()
img_ids = _coco_api.getImgIds()
# Convert to COCO format and save
result_files = results2json(preds, outfile_prefix, cat_ids)
eval_results = OrderedDict()
for metric in ["bbox"]:
iou_type = metric
if metric not in result_files:
raise KeyError(f'{metric} is not in results')
try:
with open(result_files[metric], 'r') as f:
predictions = json.load(f)
coco_dt = _coco_api.loadRes(predictions)
except IndexError:
print('The testing results of the whole dataset is empty.')
break
coco_eval = COCOeval(_coco_api, coco_dt, iou_type)
coco_eval.params.catIds = cat_ids
coco_eval.params.imgIds = img_ids
coco_eval.params.maxDets = list(proposal_nums)
coco_eval.params.iouThrs = iou_thrs
# mapping of cocoEval.stats
coco_metric_names = {
'mAP': 0,
'mAP_50': 1,
'mAP_75': 2,
'mAP_s': 3,
'mAP_m': 4,
'mAP_l': 5,
'AR@100': 6,
'AR@300': 7,
'AR@1000': 8,
'AR_s@1000': 9,
'AR_m@1000': 10,
'AR_l@1000': 11
}
coco_eval.evaluate()
coco_eval.accumulate()
coco_eval.summarize()
metric_items = [
'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
]
for metric_item in metric_items:
val = coco_eval.stats[coco_metric_names[metric_item]]
eval_results[metric_item] = float(f'{round(val, 3)}')
if tmp_dir is not None:
tmp_dir.cleanup()
return eval_results
#!/bin/bash
# ODinW Inference Script (Instruct Model)
# This script runs inference on the ODinW dataset using vLLM
python run_odinw.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--data-dir /path/to/odinw_data \
--output-file results/odinw_predictions.jsonl \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.9 \
--max-model-len 128000 \
--max-images-per-prompt 10 \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
#!/bin/bash
# ODinW Inference Script (Thinking Model)
# This script runs inference on the ODinW dataset using vLLM with thinking mode parameters
python run_odinw.py infer \
--model-path /path/to/Qwen3-VL-Thinking \
--data-dir /path/to/odinw_data \
--output-file results/odinw_predictions_thinking.jsonl \
--tensor-parallel-size 1 \
--gpu-memory-utilization 0.9 \
--max-model-len 128000 \
--max-images-per-prompt 10 \
--max-new-tokens 32768 \
--temperature 0.6 \
--top-p 0.95 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 0.0
vllm
transformers
qwen_vl_utils
pandas
numpy
Pillow
tqdm
requests
validators
torch
torchvision
accelerate
pycocotools
tabulate
flash_attn
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/bin/bash
# RealWorldQA Evaluation Script (Instruct Model)
# This script evaluates the inference results using rule-based and optionally model-based extraction
python run_realworldqa.py eval \
--data-dir /path/to/data \
--input-file results/RealWorldQA_results.jsonl \
--output-file results/RealWorldQA_evaluation.csv \
--dataset RealWorldQA \
--eval-model gpt-3.5-turbo-0125 \
--api-type dash \
--nproc 4
#!/bin/bash
# RealWorldQA Evaluation Script (Thinking Model)
# This script evaluates the inference results from thinking model using rule-based and optionally model-based extraction
python run_realworldqa.py eval \
--data-dir /path/to/data \
--input-file results/RealWorldQA_results_thinking.jsonl \
--output-file results/RealWorldQA_evaluation_thinking.csv \
--dataset RealWorldQA \
--eval-model qwen-plus \
--api-type dash \
--nproc 4
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment