"Initial commit"

3d735feb · luopl · 3d735feb · 3d735feb · 3d735feb · 3d735feb
Commit 3d735feb authored Dec 03, 2025 by luopl
20 changed files
--- a/evaluation/MathVision/eval_utils.py
+++ b/evaluation/MathVision/eval_utils.py
+import os
+import requests
+import time
+import random
+import copy
+import traceback
+import pandas as pd
+from PIL import Image
+from typing import List, Dict, Tuple, Any
+from common_utils import encode_image_to_base64
+from collections import defaultdict
+
+try:
+    from latex2sympy2 import latex2sympy
+except ImportError:
+    print('Warning: latex2sympy2 not installed. Install with: pip install latex2sympy2')
+    latex2sympy = None
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def is_equal(asw: str, gt_asw: str) -> bool:
+    """Check if two answers are equal."""
+    if not isinstance(asw, str) or not isinstance(gt_asw, str):
+        print('Warning: input is not string')
+        print(asw, gt_asw)
+    asw = str(asw).lower().strip()
+    gt_asw = str(gt_asw).lower().strip()
+    if gt_asw == asw:
+        return True
+    try:
+        a = eval(gt_asw)
+        b = eval(asw)
+        if abs(a - b) < 1e-6:
+            return True
+    except:
+        pass
+    if latex2sympy is not None:
+        try:
+            a = latex2sympy(gt_asw)
+            b = latex2sympy(asw)
+            if abs(eval(str(a)) - eval(str(b))) < 1e-6:
+                return True
+            if abs(a - b) < 1e-6:
+                return True
+        except:
+            pass
+    return False
+
+
+def get_gpt4_ICE():
+    """Get in-context examples for GPT-4 answer extraction."""
+    example_1 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: Which number is missing?\n
+Model response: The number missing in the sequence is 14.\n
+Extracted answer: 14
+"""
+
+    example_2 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: What is the fraction of females facing the camera?\n
+Model response: The fraction of females facing the camera is 0.6,
+which means that six out of ten females in the group are facing the camera.\n
+Extracted answer: 0.6
+"""
+
+    example_3 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
+Extracted answer: 1.45
+"""
+
+    example_4 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: Between which two years does the line graph saw its maximum peak?\n
+Model response: The line graph saw its maximum peak between 2007 and 2008.\n
+Extracted answer: [2007, 2008]
+"""
+
+    example_5 = """
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
+Question: What fraction of the shape is blue?\n
+Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
+Model response: The correct answer is (B) 8/11.\n
+Extracted answer: B
+"""
+
+    return [example_1, example_2, example_3, example_4, example_5]
+
+
+def build_mathv_gpt4_prompt(line):
+    """Build the prompt for GPT-4 to extract answer from model response."""
+    task_description = """
+Please read the following example.
+Then extract the answer from the model response and type it at the end of the prompt.\n
+"""
+    question = line['question']
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += question + '\n'
+    prompt += 'Model response: ' + prediction + '\n'
+    prompt += 'Extracted answer: '
+    return prompt
+
+
+def list_to_dict(lst):
+    """Convert list to dictionary with uppercase letters as keys."""
+    return {chr(65 + i): val for i, val in enumerate(lst)}
+
+
+def can_infer_option(answer, choices):
+    """Rule-based extraction of answer option."""
+    if FAIL_MSG in answer:
+        return False
+
+    reject_to_answer = [
+        "Sorry, I can't help with images of people yet.",
+        "I can't process this file.",
+        "I'm sorry, but without the image provided",
+        'Cannot determine the answer'
+    ]
+    for err in reject_to_answer:
+        if err in answer:
+            return 'Z'
+
+    def count_choice(splits, choices, prefix='', suffix=''):
+        cnt = 0
+        for c in choices:
+            if prefix + c + suffix in splits:
+                cnt += 1
+        return cnt
+
+    answer_mod = copy.copy(answer)
+    chars = '.()[],:;!*#{}'
+    for c in chars:
+        answer_mod = answer_mod.replace(c, ' ')
+
+    splits = [x.strip() for x in answer_mod.split()]
+    count = count_choice(splits, choices)
+
+    if count == 1:
+        for ch in choices:
+            if 'A' in splits and len(splits) > 3:
+                return False
+            if ch in splits:
+                return ch
+    elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
+        return 'Z'
+    return False
+
+
+def can_infer_text(answer, choices):
+    """Extract answer by matching text content."""
+    answer = answer.lower()
+    assert isinstance(choices, dict)
+    for k in choices:
+        choices[k] = str(choices[k]).lower()
+    cands = []
+    for k in choices:
+        if choices[k] in answer:
+            cands.append(k)
+    if len(cands) == 1:
+        return cands[0]
+    return False
+
+
+def can_infer(answer, choices):
+    """Combined approach to infer answer choice."""
+    answer = str(answer)
+    copt = can_infer_option(answer, choices)
+    return copt if copt else can_infer_text(answer, choices)
+
+
+def post_check(line, prefetch=False):
+    """Check if the prediction matches the answer."""
+    res = None
+    ans = line['answer']
+    response = line['prediction'] if prefetch else line['res']
+    try:
+        if len(eval(line['choices'])) > 0:
+            ans = line['answer']
+            choices = list_to_dict(eval(line['choices']))
+            res = can_infer(response, choices)
+            if prefetch:
+                return res
+        else:
+            res = str(response)
+            ans = str(ans)
+    except ValueError:
+        pass
+
+    if is_equal(res, ans):
+        return res if prefetch else True
+    else:
+        return False
+
+
+class OpenAIWrapper:
+    """Wrapper for OpenAI API."""
+    
+    def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
+        self.model = model
+        self.api_base = api_base
+        self.api_key = api_key
+        self.timeout = timeout
+        self.retry = retry
+        self.wait = wait
+        self.fail_msg = FAIL_MSG
+    
+    def generate(self, prompt, temperature=0):
+        """Generate a response from the API."""
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
+        
+        payload = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": 4096,
+            "temperature": temperature
+        }
+        
+        for i in range(self.retry):
+            try:
+                response = requests.post(
+                    self.api_base,
+                    headers=headers,
+                    json=payload,
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    resp_json = response.json()
+                    return resp_json['choices'][0]['message']['content'].strip()
+                
+                time.sleep(self.wait)
+            except Exception as e:
+                print(f"API error: {e}")
+                time.sleep(self.wait)
+        
+        return self.fail_msg
+
+
+class DashScopeWrapper:
+    """Wrapper for DashScope API."""
+    
+    def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
+        self.model = model
+        self.api_base = api_base
+        self.api_key = api_key
+        self.timeout = timeout
+        self.retry = retry
+        self.wait = wait
+        self.fail_msg = FAIL_MSG
+    
+    def generate(self, prompt, temperature=0):
+        """Generate a response from the API."""
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
+        
+        payload = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_completion_tokens": 4096,
+            "n": 1,
+            "temperature": temperature,
+            "stream": False
+        }
+
+        for i in range(self.retry):
+            try:
+                response = requests.post(
+                    self.api_base,
+                    headers=headers,
+                    json=payload,
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    resp_json = response.json()
+                    
+                    # Check finish reason
+                    for output in resp_json['choices']:
+                        if output['finish_reason'] not in ['stop', 'function_call']:
+                            print(f"DashScope finished with error: {resp_json}")
+                            time.sleep(self.wait)
+                            continue
+                    
+                    return resp_json['choices'][0]['message']['content']
+                else:
+                    print(f"DashScope API error: HTTP {response.status_code}")
+                    try:
+                        error_content = response.json()
+                        print(f"Error details: {error_content}")
+                    except:
+                        print(f"Raw error content: {response.content.decode('utf-8', errors='replace')}")
+                
+                time.sleep(self.wait)
+            except Exception as e:
+                print(f"DashScope error: {e}")
+                time.sleep(self.wait)
+        
+        return self.fail_msg
+
+
+def build_judge(model, api_type):
+    """Build a judge model for evaluation."""
+    if api_type == 'mit':
+        api_key = os.environ.get('MIT_SPIDER_TOKEN', '')
+        api_base = os.environ.get('MIT_SPIDER_URL', '')
+        return OpenAIWrapper(model, api_base, api_key)
+    elif api_type == 'dash':
+        api_key = os.environ.get('CHATGPT_DASHSCOPE_API_KEY', '')
+        api_base = os.environ.get('DASHSCOPE_API_BASE', '')
+        return DashScopeWrapper(model, api_base, api_key)
+    else:
+        raise ValueError(f"Unsupported API type: {api_type}")
+
+
+def MATH_V_auxeval(args):
+    """Auxiliary evaluation for MathVision - extract answer from model response."""
+    model, line = args
+    prompt = build_mathv_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    
+    # Try rule-based extraction first
+    if post_check(line, prefetch=True):
+        res = post_check(line, prefetch=True)
+        log += 'Prefetch succeed.\n'
+        extract_flag = True
+        if not res or res == 'Z':
+            extract_flag = False
+            log += f'Rule extract failed with ans: {res}'
+        else:
+            log += f'Rule extract success with ans: {res}'
+        return dict(log=log, res=res, extract_model='rule', extract_flag=extract_flag)
+    
+    # Use model-based extraction
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += f'{model.model} extract Succeed.\n'
+            return dict(log=log, res=res, extract_model=model.model, extract_flag=True)
+    
+    log += f'All {retry} retries failed.\n {model.model} response:{res}'
+    return dict(log=log, res='', extract_model=model.model, extract_flag=False)
+
+
+def MATH_V_acc(result_file):
+    """Calculate accuracy for MathVision results."""
+    data = pd.read_excel(result_file) if result_file.endswith('.xlsx') else pd.read_csv(result_file)
+    
+    tot = defaultdict(lambda: 0)
+    fetch = defaultdict(lambda: 0)
+    hit = defaultdict(lambda: 0)
+    lt = len(data)
+    extract_counts = {}
+
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['category']
+        tot['Overall'] += 1
+        tot[cate] += 1
+        if 'Prefetch succeed' in item['log']:
+            fetch['Overall'] += 1
+            fetch[cate] += 1
+        if post_check(item, prefetch=False):
+            hit['Overall'] += 1
+            hit[cate] += 1
+        # Statistics of answers extracted by rule and gpt
+        extract_model = item['extract_model']
+        extract_flag = item['extract_flag']
+        if extract_model in extract_counts:
+            extract_counts[extract_model][1] += 1
+        else:
+            extract_counts[extract_model] = [0, 1]  # succeed, total
+        if extract_flag:
+            extract_counts[extract_model][0] += 1
+
+    res = defaultdict(list)
+    for k in tot.keys():
+        res['Subject'].append(k)
+        res['tot'].append(tot[k])
+        res['prefetch'].append(fetch[k])
+        res['hit'].append(hit[k])
+        res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+        res['acc'].append(hit[k] / tot[k] * 100)
+        if k == 'Overall':
+            for model_key in extract_counts:
+                res[model_key+'_success'].append(extract_counts[model_key][0])
+                res[model_key+'_all'].append(extract_counts[model_key][1])
+        else:
+            for model_key in extract_counts:
+                res[model_key+'_success'].append(0)
+                res[model_key+'_all'].append(0)
+    res = pd.DataFrame(res).sort_values('Subject', ignore_index=True)
+    return res
+
+
+def eval_single_sample(args):
+    """Evaluate a single sample."""
+    return MATH_V_auxeval(args)
--- a/evaluation/MathVision/infer_instruct.sh
+++ b/evaluation/MathVision/infer_instruct.sh
+#!/bin/bash
+
+# MathVision Inference Script (Instruct Model)
+# This script runs inference on the MathVision dataset using vLLM
+
+python run_mathv.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --data-dir /path/to/mathvision_data \
+    --dataset MathVision \
+    --output-file results/mathvision_predictions.jsonl \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
+    # --num-samples 100
\ No newline at end of file
--- a/evaluation/MathVision/infer_think.sh
+++ b/evaluation/MathVision/infer_think.sh
+#!/bin/bash
+
+# MathVision Inference Script (Thinking Model)
+# This script runs inference on the MathVision dataset using vLLM with thinking mode parameters
+
+python run_mathv.py infer \
+    --model-path /path/to/Qwen3-VL-Thinking \
+    --data-dir /path/to/mathvision_data \
+    --dataset MathVision \
+    --output-file results/mathvision_predictions_thinking.jsonl \
+    --max-new-tokens 40960 \
+    --temperature 1.0 \
+    --top-p 0.95 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 0.0
+    # --num-samples 100
\ No newline at end of file
--- a/evaluation/MathVision/requirements.txt
+++ b/evaluation/MathVision/requirements.txt
+vllm
+transformers
+qwen_vl_utils
+pandas
+numpy
+Pillow
+# Utilities
+tqdm
+requests
+validators
+torch
+torchvision
+accelerate
+openpyxl
+latex2sympy2
+flash_attn
\ No newline at end of file
--- a/evaluation/MathVision/run_mathv.py
+++ b/evaluation/MathVision/run_mathv.py
+import os
+import sys
+import json
+import argparse
+import pandas as pd
+import numpy as np
+import time
+import re
+from tqdm import tqdm
+from typing import List, Dict, Any
+import torch
+import warnings
+import traceback
+
+# vLLM imports
+from vllm import LLM, SamplingParams
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor
+
+# Local imports from refactored files
+from dataset_utils import load_dataset, dump_image
+from eval_utils import build_judge, eval_single_sample, MATH_V_acc
+
+# Set vLLM multiprocessing method
+os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+
+def clean_for_excel(val):
+    """
+    Remove characters that are illegal in Excel cells.
+    Excel doesn't support control characters (0x00-0x1F) except tab, newline, carriage return.
+    """
+    if isinstance(val, str):
+        # Remove control characters (0x00-0x1F) except tab(0x09), newline(0x0A), carriage return(0x0D)
+        return re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', '', val)
+    return val
+
+def clean_dataframe_for_excel(df):
+    """Clean all string columns in a DataFrame for Excel compatibility."""
+    return df.applymap(clean_for_excel) if hasattr(df, 'applymap') else df.map(clean_for_excel)
+
+def build_mathv_prompt(line, dump_image_func, dataset):
+    """
+    Build MathVision dataset prompt.
+    """
+    # Standard resolution (MathVision uses smaller min_pixels)
+    MIN_PIXELS = 768*28*28  # ~0.6M pixels
+    MAX_PIXELS = 5120*28*28  # ~4M pixels
+    
+    tgt_path = dump_image_func(line)
+    question = line['question']
+    
+    # Build messages in standard conversation format
+    content = []
+    
+    # Add all images first
+    if isinstance(tgt_path, list):
+        for p in tgt_path:
+            content.append({
+                "type": "image",
+                "image": p,
+                "min_pixels": MIN_PIXELS,
+                "max_pixels": MAX_PIXELS
+            })
+    else:
+        content.append({
+            "type": "image", 
+            "image": tgt_path,
+            "min_pixels": MIN_PIXELS,
+            "max_pixels": MAX_PIXELS
+        })
+    
+    # Add question text last
+    content.append({"type": "text", "text": question})
+    
+    # Return messages in standard conversation format
+    messages = [{
+        "role": "user",
+        "content": content
+    }]
+    
+    return messages
+
+def prepare_inputs_for_vllm(messages, processor):
+    """
+    Prepare inputs for vLLM (following the examples in README.md).
+    
+    Args:
+        messages: List of messages in standard conversation format
+        processor: AutoProcessor instance
+    
+    Returns:
+        dict: Input format required by vLLM
+    """
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    
+    # qwen_vl_utils 0.0.14+ required
+    image_inputs, video_inputs, video_kwargs = process_vision_info(
+        messages,
+        image_patch_size=processor.image_processor.patch_size,
+        return_video_kwargs=True,
+        return_video_metadata=True
+    )
+    
+    mm_data = {}
+    if image_inputs is not None:
+        mm_data['image'] = image_inputs
+    if video_inputs is not None:
+        mm_data['video'] = video_inputs
+    
+    return {
+        'prompt': text,
+        'multi_modal_data': mm_data,
+        'mm_processor_kwargs': video_kwargs
+    }
+
+def run_inference(args):
+    """Run inference on the MathVision dataset using vLLM."""
+    print("\n" + "="*80)
+    print("🚀 MathVision Inference with vLLM (High-Speed Mode)")
+    print("="*80 + "\n")
+    
+    # Load dataset
+    data = load_dataset(args.dataset)
+    
+    # Limit number of samples if specified
+    if args.num_samples is not None and args.num_samples > 0:
+        original_len = len(data)
+        data = data.iloc[:args.num_samples]
+        print(f"✓ Loaded {len(data)} samples from {args.dataset} (limited from {original_len} samples)")
+    else:
+        print(f"✓ Loaded {len(data)} samples from {args.dataset}")
+    
+    # Set up image root directory
+    img_root = os.path.join(os.environ['LMUData'], 'images', args.dataset)
+    os.makedirs(img_root, exist_ok=True)
+    
+    # Set up dump_image function
+    def dump_image_func(line):
+        return dump_image(line, img_root)
+    
+    # Create output directory
+    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
+
+    # Set up CoT prompt if enabled
+    cot_prompt = ""
+    if args.use_cot:
+        cot_prompt = args.cot_prompt if args.cot_prompt else " Let's think step by step."
+        print(f"✓ Using CoT prompt: {cot_prompt[:50]}...")
+
+    # Set up generation parameters (vLLM SamplingParams format)
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        top_k=args.top_k,
+        max_tokens=args.max_new_tokens,
+        repetition_penalty=args.repetition_penalty,
+        presence_penalty=args.presence_penalty,
+        stop_token_ids=[],
+    )
+    
+    print(f"\n⚙️  Generation parameters (vLLM SamplingParams):")
+    print(f"   max_tokens={sampling_params.max_tokens}")
+    print(f"   temperature={sampling_params.temperature}, top_p={sampling_params.top_p}, top_k={sampling_params.top_k}")
+    print(f"   repetition_penalty={sampling_params.repetition_penalty}")
+    print(f"   presence_penalty={sampling_params.presence_penalty}")
+    
+    if sampling_params.presence_penalty > 0:
+        print(f"   ✅ Anti-repetition enabled (presence_penalty={sampling_params.presence_penalty})")
+    
+    if sampling_params.temperature <= 0.02 and sampling_params.top_k == 1:
+        print(f"   ✅ Using FAST greedy-like decoding")
+    else:
+        print(f"   ⚠️  Using sampling decoding (slower but more diverse)")
+    print()
+
+    # Load processor for input preparation
+    print(f"Loading processor from {args.model_path}")
+    processor = AutoProcessor.from_pretrained(args.model_path)
+    print("✓ Processor loaded\n")
+    
+    # Initialize vLLM
+    print(f"Initializing vLLM with model: {args.model_path}")
+    print(f"   GPU count: {torch.cuda.device_count()}")
+    print(f"   Tensor parallel size: {args.tensor_parallel_size}")
+    
+    llm = LLM(
+        model=args.model_path,
+        tensor_parallel_size=args.tensor_parallel_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        trust_remote_code=True,
+        max_model_len=args.max_model_len,
+        limit_mm_per_prompt={"image": args.max_images_per_prompt},
+        seed=42,
+    )
+    print("✓ vLLM initialized successfully\n")
+    
+    # Prepare all inputs
+    print("Preparing inputs for vLLM...")
+    all_inputs = []
+    all_line_dicts = []
+    all_messages = []
+    
+    for idx, (_, line) in enumerate(tqdm(data.iterrows(), total=len(data), desc="Building prompts")):
+        # Convert line to dict
+        line_dict = line.to_dict()
+        for k, v in line_dict.items():
+            if isinstance(v, np.integer):
+                line_dict[k] = int(v)
+            elif isinstance(v, np.floating):
+                line_dict[k] = float(v)
+        
+        # Build prompt
+        messages = build_mathv_prompt(line, dump_image_func, args.dataset)
+        
+        # Add CoT prompt
+        if args.use_cot and len(messages) > 0 and len(messages[0]['content']) > 0:
+            last_content = messages[0]['content'][-1]
+            if last_content['type'] == 'text':
+                last_content['text'] += cot_prompt
+        
+        # Prepare input for vLLM
+        vllm_input = prepare_inputs_for_vllm(messages, processor)
+        
+        all_inputs.append(vllm_input)
+        all_line_dicts.append(line_dict)
+        all_messages.append(messages)
+    
+    print(f"✓ Prepared {len(all_inputs)} inputs\n")
+    
+    # Batch inference (vLLM automatic optimization)
+    print("="*80)
+    print("🚀 Running vLLM batch inference (automatic optimization)")
+    print("="*80)
+    start_time = time.time()
+    
+    outputs = llm.generate(all_inputs, sampling_params=sampling_params)
+    
+    end_time = time.time()
+    total_time = end_time - start_time
+    print(f"\n✓ Inference completed in {total_time:.2f} seconds")
+    print(f"  Average: {total_time/len(data):.2f} seconds/sample")
+    print(f"  Throughput: {len(data)/total_time:.2f} samples/second\n")
+    
+    # Save results
+    print("Saving results...")
+    results = []
+    
+    for idx, (line_dict, messages, output) in enumerate(zip(all_line_dicts, all_messages, outputs)):
+        response = output.outputs[0].text
+        index = line_dict['index']
+
+        response_final = str(response).split("</think>")[-1].strip()
+        
+        result = {
+            "question_id": int(index) if isinstance(index, np.integer) else index,
+            "annotation": line_dict,
+            "task": args.dataset,
+            "result": {"gen": response_final, "gen_raw": response},
+            "messages": messages
+        }
+        results.append(result)
+    
+    # Write final results
+    with open(args.output_file, 'w') as f:
+        for res in results:
+            f.write(json.dumps(res) + '\n')
+    
+    print(f"\n✓ Results saved to {args.output_file}")
+    print(f"✓ Total samples processed: {len(results)}")
+
+def run_evaluation(args):
+    """Run evaluation on inference results."""
+    # Load results
+    results = []
+    with open(args.input_file, 'r') as f:
+        for line in f:
+            job = json.loads(line)
+            annotation = job["annotation"]
+            annotation["prediction"] = job["result"]["gen"]
+            results.append(annotation)
+            
+    data = pd.DataFrame.from_records(results)
+    data = data.sort_values(by='index')
+    data['prediction'] = [str(x) for x in data['prediction']]
+
+    # Load dataset for validation
+    meta = load_dataset(args.dataset)
+
+    # Validation
+    print(f"len(data): {len(data)}")
+    print(f"len(meta): {len(meta)}")
+    meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+    data_map = {x: y for x, y in zip(data['index'], data['question'])}
+    for k in data_map:
+        assert k in meta_q_map, (
+            f'eval_file should be the same as or a subset of dataset {args.dataset}'
+        )
+
+    # Save intermediate results
+    output_xlsx = args.output_file.replace('.csv', '.xlsx') if args.output_file.endswith('.csv') else args.output_file
+    clean_dataframe_for_excel(data).to_excel(output_xlsx, index=False)
+    print(f"✓ Saved intermediate results to {output_xlsx}")
+
+    # Build judge model
+    model = build_judge(
+        model=getattr(args, 'eval_model', 'gpt-4o-2024-05-13'),
+        api_type=getattr(args, 'api_type', 'dash')
+    )
+    
+    # Prepare evaluation tasks
+    eval_tasks = []
+    for i in range(len(data)):
+        item = data.iloc[i]
+        eval_tasks.append((model, item))
+    
+    # Run evaluation
+    eval_results = []
+    
+    # Debug mode: process single-threaded with first few samples
+    debug = os.environ.get('DEBUG', '').lower() == 'true'
+    if debug:
+        print("Running in debug mode with first 5 samples...")
+        for task in eval_tasks[:5]:
+            try:
+                result = eval_single_sample(task)
+                eval_results.append(result)
+            except Exception as e:
+                print(f"Error processing task: {e}")
+                print(f"Task details: {task}")
+                raise
+    else:
+        # Normal mode: process all samples with threading
+        from concurrent.futures import ThreadPoolExecutor
+        nproc = getattr(args, 'nproc', 4)
+        with ThreadPoolExecutor(max_workers=nproc) as executor:
+            for result in tqdm(executor.map(eval_single_sample, eval_tasks), 
+                             total=len(eval_tasks), desc="Evaluating"):
+                eval_results.append(result)
+    
+    # Update data with evaluation results
+    data['res'] = [r['res'] for r in eval_results]
+    data['log'] = [r['log'] for r in eval_results]
+    data['extract_model'] = [r['extract_model'] for r in eval_results]
+    data['extract_flag'] = [r['extract_flag'] for r in eval_results]
+    
+    # Save evaluation results
+    storage = args.output_file.replace('.csv', '_eval.xlsx')
+    clean_dataframe_for_excel(data).to_excel(storage, index=False)
+    print(f"✓ Saved evaluation results to {storage}")
+    
+    # Calculate accuracy
+    score = MATH_V_acc(storage)
+    score_pth = storage.replace('.xlsx', '_score.csv')
+    score.to_csv(score_pth, index=False)
+    print(f"✓ Saved score to {score_pth}")
+    
+    print(f"\n{'='*50}")
+    print(f"Evaluation Results:")
+    print(f"{'='*50}")
+    print(score)
+    print(f"{'='*50}\n")
+    
+    return score
+
+def main():
+    parser = argparse.ArgumentParser(description="MathVision Evaluation with vLLM")
+    subparsers = parser.add_subparsers(dest='command', help='Command to run')
+    
+    # Inference parser
+    infer_parser = subparsers.add_parser("infer", help="Run inference with vLLM")
+    infer_parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
+    infer_parser.add_argument("--dataset", type=str, default="MathVision", 
+                            choices=["MathVision", "MathVision_MINI"],
+                            help="Dataset name")
+    infer_parser.add_argument("--data-dir", type=str, help="The absolute path of MathVision data directory")
+    infer_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
+    infer_parser.add_argument("--num-samples", type=int, default=None, 
+                            help="Number of samples to process (default: None, process all samples)")
+    infer_parser.add_argument("--use-cot", action="store_true", help="Use Chain-of-Thought prompting")
+    infer_parser.add_argument("--cot-prompt", type=str, default="", help="Custom Chain-of-Thought prompt")
+    
+    # vLLM specific parameters
+    infer_parser.add_argument("--tensor-parallel-size", type=int, default=None, 
+                            help="Tensor parallel size (default: number of GPUs)")
+    infer_parser.add_argument("--gpu-memory-utilization", type=float, default=0.9,
+                            help="GPU memory utilization (0.0-1.0, default: 0.9)")
+    infer_parser.add_argument("--max-model-len", type=int, default=128000,
+                            help="Maximum model context length (default: 128000)")
+    infer_parser.add_argument("--max-images-per-prompt", type=int, default=10,
+                            help="Maximum images per prompt (default: 10)")
+    
+    # Generation parameters
+    infer_parser.add_argument("--max-new-tokens", type=int, default=32768, 
+                            help="Maximum number of tokens to generate (default: 2048)")
+    infer_parser.add_argument("--temperature", type=float, default=0.7, 
+                            help="Temperature for sampling (default: 0.7 for greedy-like decoding)")
+    infer_parser.add_argument("--top-p", type=float, default=0.8, 
+                            help="Top-p for sampling (default: 0.8 for greedy-like decoding)")
+    infer_parser.add_argument("--top-k", type=int, default=20, 
+                            help="Top-k for sampling (default: 20 for greedy decoding)")
+    infer_parser.add_argument("--repetition-penalty", type=float, default=1.0,
+                            help="Repetition penalty (default: 1.0, increase to 1.2-1.5 to reduce repetition)")
+    infer_parser.add_argument("--presence-penalty", type=float, default=1.5,
+                            help="Presence penalty (default: 1.5, range: 0.0-2.0, penalize tokens that have already appeared)")
+    
+    # Evaluation parser
+    eval_parser = subparsers.add_parser("eval", help="Run evaluation")
+    eval_parser.add_argument("--data-dir", type=str, help="The absolute path of MathVision data directory")
+    eval_parser.add_argument("--input-file", type=str, required=True, help="Input file with inference results")
+    eval_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
+    eval_parser.add_argument("--dataset", type=str, default="MathVision",
+                            choices=["MathVision", "MathVision_MINI"],
+                            help="Dataset name")
+    eval_parser.add_argument("--eval-model", type=str, default="gpt-4o",
+                            help="Model to use for evaluation (default: gpt-4o)")
+    eval_parser.add_argument("--api-type", type=str, default="dash", choices=["dash", "mit"],
+                            help="API type for evaluation")
+    eval_parser.add_argument("--nproc", type=int, default=4, help="Number of processes to use")
+    
+    args = parser.parse_args()
+    
+    # Set data directory if provided
+    if hasattr(args, 'data_dir') and args.data_dir:
+        os.environ['LMUData'] = args.data_dir
+    
+    # Automatically set tensor_parallel_size
+    if args.command == 'infer' and args.tensor_parallel_size is None:
+        args.tensor_parallel_size = torch.cuda.device_count()
+        print(f"Auto-set tensor_parallel_size to {args.tensor_parallel_size}")
+    
+    if args.command == 'infer':
+        run_inference(args)
+    elif args.command == 'eval':
+        run_evaluation(args)
+    else:
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()
--- a/evaluation/ODinW-13/README.md
+++ b/evaluation/ODinW-13/README.md
+# ODinW Benchmark Evaluation
+
+This directory contains the implementation for evaluating vision-language models on the ODinW (Object Detection in the Wild) 13 dataset using vLLM for high-speed inference.
+
+## Overview
+
+ODinW is a comprehensive object detection benchmark that consists of 13 diverse datasets spanning various domains. This implementation provides:
+
+- **High-speed inference** using vLLM with automatic batch optimization
+- **Unified evaluation** across 13 diverse object detection datasets
+- **COCO-style metrics** including mAP, mAP_50, mAP_75, etc.
+- **Modular code structure** for easy maintenance and extension
+
+## Project Structure
+
+```
+ODinW-13/
+├── run_odinw.py          # Main script for inference and evaluation
+├── dataset_utils.py      # Dataset loading and preprocessing utilities
+├── eval_utils.py         # Evaluation logic and COCO metrics computation
+├── infer_instruct.sh     # Inference script for instruct models
+├── infer_think.sh        # Inference script for thinking models
+├── eval_instruct.sh      # Evaluation script for instruct model results
+├── eval_think.sh         # Evaluation script for thinking model results
+├── requirements.txt      # Python dependencies
+└── README.md            # This file
+```
+
+## Requirements
+
+### Python Dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+Key dependencies:
+- `vllm` - High-speed LLM inference engine
+- `transformers` - HuggingFace transformers
+- `qwen_vl_utils` - Qwen VL utilities for vision processing
+- `pycocotools` - COCO evaluation API
+- `pandas`, `numpy` - Data processing
+- `Pillow` - Image processing
+- `tabulate` - Table formatting (optional, for better output display)
+
+### Data Preparation
+
+The ODinW dataset requires a specific directory structure:
+
+```
+/path/to/odinw_data/
+├── odinw13_config.py          # Dataset configuration file (required)
+├── AerialMaritimeDrone/       # Individual datasets
+│   ├── large/
+│   │   ├── train/
+│   │   └── test/
+│   └── tiled/
+├── Aquarium/
+├── Cottontail Rabbits/
+├── EgoHands/
+├── NorthAmerica Mushrooms/
+├── Packages/
+├── Pascal VOC/
+├── Pistols/
+├── Pothole/
+├── Raccoon/
+├── ShellfishOpenImages/
+├── Thermal Dogs and People/
+└── Vehicles OpenImages/
+```
+
+**Important**: The `odinw13_config.py` file must contain:
+- `datasets`: List of dataset configurations
+- `dataset_prefixes`: List of dataset names
+
+## Quick Start
+
+### 1. Inference
+
+Run inference on the ODinW dataset using an instruct model:
+
+```bash
+bash infer_instruct.sh
+```
+
+Or customize the inference:
+
+```bash
+python run_odinw.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --data-dir /path/to/odinw_data \
+    --output-file results/odinw_predictions.jsonl \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
+```
+
+For thinking models with extended reasoning:
+
+```bash
+bash infer_think.sh
+```
+
+### 2. Evaluation
+
+Evaluate the inference results using COCO metrics:
+
+```bash
+bash eval_instruct.sh
+```
+
+Or customize the evaluation:
+
+```bash
+python run_odinw.py eval \
+    --data-dir /path/to/odinw_data \
+    --input-file results/odinw_predictions.jsonl \
+    --output-file results/odinw_eval_results.json
+```
+
+## Detailed Usage
+
+### Inference Mode
+
+**Basic Arguments:**
+- `--model-path`: Path to the Qwen3-VL model directory (required)
+- `--data-dir`: Path to ODinW data directory containing `odinw13_config.py` (required)
+- `--output-file`: Path to save inference results in JSONL format (required)
+
+**vLLM Arguments:**
+- `--tensor-parallel-size`: Number of GPUs for tensor parallelism (default: auto-detect)
+- `--gpu-memory-utilization`: GPU memory utilization ratio, 0.0-1.0 (default: 0.9)
+- `--max-model-len`: Maximum model context length (default: 128000)
+- `--max-images-per-prompt`: Maximum images per prompt (default: 10)
+
+**Generation Parameters:**
+- `--max-new-tokens`: Maximum tokens to generate (default: 32768)
+- `--temperature`: Sampling temperature (default: 0.7)
+- `--top-p`: Top-p sampling (default: 0.8)
+- `--top-k`: Top-k sampling (default: 20)
+- `--repetition-penalty`: Repetition penalty (default: 1.0)
+- `--presence-penalty`: Presence penalty to reduce repetition (default: 1.5)
+
+### Evaluation Mode
+
+**Basic Arguments:**
+- `--data-dir`: Path to ODinW data directory containing `odinw13_config.py` (required)
+- `--input-file`: Inference results file in JSONL format (required)
+- `--output-file`: Path to save evaluation results in JSON format (required)
+
+## Output Files
+
+### Inference Output
+
+The inference script generates two files:
+
+1. **Predictions file** (`odinw_predictions.jsonl`): JSONL file where each line contains:
+```json
+{
+  "question_id": 0,
+  "annotation": [...],
+  "extra_info": {
+    "dataset_name": "AerialMaritimeDrone_large",
+    "img_id": 1,
+    "anno_path": "/path/to/annotations.json",
+    "resized_h": 640,
+    "resized_w": 640,
+    "img_h": 1080,
+    "img_w": 1920,
+    "img_path": "/path/to/image.jpg"
+  },
+  "result": {
+    "gen": "[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"boat\"}, ...]",
+    "gen_raw": "Raw model output including thinking process"
+  },
+  "messages": [...]
+}
+```
+
+2. **Dataset config file** (`odinw_predictions_datasets.json`): Configuration for evaluation
+
+### Evaluation Output
+
+The evaluation script generates a JSON file with results for each dataset:
+
+```json
+{
+  "AerialMaritimeDrone_large": {
+    "mAP": 0.456,
+    "mAP_50": 0.678,
+    "mAP_75": 0.512,
+    "mAP_s": 0.234,
+    "mAP_m": 0.456,
+    "mAP_l": 0.567
+  },
+  "Aquarium_Aquarium Combined.v2-raw-1024.coco": {
+    ...
+  },
+  ...
+  "Average": 0.423
+}
+```
+
+**Evaluation Metrics:**
+- **mAP**: Mean Average Precision at IoU 0.5:0.95 (primary metric)
+- **mAP_50**: mAP at IoU threshold 0.5
+- **mAP_75**: mAP at IoU threshold 0.75
+- **mAP_s**: mAP for small objects (area < 32²)
+- **mAP_m**: mAP for medium objects (32² < area < 96²)
+- **mAP_l**: mAP for large objects (area > 96²)
+
+## Model-Specific Configurations
+
+### Instruct Models (e.g., Qwen3-VL-2B-Instruct, Qwen3-VL-7B-Instruct)
+
+Use standard parameters for balanced performance:
+
+```bash
+--max-new-tokens 32768
+--temperature 0.7
+--top-p 0.8
+--top-k 20
+--repetition-penalty 1.0
+--presence-penalty 1.5
+```
+
+### Thinking Models (e.g., Qwen3-VL-2B-Thinking)
+
+Use adjusted parameters for deeper reasoning:
+
+```bash
+--max-new-tokens 32768
+--temperature 0.6
+--top-p 0.95
+--top-k 20
+--repetition-penalty 1.0
+--presence-penalty 0.0
+```
+
+**Note:** Thinking models output reasoning steps wrapped in `<think>...</think>` tags. The evaluation automatically extracts the final answer after `</think>`.
+
+## Performance Tips
+
+1. **GPU Memory**: Adjust `--gpu-memory-utilization` based on your GPU:
+   - 0.9: Recommended for most cases
+   - 0.95: For maximum throughput (may cause OOM)
+   - 0.7-0.8: If experiencing OOM errors
+
+2. **Batch Size**: vLLM automatically optimizes batch size based on available memory
+
+3. **Tensor Parallelism**: Use `--tensor-parallel-size` for large models:
+   - 2B model: 1 GPU recommended
+   - 7B model: 1-2 GPUs
+   - 14B+ model: 2-4 GPUs
+
+4. **Context Length**: Reduce `--max-model-len` if memory is limited:
+   - 128000: Default, works well for most cases
+   - 64000: Reduces memory usage by ~40%
+
+5. **Image Processing**: The implementation uses `smart_resize` to automatically adjust image dimensions:
+   - Dimensions are made divisible by 32
+   - Total pixels are constrained to [min_pixels, max_pixels]
+   - Aspect ratio is preserved
+
+## Troubleshooting
+
+### Common Issues
+
+**1. Config file not found**
+```
+FileNotFoundError: Config file not found: /path/to/odinw13_config.py
+```
+**Solution**: Ensure `odinw13_config.py` exists in `--data-dir`
+
+**2. CUDA Out of Memory**
+```bash
+# Reduce GPU memory utilization
+--gpu-memory-utilization 0.7
+
+# Or reduce context length
+--max-model-len 64000
+```
+
+**3. vLLM Multiprocessing Issues**
+The code automatically sets `VLLM_WORKER_MULTIPROC_METHOD=spawn`. If you still encounter issues:
+```bash
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+```
+
+**4. Empty or Invalid JSON Output**
+- Check model output format
+- Verify prompt clarity
+- Try adjusting temperature/top_p
+
+**5. Low mAP Scores**
+- Verify category names match dataset classes
+- Check coordinate format (xyxy vs xywh)
+- Ensure model outputs JSON format correctly
+
+**6. COCO API Errors**
+```
+IndexError: The testing results of the whole dataset is empty.
+```
+**Solution**: No valid predictions were generated. Check model outputs.
+
+## Advanced Usage
+
+### Custom Image Resolution
+
+Edit `dataset_utils.py` to modify resolution parameters:
+
+```python
+# Calculate image resolution parameters
+patch_size = 16
+merge_base = 2
+pixels_per_token = patch_size * patch_size * merge_base * merge_base
+min_pixels = pixels_per_token * 768
+max_pixels = pixels_per_token * 12800
+```
+
+### Filtering Datasets
+
+To evaluate only specific datasets, edit `generate_odinw_jobs()` in `dataset_utils.py`:
+
+```python
+# Only process specific datasets
+dataset_filter = ['AerialMaritimeDrone', 'Aquarium']
+for data_name, data_config in datasets.items():
+    if data_name not in dataset_filter:
+        continue
+    # ... rest of the code
+```
+
+### Custom Prompt Format
+
+Edit the prompt in `dataset_utils.py`:
+
+```python
+# Default prompt
+prompt = f"Locate every instance that belongs to the following categories: '{obj_names}'. Report bbox coordinates in JSON format."
+
+# Custom prompt example
+prompt = f"Find all {obj_names} objects in the image and output their bounding boxes as JSON."
+```
+
+## Citation
+
+If you use this code or the ODinW benchmark, please cite:
+
+```bibtex
+@inproceedings{li2022grounded,
+  title={Grounded language-image pre-training},
+  author={Li, Liunian Harold and Zhang, Pengchuan and Zhang, Haotian and Yang, Jianwei and Li, Chunyuan and Zhong, Yiwu and Wang, Lijuan and Yuan, Lu and Zhang, Lei and Hwang, Jenq-Neng and others},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10965--10975},
+  year={2022}
+}
+```
+
+## License
+
+This code is released under the same license as the Qwen3-VL model.
+
+## Support
+
+For issues and questions:
+- GitHub Issues: [Qwen3-VL Repository](https://github.com/QwenLM/Qwen3-VL)
+- Documentation: See inline code comments and docstrings
--- a/evaluation/ODinW-13/dataset_utils.py
+++ b/evaluation/ODinW-13/dataset_utils.py
+"""
+ODinW dataset loading and processing utilities.
+"""
+import os
+import math
+from typing import Dict, List, Tuple
+from pycocotools.coco import COCO
+
+
+def round_by_factor(number: int, factor: int) -> int:
+    """Return the nearest integer divisible by factor."""
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Return the ceiling integer divisible by factor."""
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: int, factor: int) -> int:
+    """Return the floor integer divisible by factor."""
+    return math.floor(number / factor) * factor
+
+
+def smart_resize(height: int, width: int, factor: int = 28, 
+                 min_pixels: int = 56*56, max_pixels: int = 14*14*4*1280, 
+                 max_long_side: int = 8192) -> Tuple[int, int]:
+    """Resize image to meet the following conditions:
+        1. Both height and width are divisible by factor
+        2. Total pixels are within [min_pixels, max_pixels]
+        3. Longest side is within max_long_side
+        4. Aspect ratio is preserved
+    
+    Args:
+        height: Original image height
+        width: Original image width
+        factor: Size must be divisible by this factor
+        min_pixels: Minimum pixel count
+        max_pixels: Maximum pixel count
+        max_long_side: Maximum longest side
+    
+    Returns:
+        (resized_height, resized_width): Resized dimensions
+    """
+    if height < 2 or width < 2:
+        raise ValueError(f'height:{height} or width:{width} must be larger than factor:{factor}')
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(f'absolute aspect ratio must be smaller than 200, got {height} / {width}')
+
+    if max(height, width) > max_long_side:
+        beta = max(height, width) / max_long_side
+        height, width = int(height / beta), int(width / beta)
+
+    h_bar = round_by_factor(height, factor)
+    w_bar = round_by_factor(width, factor)
+    
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    
+    return h_bar, w_bar
+
+
+def load_odinw_config(config_path: str) -> Dict:
+    """Load odinw13_config.py configuration file.
+    
+    Args:
+        config_path: Path to config file
+    
+    Returns:
+        datasets: Dictionary mapping dataset names to configurations
+    """
+    import runpy
+    config = runpy.run_path(config_path)
+    dataset_configs = config["datasets"]
+    dataset_names = config["dataset_prefixes"]
+    
+    datasets = {}
+    for dataset_name, dataset_config in zip(dataset_names, dataset_configs):
+        datasets[dataset_name] = dataset_config
+    
+    return datasets
+
+
+def generate_odinw_jobs(data_dir: str, args) -> Tuple[List[Dict], Dict]:
+    """Generate inference task list for ODinW dataset.
+    
+    Args:
+        data_dir: Data directory path (containing odinw13_config.py)
+        args: Command line arguments
+    
+    Returns:
+        (question_list, datasets): Task list and dataset configurations
+    """
+    # Load config
+    config_path = os.path.join(data_dir, "odinw13_config.py")
+    if not os.path.exists(config_path):
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    
+    datasets = load_odinw_config(config_path)
+    
+    question_list = []
+    question_id = 0
+    num_questions_per_dataset = {}
+    
+    # Calculate image resolution parameters
+    patch_size = 16
+    merge_base = 2
+    pixels_per_token = patch_size * patch_size * merge_base * merge_base
+    min_pixels = pixels_per_token * 768
+    max_pixels = pixels_per_token * 12800
+    
+    # Iterate through all datasets
+    for data_name, data_config in datasets.items():
+        print(f'Parsing ODinW:{data_name}')
+        classes = list(data_config["metainfo"]["classes"])
+        
+        # Build data paths
+        idx = data_config["data_root"].find('data/odinw/') + len('data/odinw/')
+        sub_root = os.path.join(data_dir, data_config["data_root"][idx:])
+        sub_anno = sub_root + data_config["ann_file"]
+        sub_img_root = sub_root + data_config["data_prefix"]["img"]
+        
+        # Load COCO format annotations
+        dataset = COCO(sub_anno)
+        num_questions = 0
+        
+        # Iterate through all images
+        for img_idx, img_meta in dataset.imgs.items():
+            img_name = img_meta["file_name"]
+            img_path = sub_img_root + img_name
+            img_h = img_meta["height"]
+            img_w = img_meta["width"]
+            
+            # Calculate resized image dimensions
+            resized_h, resized_w = smart_resize(
+                img_h, img_w, 
+                factor=32, 
+                min_pixels=min_pixels, 
+                max_pixels=max_pixels, 
+                max_long_side=50000
+            )
+            
+            # Get annotations
+            img_annos = dataset.imgToAnns[img_idx]
+            
+            # Build class names list
+            obj_names = ", ".join(classes)
+            
+            # Build prompt
+            prompt = f"Locate every instance that belongs to the following categories: '{obj_names}'. Report bbox coordinates in JSON format."
+            
+            # Build messages
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image", 
+                            "image": f"file://{img_path}", 
+                            "min_pixels": min_pixels, 
+                            "max_pixels": max_pixels
+                        },
+                        {"type": "text", "text": prompt}
+                    ]
+                }
+            ]
+            
+            # Build task item
+            item = {
+                "question_id": question_id,
+                "annotation": img_annos,
+                'messages': messages,
+                "extra_info": {
+                    'dataset_name': data_name,
+                    'dataset_config': data_config,
+                    'img_id': img_meta["id"],
+                    'anno_path': sub_anno,
+                    'resized_h': resized_h,
+                    'resized_w': resized_w,
+                    'img_h': img_h,
+                    'img_w': img_w,
+                    'img_path': img_path
+                }
+            }
+            question_list.append(item)
+            question_id += 1
+            num_questions += 1
+        
+        num_questions_per_dataset[data_name] = num_questions
+    
+    # Print statistics
+    for data_name, num_questions in num_questions_per_dataset.items():
+        print(f'{data_name}: {num_questions}')
+    print(f"Total ODinW questions: {len(question_list)}")
+    
+    return question_list, datasets
+
--- a/evaluation/ODinW-13/eval_instruct.sh
+++ b/evaluation/ODinW-13/eval_instruct.sh
+#!/bin/bash
+
+# ODinW Evaluation Script (Instruct Model)
+# This script evaluates the inference results using COCO metrics
+
+python run_odinw.py eval \
+    --data-dir /path/to/odinw_data \
+    --input-file results/odinw_predictions.jsonl \
+    --output-file results/odinw_eval_results.json
+
--- a/evaluation/ODinW-13/eval_think.sh
+++ b/evaluation/ODinW-13/eval_think.sh
+#!/bin/bash
+
+# ODinW Evaluation Script (Thinking Model)
+# This script evaluates the inference results from thinking model using COCO metrics
+
+python run_odinw.py eval \
+    --data-dir /path/to/odinw_data \
+    --input-file results/odinw_predictions_thinking.jsonl \
+    --output-file results/odinw_eval_results_thinking.json
+
--- a/evaluation/ODinW-13/eval_utils.py
+++ b/evaluation/ODinW-13/eval_utils.py
+"""
+ODinW evaluation utilities.
+"""
+import os
+import json
+import tempfile
+import numpy as np
+from typing import List, Dict, Sequence
+from collections import OrderedDict
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+
+def xyxy2xywh(bbox: np.ndarray) -> list:
+    """Convert bbox format from xyxy to xywh.
+    
+    Args:
+        bbox: Bounding box in [x1, y1, x2, y2] format
+    
+    Returns:
+        Bounding box in [x, y, w, h] format
+    """
+    _bbox = bbox.tolist()
+    return [
+        _bbox[0],
+        _bbox[1],
+        _bbox[2] - _bbox[0],
+        _bbox[3] - _bbox[1],
+    ]
+
+
+def results2json(results: Sequence[dict], outfile_prefix: str, cat_ids: dict) -> dict:
+    """Convert results to COCO JSON format.
+    
+    Args:
+        results: List of prediction results
+        outfile_prefix: Output file prefix
+        cat_ids: Category ID mapping
+    
+    Returns:
+        result_files: Dictionary of result file paths
+    """
+    bbox_json_results = []
+    for idx, result in enumerate(results):
+        image_id = result.get('img_id', idx)
+        labels = result['labels']
+        bboxes = result['bboxes']
+        scores = result['scores']
+        
+        for i, label in enumerate(labels):
+            data = dict()
+            data['image_id'] = image_id
+            data['bbox'] = xyxy2xywh(bboxes[i])
+            data['score'] = float(scores[i])
+            data['category_id'] = cat_ids[label]
+            bbox_json_results.append(data)
+    
+    result_files = dict()
+    result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+    with open(result_files['bbox'], 'w') as f:
+        json.dump(bbox_json_results, f)
+    
+    return result_files
+
+
+def compute_metrics(results: list, outfile_prefix: str = None, _coco_api: COCO = None) -> Dict[str, float]:
+    """Compute mAP and other metrics using COCO API.
+    
+    Args:
+        results: List of evaluation results, each element is a (gt, pred) tuple
+        outfile_prefix: Output file prefix (optional)
+        _coco_api: COCO API instance
+    
+    Returns:
+        eval_results: Dictionary of evaluation metrics
+    """
+    proposal_nums = (100, 300, 1000)
+    iou_thrs = np.linspace(
+        .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+    
+    # Separate ground truth and predictions
+    if len(results) == 0:
+        gts, preds = [], []
+    else:
+        gts, preds = zip(*results)
+    
+    tmp_dir = None
+    if outfile_prefix is None:
+        tmp_dir = tempfile.TemporaryDirectory()
+        outfile_prefix = os.path.join(tmp_dir.name, 'results')
+    
+    cat_ids = _coco_api.getCatIds()
+    img_ids = _coco_api.getImgIds()
+    
+    # Convert to COCO format and save
+    result_files = results2json(preds, outfile_prefix, cat_ids)
+    
+    eval_results = OrderedDict()
+    
+    for metric in ["bbox"]:
+        iou_type = metric
+        if metric not in result_files:
+            raise KeyError(f'{metric} is not in results')
+        try:
+            with open(result_files[metric], 'r') as f:
+                predictions = json.load(f)
+            coco_dt = _coco_api.loadRes(predictions)
+        except IndexError:
+            print('The testing results of the whole dataset is empty.')
+            break
+        
+        coco_eval = COCOeval(_coco_api, coco_dt, iou_type)
+        
+        coco_eval.params.catIds = cat_ids
+        coco_eval.params.imgIds = img_ids
+        coco_eval.params.maxDets = list(proposal_nums)
+        coco_eval.params.iouThrs = iou_thrs
+        
+        # mapping of cocoEval.stats
+        coco_metric_names = {
+            'mAP': 0,
+            'mAP_50': 1,
+            'mAP_75': 2,
+            'mAP_s': 3,
+            'mAP_m': 4,
+            'mAP_l': 5,
+            'AR@100': 6,
+            'AR@300': 7,
+            'AR@1000': 8,
+            'AR_s@1000': 9,
+            'AR_m@1000': 10,
+            'AR_l@1000': 11
+        }
+        
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+        
+        metric_items = [
+            'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+        ]
+        
+        for metric_item in metric_items:
+            val = coco_eval.stats[coco_metric_names[metric_item]]
+            eval_results[metric_item] = float(f'{round(val, 3)}')
+    
+    if tmp_dir is not None:
+        tmp_dir.cleanup()
+    
+    return eval_results
+
--- a/evaluation/ODinW-13/infer_instruct.sh
+++ b/evaluation/ODinW-13/infer_instruct.sh
+#!/bin/bash
+
+# ODinW Inference Script (Instruct Model)
+# This script runs inference on the ODinW dataset using vLLM
+
+python run_odinw.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --data-dir /path/to/odinw_data \
+    --output-file results/odinw_predictions.jsonl \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.9 \
+    --max-model-len 128000 \
+    --max-images-per-prompt 10 \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
+
--- a/evaluation/ODinW-13/infer_think.sh
+++ b/evaluation/ODinW-13/infer_think.sh
+#!/bin/bash
+
+# ODinW Inference Script (Thinking Model)
+# This script runs inference on the ODinW dataset using vLLM with thinking mode parameters
+
+python run_odinw.py infer \
+    --model-path /path/to/Qwen3-VL-Thinking \
+    --data-dir /path/to/odinw_data \
+    --output-file results/odinw_predictions_thinking.jsonl \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.9 \
+    --max-model-len 128000 \
+    --max-images-per-prompt 10 \
+    --max-new-tokens 32768 \
+    --temperature 0.6 \
+    --top-p 0.95 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 0.0
--- a/evaluation/ODinW-13/requirements.txt
+++ b/evaluation/ODinW-13/requirements.txt
+vllm
+transformers
+qwen_vl_utils
+pandas
+numpy
+Pillow
+tqdm
+requests
+validators
+torch
+torchvision
+accelerate
+pycocotools
+tabulate
+flash_attn
--- a/evaluation/ODinW-13/run_odinw.py
+++ b/evaluation/ODinW-13/run_odinw.py
+import os
+import sys
+import json
+import argparse
+import numpy as np
+import time
+from tqdm import tqdm
+from typing import List, Dict, Any
+from collections import defaultdict, OrderedDict
+import torch
+
+# vLLM imports
+from vllm import LLM, SamplingParams
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor
+
+# pycocotools imports
+from pycocotools.coco import COCO
+
+# Local imports from refactored files
+from dataset_utils import load_odinw_config, generate_odinw_jobs
+from eval_utils import compute_metrics
+
+# Set vLLM multiprocessing method
+os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+
+
+def prepare_inputs_for_vllm(messages, processor):
+    """Prepare inputs for vLLM."""
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    
+    # qwen_vl_utils 0.0.14+ required
+    image_inputs, video_inputs, video_kwargs = process_vision_info(
+        messages,
+        image_patch_size=processor.image_processor.patch_size,
+        return_video_kwargs=True,
+        return_video_metadata=True
+    )
+    
+    mm_data = {}
+    if image_inputs is not None:
+        mm_data['image'] = image_inputs
+    if video_inputs is not None:
+        mm_data['video'] = video_inputs
+    
+    return {
+        'prompt': text,
+        'multi_modal_data': mm_data,
+        'mm_processor_kwargs': video_kwargs
+    }
+
+
+def run_inference(args):
+    """Run inference on the ODinW dataset using vLLM."""
+    print("\n" + "="*80)
+    print("🚀 ODinW Inference with vLLM (High-Speed Mode)")
+    print("="*80 + "\n")
+    
+    # Generate task list
+    question_list, datasets = generate_odinw_jobs(args.data_dir, args)
+    print(f"✓ Generated {len(question_list)} inference jobs\n")
+    
+    # Create output directory
+    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
+    
+    # Set up generation parameters
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        top_k=args.top_k,
+        max_tokens=args.max_new_tokens,
+        repetition_penalty=args.repetition_penalty,
+        presence_penalty=args.presence_penalty,
+        stop_token_ids=[],
+    )
+    
+    print(f"\n⚙️  Generation parameters (vLLM SamplingParams):")
+    print(f"   max_tokens={sampling_params.max_tokens}")
+    print(f"   temperature={sampling_params.temperature}, top_p={sampling_params.top_p}, top_k={sampling_params.top_k}")
+    print(f"   repetition_penalty={sampling_params.repetition_penalty}")
+    print(f"   presence_penalty={sampling_params.presence_penalty}")
+    print()
+    
+    # Load processor
+    print(f"Loading processor from {args.model_path}")
+    processor = AutoProcessor.from_pretrained(args.model_path)
+    print("✓ Processor loaded\n")
+    
+    # Initialize vLLM
+    print(f"Initializing vLLM with model: {args.model_path}")
+    print(f"   GPU count: {torch.cuda.device_count()}")
+    print(f"   Tensor parallel size: {args.tensor_parallel_size}")
+    
+    llm = LLM(
+        model=args.model_path,
+        tensor_parallel_size=args.tensor_parallel_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        trust_remote_code=True,
+        max_model_len=args.max_model_len,
+        limit_mm_per_prompt={"image": args.max_images_per_prompt},
+        seed=42,
+    )
+    print("✓ vLLM initialized successfully\n")
+    
+    # Prepare all inputs
+    print("Preparing inputs for vLLM...")
+    all_inputs = []
+    
+    for item in tqdm(question_list, desc="Building prompts"):
+        vllm_input = prepare_inputs_for_vllm(item['messages'], processor)
+        all_inputs.append(vllm_input)
+    
+    print(f"✓ Prepared {len(all_inputs)} inputs\n")
+    
+    # Batch inference
+    print("="*80)
+    print("🚀 Running vLLM batch inference")
+    print("="*80)
+    start_time = time.time()
+    
+    outputs = llm.generate(all_inputs, sampling_params=sampling_params)
+    
+    end_time = time.time()
+    total_time = end_time - start_time
+    print(f"\n✓ Inference completed in {total_time:.2f} seconds")
+    print(f"  Average: {total_time/len(question_list):.2f} seconds/sample")
+    print(f"  Throughput: {len(question_list)/total_time:.2f} samples/second\n")
+    
+    # Save results
+    print("Saving results...")
+    results = []
+    
+    for idx, (item, output) in enumerate(zip(question_list, outputs)):
+        response = output.outputs[0].text
+        
+        # Handle </think> tag
+        response_final = str(response).split("</think>")[-1].strip()
+        
+        result = {
+            "question_id": item['question_id'],
+            "annotation": item['annotation'],
+            "extra_info": item['extra_info'],
+            "result": {"gen": response_final, "gen_raw": response},
+            "messages": item['messages']
+        }
+        results.append(result)
+    
+    # Save results
+    with open(args.output_file, 'w') as f:
+        for res in results:
+            f.write(json.dumps(res) + '\n')
+    
+    print(f"\n✓ Results saved to {args.output_file}")
+    print(f"✓ Total samples processed: {len(results)}")
+    
+    # Save dataset config (for evaluation)
+    config_output = args.output_file.replace('.jsonl', '_datasets.json')
+    with open(config_output, 'w') as f:
+        # Convert config for JSON serialization
+        datasets_serializable = {}
+        for k, v in datasets.items():
+            datasets_serializable[k] = {
+                'metainfo': v['metainfo'],
+                'data_root': v['data_root'],
+                'ann_file': v['ann_file'],
+                'data_prefix': v['data_prefix']
+            }
+        json.dump(datasets_serializable, f, indent=2)
+    print(f"✓ Dataset config saved to {config_output}")
+
+
+def run_evaluation(args):
+    """Run evaluation on inference results."""
+    print("\n" + "="*80)
+    print("🎯 ODinW Evaluation")
+    print("="*80 + "\n")
+    
+    # Load inference results
+    results = []
+    with open(args.input_file, 'r') as f:
+        for line in f:
+            results.append(json.loads(line))
+    
+    print(f"✓ Loaded {len(results)} inference results\n")
+    
+    # Load dataset config
+    config_path = os.path.join(args.data_dir, "odinw13_config.py")
+    datasets = load_odinw_config(config_path)
+    
+    # Group by dataset
+    all_outputs = defaultdict(list)
+    for job in results:
+        all_outputs[job["extra_info"]["dataset_name"]].append(job)
+    
+    all_results = {}
+    
+    # Evaluate each dataset
+    for dataset_name, sub_jobs in all_outputs.items():
+        print(f"\n{'='*60}")
+        print(f"Evaluating dataset: {dataset_name}")
+        print(f"{'='*60}")
+        
+        anno_path = sub_jobs[0]["extra_info"]["anno_path"]
+        coco_api = COCO(anno_path)
+        
+        classes = datasets[dataset_name]['metainfo']['classes']
+        pred_bboxes_per_img = defaultdict(list)
+        
+        for job in sub_jobs:
+            img_id = job["extra_info"]["img_id"]
+            resized_h = job["extra_info"]["resized_h"]
+            resized_w = job["extra_info"]["resized_w"]
+            img_h = job["extra_info"]["img_h"]
+            img_w = job["extra_info"]["img_w"]
+            
+            answer = job['result']['gen']
+            answer = answer.replace("```json", "")
+            answer = answer.replace("```", "")
+            
+            # Parse predictions
+            import ast
+            import re
+            
+            try:
+                json_data = ast.literal_eval(answer)
+                pred_bboxes = []
+                pred_labels = []
+                for data in json_data:
+                    if len(data.get("bbox_2d", [])) != 4:
+                        continue
+                    pred_bboxes.append(data["bbox_2d"])
+                    pred_labels.append(data["label"])
+            except Exception as e:
+                # If parsing fails, use empty results
+                pred_bboxes = []
+                pred_labels = []
+            
+            # Coordinate conversion (from resized to original size)
+            if os.getenv("is_rel", "0") == "1":
+                pred_bboxes = np.array(pred_bboxes).reshape(-1, 4) / 1000 * np.array([img_w, img_h, img_w, img_h])
+            else:
+                if len(pred_bboxes) > 0:
+                    pred_bboxes = np.array(pred_bboxes).reshape(-1, 4) / np.array([resized_w, resized_h, resized_w, resized_h]) * np.array([img_w, img_h, img_w, img_h])
+                else:
+                    pred_bboxes = np.array(pred_bboxes).reshape(-1, 4)
+            
+            pred_bboxes = pred_bboxes.tolist()
+            
+            # Group by category
+            pred_objs = defaultdict(list)
+            for pred_bbox, pred_label in zip(pred_bboxes, pred_labels):
+                pred_objs[pred_label].append(pred_bbox)
+            
+            for k, v in pred_objs.items():
+                class_names = [name.lower() for name in classes]
+                if k.lower() not in class_names:
+                    continue
+                pred_bboxes_per_img[img_id].append({
+                    'label': class_names.index(k.lower()), 
+                    'bbox': v
+                })
+        
+        # Prepare evaluation results
+        pred_results = []
+        for k, v in pred_bboxes_per_img.items():
+            bboxes = []
+            labels = []
+            for tmp in v:
+                bboxes.extend(tmp['bbox'])
+                labels.extend([tmp['label']] * len(tmp['bbox']))
+            
+            height = coco_api.imgs[k]["height"]
+            width = coco_api.imgs[k]["width"]
+            
+            pred_tuple = (
+                {'width': width, 'height': height, 'img_id': k},
+                {
+                    'img_id': k,
+                    'bboxes': np.array(bboxes),
+                    'scores': np.array([1.0] * len(bboxes)),
+                    'labels': np.array(labels),
+                },
+            )
+            pred_results.append(pred_tuple)
+        
+        # Compute metrics
+        eval_results = compute_metrics(pred_results, _coco_api=coco_api)
+        print(f"{dataset_name}: {eval_results}")
+        all_results[dataset_name] = eval_results
+    
+    # Summarize results
+    results_ordered = OrderedDict(sorted(all_results.items(), key=lambda x: x[0]))
+    metric_items = ['mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l']
+    results_display = []
+    
+    for prefix, result in results_ordered.items():
+        results_display.append([prefix] + [result[k] for k in metric_items])
+    
+    # Calculate average
+    average_scores = []
+    for col_idx in range(len(metric_items)):
+        average_scores.append(np.mean([line[col_idx + 1] for line in results_display]))
+    results_display.append(['Average'] + average_scores)
+    
+    # Print results table
+    try:
+        from tabulate import tabulate
+        print("\n" + "="*80)
+        print(
+            tabulate(
+                results_display,
+                headers=["ODinW13 Dataset"] + metric_items,
+                tablefmt="fancy_outline",
+                floatfmt=".3f",
+            )
+        )
+        print("="*80 + "\n")
+    except ImportError:
+        print("\n" + "="*80)
+        print("ODinW13 Results:")
+        print("="*80)
+        for row in results_display:
+            print(row)
+        print("="*80 + "\n")
+    
+    # Save results
+    all_results.update({"Average": average_scores[0]})
+    
+    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
+    with open(args.output_file, 'w') as f:
+        json.dump(all_results, f, ensure_ascii=False, indent=4)
+    
+    print(f"✓ Evaluation results saved to {args.output_file}")
+    print(f"\n{'='*80}")
+    print(f"Final Average mAP: {average_scores[0]:.4f}")
+    print(f"{'='*80}\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="ODinW Evaluation with vLLM")
+    subparsers = parser.add_subparsers(dest='command', help='Command to run')
+    
+    # Inference parser
+    infer_parser = subparsers.add_parser("infer", help="Run inference with vLLM")
+    infer_parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
+    infer_parser.add_argument("--data-dir", type=str, required=True, 
+                             help="Path to ODinW data directory (containing odinw13_config.py)")
+    infer_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
+    
+    # vLLM specific parameters
+    infer_parser.add_argument("--tensor-parallel-size", type=int, default=None,
+                            help="Tensor parallel size (default: number of GPUs)")
+    infer_parser.add_argument("--gpu-memory-utilization", type=float, default=0.9,
+                            help="GPU memory utilization (0.0-1.0, default: 0.9)")
+    infer_parser.add_argument("--max-model-len", type=int, default=128000,
+                            help="Maximum model context length (default: 128000)")
+    infer_parser.add_argument("--max-images-per-prompt", type=int, default=10,
+                            help="Maximum images per prompt (default: 10)")
+    
+    # Generation parameters
+    infer_parser.add_argument("--max-new-tokens", type=int, default=32768,
+                            help="Maximum number of tokens to generate (default: 32768)")
+    infer_parser.add_argument("--temperature", type=float, default=0.7,
+                            help="Temperature for sampling (default: 0.7)")
+    infer_parser.add_argument("--top-p", type=float, default=0.8,
+                            help="Top-p for sampling (default: 0.8)")
+    infer_parser.add_argument("--top-k", type=int, default=20,
+                            help="Top-k for sampling (default: 20)")
+    infer_parser.add_argument("--repetition-penalty", type=float, default=1.0,
+                            help="Repetition penalty (default: 1.0)")
+    infer_parser.add_argument("--presence-penalty", type=float, default=1.5,
+                            help="Presence penalty (default: 1.5)")
+    
+    # Evaluation parser
+    eval_parser = subparsers.add_parser("eval", help="Run evaluation")
+    eval_parser.add_argument("--data-dir", type=str, required=True,
+                           help="Path to ODinW data directory (containing odinw13_config.py)")
+    eval_parser.add_argument("--input-file", type=str, required=True,
+                           help="Input file with inference results")
+    eval_parser.add_argument("--output-file", type=str, required=True,
+                           help="Output file path")
+    
+    args = parser.parse_args()
+    
+    # Automatically set tensor_parallel_size
+    if args.command == 'infer' and args.tensor_parallel_size is None:
+        args.tensor_parallel_size = torch.cuda.device_count()
+        print(f"Auto-set tensor_parallel_size to {args.tensor_parallel_size}")
+    
+    if args.command == 'infer':
+        run_inference(args)
+    elif args.command == 'eval':
+        run_evaluation(args)
+    else:
+        parser.print_help()
+
+
+if __name__ == "__main__":
+    main()
+
--- a/evaluation/RealWorldQA/README.md
+++ b/evaluation/RealWorldQA/README.md
+# RealWorldQA Benchmark Evaluation
+
+This directory contains the implementation for evaluating vision-language models on the RealWorldQA benchmark using vLLM for high-speed inference.
+
+## Overview
+
+RealWorldQA is a real-world visual question answering benchmark containing 700+ high-quality VQA samples covering various real-world scenarios. This implementation provides:
+
+- **High-speed inference** using vLLM with automatic batch optimization
+- **Two-stage evaluation** using rule-based extraction with optional LLM-based fallback
+- **Automatic dataset download** from OpenCompass
+- **Modular code structure** for easy maintenance and extension
+
+## Project Structure
+
+```
+RealWorldQA/
+├── run_realworldqa.py    # Main script for inference and evaluation
+├── dataset_utils.py       # Dataset loading and preprocessing utilities
+├── eval_utils.py          # Evaluation logic and answer extraction
+├── common_utils.py        # Common utilities for image processing, file I/O
+├── infer_instruct.sh      # Inference script for instruct models
+├── infer_think.sh         # Inference script for thinking models
+├── eval_instruct.sh       # Evaluation script for instruct model results
+├── eval_think.sh          # Evaluation script for thinking model results
+├── requirements.txt       # Python dependencies
+└── README.md             # This file
+```
+
+## Requirements
+
+### Python Dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+Key dependencies:
+- `vllm` - High-speed LLM inference engine
+- `transformers` - HuggingFace transformers
+- `qwen_vl_utils` - Qwen VL utilities for vision processing
+- `pandas`, `numpy` - Data processing
+- `Pillow` - Image processing
+- `requests` - API calls for evaluation
+
+### Environment Variables
+
+For optional LLM-based evaluation, you need to set up API credentials:
+
+**Option 1: DashScope API (Recommended)**
+```bash
+export CHATGPT_DASHSCOPE_API_KEY="your-api-key"
+export DASHSCOPE_API_BASE="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
+```
+
+**Option 2: Custom OpenAI-compatible API**
+```bash
+export MIT_SPIDER_TOKEN="your-api-key"
+export MIT_SPIDER_URL="your-api-endpoint"
+```
+
+### Data Preparation
+
+The RealWorldQA dataset is stored in TSV format and will be **automatically downloaded** on first run from:
+```
+https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv
+```
+
+**Directory structure after download:**
+```
+${DATA_DIR}/
+├── RealWorldQA.tsv          # Main data file (auto-downloaded)
+└── images/
+    └── RealWorldQA/         # Decoded image files
+```
+
+**Setting data path:**
+- Option 1: Environment variable `export LMUData="/path/to/data"`
+- Option 2: Use `--data-dir` argument in commands
+
+## Quick Start
+
+### 1. Inference
+
+Run inference on the RealWorldQA dataset using an instruct model:
+
+```bash
+bash infer_instruct.sh
+```
+
+Or customize the inference:
+
+```bash
+python run_realworldqa.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --data-dir /path/to/data \
+    --dataset RealWorldQA \
+    --output-file results/predictions.jsonl \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
+```
+
+For thinking models with extended reasoning:
+
+```bash
+bash infer_think.sh
+```
+
+### 2. Evaluation
+
+Evaluate the inference results:
+
+```bash
+bash eval_instruct.sh
+```
+
+Or customize the evaluation:
+
+```bash
+python run_realworldqa.py eval \
+    --data-dir /path/to/data \
+    --input-file results/predictions.jsonl \
+    --output-file results/evaluation.csv \
+    --dataset RealWorldQA \
+    --eval-model gpt-3.5-turbo-0125 \
+    --api-type dash \
+    --nproc 4
+```
+
+## Detailed Usage
+
+### Inference Mode
+
+**Basic Arguments:**
+- `--model-path`: Path to the Qwen3-VL model directory (required)
+- `--data-dir`: Directory to store/load RealWorldQA dataset (required)
+- `--dataset`: Dataset name (default: `RealWorldQA`)
+- `--output-file`: Path to save inference results in JSONL format (required)
+
+**vLLM Arguments:**
+- `--tensor-parallel-size`: Number of GPUs for tensor parallelism (default: auto-detect)
+- `--gpu-memory-utilization`: GPU memory utilization ratio, 0.0-1.0 (default: 0.9)
+- `--max-model-len`: Maximum model context length (default: 128000)
+- `--max-images-per-prompt`: Maximum images per prompt (default: 10)
+
+**Generation Parameters:**
+- `--max-new-tokens`: Maximum tokens to generate (default: 32768)
+- `--temperature`: Sampling temperature (default: 0.7)
+- `--top-p`: Top-p sampling (default: 0.8)
+- `--top-k`: Top-k sampling (default: 20)
+- `--repetition-penalty`: Repetition penalty (default: 1.0)
+- `--presence-penalty`: Presence penalty to reduce repetition (default: 1.5)
+
+**Advanced Options:**
+- `--min-pixels`: Minimum pixels for image (default: 768×28×28 ≈ 600K pixels)
+- `--max-pixels`: Maximum pixels for image (default: 5120×28×28 ≈ 4M pixels)
+
+### Evaluation Mode
+
+**Basic Arguments:**
+- `--data-dir`: Directory containing RealWorldQA dataset (required)
+- `--input-file`: Inference results file in JSONL format (required)
+- `--output-file`: Path to save evaluation results in CSV format (required)
+- `--dataset`: Dataset name, must match inference (default: `RealWorldQA`)
+
+**Judge Model Arguments:**
+- `--eval-model`: Judge model name (default: None, uses rule-based only)
+  - Options: `gpt-3.5-turbo-0125`, `gpt-4-0125-preview`, `gpt-4o`, etc.
+- `--api-type`: API service type (default: `dash`)
+  - `dash`: DashScope API (Alibaba Cloud)
+  - `mit`: Custom OpenAI-compatible API
+- `--nproc`: Number of parallel workers for evaluation (default: 4)
+
+## Output Files
+
+### Inference Output
+
+The inference script generates a JSONL file where each line contains:
+
+```json
+{
+  "question_id": 0,
+  "annotation": {
+    "index": "0",
+    "question": "What is shown in the image?",
+    "A": "Cat",
+    "B": "Dog",
+    "C": "Bird",
+    "D": "Fish",
+    "answer": "A"
+  },
+  "task": "RealWorldQA",
+  "result": {
+    "gen": "The correct answer is A",
+    "gen_raw": "Raw model output including thinking process"
+  },
+  "messages": [...]
+}
+```
+
+### Evaluation Output
+
+The evaluation script generates two files:
+
+1. **CSV file** (`*_evaluation.csv`): Detailed evaluation results
+   - Columns: `index`, `question`, `prediction`, `extracted_answer`, `extraction_method`, `extraction_success`, `gt`, `hit`
+
+2. **JSON file** (`*_evaluation_acc.json`): Accuracy statistics
+   ```json
+   {
+     "overall_accuracy": 0.7234,
+     "task_samples": 765,
+     "correct": 553,
+     "total": 765
+   }
+   ```
+
+## Model-Specific Configurations
+
+### Instruct Models (e.g., Qwen3-VL-2B-Instruct, Qwen3-VL-7B-Instruct)
+
+Use standard parameters for balanced performance:
+
+```bash
+--max-new-tokens 32768
+--temperature 0.7
+--top-p 0.8
+--top-k 20
+--repetition-penalty 1.0
+--presence-penalty 1.5
+```
+
+### Thinking Models (e.g., Qwen3-VL-2B-Thinking)
+
+Use adjusted parameters for deeper reasoning:
+
+```bash
+--max-new-tokens 32768
+--temperature 0.6
+--top-p 0.95
+--top-k 20
+--repetition-penalty 1.0
+--presence-penalty 0.0
+```
+
+**Note:** Thinking models output reasoning steps wrapped in `<think>...</think>` tags. The evaluation automatically extracts the final answer after `</think>`.
+
+## Performance Tips
+
+1. **GPU Memory**: Adjust `--gpu-memory-utilization` based on your GPU:
+   - 0.9: Recommended for most cases
+   - 0.95: For maximum throughput (may cause OOM)
+   - 0.7-0.8: If experiencing OOM errors
+
+2. **Batch Size**: vLLM automatically optimizes batch size based on available memory
+
+3. **Tensor Parallelism**: Use `--tensor-parallel-size` for large models:
+   - 2B model: 1 GPU recommended
+   - 7B model: 1-2 GPUs
+   - 14B+ model: 2-4 GPUs
+
+4. **Context Length**: Reduce `--max-model-len` if memory is limited:
+   - 128000: Default, works well for most cases
+   - 64000: Reduces memory usage by ~40%
+
+5. **Evaluation Speed**: Omit `--eval-model` to use rule-based extraction only (faster, ~70-80% success rate)
+
+## Troubleshooting
+
+### Common Issues
+
+**1. CUDA Out of Memory**
+```bash
+# Reduce GPU memory utilization
+--gpu-memory-utilization 0.7
+
+# Or reduce context length
+--max-model-len 64000
+
+# Or reduce image resolution
+--max-pixels 1003520  # 1280×28×28
+```
+
+**2. vLLM Multiprocessing Issues**
+The code automatically sets `VLLM_WORKER_MULTIPROC_METHOD=spawn`. If you still encounter issues:
+```bash
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+```
+
+**3. Evaluation API Errors**
+- If you don't need LLM-based extraction, omit `--eval-model` (rule-based only)
+- If using LLM extraction, verify API credentials are set correctly
+- Check API endpoint connectivity
+- Increase `--nproc` value if rate-limited (up to 32)
+
+**4. Dataset Download Issues**
+The dataset is automatically downloaded from:
+```
+https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv
+```
+If download fails, manually download and place in `${DATA_DIR}/RealWorldQA.tsv`
+
+**5. Import Errors**
+Ensure all required files exist in the RealWorldQA directory:
+```bash
+ls common_utils.py dataset_utils.py eval_utils.py run_realworldqa.py
+```
+
+## Advanced Usage
+
+### Custom Image Resolution
+
+Modify resolution parameters in the inference command:
+
+```bash
+python run_realworldqa.py infer \
+    --min-pixels 393216      # 512×28×28
+    --max-pixels 1003520     # 1280×28×28
+    ...
+```
+
+### Evaluation Without LLM
+
+Use rule-based extraction only (faster, no API calls):
+
+```bash
+python run_realworldqa.py eval \
+    --input-file results/predictions.jsonl \
+    --output-file results/evaluation.csv
+    # No --eval-model specified
+```
+
+### Debug Mode
+
+Process only first N samples for testing:
+
+```bash
+DEBUG_SAMPLE_SIZE=10 python run_realworldqa.py infer ...
+```
+
+## Citation
+
+If you use this code or the RealWorldQA benchmark, please cite:
+
+```bibtex
+@misc{realworldqa2024,
+  title        = {RealWorldQA: A Benchmark for Real-World Spatial Understanding},
+  author       = {{xAI}},
+  year         = {2024},
+  howpublished = {\url{https://huggingface.co/datasets/xai-org/RealworldQA}},
+  note         = {Accessed: 2025-04-26}
+}
+```
+
+## License
+
+This code is released under the same license as the Qwen3-VL model.
+
+## Support
+
+For issues and questions:
+- GitHub Issues: [Qwen3-VL Repository](https://github.com/QwenLM/Qwen3-VL)
+- Documentation: See inline code comments and docstrings
--- a/evaluation/RealWorldQA/common_utils.py
+++ b/evaluation/RealWorldQA/common_utils.py
+import os
+import requests
+import base64
+import hashlib
+import io
+from PIL import Image
+from typing import List, Union
+
+def encode_image_to_base64(image, target_size=None):
+    """Encode an image to base64 string."""
+    if target_size is not None:
+        width, height = image.size
+        # Resize the image while maintaining the aspect ratio
+        if width > height:
+            new_width = target_size
+            new_height = int(height * target_size / width)
+        else:
+            new_height = target_size
+            new_width = int(width * target_size / height)
+        image = image.resize((new_width, new_height))
+    
+    buffer = io.BytesIO()
+    image.save(buffer, format="JPEG")
+    return base64.b64encode(buffer.getvalue()).decode('utf-8')
+
+def decode_base64_to_image(base64_string):
+    """Decode a base64 string to an image."""
+    image_data = base64.b64decode(base64_string)
+    return Image.open(io.BytesIO(image_data))
+
+def decode_base64_to_image_file(base64_string, output_path):
+    """Decode a base64 string and save it to a file."""
+    image = decode_base64_to_image(base64_string)
+    image.save(output_path)
+
+def download_file(url, local_path):
+    """Download a file from a URL to a local path."""
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    
+    with open(local_path, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            f.write(chunk)
+
+def md5(file_path):
+    """Calculate the MD5 hash of a file."""
+    hash_md5 = hashlib.md5()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+def toliststr(s):
+    if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'):
+        return [str(x) for x in eval(s)]
+    elif isinstance(s, str):
+        return [s]
+    elif isinstance(s, list):
+        return [str(x) for x in s]
+    raise NotImplementedError
+
--- a/evaluation/RealWorldQA/dataset_utils.py
+++ b/evaluation/RealWorldQA/dataset_utils.py
+"""
+RealWorldQA Dataset Utilities
+
+Data loading and processing utilities, fully independent of VLMEvalKit.
+"""
+
+import os
+import pandas as pd
+import numpy as np
+import string
+from typing import Dict, Any, List
+from PIL import Image
+from common_utils import download_file, md5, toliststr, decode_base64_to_image_file
+
+# RealWorldQA dataset URL and MD5
+REALWORLDQA_DATASET_URL = 'https://opencompass.openxlab.space/utils/VLMEval/RealWorldQA.tsv'
+REALWORLDQA_DATASET_MD5 = '92321028d2bc29040284b6674721e48f'
+
+def load_dataset(dataset_name='RealWorldQA'):
+    """
+    Load RealWorldQA dataset.
+    
+    Args:
+        dataset_name: Dataset name (default: 'RealWorldQA')
+    
+    Returns:
+        pd.DataFrame: Loaded dataset
+    """
+    if 'LMUData' not in os.environ:
+        raise ValueError("Please set LMUData environment variable or use --data-dir argument")
+    data_root = os.path.join(os.environ['LMUData'])
+    os.makedirs(data_root, exist_ok=True)
+    
+    file_name = f"{dataset_name}.tsv"
+    data_path = os.path.join(data_root, file_name)
+    
+    # Download dataset if not exists or MD5 mismatch
+    if not os.path.exists(data_path) or md5(data_path) != REALWORLDQA_DATASET_MD5:
+        print(f"Downloading {dataset_name} dataset...")
+        download_file(REALWORLDQA_DATASET_URL, data_path)
+    
+    # Load dataset
+    data = pd.read_csv(data_path, sep='\t')
+    
+    # Process dataset
+    data['index'] = [str(x) for x in data['index']]
+    
+    # Process image data (base64 encoded or referenced)
+    if 'image' in data:
+        data['image'] = [str(x) for x in data['image']]
+        image_map = {x: y for x, y in zip(data['index'], data['image'])}
+        
+        # Process image references (some images may reference other indices)
+        for k in image_map:
+            if len(image_map[k]) <= 64:
+                idx = image_map[k]
+                assert idx in image_map and len(image_map[idx]) > 64
+                image_map[k] = image_map[idx]
+
+        images = [toliststr(image_map[k]) for k in data['index']]
+        data['image'] = [x[0] if len(x) == 1 else x for x in images]
+    
+    # Process image paths
+    if 'image_path' in data:
+        paths = [toliststr(x) for x in data['image_path']]
+        data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
+    
+    # Convert index to integer if possible
+    if np.all([isinstance(x, int) or (isinstance(x, str) and x.isdigit()) for x in data['index']]):
+        data['index'] = [int(x) for x in data['index']]
+    
+    return data
+
+def dump_image(line, img_root):
+    """
+    Save image data to disk and return path.
+    
+    Args:
+        line: Data row containing image data
+        img_root: Image save root directory
+    
+    Returns:
+        list: List of image paths
+    """
+    os.makedirs(img_root, exist_ok=True)
+    
+    if 'image' in line:
+        if isinstance(line['image'], list):
+            tgt_path = []
+            assert 'image_path' in line
+            for img, im_name in zip(line['image'], line['image_path']):
+                path = os.path.join(img_root, im_name)
+                if not os.path.exists(path):
+                    decode_base64_to_image_file(img, path)
+                tgt_path.append(path)
+        else:
+            tgt_path = os.path.join(img_root, f"{line['index']}.jpg")
+            if not os.path.exists(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+            tgt_path = [tgt_path]
+    else:
+        assert 'image_path' in line
+        tgt_path = toliststr(line['image_path'])
+    
+    return tgt_path
+
+def build_realworldqa_prompt(line, dump_image_func, min_pixels, max_pixels):
+    """
+    Build RealWorldQA dataset prompt.
+    
+    Args:
+        line: Data row
+        dump_image_func: Image save function
+        min_pixels: Minimum pixels
+        max_pixels: Maximum pixels
+    
+    Returns:
+        list: List of messages in standard conversation format
+    """
+    # Save and get image path
+    tgt_path = dump_image_func(line)
+    
+    # Build question text
+    question = line['question']
+    
+    # Build options
+    options = {
+        cand: line[cand]
+        for cand in string.ascii_uppercase
+        if cand in line and not pd.isna(line[cand])
+    }
+    
+    options_prompt = 'Options:\n'
+    for key, item in options.items():
+        options_prompt += f'{key}. {item}\n'
+    
+    # Process hint if exists
+    hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+    
+    # Build complete prompt
+    prompt = ''
+    if hint is not None:
+        prompt += f'Hint: {hint}\n'
+    prompt += f'Question: {question}\n'
+    if len(options):
+        prompt += options_prompt
+        prompt += 'Please select the correct answer from the options above. \n'
+    
+    # Build messages in standard conversation format
+    content = []
+    
+    # Add images (using file:// prefix for consistency)
+    if isinstance(tgt_path, list):
+        for p in tgt_path:
+            content.append({
+                "type": "image",
+                "image": f"file://{p}",
+                "min_pixels": min_pixels,
+                "max_pixels": max_pixels
+            })
+    else:
+        content.append({
+            "type": "image", 
+            "image": f"file://{tgt_path}",
+            "min_pixels": min_pixels,
+            "max_pixels": max_pixels
+        })
+    
+    # Add text
+    content.append({"type": "text", "text": prompt})
+    
+    # Return messages in standard conversation format
+    messages = [{
+        "role": "user",
+        "content": content
+    }]
+    
+    return messages
+
--- a/evaluation/RealWorldQA/eval_instruct.sh
+++ b/evaluation/RealWorldQA/eval_instruct.sh
+#!/bin/bash
+
+# RealWorldQA Evaluation Script (Instruct Model)
+# This script evaluates the inference results using rule-based and optionally model-based extraction
+
+python run_realworldqa.py eval \
+    --data-dir /path/to/data \
+    --input-file results/RealWorldQA_results.jsonl \
+    --output-file results/RealWorldQA_evaluation.csv \
+    --dataset RealWorldQA \
+    --eval-model gpt-3.5-turbo-0125 \
+    --api-type dash \
+    --nproc 4
+
--- a/evaluation/RealWorldQA/eval_think.sh
+++ b/evaluation/RealWorldQA/eval_think.sh
+#!/bin/bash
+
+# RealWorldQA Evaluation Script (Thinking Model)
+# This script evaluates the inference results from thinking model using rule-based and optionally model-based extraction
+
+python run_realworldqa.py eval \
+    --data-dir /path/to/data \
+    --input-file results/RealWorldQA_results_thinking.jsonl \
+    --output-file results/RealWorldQA_evaluation_thinking.csv \
+    --dataset RealWorldQA \
+    --eval-model qwen-plus \
+    --api-type dash \
+    --nproc 4
+
--- a/evaluation/RealWorldQA/eval_utils.py
+++ b/evaluation/RealWorldQA/eval_utils.py
+"""
+RealWorldQA Evaluation Utilities
+
+Evaluation utilities, fully independent of VLMEvalKit.
+"""
+
+import os
+import requests
+import time
+import random
+import string
+import copy
+import traceback
+import pandas as pd
+from PIL import Image
+from typing import List, Dict, Tuple, Any
+from common_utils import encode_image_to_base64
+
+class OpenAIWrapper:
+    """Wrapper for OpenAI API."""
+    
+    def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
+        self.model = model
+        self.api_base = api_base
+        self.api_key = api_key
+        self.timeout = timeout
+        self.retry = retry
+        self.wait = wait
+        self.fail_msg = 'Failed to obtain answer via API.'
+    
+    def generate(self, messages):
+        """Generate a response from the API."""
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
+        
+        # Format messages for API
+        formatted_messages = []
+        for msg in messages:
+            if msg['type'] == 'text':
+                formatted_messages.append({"role": "user", "content": [{"type": "text", "text": msg['value']}]})
+            elif msg['type'] == 'image':
+                # Load and encode the image
+                image = Image.open(msg['value'])
+                image_data = encode_image_to_base64(image)
+                formatted_messages.append({
+                    "role": "user", 
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
+                    ]
+                })
+        
+        payload = {
+            "model": self.model,
+            "messages": formatted_messages,
+            "max_tokens": 4096,
+            "temperature": 0
+        }
+        
+        for i in range(self.retry):
+            try:
+                response = requests.post(
+                    self.api_base,
+                    headers=headers,
+                    json=payload,
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    resp_json = response.json()
+                    return resp_json['choices'][0]['message']['content'].strip()
+                
+                time.sleep(self.wait)
+            except Exception as e:
+                print(f"API error: {e}")
+                time.sleep(self.wait)
+        
+        return self.fail_msg
+
+class DashScopeWrapper:
+    """Wrapper for DashScope API."""
+    
+    def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
+        self.model = model
+        self.api_base = api_base
+        self.api_key = api_key
+        self.timeout = timeout
+        self.retry = retry
+        self.wait = wait
+        self.fail_msg = 'Failed to obtain answer via API.'
+    
+    def generate(self, messages):
+        """Generate a response from the API."""
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
+        
+        # Format messages for API
+        formatted_messages = []
+        for msg in messages:
+            if msg['type'] == 'text':
+                formatted_messages.append({"role": "user", "content": [{"type": "text", "text": msg['value']}]})
+            elif msg['type'] == 'image':
+                # Load and encode the image
+                image = Image.open(msg['value'])
+                image_data = encode_image_to_base64(image)
+                formatted_messages.append({
+                    "role": "user", 
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
+                    ]
+                })
+        
+        payload = {
+            "model": self.model,
+            "messages": formatted_messages,
+            "max_completion_tokens": 4096,
+            "n": 1,
+            "temperature": 0,
+            "stream": False
+        }
+
+        for i in range(self.retry):
+            try:
+                response = requests.post(
+                    self.api_base,
+                    headers=headers,
+                    json=payload,
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    resp_json = response.json()
+                    
+                    # Check finish reason
+                    for output in resp_json['choices']:
+                        if output['finish_reason'] not in ['stop', 'function_call']:
+                            print(f"DashScope finished with error: {resp_json}")
+                            time.sleep(self.wait)
+                            continue
+                    
+                    return resp_json['choices'][0]['message']['content']
+                else:
+                    print(f"DashScope API error: HTTP {response.status_code}")
+                    try:
+                        error_content = response.json()
+                        print(f"Error details: {error_content}")
+                    except:
+                        print(f"Raw error content: {response.content.decode('utf-8', errors='replace')}")
+                
+                time.sleep(self.wait)
+            except requests.exceptions.ConnectionError as conn_err:
+                print(f"DashScope: Connection error occurred: {conn_err}")
+                time.sleep(self.wait)
+            except requests.exceptions.Timeout as timeout_err:
+                print(f"DashScope: Timeout error occurred: {timeout_err}")
+                time.sleep(self.wait)
+            except requests.exceptions.RequestException as req_err:
+                print(f"DashScope: Request exception occurred: {req_err}")
+                time.sleep(self.wait)
+            except Exception as e:
+                print(f"DashScope: An error occurred: {e}")
+                print(traceback.format_exc())
+                time.sleep(self.wait)
+        
+        return self.fail_msg
+
+def build_judge(model, api_type):
+    """Build a judge model for evaluation."""
+    if api_type == 'mit':
+        api_key = os.environ.get('MIT_SPIDER_TOKEN', '')
+        api_base = os.environ.get('MIT_SPIDER_URL', '')
+        return OpenAIWrapper(model, api_base, api_key)
+    elif api_type == 'dash':
+        api_key = os.environ.get('CHATGPT_DASHSCOPE_API_KEY', '')
+        api_base = os.environ.get('DASHSCOPE_API_BASE', '')
+        return DashScopeWrapper(model, api_base, api_key)
+    else:
+        raise ValueError(f"Unsupported API type: {api_type}")
+
+def can_infer_option(answer, choices):
+    """Rule-based extraction of answer option."""
+    if 'Failed to obtain answer via API' in answer:
+        return False
+
+    reject_to_answer = [
+        "Sorry, I can't help with images of people yet.",
+        "I can't process this file.",
+        "I'm sorry, but without the image provided",
+        'Cannot determine the answer'
+    ]
+    for err in reject_to_answer:
+        if err in answer:
+            return 'Z'
+
+    def count_choice(splits, choices, prefix='', suffix=''):
+        cnt = 0
+        for c in choices:
+            if prefix + c + suffix in splits:
+                cnt += 1
+        return cnt
+
+    answer_mod = copy.copy(answer)
+    chars = '.()[],:;!*#{}'
+    for c in chars:
+        answer_mod = answer_mod.replace(c, ' ')
+
+    splits = [x.strip() for x in answer_mod.split()]
+    count = count_choice(splits, choices)
+
+    if count == 1:
+        for ch in choices:
+            if 'A' in splits and len(splits) > 3:
+                return False
+            if ch in splits:
+                return ch
+    elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
+        return 'Z'
+    return False
+
+def can_infer_text(answer, choices):
+    """Extract answer by matching text content."""
+    answer = answer.lower()
+    assert isinstance(choices, dict)
+    for k in choices:
+        assert k in string.ascii_uppercase
+        choices[k] = str(choices[k]).lower()
+    cands = []
+    for k in choices:
+        if choices[k] in answer:
+            cands.append(k)
+    if len(cands) == 1:
+        return cands[0]
+    return False
+
+def can_infer(answer, choices):
+    """Combined approach to infer answer choice."""
+    answer = str(answer)
+    copt = can_infer_option(answer, choices)
+    return copt if copt else can_infer_text(answer, choices)
+
+def build_choices(item):
+    """Build choices dictionary from item."""
+    ret = {}
+    for ch in string.ascii_uppercase:
+        if ch in item and (not pd.isna(item[ch])):
+            ret[ch] = item[ch]
+    return ret
+
+def build_option_str(option_dict):
+    """Build option string for prompt."""
+    s = 'There are several options: \n'
+    for c, content in option_dict.items():
+        if not pd.isna(content):
+            s += f'{c}. {content}\n'
+    return s
+
+def build_prompt(question, options, prediction):
+    """Build prompt for answer extraction."""
+    tmpl = (
+        'You are an AI assistant who will help me to match '
+        'an answer with several options of a single-choice question. '
+        'You are provided with a question, several options, and an answer, '
+        'and you need to find which option is most similar to the answer. '
+        'If the meaning of all options are significantly different from the answer, output Z. '
+        'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
+        'Example 1: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: a cute teddy bear\nYour output: A\n'
+        'Example 2: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: Spider\nYour output: Z\n'
+        'Example 3: \n'
+        'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
+    )
+    return tmpl.format(question, options, prediction)
+
+def extract_answer_from_item(model, item, wait=5):
+    """Extract answer from model prediction using rule-based and model-based approaches."""
+    # Build choices dictionary
+    choices = build_choices(item)
+    option_str = build_option_str(choices)
+    prompt = build_prompt(item['question'], option_str, item['prediction'])
+    
+    # Try rule-based extraction first
+    prediction = item['prediction']
+    ret = can_infer(prediction, choices)
+    
+    if ret:
+        if ret == 'Z':
+            extract_flag = False
+            log = f"Rule extract failed with rule result: {ret} prediction: {prediction}"
+        else:
+            extract_flag = True
+            log = f"Rule extract success with rule result: {ret} prediction: {prediction}"
+        return dict(opt=ret, log=log, extract_model='rule', extract_flag=extract_flag)
+    
+    # If rule-based extraction fails, use model-based extraction
+    print(f"Rule extract failed. Use model-based extraction.")
+    if model is None:
+        # For RealWorldQA, if model is None, use random choice
+        options = list(choices) + ['Z'] if 'Z' not in choices else list(choices)
+        log = f'No judge model provided. Randomly generate one.\n'
+        return dict(opt=random.choice(options), log=log, extract_model='random', extract_flag=False)
+    
+    # Try model-based extraction with retries
+    retry = 5
+    while retry:
+        messages_for_judge = [{'type': 'text', 'value': prompt}]
+        ans = model.generate(messages_for_judge)
+        if 'Failed to obtain answer via API' in ans:
+            print('API failed to answer.')
+        else:
+            ret = can_infer(ans, choices)
+            if ret and ret != 'Z':
+                log = f'{model.model} extract Succeed. {model.model}:{ans}\n'
+                return dict(opt=ret, log=log, extract_model=model.model, extract_flag=True)
+            else:
+                print(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
+        retry -= 1
+        
+        if retry <= 0:
+            options = list(choices) + ['Z'] if 'Z' not in choices else []
+            log = f'{model.model} extract failed. randomly generate one. {model.model} response:{ans}\n'
+            return dict(opt=random.choice(options), log=log, extract_model=model.model, extract_flag=False)
+
+def eval_single_sample(args):
+    """Evaluate a single sample."""
+    model, item = args
+        
+    # Extract answer using the combined approach
+    result = extract_answer_from_item(model, item)
+    
+    # Get ground truth answer
+    gt_answer = item['answer']
+    
+    # Determine if the answer is correct
+    hit = 1 if result['opt'] == gt_answer else 0
+    
+    return {
+        "index": item['index'],
+        "question": item['question'],
+        "prediction": item['prediction'],
+        "extracted_answer": result['opt'],
+        "extraction_method": result['extract_model'],
+        "extraction_success": result['extract_flag'],
+        "extraction_log": result['log'],
+        "gt": gt_answer,
+        "hit": hit
+    }
+