"Initial commit"

3d735feb · luopl · 3d735feb · 3d735feb · 3d735feb · 3d735feb
Commit 3d735feb authored Dec 03, 2025 by luopl
20 changed files
--- a/evaluation/MathVision/eval_utils.py
+++ b/evaluation/MathVision/eval_utils.py
+import os
+import requests
+import time
+import random
+import copy
+import traceback
+import pandas as pd
+from PIL import Image
+from typing import List, Dict, Tuple, Any
+from common_utils import encode_image_to_base64
+from collections import defaultdict
+
+try:
+    from latex2sympy2 import latex2sympy
+except ImportError:
+    print('Warning: latex2sympy2 not installed. Install with: pip install latex2sympy2')
+    latex2sympy = None
+
+FAIL_MSG = 'Failed to obtain answer via API.'
+
+
+def is_equal(asw: str, gt_asw: str) -> bool:
+    """Check if two answers are equal."""
+    if not isinstance(asw, str) or not isinstance(gt_asw, str):
+        print('Warning: input is not string')
+        print(asw, gt_asw)
+    asw = str(asw).lower().strip()
+    gt_asw = str(gt_asw).lower().strip()
+    if gt_asw == asw:
+        return True
+    try:
+        a = eval(gt_asw)
+        b = eval(asw)
+        if abs(a - b) < 1e-6:
+            return True
+    except:
+        pass
+    if latex2sympy is not None:
+        try:
+            a = latex2sympy(gt_asw)
+            b = latex2sympy(asw)
+            if abs(eval(str(a)) - eval(str(b))) < 1e-6:
+                return True
+            if abs(a - b) < 1e-6:
+                return True
+        except:
+            pass
+    return False
+
+
+def get_gpt4_ICE():
+    """Get in-context examples for GPT-4 answer extraction."""
+    example_1 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: Which number is missing?\n
+Model response: The number missing in the sequence is 14.\n
+Extracted answer: 14
+"""
+
+    example_2 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: What is the fraction of females facing the camera?\n
+Model response: The fraction of females facing the camera is 0.6,
+which means that six out of ten females in the group are facing the camera.\n
+Extracted answer: 0.6
+"""
+
+    example_3 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: How much money does Luca need to buy a sour apple candy and a butter-scotch candy? (Unit: $)\n
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.\n
+Extracted answer: 1.45
+"""
+
+    example_4 = """
+Hint: Please answer the question and provide the final answer at the end.\n
+Question: Between which two years does the line graph saw its maximum peak?\n
+Model response: The line graph saw its maximum peak between 2007 and 2008.\n
+Extracted answer: [2007, 2008]
+"""
+
+    example_5 = """
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n
+Question: What fraction of the shape is blue?\n
+Choices: (A) 3/11 (B) 8/11 (C) 6/11 (D) 3/5\n
+Model response: The correct answer is (B) 8/11.\n
+Extracted answer: B
+"""
+
+    return [example_1, example_2, example_3, example_4, example_5]
+
+
+def build_mathv_gpt4_prompt(line):
+    """Build the prompt for GPT-4 to extract answer from model response."""
+    task_description = """
+Please read the following example.
+Then extract the answer from the model response and type it at the end of the prompt.\n
+"""
+    question = line['question']
+    prediction = str(line['prediction'])
+    prompt = task_description
+    examples = get_gpt4_ICE()
+    for example in examples:
+        prompt += example + '\n'
+    prompt += question + '\n'
+    prompt += 'Model response: ' + prediction + '\n'
+    prompt += 'Extracted answer: '
+    return prompt
+
+
+def list_to_dict(lst):
+    """Convert list to dictionary with uppercase letters as keys."""
+    return {chr(65 + i): val for i, val in enumerate(lst)}
+
+
+def can_infer_option(answer, choices):
+    """Rule-based extraction of answer option."""
+    if FAIL_MSG in answer:
+        return False
+
+    reject_to_answer = [
+        "Sorry, I can't help with images of people yet.",
+        "I can't process this file.",
+        "I'm sorry, but without the image provided",
+        'Cannot determine the answer'
+    ]
+    for err in reject_to_answer:
+        if err in answer:
+            return 'Z'
+
+    def count_choice(splits, choices, prefix='', suffix=''):
+        cnt = 0
+        for c in choices:
+            if prefix + c + suffix in splits:
+                cnt += 1
+        return cnt
+
+    answer_mod = copy.copy(answer)
+    chars = '.()[],:;!*#{}'
+    for c in chars:
+        answer_mod = answer_mod.replace(c, ' ')
+
+    splits = [x.strip() for x in answer_mod.split()]
+    count = count_choice(splits, choices)
+
+    if count == 1:
+        for ch in choices:
+            if 'A' in splits and len(splits) > 3:
+                return False
+            if ch in splits:
+                return ch
+    elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
+        return 'Z'
+    return False
+
+
+def can_infer_text(answer, choices):
+    """Extract answer by matching text content."""
+    answer = answer.lower()
+    assert isinstance(choices, dict)
+    for k in choices:
+        choices[k] = str(choices[k]).lower()
+    cands = []
+    for k in choices:
+        if choices[k] in answer:
+            cands.append(k)
+    if len(cands) == 1:
+        return cands[0]
+    return False
+
+
+def can_infer(answer, choices):
+    """Combined approach to infer answer choice."""
+    answer = str(answer)
+    copt = can_infer_option(answer, choices)
+    return copt if copt else can_infer_text(answer, choices)
+
+
+def post_check(line, prefetch=False):
+    """Check if the prediction matches the answer."""
+    res = None
+    ans = line['answer']
+    response = line['prediction'] if prefetch else line['res']
+    try:
+        if len(eval(line['choices'])) > 0:
+            ans = line['answer']
+            choices = list_to_dict(eval(line['choices']))
+            res = can_infer(response, choices)
+            if prefetch:
+                return res
+        else:
+            res = str(response)
+            ans = str(ans)
+    except ValueError:
+        pass
+
+    if is_equal(res, ans):
+        return res if prefetch else True
+    else:
+        return False
+
+
+class OpenAIWrapper:
+    """Wrapper for OpenAI API."""
+    
+    def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
+        self.model = model
+        self.api_base = api_base
+        self.api_key = api_key
+        self.timeout = timeout
+        self.retry = retry
+        self.wait = wait
+        self.fail_msg = FAIL_MSG
+    
+    def generate(self, prompt, temperature=0):
+        """Generate a response from the API."""
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
+        
+        payload = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": 4096,
+            "temperature": temperature
+        }
+        
+        for i in range(self.retry):
+            try:
+                response = requests.post(
+                    self.api_base,
+                    headers=headers,
+                    json=payload,
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    resp_json = response.json()
+                    return resp_json['choices'][0]['message']['content'].strip()
+                
+                time.sleep(self.wait)
+            except Exception as e:
+                print(f"API error: {e}")
+                time.sleep(self.wait)
+        
+        return self.fail_msg
+
+
+class DashScopeWrapper:
+    """Wrapper for DashScope API."""
+    
+    def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
+        self.model = model
+        self.api_base = api_base
+        self.api_key = api_key
+        self.timeout = timeout
+        self.retry = retry
+        self.wait = wait
+        self.fail_msg = FAIL_MSG
+    
+    def generate(self, prompt, temperature=0):
+        """Generate a response from the API."""
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
+        
+        payload = {
+            "model": self.model,
+            "messages": [{"role": "user", "content": prompt}],
+            "max_completion_tokens": 4096,
+            "n": 1,
+            "temperature": temperature,
+            "stream": False
+        }
+
+        for i in range(self.retry):
+            try:
+                response = requests.post(
+                    self.api_base,
+                    headers=headers,
+                    json=payload,
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    resp_json = response.json()
+                    
+                    # Check finish reason
+                    for output in resp_json['choices']:
+                        if output['finish_reason'] not in ['stop', 'function_call']:
+                            print(f"DashScope finished with error: {resp_json}")
+                            time.sleep(self.wait)
+                            continue
+                    
+                    return resp_json['choices'][0]['message']['content']
+                else:
+                    print(f"DashScope API error: HTTP {response.status_code}")
+                    try:
+                        error_content = response.json()
+                        print(f"Error details: {error_content}")
+                    except:
+                        print(f"Raw error content: {response.content.decode('utf-8', errors='replace')}")
+                
+                time.sleep(self.wait)
+            except Exception as e:
+                print(f"DashScope error: {e}")
+                time.sleep(self.wait)
+        
+        return self.fail_msg
+
+
+def build_judge(model, api_type):
+    """Build a judge model for evaluation."""
+    if api_type == 'mit':
+        api_key = os.environ.get('MIT_SPIDER_TOKEN', '')
+        api_base = os.environ.get('MIT_SPIDER_URL', '')
+        return OpenAIWrapper(model, api_base, api_key)
+    elif api_type == 'dash':
+        api_key = os.environ.get('CHATGPT_DASHSCOPE_API_KEY', '')
+        api_base = os.environ.get('DASHSCOPE_API_BASE', '')
+        return DashScopeWrapper(model, api_base, api_key)
+    else:
+        raise ValueError(f"Unsupported API type: {api_type}")
+
+
+def MATH_V_auxeval(args):
+    """Auxiliary evaluation for MathVision - extract answer from model response."""
+    model, line = args
+    prompt = build_mathv_gpt4_prompt(line)
+    log = ''
+    retry = 5
+    
+    # Try rule-based extraction first
+    if post_check(line, prefetch=True):
+        res = post_check(line, prefetch=True)
+        log += 'Prefetch succeed.\n'
+        extract_flag = True
+        if not res or res == 'Z':
+            extract_flag = False
+            log += f'Rule extract failed with ans: {res}'
+        else:
+            log += f'Rule extract success with ans: {res}'
+        return dict(log=log, res=res, extract_model='rule', extract_flag=extract_flag)
+    
+    # Use model-based extraction
+    for i in range(retry):
+        prediction = line['prediction']
+        res = model.generate(prompt, temperature=i * 0.5)
+
+        if FAIL_MSG in res:
+            log += f'Try {i}: output is {prediction}, failed to parse.\n'
+        else:
+            log += f'{model.model} extract Succeed.\n'
+            return dict(log=log, res=res, extract_model=model.model, extract_flag=True)
+    
+    log += f'All {retry} retries failed.\n {model.model} response:{res}'
+    return dict(log=log, res='', extract_model=model.model, extract_flag=False)
+
+
+def MATH_V_acc(result_file):
+    """Calculate accuracy for MathVision results."""
+    data = pd.read_excel(result_file) if result_file.endswith('.xlsx') else pd.read_csv(result_file)
+    
+    tot = defaultdict(lambda: 0)
+    fetch = defaultdict(lambda: 0)
+    hit = defaultdict(lambda: 0)
+    lt = len(data)
+    extract_counts = {}
+
+    for i in range(lt):
+        item = data.iloc[i]
+        cate = item['category']
+        tot['Overall'] += 1
+        tot[cate] += 1
+        if 'Prefetch succeed' in item['log']:
+            fetch['Overall'] += 1
+            fetch[cate] += 1
+        if post_check(item, prefetch=False):
+            hit['Overall'] += 1
+            hit[cate] += 1
+        # Statistics of answers extracted by rule and gpt
+        extract_model = item['extract_model']
+        extract_flag = item['extract_flag']
+        if extract_model in extract_counts:
+            extract_counts[extract_model][1] += 1
+        else:
+            extract_counts[extract_model] = [0, 1]  # succeed, total
+        if extract_flag:
+            extract_counts[extract_model][0] += 1
+
+    res = defaultdict(list)
+    for k in tot.keys():
+        res['Subject'].append(k)
+        res['tot'].append(tot[k])
+        res['prefetch'].append(fetch[k])
+        res['hit'].append(hit[k])
+        res['prefetch_rate'].append(fetch[k] / tot[k] * 100)
+        res['acc'].append(hit[k] / tot[k] * 100)
+        if k == 'Overall':
+            for model_key in extract_counts:
+                res[model_key+'_success'].append(extract_counts[model_key][0])
+                res[model_key+'_all'].append(extract_counts[model_key][1])
+        else:
+            for model_key in extract_counts:
+                res[model_key+'_success'].append(0)
+                res[model_key+'_all'].append(0)
+    res = pd.DataFrame(res).sort_values('Subject', ignore_index=True)
+    return res
+
+
+def eval_single_sample(args):
+    """Evaluate a single sample."""
+    return MATH_V_auxeval(args)
--- a/evaluation/MathVision/infer_instruct.sh
+++ b/evaluation/MathVision/infer_instruct.sh
+#!/bin/bash
+
+# MathVision Inference Script (Instruct Model)
+# This script runs inference on the MathVision dataset using vLLM
+
+python run_mathv.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --data-dir /path/to/mathvision_data \
+    --dataset MathVision \
+    --output-file results/mathvision_predictions.jsonl \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
+    # --num-samples 100
\ No newline at end of file
--- a/evaluation/MathVision/infer_think.sh
+++ b/evaluation/MathVision/infer_think.sh
+#!/bin/bash
+
+# MathVision Inference Script (Thinking Model)
+# This script runs inference on the MathVision dataset using vLLM with thinking mode parameters
+
+python run_mathv.py infer \
+    --model-path /path/to/Qwen3-VL-Thinking \
+    --data-dir /path/to/mathvision_data \
+    --dataset MathVision \
+    --output-file results/mathvision_predictions_thinking.jsonl \
+    --max-new-tokens 40960 \
+    --temperature 1.0 \
+    --top-p 0.95 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 0.0
+    # --num-samples 100
\ No newline at end of file
--- a/evaluation/MathVision/requirements.txt
+++ b/evaluation/MathVision/requirements.txt
+vllm
+transformers
+qwen_vl_utils
+pandas
+numpy
+Pillow
+# Utilities
+tqdm
+requests
+validators
+torch
+torchvision
+accelerate
+openpyxl
+latex2sympy2
+flash_attn
\ No newline at end of file
--- a/evaluation/MathVision/run_mathv.py
+++ b/evaluation/MathVision/run_mathv.py
+import os
+import sys
+import json
+import argparse
+import pandas as pd
+import numpy as np
+import time
+import re
+from tqdm import tqdm
+from typing import List, Dict, Any
+import torch
+import warnings
+import traceback
+
+# vLLM imports
+from vllm import LLM, SamplingParams
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor
+
+# Local imports from refactored files
+from dataset_utils import load_dataset, dump_image
+from eval_utils import build_judge, eval_single_sample, MATH_V_acc
+
+# Set vLLM multiprocessing method
+os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+
+def clean_for_excel(val):
+    """
+    Remove characters that are illegal in Excel cells.
+    Excel doesn't support control characters (0x00-0x1F) except tab, newline, carriage return.
+    """
+    if isinstance(val, str):
+        # Remove control characters (0x00-0x1F) except tab(0x09), newline(0x0A), carriage return(0x0D)
+        return re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F]', '', val)
+    return val
+
+def clean_dataframe_for_excel(df):
+    """Clean all string columns in a DataFrame for Excel compatibility."""
+    return df.applymap(clean_for_excel) if hasattr(df, 'applymap') else df.map(clean_for_excel)
+
+def build_mathv_prompt(line, dump_image_func, dataset):
+    """
+    Build MathVision dataset prompt.
+    """
+    # Standard resolution (MathVision uses smaller min_pixels)
+    MIN_PIXELS = 768*28*28  # ~0.6M pixels
+    MAX_PIXELS = 5120*28*28  # ~4M pixels
+    
+    tgt_path = dump_image_func(line)
+    question = line['question']
+    
+    # Build messages in standard conversation format
+    content = []
+    
+    # Add all images first
+    if isinstance(tgt_path, list):
+        for p in tgt_path:
+            content.append({
+                "type": "image",
+                "image": p,
+                "min_pixels": MIN_PIXELS,
+                "max_pixels": MAX_PIXELS
+            })
+    else:
+        content.append({
+            "type": "image", 
+            "image": tgt_path,
+            "min_pixels": MIN_PIXELS,
+            "max_pixels": MAX_PIXELS
+        })
+    
+    # Add question text last
+    content.append({"type": "text", "text": question})
+    
+    # Return messages in standard conversation format
+    messages = [{
+        "role": "user",
+        "content": content
+    }]
+    
+    return messages
+
+def prepare_inputs_for_vllm(messages, processor):
+    """
+    Prepare inputs for vLLM (following the examples in README.md).
+    
+    Args:
+        messages: List of messages in standard conversation format
+        processor: AutoProcessor instance
+    
+    Returns:
+        dict: Input format required by vLLM
+    """
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    
+    # qwen_vl_utils 0.0.14+ required
+    image_inputs, video_inputs, video_kwargs = process_vision_info(
+        messages,
+        image_patch_size=processor.image_processor.patch_size,
+        return_video_kwargs=True,
+        return_video_metadata=True
+    )
+    
+    mm_data = {}
+    if image_inputs is not None:
+        mm_data['image'] = image_inputs
+    if video_inputs is not None:
+        mm_data['video'] = video_inputs
+    
+    return {
+        'prompt': text,
+        'multi_modal_data': mm_data,
+        'mm_processor_kwargs': video_kwargs
+    }
+
+def run_inference(args):
+    """Run inference on the MathVision dataset using vLLM."""
+    print("\n" + "="*80)
+    print("🚀 MathVision Inference with vLLM (High-Speed Mode)")
+    print("="*80 + "\n")
+    
+    # Load dataset
+    data = load_dataset(args.dataset)
+    
+    # Limit number of samples if specified
+    if args.num_samples is not None and args.num_samples > 0:
+        original_len = len(data)
+        data = data.iloc[:args.num_samples]
+        print(f"✓ Loaded {len(data)} samples from {args.dataset} (limited from {original_len} samples)")
+    else:
+        print(f"✓ Loaded {len(data)} samples from {args.dataset}")
+    
+    # Set up image root directory
+    img_root = os.path.join(os.environ['LMUData'], 'images', args.dataset)
+    os.makedirs(img_root, exist_ok=True)
+    
+    # Set up dump_image function
+    def dump_image_func(line):
+        return dump_image(line, img_root)
+    
+    # Create output directory
+    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
+
+    # Set up CoT prompt if enabled
+    cot_prompt = ""
+    if args.use_cot:
+        cot_prompt = args.cot_prompt if args.cot_prompt else " Let's think step by step."
+        print(f"✓ Using CoT prompt: {cot_prompt[:50]}...")
+
+    # Set up generation parameters (vLLM SamplingParams format)
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        top_k=args.top_k,
+        max_tokens=args.max_new_tokens,
+        repetition_penalty=args.repetition_penalty,
+        presence_penalty=args.presence_penalty,
+        stop_token_ids=[],
+    )
+    
+    print(f"\n⚙️  Generation parameters (vLLM SamplingParams):")
+    print(f"   max_tokens={sampling_params.max_tokens}")
+    print(f"   temperature={sampling_params.temperature}, top_p={sampling_params.top_p}, top_k={sampling_params.top_k}")
+    print(f"   repetition_penalty={sampling_params.repetition_penalty}")
+    print(f"   presence_penalty={sampling_params.presence_penalty}")
+    
+    if sampling_params.presence_penalty > 0:
+        print(f"   ✅ Anti-repetition enabled (presence_penalty={sampling_params.presence_penalty})")
+    
+    if sampling_params.temperature <= 0.02 and sampling_params.top_k == 1:
+        print(f"   ✅ Using FAST greedy-like decoding")
+    else:
+        print(f"   ⚠️  Using sampling decoding (slower but more diverse)")
+    print()
+
+    # Load processor for input preparation
+    print(f"Loading processor from {args.model_path}")
+    processor = AutoProcessor.from_pretrained(args.model_path)
+    print("✓ Processor loaded\n")
+    
+    # Initialize vLLM
+    print(f"Initializing vLLM with model: {args.model_path}")
+    print(f"   GPU count: {torch.cuda.device_count()}")
+    print(f"   Tensor parallel size: {args.tensor_parallel_size}")
+    
+    llm = LLM(
+        model=args.model_path,
+        tensor_parallel_size=args.tensor_parallel_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        trust_remote_code=True,
+        max_model_len=args.max_model_len,
+        limit_mm_per_prompt={"image": args.max_images_per_prompt},
+        seed=42,
+    )
+    print("✓ vLLM initialized successfully\n")
+    
+    # Prepare all inputs
+    print("Preparing inputs for vLLM...")
+    all_inputs = []
+    all_line_dicts = []
+    all_messages = []
+    
+    for idx, (_, line) in enumerate(tqdm(data.iterrows(), total=len(data), desc="Building prompts")):
+        # Convert line to dict
+        line_dict = line.to_dict()
+        for k, v in line_dict.items():
+            if isinstance(v, np.integer):
+                line_dict[k] = int(v)
+            elif isinstance(v, np.floating):
+                line_dict[k] = float(v)
+        
+        # Build prompt
+        messages = build_mathv_prompt(line, dump_image_func, args.dataset)
+        
+        # Add CoT prompt
+        if args.use_cot and len(messages) > 0 and len(messages[0]['content']) > 0:
+            last_content = messages[0]['content'][-1]
+            if last_content['type'] == 'text':
+                last_content['text'] += cot_prompt
+        
+        # Prepare input for vLLM
+        vllm_input = prepare_inputs_for_vllm(messages, processor)
+        
+        all_inputs.append(vllm_input)
+        all_line_dicts.append(line_dict)
+        all_messages.append(messages)
+    
+    print(f"✓ Prepared {len(all_inputs)} inputs\n")
+    
+    # Batch inference (vLLM automatic optimization)
+    print("="*80)
+    print("🚀 Running vLLM batch inference (automatic optimization)")
+    print("="*80)
+    start_time = time.time()
+    
+    outputs = llm.generate(all_inputs, sampling_params=sampling_params)
+    
+    end_time = time.time()
+    total_time = end_time - start_time
+    print(f"\n✓ Inference completed in {total_time:.2f} seconds")
+    print(f"  Average: {total_time/len(data):.2f} seconds/sample")
+    print(f"  Throughput: {len(data)/total_time:.2f} samples/second\n")
+    
+    # Save results
+    print("Saving results...")
+    results = []
+    
+    for idx, (line_dict, messages, output) in enumerate(zip(all_line_dicts, all_messages, outputs)):
+        response = output.outputs[0].text
+        index = line_dict['index']
+
+        response_final = str(response).split("</think>")[-1].strip()
+        
+        result = {
+            "question_id": int(index) if isinstance(index, np.integer) else index,
+            "annotation": line_dict,
+            "task": args.dataset,
+            "result": {"gen": response_final, "gen_raw": response},
+            "messages": messages
+        }
+        results.append(result)
+    
+    # Write final results
+    with open(args.output_file, 'w') as f:
+        for res in results:
+            f.write(json.dumps(res) + '\n')
+    
+    print(f"\n✓ Results saved to {args.output_file}")
+    print(f"✓ Total samples processed: {len(results)}")
+
+def run_evaluation(args):
+    """Run evaluation on inference results."""
+    # Load results
+    results = []
+    with open(args.input_file, 'r') as f:
+        for line in f:
+            job = json.loads(line)
+            annotation = job["annotation"]
+            annotation["prediction"] = job["result"]["gen"]
+            results.append(annotation)
+            
+    data = pd.DataFrame.from_records(results)
+    data = data.sort_values(by='index')
+    data['prediction'] = [str(x) for x in data['prediction']]
+
+    # Load dataset for validation
+    meta = load_dataset(args.dataset)
+
+    # Validation
+    print(f"len(data): {len(data)}")
+    print(f"len(meta): {len(meta)}")
+    meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+    data_map = {x: y for x, y in zip(data['index'], data['question'])}
+    for k in data_map:
+        assert k in meta_q_map, (
+            f'eval_file should be the same as or a subset of dataset {args.dataset}'
+        )
+
+    # Save intermediate results
+    output_xlsx = args.output_file.replace('.csv', '.xlsx') if args.output_file.endswith('.csv') else args.output_file
+    clean_dataframe_for_excel(data).to_excel(output_xlsx, index=False)
+    print(f"✓ Saved intermediate results to {output_xlsx}")
+
+    # Build judge model
+    model = build_judge(
+        model=getattr(args, 'eval_model', 'gpt-4o-2024-05-13'),
+        api_type=getattr(args, 'api_type', 'dash')
+    )
+    
+    # Prepare evaluation tasks
+    eval_tasks = []
+    for i in range(len(data)):
+        item = data.iloc[i]
+        eval_tasks.append((model, item))
+    
+    # Run evaluation
+    eval_results = []
+    
+    # Debug mode: process single-threaded with first few samples
+    debug = os.environ.get('DEBUG', '').lower() == 'true'
+    if debug:
+        print("Running in debug mode with first 5 samples...")
+        for task in eval_tasks[:5]:
+            try:
+                result = eval_single_sample(task)
+                eval_results.append(result)
+            except Exception as e:
+                print(f"Error processing task: {e}")
+                print(f"Task details: {task}")
+                raise
+    else:
+        # Normal mode: process all samples with threading
+        from concurrent.futures import ThreadPoolExecutor
+        nproc = getattr(args, 'nproc', 4)
+        with ThreadPoolExecutor(max_workers=nproc) as executor:
+            for result in tqdm(executor.map(eval_single_sample, eval_tasks), 
+                             total=len(eval_tasks), desc="Evaluating"):
+                eval_results.append(result)
+    
+    # Update data with evaluation results
+    data['res'] = [r['res'] for r in eval_results]
+    data['log'] = [r['log'] for r in eval_results]
+    data['extract_model'] = [r['extract_model'] for r in eval_results]
+    data['extract_flag'] = [r['extract_flag'] for r in eval_results]
+    
+    # Save evaluation results
+    storage = args.output_file.replace('.csv', '_eval.xlsx')
+    clean_dataframe_for_excel(data).to_excel(storage, index=False)
+    print(f"✓ Saved evaluation results to {storage}")
+    
+    # Calculate accuracy
+    score = MATH_V_acc(storage)
+    score_pth = storage.replace('.xlsx', '_score.csv')
+    score.to_csv(score_pth, index=False)
+    print(f"✓ Saved score to {score_pth}")
+    
+    print(f"\n{'='*50}")
+    print(f"Evaluation Results:")
+    print(f"{'='*50}")
+    print(score)
+    print(f"{'='*50}\n")
+    
+    return score
+
+def main():
+    parser = argparse.ArgumentParser(description="MathVision Evaluation with vLLM")
+    subparsers = parser.add_subparsers(dest='command', help='Command to run')
+    
+    # Inference parser
+    infer_parser = subparsers.add_parser("infer", help="Run inference with vLLM")
+    infer_parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
+    infer_parser.add_argument("--dataset", type=str, default="MathVision", 
+                            choices=["MathVision", "MathVision_MINI"],
+                            help="Dataset name")
+    infer_parser.add_argument("--data-dir", type=str, help="The absolute path of MathVision data directory")
+    infer_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
+    infer_parser.add_argument("--num-samples", type=int, default=None, 
+                            help="Number of samples to process (default: None, process all samples)")
+    infer_parser.add_argument("--use-cot", action="store_true", help="Use Chain-of-Thought prompting")
+    infer_parser.add_argument("--cot-prompt", type=str, default="", help="Custom Chain-of-Thought prompt")
+    
+    # vLLM specific parameters
+    infer_parser.add_argument("--tensor-parallel-size", type=int, default=None, 
+                            help="Tensor parallel size (default: number of GPUs)")
+    infer_parser.add_argument("--gpu-memory-utilization", type=float, default=0.9,
+                            help="GPU memory utilization (0.0-1.0, default: 0.9)")
+    infer_parser.add_argument("--max-model-len", type=int, default=128000,
+                            help="Maximum model context length (default: 128000)")
+    infer_parser.add_argument("--max-images-per-prompt", type=int, default=10,
+                            help="Maximum images per prompt (default: 10)")
+    
+    # Generation parameters
+    infer_parser.add_argument("--max-new-tokens", type=int, default=32768, 
+                            help="Maximum number of tokens to generate (default: 2048)")
+    infer_parser.add_argument("--temperature", type=float, default=0.7, 
+                            help="Temperature for sampling (default: 0.7 for greedy-like decoding)")
+    infer_parser.add_argument("--top-p", type=float, default=0.8, 
+                            help="Top-p for sampling (default: 0.8 for greedy-like decoding)")
+    infer_parser.add_argument("--top-k", type=int, default=20, 
+                            help="Top-k for sampling (default: 20 for greedy decoding)")
+    infer_parser.add_argument("--repetition-penalty", type=float, default=1.0,
+                            help="Repetition penalty (default: 1.0, increase to 1.2-1.5 to reduce repetition)")
+    infer_parser.add_argument("--presence-penalty", type=float, default=1.5,
+                            help="Presence penalty (default: 1.5, range: 0.0-2.0, penalize tokens that have already appeared)")
+    
+    # Evaluation parser
+    eval_parser = subparsers.add_parser("eval", help="Run evaluation")
+    eval_parser.add_argument("--data-dir", type=str, help="The absolute path of MathVision data directory")
+    eval_parser.add_argument("--input-file", type=str, required=True, help="Input file with inference results")
+    eval_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
+    eval_parser.add_argument("--dataset", type=str, default="MathVision",
+                            choices=["MathVision", "MathVision_MINI"],
+                            help="Dataset name")
+    eval_parser.add_argument("--eval-model", type=str, default="gpt-4o",
+                            help="Model to use for evaluation (default: gpt-4o)")
+    eval_parser.add_argument("--api-type", type=str, default="dash", choices=["dash", "mit"],
+                            help="API type for evaluation")
+    eval_parser.add_argument("--nproc", type=int, default=4, help="Number of processes to use")
+    
+    args = parser.parse_args()
+    
+    # Set data directory if provided
+    if hasattr(args, 'data_dir') and args.data_dir:
+        os.environ['LMUData'] = args.data_dir
+    
+    # Automatically set tensor_parallel_size
+    if args.command == 'infer' and args.tensor_parallel_size is None:
+        args.tensor_parallel_size = torch.cuda.device_count()
+        print(f"Auto-set tensor_parallel_size to {args.tensor_parallel_size}")
+    
+    if args.command == 'infer':
+        run_inference(args)
+    elif args.command == 'eval':
+        run_evaluation(args)
+    else:
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()
--- a/evaluation/ODinW-13/README.md
+++ b/evaluation/ODinW-13/README.md
+# ODinW Benchmark Evaluation
+
+This directory contains the implementation for evaluating vision-language models on the ODinW (Object Detection in the Wild) 13 dataset using vLLM for high-speed inference.
+
+## Overview
+
+ODinW is a comprehensive object detection benchmark that consists of 13 diverse datasets spanning various domains. This implementation provides:
+
+- **High-speed inference** using vLLM with automatic batch optimization
+- **Unified evaluation** across 13 diverse object detection datasets
+- **COCO-style metrics** including mAP, mAP_50, mAP_75, etc.
+- **Modular code structure** for easy maintenance and extension
+
+## Project Structure
+
+```
+ODinW-13/
+├── run_odinw.py          # Main script for inference and evaluation
+├── dataset_utils.py      # Dataset loading and preprocessing utilities
+├── eval_utils.py         # Evaluation logic and COCO metrics computation
+├── infer_instruct.sh     # Inference script for instruct models
+├── infer_think.sh        # Inference script for thinking models
+├── eval_instruct.sh      # Evaluation script for instruct model results
+├── eval_think.sh         # Evaluation script for thinking model results
+├── requirements.txt      # Python dependencies
+└── README.md            # This file
+```
+
+## Requirements
+
+### Python Dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+Key dependencies:
+- `vllm` - High-speed LLM inference engine
+- `transformers` - HuggingFace transformers
+- `qwen_vl_utils` - Qwen VL utilities for vision processing
+- `pycocotools` - COCO evaluation API
+- `pandas`, `numpy` - Data processing
+- `Pillow` - Image processing
+- `tabulate` - Table formatting (optional, for better output display)
+
+### Data Preparation
+
+The ODinW dataset requires a specific directory structure:
+
+```
+/path/to/odinw_data/
+├── odinw13_config.py          # Dataset configuration file (required)
+├── AerialMaritimeDrone/       # Individual datasets
+│   ├── large/
+│   │   ├── train/
+│   │   └── test/
+│   └── tiled/
+├── Aquarium/
+├── Cottontail Rabbits/
+├── EgoHands/
+├── NorthAmerica Mushrooms/
+├── Packages/
+├── Pascal VOC/
+├── Pistols/
+├── Pothole/
+├── Raccoon/
+├── ShellfishOpenImages/
+├── Thermal Dogs and People/
+└── Vehicles OpenImages/
+```
+
+**Important**: The `odinw13_config.py` file must contain:
+- `datasets`: List of dataset configurations
+- `dataset_prefixes`: List of dataset names
+
+## Quick Start
+
+### 1. Inference
+
+Run inference on the ODinW dataset using an instruct model:
+
+```bash
+bash infer_instruct.sh
+```
+
+Or customize the inference:
+
+```bash
+python run_odinw.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --data-dir /path/to/odinw_data \
+    --output-file results/odinw_predictions.jsonl \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
+```
+
+For thinking models with extended reasoning:
+
+```bash
+bash infer_think.sh
+```
+
+### 2. Evaluation
+
+Evaluate the inference results using COCO metrics:
+
+```bash
+bash eval_instruct.sh
+```
+
+Or customize the evaluation:
+
+```bash
+python run_odinw.py eval \
+    --data-dir /path/to/odinw_data \
+    --input-file results/odinw_predictions.jsonl \
+    --output-file results/odinw_eval_results.json
+```
+
+## Detailed Usage
+
+### Inference Mode
+
+**Basic Arguments:**
+- `--model-path`: Path to the Qwen3-VL model directory (required)
+- `--data-dir`: Path to ODinW data directory containing `odinw13_config.py` (required)
+- `--output-file`: Path to save inference results in JSONL format (required)
+
+**vLLM Arguments:**
+- `--tensor-parallel-size`: Number of GPUs for tensor parallelism (default: auto-detect)
+- `--gpu-memory-utilization`: GPU memory utilization ratio, 0.0-1.0 (default: 0.9)
+- `--max-model-len`: Maximum model context length (default: 128000)
+- `--max-images-per-prompt`: Maximum images per prompt (default: 10)
+
+**Generation Parameters:**
+- `--max-new-tokens`: Maximum tokens to generate (default: 32768)
+- `--temperature`: Sampling temperature (default: 0.7)
+- `--top-p`: Top-p sampling (default: 0.8)
+- `--top-k`: Top-k sampling (default: 20)
+- `--repetition-penalty`: Repetition penalty (default: 1.0)
+- `--presence-penalty`: Presence penalty to reduce repetition (default: 1.5)
+
+### Evaluation Mode
+
+**Basic Arguments:**
+- `--data-dir`: Path to ODinW data directory containing `odinw13_config.py` (required)
+- `--input-file`: Inference results file in JSONL format (required)
+- `--output-file`: Path to save evaluation results in JSON format (required)
+
+## Output Files
+
+### Inference Output
+
+The inference script generates two files:
+
+1. **Predictions file** (`odinw_predictions.jsonl`): JSONL file where each line contains:
+```json
+{
+  "question_id": 0,
+  "annotation": [...],
+  "extra_info": {
+    "dataset_name": "AerialMaritimeDrone_large",
+    "img_id": 1,
+    "anno_path": "/path/to/annotations.json",
+    "resized_h": 640,
+    "resized_w": 640,
+    "img_h": 1080,
+    "img_w": 1920,
+    "img_path": "/path/to/image.jpg"
+  },
+  "result": {
+    "gen": "[{\"bbox_2d\": [x1, y1, x2, y2], \"label\": \"boat\"}, ...]",
+    "gen_raw": "Raw model output including thinking process"
+  },
+  "messages": [...]
+}
+```
+
+2. **Dataset config file** (`odinw_predictions_datasets.json`): Configuration for evaluation
+
+### Evaluation Output
+
+The evaluation script generates a JSON file with results for each dataset:
+
+```json
+{
+  "AerialMaritimeDrone_large": {
+    "mAP": 0.456,
+    "mAP_50": 0.678,
+    "mAP_75": 0.512,
+    "mAP_s": 0.234,
+    "mAP_m": 0.456,
+    "mAP_l": 0.567
+  },
+  "Aquarium_Aquarium Combined.v2-raw-1024.coco": {
+    ...
+  },
+  ...
+  "Average": 0.423
+}
+```
+
+**Evaluation Metrics:**
+- **mAP**: Mean Average Precision at IoU 0.5:0.95 (primary metric)
+- **mAP_50**: mAP at IoU threshold 0.5
+- **mAP_75**: mAP at IoU threshold 0.75
+- **mAP_s**: mAP for small objects (area < 32²)
+- **mAP_m**: mAP for medium objects (32² < area < 96²)
+- **mAP_l**: mAP for large objects (area > 96²)
+
+## Model-Specific Configurations
+
+### Instruct Models (e.g., Qwen3-VL-2B-Instruct, Qwen3-VL-7B-Instruct)
+
+Use standard parameters for balanced performance:
+
+```bash
+--max-new-tokens 32768
+--temperature 0.7
+--top-p 0.8
+--top-k 20
+--repetition-penalty 1.0
+--presence-penalty 1.5
+```
+
+### Thinking Models (e.g., Qwen3-VL-2B-Thinking)
+
+Use adjusted parameters for deeper reasoning:
+
+```bash
+--max-new-tokens 32768
+--temperature 0.6
+--top-p 0.95
+--top-k 20
+--repetition-penalty 1.0
+--presence-penalty 0.0
+```
+
+**Note:** Thinking models output reasoning steps wrapped in `<think>...</think>` tags. The evaluation automatically extracts the final answer after `</think>`.
+
+## Performance Tips
+
+1. **GPU Memory**: Adjust `--gpu-memory-utilization` based on your GPU:
+   - 0.9: Recommended for most cases
+   - 0.95: For maximum throughput (may cause OOM)
+   - 0.7-0.8: If experiencing OOM errors
+
+2. **Batch Size**: vLLM automatically optimizes batch size based on available memory
+
+3. **Tensor Parallelism**: Use `--tensor-parallel-size` for large models:
+   - 2B model: 1 GPU recommended
+   - 7B model: 1-2 GPUs
+   - 14B+ model: 2-4 GPUs
+
+4. **Context Length**: Reduce `--max-model-len` if memory is limited:
+   - 128000: Default, works well for most cases
+   - 64000: Reduces memory usage by ~40%
+
+5. **Image Processing**: The implementation uses `smart_resize` to automatically adjust image dimensions:
+   - Dimensions are made divisible by 32
+   - Total pixels are constrained to [min_pixels, max_pixels]
+   - Aspect ratio is preserved
+
+## Troubleshooting
+
+### Common Issues
+
+**1. Config file not found**
+```
+FileNotFoundError: Config file not found: /path/to/odinw13_config.py
+```
+**Solution**: Ensure `odinw13_config.py` exists in `--data-dir`
+
+**2. CUDA Out of Memory**
+```bash
+# Reduce GPU memory utilization
+--gpu-memory-utilization 0.7
+
+# Or reduce context length
+--max-model-len 64000
+```
+
+**3. vLLM Multiprocessing Issues**
+The code automatically sets `VLLM_WORKER_MULTIPROC_METHOD=spawn`. If you still encounter issues:
+```bash
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+```
+
+**4. Empty or Invalid JSON Output**
+- Check model output format
+- Verify prompt clarity
+- Try adjusting temperature/top_p
+
+**5. Low mAP Scores**
+- Verify category names match dataset classes
+- Check coordinate format (xyxy vs xywh)
+- Ensure model outputs JSON format correctly
+
+**6. COCO API Errors**
+```
+IndexError: The testing results of the whole dataset is empty.
+```
+**Solution**: No valid predictions were generated. Check model outputs.
+
+## Advanced Usage
+
+### Custom Image Resolution
+
+Edit `dataset_utils.py` to modify resolution parameters:
+
+```python
+# Calculate image resolution parameters
+patch_size = 16
+merge_base = 2
+pixels_per_token = patch_size * patch_size * merge_base * merge_base
+min_pixels = pixels_per_token * 768
+max_pixels = pixels_per_token * 12800
+```
+
+### Filtering Datasets
+
+To evaluate only specific datasets, edit `generate_odinw_jobs()` in `dataset_utils.py`:
+
+```python
+# Only process specific datasets
+dataset_filter = ['AerialMaritimeDrone', 'Aquarium']
+for data_name, data_config in datasets.items():
+    if data_name not in dataset_filter:
+        continue
+    # ... rest of the code
+```
+
+### Custom Prompt Format
+
+Edit the prompt in `dataset_utils.py`:
+
+```python
+# Default prompt
+prompt = f"Locate every instance that belongs to the following categories: '{obj_names}'. Report bbox coordinates in JSON format."
+
+# Custom prompt example
+prompt = f"Find all {obj_names} objects in the image and output their bounding boxes as JSON."
+```
+
+## Citation
+
+If you use this code or the ODinW benchmark, please cite:
+
+```bibtex
+@inproceedings{li2022grounded,
+  title={Grounded language-image pre-training},
+  author={Li, Liunian Harold and Zhang, Pengchuan and Zhang, Haotian and Yang, Jianwei and Li, Chunyuan and Zhong, Yiwu and Wang, Lijuan and Yuan, Lu and Zhang, Lei and Hwang, Jenq-Neng and others},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10965--10975},
+  year={2022}
+}
+```
+
+## License
+
+This code is released under the same license as the Qwen3-VL model.
+
+## Support
+
+For issues and questions:
+- GitHub Issues: [Qwen3-VL Repository](https://github.com/QwenLM/Qwen3-VL)
+- Documentation: See inline code comments and docstrings
--- a/evaluation/ODinW-13/dataset_utils.py
+++ b/evaluation/ODinW-13/dataset_utils.py
+"""
+ODinW dataset loading and processing utilities.
+"""
+import os
+import math
+from typing import Dict, List, Tuple
+from pycocotools.coco import COCO
+
+
+def round_by_factor(number: int, factor: int) -> int:
+    """Return the nearest integer divisible by factor."""
+    return round(number / factor) * factor
+
+
+def ceil_by_factor(number: int, factor: int) -> int:
+    """Return the ceiling integer divisible by factor."""
+    return math.ceil(number / factor) * factor
+
+
+def floor_by_factor(number: int, factor: int) -> int:
+    """Return the floor integer divisible by factor."""
+    return math.floor(number / factor) * factor
+
+
+def smart_resize(height: int, width: int, factor: int = 28, 
+                 min_pixels: int = 56*56, max_pixels: int = 14*14*4*1280, 
+                 max_long_side: int = 8192) -> Tuple[int, int]:
+    """Resize image to meet the following conditions:
+        1. Both height and width are divisible by factor
+        2. Total pixels are within [min_pixels, max_pixels]
+        3. Longest side is within max_long_side
+        4. Aspect ratio is preserved
+    
+    Args:
+        height: Original image height
+        width: Original image width
+        factor: Size must be divisible by this factor
+        min_pixels: Minimum pixel count
+        max_pixels: Maximum pixel count
+        max_long_side: Maximum longest side
+    
+    Returns:
+        (resized_height, resized_width): Resized dimensions
+    """
+    if height < 2 or width < 2:
+        raise ValueError(f'height:{height} or width:{width} must be larger than factor:{factor}')
+    elif max(height, width) / min(height, width) > 200:
+        raise ValueError(f'absolute aspect ratio must be smaller than 200, got {height} / {width}')
+
+    if max(height, width) > max_long_side:
+        beta = max(height, width) / max_long_side
+        height, width = int(height / beta), int(width / beta)
+
+    h_bar = round_by_factor(height, factor)
+    w_bar = round_by_factor(width, factor)
+    
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = floor_by_factor(height / beta, factor)
+        w_bar = floor_by_factor(width / beta, factor)
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = ceil_by_factor(height * beta, factor)
+        w_bar = ceil_by_factor(width * beta, factor)
+    
+    return h_bar, w_bar
+
+
+def load_odinw_config(config_path: str) -> Dict:
+    """Load odinw13_config.py configuration file.
+    
+    Args:
+        config_path: Path to config file
+    
+    Returns:
+        datasets: Dictionary mapping dataset names to configurations
+    """
+    import runpy
+    config = runpy.run_path(config_path)
+    dataset_configs = config["datasets"]
+    dataset_names = config["dataset_prefixes"]
+    
+    datasets = {}
+    for dataset_name, dataset_config in zip(dataset_names, dataset_configs):
+        datasets[dataset_name] = dataset_config
+    
+    return datasets
+
+
+def generate_odinw_jobs(data_dir: str, args) -> Tuple[List[Dict], Dict]:
+    """Generate inference task list for ODinW dataset.
+    
+    Args:
+        data_dir: Data directory path (containing odinw13_config.py)
+        args: Command line arguments
+    
+    Returns:
+        (question_list, datasets): Task list and dataset configurations
+    """
+    # Load config
+    config_path = os.path.join(data_dir, "odinw13_config.py")
+    if not os.path.exists(config_path):
+        raise FileNotFoundError(f"Config file not found: {config_path}")
+    
+    datasets = load_odinw_config(config_path)
+    
+    question_list = []
+    question_id = 0
+    num_questions_per_dataset = {}
+    
+    # Calculate image resolution parameters
+    patch_size = 16
+    merge_base = 2
+    pixels_per_token = patch_size * patch_size * merge_base * merge_base
+    min_pixels = pixels_per_token * 768
+    max_pixels = pixels_per_token * 12800
+    
+    # Iterate through all datasets
+    for data_name, data_config in datasets.items():
+        print(f'Parsing ODinW:{data_name}')
+        classes = list(data_config["metainfo"]["classes"])
+        
+        # Build data paths
+        idx = data_config["data_root"].find('data/odinw/') + len('data/odinw/')
+        sub_root = os.path.join(data_dir, data_config["data_root"][idx:])
+        sub_anno = sub_root + data_config["ann_file"]
+        sub_img_root = sub_root + data_config["data_prefix"]["img"]
+        
+        # Load COCO format annotations
+        dataset = COCO(sub_anno)
+        num_questions = 0
+        
+        # Iterate through all images
+        for img_idx, img_meta in dataset.imgs.items():
+            img_name = img_meta["file_name"]
+            img_path = sub_img_root + img_name
+            img_h = img_meta["height"]
+            img_w = img_meta["width"]
+            
+            # Calculate resized image dimensions
+            resized_h, resized_w = smart_resize(
+                img_h, img_w, 
+                factor=32, 
+                min_pixels=min_pixels, 
+                max_pixels=max_pixels, 
+                max_long_side=50000
+            )
+            
+            # Get annotations
+            img_annos = dataset.imgToAnns[img_idx]
+            
+            # Build class names list
+            obj_names = ", ".join(classes)
+            
+            # Build prompt
+            prompt = f"Locate every instance that belongs to the following categories: '{obj_names}'. Report bbox coordinates in JSON format."
+            
+            # Build messages
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image", 
+                            "image": f"file://{img_path}", 
+                            "min_pixels": min_pixels, 
+                            "max_pixels": max_pixels
+                        },
+                        {"type": "text", "text": prompt}
+                    ]
+                }
+            ]
+            
+            # Build task item
+            item = {
+                "question_id": question_id,
+                "annotation": img_annos,
+                'messages': messages,
+                "extra_info": {
+                    'dataset_name': data_name,
+                    'dataset_config': data_config,
+                    'img_id': img_meta["id"],
+                    'anno_path': sub_anno,
+                    'resized_h': resized_h,
+                    'resized_w': resized_w,
+                    'img_h': img_h,
+                    'img_w': img_w,
+                    'img_path': img_path
+                }
+            }
+            question_list.append(item)
+            question_id += 1
+            num_questions += 1
+        
+        num_questions_per_dataset[data_name] = num_questions
+    
+    # Print statistics
+    for data_name, num_questions in num_questions_per_dataset.items():
+        print(f'{data_name}: {num_questions}')
+    print(f"Total ODinW questions: {len(question_list)}")
+    
+    return question_list, datasets
+
--- a/evaluation/ODinW-13/eval_instruct.sh
+++ b/evaluation/ODinW-13/eval_instruct.sh
+#!/bin/bash
+
+# ODinW Evaluation Script (Instruct Model)
+# This script evaluates the inference results using COCO metrics
+
+python run_odinw.py eval \
+    --data-dir /path/to/odinw_data \
+    --input-file results/odinw_predictions.jsonl \
+    --output-file results/odinw_eval_results.json
+
--- a/evaluation/ODinW-13/eval_think.sh
+++ b/evaluation/ODinW-13/eval_think.sh
+#!/bin/bash
+
+# ODinW Evaluation Script (Thinking Model)
+# This script evaluates the inference results from thinking model using COCO metrics
+
+python run_odinw.py eval \
+    --data-dir /path/to/odinw_data \
+    --input-file results/odinw_predictions_thinking.jsonl \
+    --output-file results/odinw_eval_results_thinking.json
+
--- a/evaluation/ODinW-13/eval_utils.py
+++ b/evaluation/ODinW-13/eval_utils.py
+"""
+ODinW evaluation utilities.
+"""
+import os
+import json
+import tempfile
+import numpy as np
+from typing import List, Dict, Sequence
+from collections import OrderedDict
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+
+
+def xyxy2xywh(bbox: np.ndarray) -> list:
+    """Convert bbox format from xyxy to xywh.
+    
+    Args:
+        bbox: Bounding box in [x1, y1, x2, y2] format
+    
+    Returns:
+        Bounding box in [x, y, w, h] format
+    """
+    _bbox = bbox.tolist()
+    return [
+        _bbox[0],
+        _bbox[1],
+        _bbox[2] - _bbox[0],
+        _bbox[3] - _bbox[1],
+    ]
+
+
+def results2json(results: Sequence[dict], outfile_prefix: str, cat_ids: dict) -> dict:
+    """Convert results to COCO JSON format.
+    
+    Args:
+        results: List of prediction results
+        outfile_prefix: Output file prefix
+        cat_ids: Category ID mapping
+    
+    Returns:
+        result_files: Dictionary of result file paths
+    """
+    bbox_json_results = []
+    for idx, result in enumerate(results):
+        image_id = result.get('img_id', idx)
+        labels = result['labels']
+        bboxes = result['bboxes']
+        scores = result['scores']
+        
+        for i, label in enumerate(labels):
+            data = dict()
+            data['image_id'] = image_id
+            data['bbox'] = xyxy2xywh(bboxes[i])
+            data['score'] = float(scores[i])
+            data['category_id'] = cat_ids[label]
+            bbox_json_results.append(data)
+    
+    result_files = dict()
+    result_files['bbox'] = f'{outfile_prefix}.bbox.json'
+    with open(result_files['bbox'], 'w') as f:
+        json.dump(bbox_json_results, f)
+    
+    return result_files
+
+
+def compute_metrics(results: list, outfile_prefix: str = None, _coco_api: COCO = None) -> Dict[str, float]:
+    """Compute mAP and other metrics using COCO API.
+    
+    Args:
+        results: List of evaluation results, each element is a (gt, pred) tuple
+        outfile_prefix: Output file prefix (optional)
+        _coco_api: COCO API instance
+    
+    Returns:
+        eval_results: Dictionary of evaluation metrics
+    """
+    proposal_nums = (100, 300, 1000)
+    iou_thrs = np.linspace(
+        .5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True)
+    
+    # Separate ground truth and predictions
+    if len(results) == 0:
+        gts, preds = [], []
+    else:
+        gts, preds = zip(*results)
+    
+    tmp_dir = None
+    if outfile_prefix is None:
+        tmp_dir = tempfile.TemporaryDirectory()
+        outfile_prefix = os.path.join(tmp_dir.name, 'results')
+    
+    cat_ids = _coco_api.getCatIds()
+    img_ids = _coco_api.getImgIds()
+    
+    # Convert to COCO format and save
+    result_files = results2json(preds, outfile_prefix, cat_ids)
+    
+    eval_results = OrderedDict()
+    
+    for metric in ["bbox"]:
+        iou_type = metric
+        if metric not in result_files:
+            raise KeyError(f'{metric} is not in results')
+        try:
+            with open(result_files[metric], 'r') as f:
+                predictions = json.load(f)
+            coco_dt = _coco_api.loadRes(predictions)
+        except IndexError:
+            print('The testing results of the whole dataset is empty.')
+            break
+        
+        coco_eval = COCOeval(_coco_api, coco_dt, iou_type)
+        
+        coco_eval.params.catIds = cat_ids
+        coco_eval.params.imgIds = img_ids
+        coco_eval.params.maxDets = list(proposal_nums)
+        coco_eval.params.iouThrs = iou_thrs
+        
+        # mapping of cocoEval.stats
+        coco_metric_names = {
+            'mAP': 0,
+            'mAP_50': 1,
+            'mAP_75': 2,
+            'mAP_s': 3,
+            'mAP_m': 4,
+            'mAP_l': 5,
+            'AR@100': 6,
+            'AR@300': 7,
+            'AR@1000': 8,
+            'AR_s@1000': 9,
+            'AR_m@1000': 10,
+            'AR_l@1000': 11
+        }
+        
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+        
+        metric_items = [
+            'mAP', 'mAP_50', 'mAP_75', 'mAP_s', 'mAP_m', 'mAP_l'
+        ]
+        
+        for metric_item in metric_items:
+            val = coco_eval.stats[coco_metric_names[metric_item]]
+            eval_results[metric_item] = float(f'{round(val, 3)}')
+    
+    if tmp_dir is not None:
+        tmp_dir.cleanup()
+    
+    return eval_results
+
--- a/evaluation/ODinW-13/infer_instruct.sh
+++ b/evaluation/ODinW-13/infer_instruct.sh
+#!/bin/bash
+
+# ODinW Inference Script (Instruct Model)
+# This script runs inference on the ODinW dataset using vLLM
+
+python run_odinw.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --data-dir /path/to/odinw_data \
+    --output-file results/odinw_predictions.jsonl \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.9 \
+    --max-model-len 128000 \
+    --max-images-per-prompt 10 \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
+
--- a/evaluation/ODinW-13/infer_think.sh
+++ b/evaluation/ODinW-13/infer_think.sh
+#!/bin/bash
+
+# ODinW Inference Script (Thinking Model)
+# This script runs inference on the ODinW dataset using vLLM with thinking mode parameters
+
+python run_odinw.py infer \
+    --model-path /path/to/Qwen3-VL-Thinking \
+    --data-dir /path/to/odinw_data \
+    --output-file results/odinw_predictions_thinking.jsonl \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.9 \
+    --max-model-len 128000 \
+    --max-images-per-prompt 10 \
+    --max-new-tokens 32768 \
+    --temperature 0.6 \
+    --top-p 0.95 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 0.0
--- a/evaluation/ODinW-13/requirements.txt
+++ b/evaluation/ODinW-13/requirements.txt
+vllm
+transformers
+qwen_vl_utils
+pandas
+numpy
+Pillow
+tqdm
+requests
+validators
+torch
+torchvision
+accelerate
+pycocotools
+tabulate
+flash_attn
--- a/evaluation/ODinW-13/run_odinw.py
+++ b/evaluation/ODinW-13/run_odinw.py
--- a/evaluation/RealWorldQA/README.md
+++ b/evaluation/RealWorldQA/README.md
--- a/evaluation/RealWorldQA/common_utils.py
+++ b/evaluation/RealWorldQA/common_utils.py
--- a/evaluation/RealWorldQA/dataset_utils.py
+++ b/evaluation/RealWorldQA/dataset_utils.py
--- a/evaluation/RealWorldQA/eval_instruct.sh
+++ b/evaluation/RealWorldQA/eval_instruct.sh
+#!/bin/bash
+
+# RealWorldQA Evaluation Script (Instruct Model)
+# This script evaluates the inference results using rule-based and optionally model-based extraction
+
+python run_realworldqa.py eval \
+    --data-dir /path/to/data \
+    --input-file results/RealWorldQA_results.jsonl \
+    --output-file results/RealWorldQA_evaluation.csv \
+    --dataset RealWorldQA \
+    --eval-model gpt-3.5-turbo-0125 \
+    --api-type dash \
+    --nproc 4
+
--- a/evaluation/RealWorldQA/eval_think.sh
+++ b/evaluation/RealWorldQA/eval_think.sh
+#!/bin/bash
+
+# RealWorldQA Evaluation Script (Thinking Model)
+# This script evaluates the inference results from thinking model using rule-based and optionally model-based extraction
+
+python run_realworldqa.py eval \
+    --data-dir /path/to/data \
+    --input-file results/RealWorldQA_results_thinking.jsonl \
+    --output-file results/RealWorldQA_evaluation_thinking.csv \
+    --dataset RealWorldQA \
+    --eval-model qwen-plus \
+    --api-type dash \
+    --nproc 4
+
--- a/evaluation/RealWorldQA/eval_utils.py
+++ b/evaluation/RealWorldQA/eval_utils.py