"Initial commit"

3d735feb · luopl · 3d735feb · 3d735feb · 3d735feb · 3d735feb
Commit 3d735feb authored Dec 03, 2025 by luopl
20 changed files
--- a/evaluation/RealWorldQA/infer_instruct.sh
+++ b/evaluation/RealWorldQA/infer_instruct.sh
+#!/bin/bash
+
+# RealWorldQA Inference Script (Instruct Model)
+# This script runs inference on the RealWorldQA dataset using vLLM
+
+python run_realworldqa.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --dataset RealWorldQA \
+    --data-dir /path/to/data \
+    --output-file results/RealWorldQA_results.jsonl \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.9 \
+    --max-model-len 128000 \
+    --max-images-per-prompt 10 \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
+
--- a/evaluation/RealWorldQA/infer_think.sh
+++ b/evaluation/RealWorldQA/infer_think.sh
+#!/bin/bash
+
+# RealWorldQA Inference Script (Thinking Model)
+# This script runs inference on the RealWorldQA dataset using vLLM with thinking mode parameters
+
+python run_realworldqa.py infer \
+    --model-path /path/to/Qwen3-VL-Thinking \
+    --dataset RealWorldQA \
+    --data-dir /path/to/data \
+    --output-file results/RealWorldQA_results_thinking.jsonl \
+    --tensor-parallel-size 1 \
+    --gpu-memory-utilization 0.9 \
+    --max-model-len 128000 \
+    --max-images-per-prompt 10 \
+    --max-new-tokens 32768 \
+    --temperature 0.6 \
+    --top-p 0.95 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 0.0
+
--- a/evaluation/RealWorldQA/requirements.txt
+++ b/evaluation/RealWorldQA/requirements.txt
+vllm
+transformers
+qwen_vl_utils
+pandas
+numpy
+Pillow
+tqdm
+requests
+validators
+torch
+torchvision
+accelerate
+sentencepiece
+flash_attn
\ No newline at end of file
--- a/evaluation/RealWorldQA/run_realworldqa.py
+++ b/evaluation/RealWorldQA/run_realworldqa.py
+import os
+import sys
+import json
+import argparse
+import pandas as pd
+import numpy as np
+import time
+from tqdm import tqdm
+from typing import List, Dict, Any
+import torch
+import warnings
+import string
+
+# vLLM imports
+from vllm import LLM, SamplingParams
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor
+
+# Local imports from refactored files
+from dataset_utils import load_dataset, dump_image, build_realworldqa_prompt
+from eval_utils import build_judge, eval_single_sample
+
+# Set vLLM multiprocessing method
+os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+
+def prepare_inputs_for_vllm(messages, processor):
+    """
+    Prepare inputs for vLLM.
+    
+    Args:
+        messages: List of messages in standard conversation format
+        processor: AutoProcessor instance
+    
+    Returns:
+        dict: Input format required by vLLM
+    """
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    
+    # qwen_vl_utils 0.0.14+ required
+    image_inputs, video_inputs, video_kwargs = process_vision_info(
+        messages,
+        image_patch_size=processor.image_processor.patch_size,
+        return_video_kwargs=True,
+        return_video_metadata=True
+    )
+    
+    mm_data = {}
+    if image_inputs is not None:
+        mm_data['image'] = image_inputs
+    if video_inputs is not None:
+        mm_data['video'] = video_inputs
+    
+    return {
+        'prompt': text,
+        'multi_modal_data': mm_data,
+        'mm_processor_kwargs': video_kwargs
+    }
+
+def run_inference(args):
+    """Run inference on the RealWorldQA dataset using vLLM."""
+    print("\n" + "="*80)
+    print("🚀 RealWorldQA Inference with vLLM (High-Speed Mode)")
+    print("="*80 + "\n")
+    
+    # Set up data directory
+    if args.data_dir:
+        os.environ['LMUData'] = args.data_dir
+    elif 'LMUData' not in os.environ:
+        raise ValueError("Please specify --data-dir or set LMUData environment variable")
+    
+    print(f"✓ Data directory: {os.environ['LMUData']}")
+    
+    # Load dataset
+    print(f"Loading dataset: {args.dataset}")
+    data = load_dataset(args.dataset)
+    print(f"✓ Loaded {len(data)} samples from {args.dataset}")
+    
+    # DEBUG: Process only first N samples if specified
+    if os.getenv('DEBUG_SAMPLE_SIZE'):
+        debug_size = int(os.getenv('DEBUG_SAMPLE_SIZE'))
+        data = data.iloc[:debug_size]
+        print(f"⚠️  DEBUG MODE: Only processing {len(data)} samples")
+    
+    # Set up image root directory
+    img_root = os.path.join(os.environ['LMUData'], 'images', args.dataset)
+    os.makedirs(img_root, exist_ok=True)
+    
+    # Set up dump_image function
+    def dump_image_func(line):
+        return dump_image(line, img_root)
+    
+    # Create output directory
+    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
+
+    # Set resolution parameters
+    min_pixels = args.min_pixels if args.min_pixels is not None else 768*28*28
+    max_pixels = args.max_pixels if args.max_pixels is not None else 5120*28*28
+    print(f"✓ Image resolution: min_pixels={min_pixels}, max_pixels={max_pixels}")
+
+    # Set up generation parameters (vLLM SamplingParams format)
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        top_k=args.top_k,
+        max_tokens=args.max_new_tokens,
+        repetition_penalty=args.repetition_penalty,
+        presence_penalty=args.presence_penalty,
+        stop_token_ids=[],
+    )
+    
+    print(f"\n⚙️  Generation parameters (vLLM SamplingParams):")
+    print(f"   max_tokens={sampling_params.max_tokens}")
+    print(f"   temperature={sampling_params.temperature}, top_p={sampling_params.top_p}, top_k={sampling_params.top_k}")
+    print(f"   repetition_penalty={sampling_params.repetition_penalty}")
+    print(f"   presence_penalty={sampling_params.presence_penalty}")
+    
+    if sampling_params.presence_penalty > 0:
+        print(f"   ✅ Anti-repetition enabled (presence_penalty={sampling_params.presence_penalty})")
+    
+    if sampling_params.temperature <= 0.02 and sampling_params.top_k == 1:
+        print(f"   ✅ Using FAST greedy-like decoding")
+    else:
+        print(f"   ⚠️  Using sampling decoding (slower but more diverse)")
+    print()
+
+    # Load processor for input preparation
+    print(f"Loading processor from {args.model_path}")
+    processor = AutoProcessor.from_pretrained(args.model_path)
+    print("✓ Processor loaded\n")
+    
+    # Initialize vLLM
+    print(f"Initializing vLLM with model: {args.model_path}")
+    print(f"   GPU count: {torch.cuda.device_count()}")
+    print(f"   Tensor parallel size: {args.tensor_parallel_size}")
+    
+    llm = LLM(
+        model=args.model_path,
+        tensor_parallel_size=args.tensor_parallel_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        trust_remote_code=True,
+        max_model_len=args.max_model_len,
+        limit_mm_per_prompt={"image": args.max_images_per_prompt},
+        seed=42,
+    )
+    print("✓ vLLM initialized successfully\n")
+    
+    # Prepare all inputs
+    print("Preparing inputs for vLLM...")
+    all_inputs = []
+    all_line_dicts = []
+    all_messages = []
+    
+    for idx, (_, line) in enumerate(tqdm(data.iterrows(), total=len(data), desc="Building prompts")):
+        # Convert line to dict
+        line_dict = line.to_dict()
+        for k, v in line_dict.items():
+            if isinstance(v, np.integer):
+                line_dict[k] = int(v)
+            elif isinstance(v, np.floating):
+                line_dict[k] = float(v)
+        
+        # Build prompt
+        messages = build_realworldqa_prompt(line, dump_image_func, min_pixels, max_pixels)
+        
+        # Prepare input for vLLM
+        vllm_input = prepare_inputs_for_vllm(messages, processor)
+        
+        all_inputs.append(vllm_input)
+        all_line_dicts.append(line_dict)
+        all_messages.append(messages)
+    
+    print(f"✓ Prepared {len(all_inputs)} inputs\n")
+    
+    # Batch inference (vLLM automatic optimization)
+    print("="*80)
+    print("🚀 Running vLLM batch inference (automatic optimization)")
+    print("="*80)
+    start_time = time.time()
+    
+    outputs = llm.generate(all_inputs, sampling_params=sampling_params)
+    
+    end_time = time.time()
+    total_time = end_time - start_time
+    print(f"\n✓ Inference completed in {total_time:.2f} seconds")
+    print(f"  Average: {total_time/len(data):.2f} seconds/sample")
+    print(f"  Throughput: {len(data)/total_time:.2f} samples/second\n")
+    
+    # Save results
+    print("Saving results...")
+    results = []
+    
+    for idx, (line_dict, messages, output) in enumerate(zip(all_line_dicts, all_messages, outputs)):
+        response = output.outputs[0].text
+        index = line_dict['index']
+
+        # Handle </think> tag
+        response_final = str(response).split("</think>")[-1].strip()
+        
+        result = {
+            "question_id": int(index) if isinstance(index, (int, np.integer)) else index,
+            "annotation": line_dict,
+            "task": args.dataset,
+            "result": {"gen": response_final, "gen_raw": response},
+            "messages": messages
+        }
+        results.append(result)
+    
+    # Write final results
+    with open(args.output_file, 'w') as f:
+        for res in results:
+            f.write(json.dumps(res) + '\n')
+    
+    print(f"\n✓ Results saved to {args.output_file}")
+    print(f"✓ Total samples processed: {len(results)}")
+
+def run_evaluation(args):
+    """Run evaluation on inference results."""
+    print("\n" + "="*80)
+    print("📊 RealWorldQA Evaluation")
+    print("="*80 + "\n")
+    
+    # Set up data directory
+    if args.data_dir:
+        os.environ['LMUData'] = args.data_dir
+    elif 'LMUData' not in os.environ:
+        raise ValueError("Please specify --data-dir or set LMUData environment variable")
+    
+    # Load results
+    results = []
+    with open(args.input_file, 'r') as f:
+        for line in f:
+            job = json.loads(line)
+            annotation = job["annotation"]
+            annotation["prediction"] = job["result"]["gen"]
+            results.append(annotation)
+            
+    data = pd.DataFrame.from_records(results)
+    data = data.sort_values(by='index')
+    data['prediction'] = [str(x) for x in data['prediction']]
+    
+    # Convert column names to lowercase
+    for k in list(data.keys()):
+        data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+    
+    print(f"✓ Loaded {len(data)} results from {args.input_file}")
+    
+    # Create output directory
+    output_dir = os.path.dirname(args.output_file)
+    os.makedirs(output_dir, exist_ok=True)
+    
+    # Build judge model (if specified)
+    model = None
+    if args.eval_model:
+        model = build_judge(
+            model=args.eval_model,
+            api_type=getattr(args, 'api_type', 'dash')
+        )
+        print(f"✓ Evaluation model: {args.eval_model}")
+    else:
+        print("⚠️  No evaluation model specified, using rule-based extraction only")
+    
+    # Prepare evaluation tasks
+    items = []
+    for i in range(len(data)):
+        item = data.iloc[i].to_dict()
+        items.append(item)
+    
+    eval_tasks = []
+    for item in items:
+        eval_tasks.append((model, item))
+    
+    # Run evaluation
+    eval_results = []
+    
+    # Debug mode: process single-threaded with first few samples
+    debug = os.environ.get('DEBUG', '').lower() == 'true'
+    if debug:
+        print("Running in debug mode with first 5 samples...")
+        for task in eval_tasks[:5]:
+            try:
+                result = eval_single_sample(task)
+                eval_results.append(result)
+            except Exception as e:
+                print(f"Error processing task: {e}")
+                raise
+    else:
+        # Normal mode: process all samples with threading
+        from concurrent.futures import ThreadPoolExecutor
+        nproc = getattr(args, 'nproc', 4)
+        print(f"✓ Using {nproc} parallel processes")
+        with ThreadPoolExecutor(max_workers=nproc) as executor:
+            for result in tqdm(executor.map(eval_single_sample, eval_tasks), 
+                             total=len(eval_tasks), desc="Evaluating"):
+                eval_results.append(result)
+    
+    # Calculate overall accuracy
+    accuracy = sum(r['hit'] for r in eval_results) / len(eval_results)
+    
+    # Save results
+    output_df = pd.DataFrame(eval_results)
+    output_df.to_csv(args.output_file, index=False)
+    
+    # Save accuracy to JSON
+    acc_file = args.output_file.replace('.csv', '_acc.json')
+    with open(acc_file, 'w') as f:
+        json.dump({
+            "overall_accuracy": accuracy,
+            "task_samples": len(results),
+            "correct": sum(r['hit'] for r in eval_results),
+            "total": len(eval_results)
+        }, f, indent=2)
+    
+    print(f"\n{'='*50}")
+    print(f"Evaluation Results:")
+    print(f"{'='*50}")
+    print(f"Overall accuracy: {accuracy:.4f} ({sum(r['hit'] for r in eval_results)}/{len(eval_results)})")
+    print(f"{'='*50}\n")
+    
+    print(f"✓ Detailed results saved to {args.output_file}")
+    print(f"✓ Accuracy saved to {acc_file}")
+
+def main():
+    parser = argparse.ArgumentParser(description="RealWorldQA Evaluation with vLLM")
+    subparsers = parser.add_subparsers(dest='command', help='Command to run')
+    
+    # Inference parser
+    infer_parser = subparsers.add_parser("infer", help="Run inference with vLLM")
+    infer_parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
+    infer_parser.add_argument("--dataset", type=str, default="RealWorldQA", help="Dataset name")
+    infer_parser.add_argument("--data-dir", type=str, help="Data directory (LMUData)")
+    infer_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
+    
+    # Image resolution parameters
+    infer_parser.add_argument("--min-pixels", type=int, default=None,
+                            help="Minimum pixels for image (default: 768*28*28)")
+    infer_parser.add_argument("--max-pixels", type=int, default=None,
+                            help="Maximum pixels for image (default: 5120*28*28)")
+    
+    # vLLM specific parameters
+    infer_parser.add_argument("--tensor-parallel-size", type=int, default=None, 
+                            help="Tensor parallel size (default: number of GPUs)")
+    infer_parser.add_argument("--gpu-memory-utilization", type=float, default=0.9,
+                            help="GPU memory utilization (0.0-1.0, default: 0.9)")
+    infer_parser.add_argument("--max-model-len", type=int, default=128000,
+                            help="Maximum model context length (default: 128000)")
+    infer_parser.add_argument("--max-images-per-prompt", type=int, default=10,
+                            help="Maximum images per prompt (default: 10)")
+    
+    # Generation parameters
+    infer_parser.add_argument("--max-new-tokens", type=int, default=32768, 
+                            help="Maximum number of tokens to generate (default: 32768)")
+    infer_parser.add_argument("--temperature", type=float, default=0.7, 
+                            help="Temperature for sampling (default: 0.7)")
+    infer_parser.add_argument("--top-p", type=float, default=0.8, 
+                            help="Top-p for sampling (default: 0.8)")
+    infer_parser.add_argument("--top-k", type=int, default=20, 
+                            help="Top-k for sampling (default: 20)")
+    infer_parser.add_argument("--repetition-penalty", type=float, default=1.0,
+                            help="Repetition penalty (default: 1.0)")
+    infer_parser.add_argument("--presence-penalty", type=float, default=1.5,
+                            help="Presence penalty (default: 1.5)")
+    
+    # Evaluation parser
+    eval_parser = subparsers.add_parser("eval", help="Run evaluation")
+    eval_parser.add_argument("--data-dir", type=str, help="Data directory (LMUData)")
+    eval_parser.add_argument("--input-file", type=str, required=True, help="Input file with inference results")
+    eval_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
+    eval_parser.add_argument("--dataset", type=str, default="RealWorldQA", help="Dataset name")
+    eval_parser.add_argument("--eval-model", type=str, default=None,
+                            help="Model to use for evaluation (default: None, use rule-based only)")
+    eval_parser.add_argument("--api-type", type=str, default="dash", choices=["dash", "mit"],
+                            help="API type for evaluation")
+    eval_parser.add_argument("--nproc", type=int, default=4, help="Number of processes to use")
+    
+    args = parser.parse_args()
+    
+    # Automatically set tensor_parallel_size
+    if args.command == 'infer' and args.tensor_parallel_size is None:
+        args.tensor_parallel_size = torch.cuda.device_count()
+        print(f"Auto-set tensor_parallel_size to {args.tensor_parallel_size}")
+    
+    if args.command == 'infer':
+        run_inference(args)
+    elif args.command == 'eval':
+        run_evaluation(args)
+    else:
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()
--- a/evaluation/mmmu/README.md
+++ b/evaluation/mmmu/README.md
+# MMMU Benchmark Evaluation
+
+This directory contains the implementation for evaluating vision-language models on the MMMU (Massive Multi-discipline Multimodal Understanding) benchmark using vLLM for high-speed inference.
+
+## Overview
+
+The MMMU benchmark evaluates models across diverse disciplines with multi-modal questions. This implementation provides:
+
+- **High-speed inference** using vLLM with automatic batch optimization
+- **Flexible evaluation** using GPT-based judge models
+- **Support for thinking models** with extended reasoning
+- **Modular code structure** for easy maintenance and extension
+
+## Project Structure
+
+```
+mmmu/
+├── run_mmmu.py           # Main script for inference and evaluation
+├── dataset_utils.py      # Dataset loading and preprocessing utilities
+├── eval_utils.py         # Evaluation logic and judge model wrappers
+├── common_utils.py       # Common utilities for image processing, file I/O
+├── infer_instruct.sh     # Inference script for instruct models
+├── infer_think.sh        # Inference script for thinking models
+├── eval_instruct.sh      # Evaluation script for instruct model results
+├── eval_think.sh         # Evaluation script for thinking model results
+├── requirements.txt      # Python dependencies
+└── README.md            # This file
+```
+
+## Requirements
+
+### Python Dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+Key dependencies:
+- `vllm` - High-speed LLM inference engine
+- `transformers` - HuggingFace transformers
+- `qwen_vl_utils` - Qwen VL utilities for vision processing
+- `pandas`, `numpy` - Data processing
+- `Pillow` - Image processing
+- `requests` - API calls for evaluation
+
+### Environment Variables
+
+For evaluation, you need to set up API credentials for the judge model:
+
+**Option 1: DashScope API (Recommended)**
+```bash
+export CHATGPT_DASHSCOPE_API_KEY="your-api-key"
+export DASHSCOPE_API_BASE="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
+```
+
+**Option 2: Custom OpenAI-compatible API**
+```bash
+export MIT_SPIDER_TOKEN="your-api-key"
+export MIT_SPIDER_URL="your-api-endpoint"
+```
+
+## Quick Start
+
+### 1. Inference
+
+Run inference on MMMU dataset using an instruct model:
+
+```bash
+bash infer_instruct.sh
+```
+
+Or customize the inference:
+
+```bash
+python run_mmmu.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --data-dir /path/to/data \
+    --dataset MMMU_DEV_VAL \
+    --output-file results/predictions.jsonl \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
+```
+
+For thinking models with extended reasoning:
+
+```bash
+bash infer_think.sh
+```
+
+### 2. Evaluation
+
+Evaluate the inference results using a judge model:
+
+```bash
+bash eval_instruct.sh
+```
+
+Or customize the evaluation:
+
+```bash
+python run_mmmu.py eval \
+    --data-dir /path/to/data \
+    --input-file results/predictions.jsonl \
+    --output-file results/evaluation.csv \
+    --dataset MMMU_DEV_VAL \
+    --eval-model gpt-3.5-turbo-0125 \
+    --api-type dash \
+    --nproc 16
+```
+
+## Detailed Usage
+
+### Inference Mode
+
+**Basic Arguments:**
+- `--model-path`: Path to the Qwen3-VL model directory (required)
+- `--data-dir`: Directory to store/load MMMU dataset (required)
+- `--dataset`: Dataset name, default: `MMMU_DEV_VAL`
+- `--output-file`: Path to save inference results in JSONL format (required)
+
+**vLLM Arguments:**
+- `--tensor-parallel-size`: Number of GPUs for tensor parallelism (default: auto-detect)
+- `--gpu-memory-utilization`: GPU memory utilization ratio, 0.0-1.0 (default: 0.9)
+- `--max-model-len`: Maximum model context length (default: 128000)
+- `--max-images-per-prompt`: Maximum images per prompt (default: 10)
+
+**Generation Parameters:**
+- `--max-new-tokens`: Maximum tokens to generate (default: 32768)
+- `--temperature`: Sampling temperature (default: 0.7)
+- `--top-p`: Top-p sampling (default: 0.8)
+- `--top-k`: Top-k sampling (default: 20)
+- `--repetition-penalty`: Repetition penalty (default: 1.0)
+- `--presence-penalty`: Presence penalty to reduce repetition (default: 1.5)
+
+**Advanced Options:**
+- `--use-cot`: Enable Chain-of-Thought prompting for better reasoning
+- `--cot-prompt`: Custom CoT prompt (optional)
+
+### Evaluation Mode
+
+**Basic Arguments:**
+- `--data-dir`: Directory containing MMMU dataset (required)
+- `--input-file`: Inference results file in JSONL format (required)
+- `--output-file`: Path to save evaluation results in CSV format (required)
+- `--dataset`: Dataset name, must match inference (default: `MMMU_DEV_VAL`)
+
+**Judge Model Arguments:**
+- `--eval-model`: Judge model name (default: `gpt-3.5-turbo-0125`)
+  - Options: `gpt-3.5-turbo-0125`, `gpt-4-0125-preview`, `gpt-4o`, etc.
+- `--api-type`: API service type (default: `dash`)
+  - `dash`: DashScope API (Alibaba Cloud)
+  - `mit`: Custom OpenAI-compatible API
+- `--nproc`: Number of parallel workers for evaluation (default: 4)
+
+## Output Files
+
+### Inference Output
+
+The inference script generates a JSONL file where each line contains:
+
+```json
+{
+  "question_id": 123,
+  "annotation": {
+    "index": 123,
+    "question": "What is shown in the image?",
+    "A": "Option A",
+    "B": "Option B",
+    "answer": "A",
+    ...
+  },
+  "task": "MMMU_DEV_VAL",
+  "result": {
+    "gen": "The final answer",
+    "gen_raw": "Raw model output including thinking process"
+  },
+  "messages": [...]
+}
+```
+
+### Evaluation Output
+
+The evaluation script generates two files:
+
+1. **CSV file** (`*_eval_results.csv`): Detailed results for each sample
+   - Columns: `index`, `question`, `prediction`, `extracted_answer`, `extraction_method`, `gt`, `hit`, `split`
+
+2. **JSON file** (`*_eval_results_acc.json`): Accuracy summary
+   ```json
+   {
+     "overall_accuracy": 0.7234,
+     "accuracy_by_split": {
+       "validation": 0.7234
+     }
+   }
+   ```
+
+## Model-Specific Configurations
+
+### Instruct Models (e.g., Qwen3-VL-2B-Instruct, Qwen3-VL-30B-Instruct)
+
+Use standard parameters for balanced performance:
+
+```bash
+--max-new-tokens 32768
+--temperature 0.7
+--top-p 0.8
+--top-k 20
+--repetition-penalty 1.0
+--presence-penalty 1.5
+```
+
+### Thinking Models (e.g., Qwen3-VL-4B-Thinking)
+
+Use extended parameters for deeper reasoning:
+
+```bash
+--max-new-tokens 40960
+--temperature 1.0
+--top-p 0.95
+--top-k 20
+--repetition-penalty 1.0
+--presence-penalty 0.0
+```
+
+**Note:** Thinking models output reasoning steps wrapped in `<think>...</think>` tags. The evaluation automatically extracts the final answer after `</think>`.
+
+## Performance Tips
+
+1. **GPU Memory**: Adjust `--gpu-memory-utilization` based on your GPU:
+   - 0.9: Recommended for most cases
+   - 0.95: For maximum throughput (may cause OOM)
+   - 0.7-0.8: If experiencing OOM errors
+
+2. **Batch Size**: vLLM automatically optimizes batch size based on available memory
+
+3. **Tensor Parallelism**: Use `--tensor-parallel-size` for large models:
+   - 2B/4B models: 1-2 GPUs
+   - 7B/14B models: 2-4 GPUs
+   - 30B+ models: 4-8 GPUs
+
+4. **Context Length**: Reduce `--max-model-len` if memory is limited:
+   - 128000: Default, works well for most cases
+   - 64000: Reduces memory usage by ~40%
+
+## Troubleshooting
+
+### Common Issues
+
+**1. CUDA Out of Memory**
+```bash
+# Reduce GPU memory utilization
+--gpu-memory-utilization 0.7
+
+# Or reduce context length
+--max-model-len 64000
+```
+
+**2. vLLM Multiprocessing Issues**
+The code automatically sets `VLLM_WORKER_MULTIPROC_METHOD=spawn`. If you still encounter issues:
+```bash
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+```
+
+**3. Evaluation API Errors**
+- Verify API credentials are set correctly
+- Check API endpoint connectivity
+- Increase `--nproc` value if rate-limited (up to 32)
+
+**4. Dataset Download Issues**
+The dataset is automatically downloaded from:
+```
+https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv
+```
+If download fails, manually download and place in `--data-dir`.
+
+## Advanced Usage
+
+### Custom Image Resolution
+
+Edit `run_mmmu.py` to modify image resolution:
+
+```python
+MIN_PIXELS = 1280*28*28  # ~1M pixels
+MAX_PIXELS = 5120*28*28  # ~4M pixels
+```
+
+### Custom Evaluation Logic
+
+The evaluation uses a two-stage approach:
+1. **Rule-based extraction**: Fast pattern matching for clear answers
+2. **Model-based extraction**: GPT judge for ambiguous answers
+
+To customize, edit `eval_utils.py`:
+- `can_infer_option()`: Modify option extraction rules
+- `can_infer_text()`: Modify text matching logic
+- `build_prompt()`: Customize judge prompt
+
+### Debugging
+
+Enable debug mode to process only 5 samples:
+
+```bash
+DEBUG=true python run_mmmu.py eval ...
+```
+
+## Citation
+
+If you use this code or the MMMU benchmark, please cite:
+
+```bibtex
+@article{yue2023mmmu,
+  title={Mmmu: A massive multi-discipline multimodal understanding and reasoning benchmark for expert agi},
+  author={Yue, Xiang and Ni, Yuansheng and Zhang, Kai and Zheng, Tianyu and Liu, Ruoqi and Zhang, Ge and Stevens, Samuel and Jiang, Dongfu and Ren, Weiming and Sun, Yuxuan and others},
+  journal={arXiv:2311.16502},
+  year={2023}
+}
+```
+
+## License
+
+This code is released under the same license as the Qwen3-VL model.
+
+## Support
+
+For issues and questions:
+- GitHub Issues: [Qwen3-VL Repository](https://github.com/QwenLM/Qwen3-VL)
+- Documentation: See inline code comments and docstrings
--- a/evaluation/mmmu/common_utils.py
+++ b/evaluation/mmmu/common_utils.py
+import os
+import requests
+import base64
+import hashlib
+import io
+from PIL import Image
+from typing import List, Union
+
+def encode_image_to_base64(image, target_size=None):
+    """Encode an image to base64 string."""
+    if target_size is not None:
+        width, height = image.size
+        # Resize the image while maintaining the aspect ratio
+        if width > height:
+            new_width = target_size
+            new_height = int(height * target_size / width)
+        else:
+            new_height = target_size
+            new_width = int(width * target_size / height)
+        image = image.resize((new_width, new_height))
+    
+    buffer = io.BytesIO()
+    image.save(buffer, format="JPEG")
+    return base64.b64encode(buffer.getvalue()).decode('utf-8')
+
+def decode_base64_to_image(base64_string):
+    """Decode a base64 string to an image."""
+    image_data = base64.b64decode(base64_string)
+    return Image.open(io.BytesIO(image_data))
+
+def decode_base64_to_image_file(base64_string, output_path):
+    """Decode a base64 string and save it to a file."""
+    image = decode_base64_to_image(base64_string)
+    image.save(output_path)
+
+def download_file(url, local_path):
+    """Download a file from a URL to a local path."""
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    
+    with open(local_path, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            f.write(chunk)
+
+def md5(file_path):
+    """Calculate the MD5 hash of a file."""
+    hash_md5 = hashlib.md5()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+def toliststr(s):
+    if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'):
+        return [str(x) for x in eval(s)]
+    elif isinstance(s, str):
+        return [s]
+    elif isinstance(s, list):
+        return [str(x) for x in s]
+    raise NotImplementedError
\ No newline at end of file
--- a/evaluation/mmmu/dataset_utils.py
+++ b/evaluation/mmmu/dataset_utils.py
+import os
+import pandas as pd
+import numpy as np
+from typing import Dict, Any
+from common_utils import download_file, md5, toliststr, decode_base64_to_image_file
+
+MMMU_DATASET_URL = 'https://opencompass.openxlab.space/utils/VLMEval/MMMU_DEV_VAL.tsv'
+MMMU_DATASET_MD5 = '521afc0f3bf341e6654327792781644d'
+
+def load_dataset(dataset_name='MMMU_DEV_VAL'):
+    """Load the MMMU dataset."""
+    data_root = os.path.join(os.environ['LMUData'])
+    os.makedirs(data_root, exist_ok=True)
+    
+    file_name = f"{dataset_name}.tsv"
+    data_path = os.path.join(data_root, file_name)
+    
+    # Download if not exists or MD5 doesn't match
+    if not os.path.exists(data_path) or md5(data_path) != MMMU_DATASET_MD5:
+        print(f"Downloading {dataset_name} dataset...")
+        download_file(MMMU_DATASET_URL, data_path)
+    
+    # Load the dataset
+    data = pd.read_csv(data_path, sep='\t')
+    
+    # Process the dataset
+    data['index'] = [str(x) for x in data['index']]
+    
+    # Handle image data
+    if 'image' in data:
+        data['image'] = [str(x) for x in data['image']]
+        image_map = {x: y for x, y in zip(data['index'], data['image'])}
+        for k in image_map:
+            if len(image_map[k]) <= 64:
+                idx = image_map[k]
+                assert idx in image_map and len(image_map[idx]) > 64
+                image_map[k] = image_map[idx]
+
+        images = [toliststr(image_map[k]) for k in data['index']]
+        data['image'] = [x[0] if len(x) == 1 else x for x in images]
+    
+    # Handle image paths
+    if 'image_path' in data:
+        paths = [toliststr(x) for x in data['image_path']]
+        data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
+    
+    # Convert index to int if possible
+    if np.all([isinstance(x, int) or x.isdigit() for x in data['index']]):
+        data['index'] = [int(x) for x in data['index']]
+    
+    return data
+
+def dump_image(line, img_root):
+    """Save image data to disk and return the path."""
+    os.makedirs(img_root, exist_ok=True)
+    
+    if 'image' in line:
+        if isinstance(line['image'], list):
+            tgt_path = []
+            assert 'image_path' in line
+            for img, im_name in zip(line['image'], line['image_path']):
+                path = os.path.join(img_root, im_name)
+                if not os.path.exists(path):
+                    decode_base64_to_image_file(img, path)
+                tgt_path.append(path)
+        else:
+            tgt_path = os.path.join(img_root, f"{line['index']}.jpg")
+            if not os.path.exists(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+            tgt_path = [tgt_path]
+    else:
+        assert 'image_path' in line
+        tgt_path = toliststr(line['image_path'])
+    
+    return tgt_path
+
+def MMMU_preproc(data):
+    """
+    Preprocess MMMU dataset to reformulate open questions to multi-choice ones.
+    This aligns with the implementation in multiple_choice.py
+    """
+    print("Preprocessing MMMU dataset...")
+    cnt = 0
+    As, Bs, Ans = list(data['A']), list(data['B']), list(data['answer'])
+    lt = len(data)
+    for i in range(lt):
+        if pd.isna(As[i]):
+            As[i] = Ans[i]
+            Bs[i] = 'Other Answers'
+            cnt += 1
+    print(f'During MMMU_preproc in Evaluation, {cnt} open questions are re-formulated to multi-choice ones.')
+    data['A'] = As
+    data['B'] = Bs
+    return data
\ No newline at end of file
--- a/evaluation/mmmu/eval_instruct.sh
+++ b/evaluation/mmmu/eval_instruct.sh
+#!/bin/bash
+
+# MMMU Evaluation Script (Instruct Model)
+# This script evaluates the inference results using a judge model
+
+python run_mmmu.py eval \
+    --data-dir /path/to/mmmu_data \
+    --input-file results/mmmu_dev_val_predictions.jsonl \
+    --output-file results/mmmu_dev_val_eval_results.csv \
+    --dataset MMMU_DEV_VAL \
+    --eval-model gpt-3.5-turbo-0125 \
+    --api-type dash \
+    --nproc 16
\ No newline at end of file
--- a/evaluation/mmmu/eval_think.sh
+++ b/evaluation/mmmu/eval_think.sh
+#!/bin/bash
+
+# MMMU Evaluation Script (Thinking Model)
+# This script evaluates the inference results from thinking model using a judge model
+
+python run_mmmu.py eval \
+    --data-dir /path/to/mmmu_data \
+    --input-file results/mmmu_dev_val_predictions_thinking.jsonl \
+    --output-file results/mmmu_dev_val_eval_results_thinking.csv \
+    --dataset MMMU_DEV_VAL \
+    --eval-model gpt-3.5-turbo-0125 \
+    --api-type dash \
+    --nproc 16
\ No newline at end of file
--- a/evaluation/mmmu/eval_utils.py
+++ b/evaluation/mmmu/eval_utils.py
+import os
+import requests
+import time
+import random
+import string
+import copy
+import traceback
+import pandas as pd
+from PIL import Image
+from typing import List, Dict, Tuple, Any
+from common_utils import encode_image_to_base64
+
+class OpenAIWrapper:
+    """Wrapper for OpenAI API."""
+    
+    def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
+        self.model = model
+        self.api_base = api_base
+        self.api_key = api_key
+        self.timeout = timeout
+        self.retry = retry
+        self.wait = wait
+        self.fail_msg = 'Failed to obtain answer via API.'
+    
+    def generate(self, messages):
+        """Generate a response from the API."""
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
+        
+        # Format messages for API
+        formatted_messages = []
+        for msg in messages:
+            if msg['type'] == 'text':
+                formatted_messages.append({"role": "user", "content": [{"type": "text", "text": msg['value']}]})
+            elif msg['type'] == 'image':
+                # Load and encode the image
+                image = Image.open(msg['value'])
+                image_data = encode_image_to_base64(image)
+                formatted_messages.append({
+                    "role": "user", 
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
+                    ]
+                })
+        
+        payload = {
+            "model": self.model,
+            "messages": formatted_messages,
+            "max_tokens": 4096,
+            "temperature": 0
+        }
+        
+        for i in range(self.retry):
+            try:
+                response = requests.post(
+                    self.api_base,
+                    headers=headers,
+                    json=payload,
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    resp_json = response.json()
+                    return resp_json['choices'][0]['message']['content'].strip()
+                
+                time.sleep(self.wait)
+            except Exception as e:
+                print(f"API error: {e}")
+                time.sleep(self.wait)
+        
+        return self.fail_msg
+
+class DashScopeWrapper:
+    """Wrapper for DashScope API."""
+    
+    def __init__(self, model, api_base, api_key, timeout=60, retry=5, wait=5):
+        self.model = model
+        self.api_base = api_base
+        self.api_key = api_key
+        self.timeout = timeout
+        self.retry = retry
+        self.wait = wait
+        self.fail_msg = 'Failed to obtain answer via API.'
+    
+    def generate(self, messages):
+        """Generate a response from the API."""
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.api_key}'}
+        
+        # Format messages for API
+        formatted_messages = []
+        for msg in messages:
+            if msg['type'] == 'text':
+                formatted_messages.append({"role": "user", "content": [{"type": "text", "text": msg['value']}]})
+            elif msg['type'] == 'image':
+                # Load and encode the image
+                image = Image.open(msg['value'])
+                image_data = encode_image_to_base64(image)
+                formatted_messages.append({
+                    "role": "user", 
+                    "content": [
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image_data}"}}
+                    ]
+                })
+        
+        payload = {
+            "model": self.model,
+            "messages": formatted_messages,
+            "max_completion_tokens": 4096,
+            "n": 1,
+            "temperature": 0,
+            "stream": False
+        }
+
+        for i in range(self.retry):
+            try:
+                response = requests.post(
+                    self.api_base,
+                    headers=headers,
+                    json=payload,
+                    timeout=self.timeout
+                )
+                
+                if response.status_code == 200:
+                    resp_json = response.json()
+                    
+                    # Check finish reason
+                    for output in resp_json['choices']:
+                        if output['finish_reason'] not in ['stop', 'function_call']:
+                            print(f"DashScope finished with error: {resp_json}")
+                            time.sleep(self.wait)
+                            continue
+                    
+                    return resp_json['choices'][0]['message']['content']
+                else:
+                    print(f"DashScope API error: HTTP {response.status_code}")
+                    try:
+                        error_content = response.json()
+                        print(f"Error details: {error_content}")
+                    except:
+                        print(f"Raw error content: {response.content.decode('utf-8', errors='replace')}")
+                
+                time.sleep(self.wait)
+            except requests.exceptions.ConnectionError as conn_err:
+                print(f"DashScope: Connection error occurred: {conn_err}")
+                time.sleep(self.wait)
+            except requests.exceptions.Timeout as timeout_err:
+                print(f"DashScope: Timeout error occurred: {timeout_err}")
+                time.sleep(self.wait)
+            except requests.exceptions.RequestException as req_err:
+                print(f"DashScope: Request exception occurred: {req_err}")
+                time.sleep(self.wait)
+            except Exception as e:
+                print(f"DashScope: An error occurred: {e}")
+                print(traceback.format_exc())
+                time.sleep(self.wait)
+        
+        return self.fail_msg
+
+def build_judge(model, api_type):
+    """Build a judge model for evaluation."""
+    if api_type == 'mit':
+        api_key = os.environ.get('MIT_SPIDER_TOKEN', '')
+        api_base = os.environ.get('MIT_SPIDER_URL', '')
+        return OpenAIWrapper(model, api_base, api_key)
+    elif api_type == 'dash':
+        api_key = os.environ.get('CHATGPT_DASHSCOPE_API_KEY', '')
+        api_base = os.environ.get('DASHSCOPE_API_BASE', '')
+        return DashScopeWrapper(model, api_base, api_key)
+    else:
+        raise ValueError(f"Unsupported API type: {api_type}")
+
+def can_infer_option(answer, choices):
+    """Rule-based extraction of answer option."""
+    if 'Failed to obtain answer via API' in answer:
+        return False
+
+    reject_to_answer = [
+        "Sorry, I can't help with images of people yet.",
+        "I can't process this file.",
+        "I'm sorry, but without the image provided",
+        'Cannot determine the answer'
+    ]
+    for err in reject_to_answer:
+        if err in answer:
+            return 'Z'
+
+    def count_choice(splits, choices, prefix='', suffix=''):
+        cnt = 0
+        for c in choices:
+            if prefix + c + suffix in splits:
+                cnt += 1
+        return cnt
+
+    answer_mod = copy.copy(answer)
+    chars = '.()[],:;!*#{}'
+    for c in chars:
+        answer_mod = answer_mod.replace(c, ' ')
+
+    splits = [x.strip() for x in answer_mod.split()]
+    count = count_choice(splits, choices)
+
+    if count == 1:
+        for ch in choices:
+            if 'A' in splits and len(splits) > 3:
+                # print(f'A might be a quantifier in the string: {answer}.')
+                return False
+            if ch in splits:
+                return ch
+    elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
+        return 'Z'
+    return False
+
+def can_infer_text(answer, choices):
+    """Extract answer by matching text content."""
+    answer = answer.lower()
+    assert isinstance(choices, dict)
+    for k in choices:
+        assert k in string.ascii_uppercase
+        choices[k] = str(choices[k]).lower()
+    cands = []
+    for k in choices:
+        if choices[k] in answer:
+            cands.append(k)
+    if len(cands) == 1:
+        return cands[0]
+    return False
+
+def can_infer(answer, choices):
+    """Combined approach to infer answer choice."""
+    answer = str(answer)
+    copt = can_infer_option(answer, choices)
+    return copt if copt else can_infer_text(answer, choices)
+
+def build_choices(item):
+    ret = {}
+    for ch in string.ascii_uppercase:
+        if ch in item and (not pd.isna(item[ch])):
+            ret[ch] = item[ch]
+    return ret
+
+def build_option_str(option_dict):
+    s = 'There are several options: \n'
+    for c, content in option_dict.items():
+        if not pd.isna(content):
+            s += f'{c}. {content}\n'
+    return s
+
+def build_prompt(question, options, prediction):
+    tmpl = (
+        'You are an AI assistant who will help me to match '
+        'an answer with several options of a single-choice question. '
+        'You are provided with a question, several options, and an answer, '
+        'and you need to find which option is most similar to the answer. '
+        'If the meaning of all options are significantly different from the answer, output Z. '
+        'Your should output a single uppercase character in A, B, C, D (if they are valid options), and Z. \n'
+        'Example 1: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: a cute teddy bear\nYour output: A\n'
+        'Example 2: \n'
+        'Question: What is the main object in image?\nOptions: A. teddy bear B. rabbit C. cat D. dog\n'
+        'Answer: Spider\nYour output: Z\n'
+        'Example 3: \n'
+        'Question: {}?\nOptions: {}\nAnswer: {}\nYour output: '
+    )
+    return tmpl.format(question, options, prediction)
+
+def extract_answer_from_item(model, item, wait=5):
+    """Extract answer from model prediction using rule-based and model-based approaches."""
+    # Build choices dictionary
+    choices = build_choices(item)
+    option_str = build_option_str(choices)
+    prompt = build_prompt(item['question'], option_str, item['prediction'])
+    
+    # Try rule-based extraction first
+    prediction = item['prediction']
+    ret = can_infer(prediction, choices)
+    
+    if ret:
+        if ret == 'Z':
+            extract_flag = False
+            log = f"Rule extract failed with rule result: {ret} prediction: {prediction}"
+        else:
+            extract_flag = True
+            log = f"Rule extract success with rule result: {ret} prediction: {prediction}"
+        return dict(opt=ret, log=log, extract_model='rule', extract_flag=extract_flag)
+    
+    # If rule-based extraction fails, use model-based extraction
+    print(f"Rule extract failed. Use model-based extraction.")
+    if model is None:
+       assert model is not None, 'Judge model is None for MMMU_DEV_VAL !!!'
+    
+    # Try model-based extraction with retries
+    retry = 25
+    while retry:
+        ans = model.generate([{"type": "text", "value": prompt}])
+        if 'Failed to obtain answer via API' in ans:
+            print('API failed to answer.')
+        else:
+            ret = can_infer(ans, choices)
+            if ret and ret != 'Z':
+                log = f'{model.model} extract Succeed. {model.model}:{ans}\n'
+                return dict(opt=ret, log=log, extract_model=model.model, extract_flag=True)
+            else:
+                print(f'Output includes 0 / > 1 letter among candidates {set(choices)} and Z: {ans}')
+        retry -= 1
+        T = random.random() * wait * 2
+        time.sleep(T)
+        
+        if retry == 0:
+            options = list(choices) + ['Z'] if 'Z' not in choices else list(choices)
+            log = f'{model.model} extract failed. randomly generate one. {model.model} response:{ans}\n'
+            return dict(opt=random.choice(options), log=log, extract_model=model.model, extract_flag=False)
+
+def eval_single_sample(args):
+    """Evaluate a single sample."""
+    model, item = args
+        
+    # Extract answer using the combined approach
+    result = extract_answer_from_item(model, item)
+    
+    # Determine if the answer is correct
+    hit = 1 if result['opt'] == item['GT'] else 0
+    
+    return {
+        "index": item['index'],
+        "split": item['split'],
+        "question": item['question'],
+        "prediction": item['prediction'],
+        "extracted_answer": result['opt'],
+        "extraction_method": result['extract_model'],
+        "extraction_success": result['extract_flag'],
+        "extraction_log": result['log'],
+        "gt": item['GT'],
+        "hit": hit
+    }
\ No newline at end of file
--- a/evaluation/mmmu/infer_instruct.sh
+++ b/evaluation/mmmu/infer_instruct.sh
+#!/bin/bash
+
+# MMMU Inference Script (Instruct Model)
+# This script runs inference on the MMMU dataset using vLLM
+
+python run_mmmu.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --data-dir /path/to/mmmu_data \
+    --dataset MMMU_DEV_VAL \
+    --output-file results/mmmu_dev_val_predictions.jsonl \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
--- a/evaluation/mmmu/infer_think.sh
+++ b/evaluation/mmmu/infer_think.sh
+#!/bin/bash
+
+# MMMU Inference Script (Thinking Model)
+# This script runs inference on the MMMU dataset using vLLM with thinking mode parameters
+
+python run_mmmu.py infer \
+    --model-path /path/to/Qwen3-VL-Thinking \
+    --data-dir /path/to/mmmu_data \
+    --dataset MMMU_DEV_VAL \
+    --output-file results/mmmu_dev_val_predictions_thinking.jsonl \
+    --max-new-tokens 40960 \
+    --temperature 1.0 \
+    --top-p 0.95 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 0.0 \
+    --tensor-parallel-size 4
--- a/evaluation/mmmu/requirements.txt
+++ b/evaluation/mmmu/requirements.txt
+vllm
+transformers
+qwen_vl_utils
+pandas
+numpy
+Pillow
+tqdm
+requests
+validators
+torch
+torchvision
+accelerate
+flash_attn
\ No newline at end of file
--- a/evaluation/mmmu/run_mmmu.py
+++ b/evaluation/mmmu/run_mmmu.py
+import os
+import sys
+import json
+import argparse
+import pandas as pd
+import numpy as np
+import time
+from tqdm import tqdm
+from typing import List, Dict, Any
+import torch
+import warnings
+import string
+import traceback
+
+# vLLM imports
+from vllm import LLM, SamplingParams
+from qwen_vl_utils import process_vision_info
+from transformers import AutoProcessor
+
+# Local imports from refactored files
+from dataset_utils import load_dataset, dump_image, MMMU_preproc
+from eval_utils import build_judge, eval_single_sample
+
+# Set vLLM multiprocessing method
+os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn'
+
+def build_mmmu_prompt(line, dump_image_func, dataset):
+    """Build MMMU dataset prompt with standard resolution settings."""
+    # Standard resolution settings
+    MIN_PIXELS = 1280*28*28  # ~1M pixels
+    MAX_PIXELS = 5120*28*28  # ~4M pixels
+    
+    tgt_path = dump_image_func(line)
+    question = line['question']
+    options = {cand: line[cand] for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand])}
+    options_prompt = 'Options:\n'
+    for key, item in options.items():
+        options_prompt += f'{key}. {item}\n'
+    hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
+    prompt = ''
+    if hint is not None:
+        prompt += f'Hint: {hint}\n'
+    prompt += f'Question: {question}\n'
+    if len(options):
+        prompt += options_prompt
+        prompt += 'Please select the correct answer from the options above. \n'
+    prompt = prompt.rstrip()
+    
+    # Build messages in standard conversation format
+    content = []
+    if isinstance(tgt_path, list):
+        for p in tgt_path:
+            content.append({
+                "type": "image",
+                "image": p,
+                "min_pixels": MIN_PIXELS,
+                "max_pixels": MAX_PIXELS
+            })
+    else:
+        content.append({
+            "type": "image", 
+            "image": tgt_path,
+            "min_pixels": MIN_PIXELS,
+            "max_pixels": MAX_PIXELS
+        })
+    content.append({"type": "text", "text": prompt})
+    
+    # Return messages in standard conversation format
+    messages = [{
+        "role": "user",
+        "content": content
+    }]
+    
+    return messages
+
+def prepare_inputs_for_vllm(messages, processor):
+    """
+    Prepare inputs for vLLM (following the examples in README.md).
+    
+    Args:
+        messages: List of messages in standard conversation format
+        processor: AutoProcessor instance
+    
+    Returns:
+        dict: Input format required by vLLM
+    """
+    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+    
+    # qwen_vl_utils 0.0.14+ required
+    image_inputs, video_inputs, video_kwargs = process_vision_info(
+        messages,
+        image_patch_size=processor.image_processor.patch_size,
+        return_video_kwargs=True,
+        return_video_metadata=True
+    )
+    
+    mm_data = {}
+    if image_inputs is not None:
+        mm_data['image'] = image_inputs
+    if video_inputs is not None:
+        mm_data['video'] = video_inputs
+    
+    return {
+        'prompt': text,
+        'multi_modal_data': mm_data,
+        'mm_processor_kwargs': video_kwargs
+    }
+
+def run_inference(args):
+    """Run inference on the MMMU dataset using vLLM."""
+    print("\n" + "="*80)
+    print("🚀 MMMU Inference with vLLM (High-Speed Mode)")
+    print("="*80 + "\n")
+    
+    # Load dataset
+    data = load_dataset(args.dataset)
+    print(f"✓ Loaded {len(data)} samples from {args.dataset}")
+    
+    # Set up image root directory
+    img_root = os.path.join(os.environ['LMUData'], 'images', 'MMMU')
+    os.makedirs(img_root, exist_ok=True)
+    
+    # Set up dump_image function
+    def dump_image_func(line):
+        return dump_image(line, img_root)
+    
+    # Create output directory
+    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
+
+    # Set up CoT prompt if enabled
+    cot_prompt = ""
+    if args.use_cot:
+        cot_prompt = args.cot_prompt if args.cot_prompt else " If you are uncertain or the problem is too complex, make a reasoned guess based on the information provided. Avoid repeating steps indefinitely—provide your best guess even if unsure. Determine whether to think step by step based on the difficulty of the question, considering all relevant information before answering."
+        print(f"✓ Using CoT prompt: {cot_prompt[:50]}...")
+
+    # Set up generation parameters (vLLM SamplingParams format)
+    sampling_params = SamplingParams(
+        temperature=args.temperature,
+        top_p=args.top_p,
+        top_k=args.top_k,
+        max_tokens=args.max_new_tokens,
+        repetition_penalty=args.repetition_penalty,
+        presence_penalty=args.presence_penalty,
+        stop_token_ids=[],
+    )
+    
+    print(f"\n⚙️  Generation parameters (vLLM SamplingParams):")
+    print(f"   max_tokens={sampling_params.max_tokens}")
+    print(f"   temperature={sampling_params.temperature}, top_p={sampling_params.top_p}, top_k={sampling_params.top_k}")
+    print(f"   repetition_penalty={sampling_params.repetition_penalty}")
+    print(f"   presence_penalty={sampling_params.presence_penalty}")
+    
+    if sampling_params.presence_penalty > 0:
+        print(f"   ✅ Anti-repetition enabled (presence_penalty={sampling_params.presence_penalty})")
+    
+    if sampling_params.temperature <= 0.02 and sampling_params.top_k == 1:
+        print(f"   ✅ Using FAST greedy-like decoding")
+    else:
+        print(f"   ⚠️  Using sampling decoding (slower but more diverse)")
+    print()
+
+    # Load processor for input preparation
+    print(f"Loading processor from {args.model_path}")
+    processor = AutoProcessor.from_pretrained(args.model_path)
+    print("✓ Processor loaded\n")
+    
+    # Initialize vLLM
+    print(f"Initializing vLLM with model: {args.model_path}")
+    print(f"   GPU count: {torch.cuda.device_count()}")
+    print(f"   Tensor parallel size: {args.tensor_parallel_size}")
+    
+    llm = LLM(
+        model=args.model_path,
+        tensor_parallel_size=args.tensor_parallel_size,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+        trust_remote_code=True,
+        max_model_len=args.max_model_len,
+        limit_mm_per_prompt={"image": args.max_images_per_prompt},
+        seed=42,
+    )
+    print("✓ vLLM initialized successfully\n")
+    
+    # Prepare all inputs
+    print("Preparing inputs for vLLM...")
+    all_inputs = []
+    all_line_dicts = []
+    all_messages = []
+    
+    for idx, (_, line) in enumerate(tqdm(data.iterrows(), total=len(data), desc="Building prompts")):
+        # Convert line to dict
+        line_dict = line.to_dict()
+        for k, v in line_dict.items():
+            if isinstance(v, np.integer):
+                line_dict[k] = int(v)
+            elif isinstance(v, np.floating):
+                line_dict[k] = float(v)
+        
+        # Build prompt
+        messages = build_mmmu_prompt(line, dump_image_func, args.dataset)
+        
+        # Add CoT prompt
+        if args.use_cot and len(messages) > 0 and len(messages[0]['content']) > 0:
+            last_content = messages[0]['content'][-1]
+            if last_content['type'] == 'text':
+                last_content['text'] += cot_prompt
+        
+        # Prepare input for vLLM
+        vllm_input = prepare_inputs_for_vllm(messages, processor)
+        
+        all_inputs.append(vllm_input)
+        all_line_dicts.append(line_dict)
+        all_messages.append(messages)
+    
+    print(f"✓ Prepared {len(all_inputs)} inputs\n")
+    
+    # Batch inference (vLLM automatic optimization)
+    print("="*80)
+    print("🚀 Running vLLM batch inference (automatic optimization)")
+    print("="*80)
+    start_time = time.time()
+    
+    outputs = llm.generate(all_inputs, sampling_params=sampling_params)
+    
+    end_time = time.time()
+    total_time = end_time - start_time
+    print(f"\n✓ Inference completed in {total_time:.2f} seconds")
+    print(f"  Average: {total_time/len(data):.2f} seconds/sample")
+    print(f"  Throughput: {len(data)/total_time:.2f} samples/second\n")
+    
+    # Save results
+    print("Saving results...")
+    results = []
+    
+    for idx, (line_dict, messages, output) in enumerate(zip(all_line_dicts, all_messages, outputs)):
+        response = output.outputs[0].text
+        index = line_dict['index']
+
+        response_final = str(response).split("</think>")[-1].strip()
+        
+        result = {
+            "question_id": int(index) if isinstance(index, np.integer) else index,
+            "annotation": line_dict,
+            "task": args.dataset,
+            "result": {"gen": response_final, "gen_raw": response},
+            "messages": messages
+        }
+        results.append(result)
+    
+    # Write final results
+    with open(args.output_file, 'w') as f:
+        for res in results:
+            f.write(json.dumps(res) + '\n')
+    
+    print(f"\n✓ Results saved to {args.output_file}")
+    print(f"✓ Total samples processed: {len(results)}")
+
+def run_evaluation(args):
+    """Run evaluation on inference results."""
+    # Load results
+    results = []
+    with open(args.input_file, 'r') as f:
+        for line in f:
+            job = json.loads(line)
+            annotation = job["annotation"]
+            annotation["prediction"] = job["result"]["gen"]
+            results.append(annotation)
+            
+    data = pd.DataFrame.from_records(results)
+    data = data.sort_values(by='index')
+    data['prediction'] = [str(x) for x in data['prediction']]
+    # If not choice label, then use lower case
+    for k in data.keys():
+        data[k.lower() if k not in list(string.ascii_uppercase) else k] = data.pop(k)
+
+    # Load dataset
+    meta = load_dataset(args.dataset)
+
+    # Validation
+    print(f"len(data): {len(data)}")
+    print(f"len(meta): {len(meta)}")
+    meta_q_map = {x: y for x, y in zip(meta['index'], meta['question'])}
+    data_map = {x: y for x, y in zip(data['index'], data['question'])}
+    for k in data_map:
+        assert k in meta_q_map, (
+            f'eval_file should be the same as or a subset of dataset MMMU_DEV_VAL'
+        )
+
+    answer_map = {i: c for i, c in zip(meta['index'], meta['answer'])}
+    data = MMMU_preproc(data)
+    answer_map = {k: (v if v in list(string.ascii_uppercase) else 'A') for k, v in answer_map.items()}
+    data = data[data['index'].isin(answer_map)]
+    data['GT'] = [answer_map[idx] for idx in data['index']]
+    items = []
+    for i in range(len(data)):
+        item = data.iloc[i]
+        items.append(item)
+
+    # Build judge model
+    model = build_judge(
+        model=getattr(args, 'eval_model', 'gpt-3.5-turbo-0125'),
+        api_type=getattr(args, 'api_type', 'dash')
+    )
+    
+    # Prepare evaluation tasks
+    eval_tasks = []
+    for item in items:
+        eval_tasks.append((model, item))
+    
+    # Run evaluation
+    eval_results = []
+    
+    # Debug mode: process single-threaded with first few samples
+    debug = os.environ.get('DEBUG', '').lower() == 'true'
+    if debug:
+        print("Running in debug mode with first 5 samples...")
+        for task in eval_tasks[:5]:
+            try:
+                result = eval_single_sample(task)
+                eval_results.append(result)
+            except Exception as e:
+                print(f"Error processing task: {e}")
+                print(f"Task details: {task}")
+                raise
+    else:
+        # Normal mode: process all samples with threading
+        from concurrent.futures import ThreadPoolExecutor
+        nproc = getattr(args, 'nproc', 4)
+        with ThreadPoolExecutor(max_workers=nproc) as executor:
+            for result in tqdm(executor.map(eval_single_sample, eval_tasks), 
+                             total=len(eval_tasks), desc="Evaluating"):
+                eval_results.append(result)
+    
+    # Calculate overall accuracy
+    accuracy = sum(r['hit'] for r in eval_results) / len(eval_results)
+    
+    # Calculate accuracy by split
+    results_by_split = {}
+    for result in eval_results:
+        split = result.get('split', 'unknown')
+        if split not in results_by_split:
+            results_by_split[split] = []
+        results_by_split[split].append(result)
+    
+    accuracy_by_split = {}
+    for split, split_results in results_by_split.items():
+        split_accuracy = sum(r['hit'] for r in split_results) / len(split_results)
+        accuracy_by_split[split] = split_accuracy
+        print(f"Accuracy for {split} split: {split_accuracy:.4f} ({sum(r['hit'] for r in split_results)}/{len(split_results)})")
+    
+    # Save results
+    output_df = pd.DataFrame(eval_results)
+    output_df.to_csv(args.output_file, index=False)
+    
+    # Save accuracy
+    with open(args.output_file.replace('.csv', '_acc.json'), 'w') as f:
+        json.dump({
+            "overall_accuracy": accuracy,
+            "accuracy_by_split": accuracy_by_split
+        }, f, indent=2)
+    
+    print(f"\n{'='*50}")
+    print(f"Evaluation Results:")
+    print(f"{'='*50}")
+    print(f"Overall accuracy: {accuracy:.4f}")
+    print(f"{'='*50}\n")
+
+def main():
+    parser = argparse.ArgumentParser(description="MMMU Evaluation with vLLM")
+    subparsers = parser.add_subparsers(dest='command', help='Command to run')
+    
+    # Inference parser
+    infer_parser = subparsers.add_parser("infer", help="Run inference with vLLM")
+    infer_parser.add_argument("--model-path", type=str, required=True, help="Path to the model")
+    infer_parser.add_argument("--dataset", type=str, default="MMMU_DEV_VAL", help="Dataset name")
+    infer_parser.add_argument("--data-dir", type=str, help="The absolute path of MMMU_DEV_VAL.tsv")
+    infer_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
+    infer_parser.add_argument("--use-cot", action="store_true", help="Use Chain-of-Thought prompting")
+    infer_parser.add_argument("--cot-prompt", type=str, default="", help="Custom Chain-of-Thought prompt")
+    
+    # vLLM specific parameters
+    infer_parser.add_argument("--tensor-parallel-size", type=int, default=None, 
+                            help="Tensor parallel size (default: number of GPUs)")
+    infer_parser.add_argument("--gpu-memory-utilization", type=float, default=0.9,
+                            help="GPU memory utilization (0.0-1.0, default: 0.9)")
+    infer_parser.add_argument("--max-model-len", type=int, default=128000,
+                            help="Maximum model context length (default: 128000, balance between performance and memory)")
+    infer_parser.add_argument("--max-images-per-prompt", type=int, default=10,
+                            help="Maximum images per prompt (default: 10)")
+    
+    # Generation parameters
+    infer_parser.add_argument("--max-new-tokens", type=int, default=32768, 
+                            help="Maximum number of tokens to generate (default: 2048)")
+    infer_parser.add_argument("--temperature", type=float, default=0.7, 
+                            help="Temperature for sampling (default: 0.7 for greedy-like decoding)")
+    infer_parser.add_argument("--top-p", type=float, default=0.8, 
+                            help="Top-p for sampling (default: 0.8 for greedy-like decoding)")
+    infer_parser.add_argument("--top-k", type=int, default=20, 
+                            help="Top-k for sampling (default: 20 for greedy decoding)")
+    infer_parser.add_argument("--repetition-penalty", type=float, default=1.0,
+                            help="Repetition penalty (default: 1.0, increase to 1.2-1.5 to reduce repetition)")
+    infer_parser.add_argument("--presence-penalty", type=float, default=1.5,
+                            help="Presence penalty (default: 1.5, range: 0.0-2.0, penalize tokens that have already appeared)")
+    
+    # Evaluation parser
+    eval_parser = subparsers.add_parser("eval", help="Run evaluation")
+    eval_parser.add_argument("--data-dir", type=str, help="The absolute path of MMMU_DEV_VAL.tsv")
+    eval_parser.add_argument("--input-file", type=str, required=True, help="Input file with inference results")
+    eval_parser.add_argument("--output-file", type=str, required=True, help="Output file path")
+    eval_parser.add_argument("--dataset", type=str, default="MMMU_DEV_VAL", help="Dataset name")
+    eval_parser.add_argument("--eval-model", type=str, default="gpt-3.5-turbo-0125",
+                            help="Model to use for evaluation (default: gpt-3.5-turbo-0125)")
+    eval_parser.add_argument("--api-type", type=str, default="dash", choices=["dash", "mit"],
+                            help="API type for evaluation")
+    eval_parser.add_argument("--nproc", type=int, default=4, help="Number of processes to use")
+    
+    args = parser.parse_args()
+    
+    # Set data directory if provided
+    if hasattr(args, 'data_dir') and args.data_dir:
+        os.environ['LMUData'] = args.data_dir
+    
+    # Automatically set tensor_parallel_size
+    if args.command == 'infer' and args.tensor_parallel_size is None:
+        args.tensor_parallel_size = torch.cuda.device_count()
+        print(f"Auto-set tensor_parallel_size to {args.tensor_parallel_size}")
+    
+    if args.command == 'infer':
+        run_inference(args)
+    elif args.command == 'eval':
+        run_evaluation(args)
+    else:
+        parser.print_help()
+
+if __name__ == "__main__":
+    main()
--- a/icon.png
+++ b/icon.png
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=1858
+# 模型名称
+modelName=Qwen3-VL_pytorch
+# 模型描述
+modelDescription=Qwen3-VL这一代产品在各方面都进行了全面升级，迄今为止 是Qwen 系列中最强大的视觉语言模型。
+# 运行过程
+processType=推理
+# 算法类别
+appCategory=多模态
+# 框架类型
+frameType=pytorch
+# 加速卡类型
+accelerateType=BW1000
--- a/qwen-vl-finetune/README.md
+++ b/qwen-vl-finetune/README.md
+# QwenVL Training Framework
+
+This repository provides a training framework for Qwen VL models. The are two steps to use our repo:
+
+1. Customize your dataset: downloading data, implement the config
+2. Modify training scripts: 
+
+## Repository Structure
+
+The `qwenvl` directory contains the following components:
+
+### `train/`
+- `trainer.py`: Main trainer updated from Huggingface Trainer
+- `train_qwen.py`: Main file for training
+- `argument.py`: Dataclasses for model, data and training arguments
+
+### `data/`
+- `__init__.py`: Contains datasets configs
+- `data_processor.py`: Data processing module for QwenVL models
+- `rope2d.py`: Provide RoPE implementation
+
+### `tools`
+- `process_bbox.ipynb`: Convert bbox into QwenVL format. If you have grounding data, please refer this file to tranform your data.
+- `pack_data.py`: Pack data into even length buckets.
+
+## Requirements
+
+You could use follow version of packages:
+
+- `torch==2.6.0`
+- `torchvision==0.21.0`
+- `transformers==4.57.0.dev0`
+- `deepspeed==0.17.1`
+- `flash_attn==2.7.4.post1`
+- `triton==3.2.0`
+- `accelerate==1.7.0`
+- `torchcodec==0.2`
+- `peft==0.17.1`
+
+## Custom Dataset Configuration
+
+The customized data should have the format like:
+
+### JSON Data Structure
+
+**Media Specification**:
+- `image/video`: Contains path to the media file (required)
+- Media tags in prompts:
+    - `<image>` for image understanding tasks
+    - `<video>` for video understanding tasks
+- `conversations`: contains the questions and answers
+
+### Example Instances:
+
+1. **Single Image Example**:
+```json
+{
+    "image": "images/001.jpg",
+    "conversations": [
+        {
+            "from": "human",
+            "value": "<image>\nWhat's the main object in this picture?"
+        },
+        {
+            "from": "gpt",
+            "value": "A red apple on a wooden table"
+        }
+    ]
+}
+```
+
+2. **Multi-Image Example**:
+```json
+{
+    "image": ["cats/001.jpg", "cats/002.jpg"],
+    "conversations": [
+        {
+            "from": "human",
+            "value": "<image>\n<image>\nWhat are the differences between these two cats?"
+        },
+        {
+            "from": "gpt",
+            "value": "The first cat is an orange tabby with short fur and green eyes, while the second is a gray Siamese with blue eyes and pointed coloration. They also appear to be in different environments - the first is indoors on a couch, the second is outdoors in a garden."
+        }
+    ]
+}
+```
+
+3. **Video Example**:
+```json
+{
+    "video": "videos/005.mp4",
+    "conversations": [
+        {
+            "from": "human",
+            "value": "<video>\nWhat caused the blue object to move?\nOptions:\n(A) Gravity\n(B) Collision\n(C) Magnetic force"
+        },
+        {
+            "from": "gpt",
+            "value": "Answer: (B) Collision"
+        }
+    ]
+}
+```
+
+4. **Grounding Example**:
+```json
+{
+    "image": "demo/COCO_train2014_000000580957.jpg",
+    "conversations": [
+        {
+            "from": "human",
+            "value": "<image>\nLocate house in this image and output the bbox coordinates in JSON format."
+        },
+        {
+            "from": "gpt",
+            "value": "{\n"bbox_2d": [135, 114, 1016, 672]\n}"
+        }
+    ]
+}
+```
+
+5. **Packed Data Example**:
+```json
+[
+    {
+        "image": "images/001.jpg",
+        "conversations": [
+            {
+                "from": "human",
+                "value": "<image>\nWhat's the main object in this picture?"
+            },
+            {
+                "from": "gpt",
+                "value": "A red apple on a wooden table"
+            }
+        ]
+    },
+    {
+        "image": "images/002.jpg",
+        "conversations": [
+            {
+                "from": "human",
+                "value": "<image>\nWhat's the main object in this picture?"
+            },
+            {
+                "from": "gpt",
+                "value": "A green orange on a plastic table"
+            }
+        ]
+    }
+]
+```
+
+Some examples are shown in `demo/single_images.json` and `demo/video.json` and these json files could be used for training.
+
+### Dataset config for training
+
+To add or modify datasets for training, follow these steps:
+
+### Dataset Definition Structure
+
+1. **Create a dataset dictionary** in the format in the file `data/__init__.py`:
+```python
+DATASET_NAME = {
+    "annotation_path": "/path/to/annotations.json",
+    "data_path": "/path/to/image/data",  # Can be empty if paths are in annotations
+}
+```
+
+2. **Register your dataset** by adding it to the `data_dict`:
+```python
+data_dict = {
+    "your_dataset_name": DATASET_NAME,
+    # ... other datasets
+}
+```
+
+### Sampling Rate Control
+
+You can optionally specify sampling rates by appending `%X` to the dataset name:
+- `"dataset_name%50"` will sample 50% of the data
+- `"dataset_name%20"` will sample 20% of the data
+
+### Usage Example
+
+1. Define your dataset:
+```python
+MY_DATASET = {
+    "annotation_path": "/data/my_dataset/annotations.json",
+    "data_path": "/data/my_dataset/images/",
+}
+
+data_dict = {
+    "my_dataset": MY_DATASET,
+    "cambrian_737k": CAMBRIAN_737K,  # existing dataset
+}
+```
+
+2. Use it in training:
+```python
+dataset_names = ["my_dataset%50"]  # Will use 50% of your dataset
+configs = data_list(dataset_names)
+```
+
+### Notes  
+- The `annotation_path` should point to a JSON or JSONL file containing your dataset annotations.  
+- The `data_path` can be left empty if the image paths in the annotations are absolute.  
+- Sampling rates are applied per-dataset when multiple datasets are specified.  
+- Some datasets you can use directly: `nyu-visionx/Cambrian-10M`, `lmms-lab/LLaVA-NeXT-Data`, `FreedomIntelligence/ALLaVA-4V`, `TIGER-Lab/VisualWebInstruct`.  
+- The training data should strictly follow this format:  
+  - One `<image>` tag in the question must correspond to exactly one image file  
+  - Similarly, `<video>` tags must correspond to video files  
+  - These special tokens should not appear in the answer text  
+- For open source data that might have missing images or other issues, you can verify data completeness using `tools/check_image.py`.  
+
+
+## Usage
+
+To train a model:
+
+```bash
+#!/bin/bash
+# Complete QwenVL Training Launch Script with Full Parameter Documentation
+
+# ======================
+# Distributed Configuration
+# ======================
+MASTER_ADDR="127.0.0.1"                     # [Required] Master node IP for multi-GPU training
+MASTER_PORT=$(shuf -i 20000-29999 -n 1)     # Random port to avoid conflicts
+NPROC_PER_NODE=$(nvidia-smi --list-gpus | wc -l)  # Automatically detects available GPUs
+
+# ======================
+# Path Configuration
+# ======================
+MODEL_PATH="/path/to/Qwen2.5-VL-3B-Instruct"  # [ModelArguments] Pretrained model path
+OUTPUT_DIR="./checkpoints"                   # Directory for saving checkpoints
+CACHE_DIR="./cache"                          # [TrainingArguments] Cache directory for models
+
+# ======================
+# Model Configuration
+# ======================
+DATASETS="your_dataset%100"                  # [DataArguments] Dataset with sampling rate
+
+# ======================
+# Training Hyperparameters
+# ======================
+torchrun --nproc_per_node=$NPROC_PER_NODE \
+         --master_addr=$MASTER_ADDR \
+         --master_port=$MASTER_PORT \
+         qwenvl/train/train_qwen.py \
+         # Core Arguments
+         --model_name_or_path $MODEL_PATH \  # [ModelArguments] Model identifier
+         --tune_mm_llm True \                # [TrainingArguments] Train LLM or not
+         --tune_mm_vision False \            # [TrainingArguments] Train VIT or not
+         --tune_mm_mlp False \               # [TrainingArguments] Train MLP or not
+         --dataset_use $DATASETS \           # [DataArguments] Dataset specification
+         --output_dir $OUTPUT_DIR \          # Output directory for checkpoints
+         --cache_dir $CACHE_DIR \            # [TrainingArguments] Model cache location
+         
+         # Precision & Memory
+         --bf16 \                            # Use bfloat16 precision (Ampere+ GPUs)
+         --per_device_train_batch_size 4 \   # Batch size per GPU
+         --gradient_accumulation_steps 4 \   # Effective batch size multiplier
+         
+         # Learning Rate Configuration
+         --learning_rate 2e-7 \              # Base learning rate
+         --mm_projector_lr 1e-5 \            # [TrainingArguments] Projector-specific LR
+         --vision_tower_lr 1e-6 \            # [TrainingArguments] Vision encoder LR
+         --optim adamw_torch \               # [TrainingArguments] Optimizer selection
+         
+         # Sequence Configuration
+         --model_max_length 4096 \           # [TrainingArguments] Max sequence length
+         --data_flatten True \               # [DataArguments] Concatenate batch sequences
+         --data_packing True \               # [DataArguments] Using packing data
+         
+         # Image Processing
+         --max_pixels 576\*28\*28 \               # [DataArguments] Max image pixels (H*W) for image
+         --min_pixels 16\*28\*28 \                # [DataArguments] Min image pixels for image
+         # Video Processing
+         --video_fps 2 \                          # [DataArguments] video fps
+         --video_max_frames 8 \                   # [DataArguments] Max frames per video
+         --video_min_frames 4 \                   # [DataArguments] Min frames per video
+         --video_max_pixels 1664\*28\*28 \        # [DataArguments] Max pixels per video
+         --video_min_pixels 256\*28\*28 \         # [DataArguments] Min pixels per video
+         
+         # Training Schedule
+         --num_train_epochs 3 \              # Total training epochs
+         --warmup_ratio 0.03 \               # LR warmup proportion
+         --lr_scheduler_type "cosine" \      # Learning rate schedule
+         --weight_decay 0.01 \               # L2 regularization strength
+         
+         # Logging & Checkpoints
+         --logging_steps 10 \               # Log metrics interval
+         --save_steps 500 \                 # Checkpoint save interval
+         --save_total_limit 3 \             # Max checkpoints to keep
+
+         # Lora Config
+         --lora_enable True \                 # [TrainingArguments] Enable LoRA
+         --lora_r 8 \                         # [TrainingArguments] LoRA r
+         --lora_alpha 16 \                    # [TrainingArguments] LoRA alpha 
+         --lora_dropout 0.0 \                # [TrainingArguments] LoRA dropout
+
+         # Advanced Options
+         --deepspeed zero3.json \           # DeepSpeed configuration
+```
+
+The script accepts arguments in three categories:
+
+   - Flags to control which components to tune (`tune_mm_vision`, `tune_mm_mlp`, `tune_mm_llm`). If trained with both image and video data, tune_mm_vision should be False: `tune_mm_vision=False`
+   - `data_flatten` flag means data in a batch are concat into one sequence
+   - `data_packing` requires preprocess with `tools/pack_data.py`
+   - Training hyperparameters, the suggested learning rate is from 1e-6 to 2e-7
+   - Training resolution is critical for the model performances, hence `--max_pixels` and `--min_pixels` should be properly set
+   - Training with Qwen2.5-VL-32B model, you should have 8 80G GPU refering to `scripts/sft_32b.sh`
+   - `"_attn_implementation": "flash_attention_2",` could be add in the config.json of the model to use flash attention.
+   - The Qwen3VL MoE model does not support DeepSpeed with ZeRO-3. Additionally, Hugging Face’s official implementation does not include support for load balancing loss currently.
+
--- a/qwen-vl-finetune/demo/images/10095.png
+++ b/qwen-vl-finetune/demo/images/10095.png
--- a/qwen-vl-finetune/demo/images/10149.png
+++ b/qwen-vl-finetune/demo/images/10149.png
--- a/qwen-vl-finetune/demo/images/COCO_train2014_000000580957.jpg
+++ b/qwen-vl-finetune/demo/images/COCO_train2014_000000580957.jpg