"Initial commit"

3d735feb · luopl · 3d735feb · 3d735feb · 3d735feb · 3d735feb
Commit 3d735feb authored Dec 03, 2025 by luopl
20 changed files
--- a/cookbooks/ocr.ipynb
+++ b/cookbooks/ocr.ipynb
--- a/cookbooks/omni_recognition.ipynb
+++ b/cookbooks/omni_recognition.ipynb
--- a/cookbooks/spatial_understanding.ipynb
+++ b/cookbooks/spatial_understanding.ipynb
--- a/cookbooks/think_with_images.ipynb
+++ b/cookbooks/think_with_images.ipynb
--- a/cookbooks/utils/agent_function_call.py
+++ b/cookbooks/utils/agent_function_call.py
+from typing import Union, Tuple, List
+
+from qwen_agent.tools.base import BaseTool, register_tool
+
+
+@register_tool("mobile_use")
+class MobileUse(BaseTool):
+    @property
+    def description(self):
+        return f"""
+Use a touchscreen to interact with a mobile device, and take screenshots.
+* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.
+* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.
+* The screen's resolution is {self.display_width_px}x{self.display_height_px}.
+* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.
+""".strip()
+
+    parameters = {
+        "properties": {
+            "action": {
+                "description": """
+The action to perform. The available actions are:
+* `key`: Perform a key event on the mobile device.
+    - This supports adb's `keyevent` syntax.
+    - Examples: "volume_up", "volume_down", "power", "camera", "clear".
+* `click`: Click the point on the screen with coordinate (x, y).
+* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.
+* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).
+* `type`: Input the specified text into the activated input box.
+* `system_button`: Press the system button.
+* `open`: Open an app on the device.
+* `wait`: Wait specified seconds for the change to happen.
+* `terminate`: Terminate the current task and report its completion status.
+""".strip(),
+                "enum": [
+                    "key",
+                    "click",
+                    "long_press",
+                    "swipe",
+                    "type",
+                    "system_button",
+                    "open",
+                    "wait",
+                    "terminate",
+                ],
+                "type": "string",
+            },
+            "coordinate": {
+                "description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.",
+                "type": "array",
+            },
+            "coordinate2": {
+                "description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.",
+                "type": "array",
+            },
+            "text": {
+                "description": "Required only by `action=key`, `action=type`, and `action=open`.",
+                "type": "string",
+            },
+            "time": {
+                "description": "The seconds to wait. Required only by `action=long_press` and `action=wait`.",
+                "type": "number",
+            },
+            "button": {
+                "description": "Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`",
+                "enum": [
+                    "Back",
+                    "Home",
+                    "Menu",
+                    "Enter",
+                ],
+                "type": "string",
+            },
+            "status": {
+                "description": "The status of the task. Required only by `action=terminate`.",
+                "type": "string",
+                "enum": ["success", "failure"],
+            },
+        },
+        "required": ["action"],
+        "type": "object",
+    }
+
+    def __init__(self, cfg=None):
+        self.display_width_px = cfg["display_width_px"]
+        self.display_height_px = cfg["display_height_px"]
+        super().__init__(cfg)
+
+    def call(self, params: Union[str, dict], **kwargs):
+        params = self._verify_json_format_args(params)
+        action = params["action"]
+        if action == "key":
+            return self._key(params["text"])
+        elif action == "click":
+            return self._click(
+                coordinate=params["coordinate"]
+            )
+        elif action == "long_press":
+            return self._long_press(
+                coordinate=params["coordinate"], time=params["time"]
+            )
+        elif action == "swipe":
+            return self._swipe(
+                coordinate=params["coordinate"], coordinate2=params["coordinate2"]
+            )
+        elif action == "type":
+            return self._type(params["text"])
+        elif action == "system_button":
+            return self._system_button(params["button"])
+        elif action == "open":
+            return self._open(params["text"])
+        elif action == "wait":
+            return self._wait(params["time"])
+        elif action == "terminate":
+            return self._terminate(params["status"])
+        else:
+            raise ValueError(f"Unknown action: {action}")
+
+    def _key(self, text: str):
+        raise NotImplementedError()
+        
+    def _click(self, coordinate: Tuple[int, int]):
+        raise NotImplementedError()
+
+    def _long_press(self, coordinate: Tuple[int, int], time: int):
+        raise NotImplementedError()
+
+    def _swipe(self, coordinate: Tuple[int, int], coordinate2: Tuple[int, int]):
+        raise NotImplementedError()
+
+    def _type(self, text: str):
+        raise NotImplementedError()
+
+    def _system_button(self, button: str):
+        raise NotImplementedError()
+
+    def _open(self, text: str):
+        raise NotImplementedError()
+
+    def _wait(self, time: int):
+        raise NotImplementedError()
+
+    def _terminate(self, status: str):
+        raise NotImplementedError()
+    
+@register_tool("computer_use")
+class ComputerUse(BaseTool):
+    @property
+    def description(self):
+        return f"""
+Use a mouse and keyboard to interact with a computer, and take screenshots.
+* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.
+* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.
+* The screen's resolution is {self.display_width_px}x{self.display_height_px}.
+* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.
+* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.
+* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges.
+""".strip()
+
+    parameters = {
+        "properties": {
+            "action": {
+                "description": """
+The action to perform. The available actions are:
+* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.
+* `type`: Type a string of text on the keyboard.
+* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
+* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen.
+* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.
+* `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen.
+* `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen.
+* `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
+* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen (simulated as double-click since it's the closest action).
+* `scroll`: Performs a scroll of the mouse scroll wheel.
+* `hscroll`: Performs a horizontal scroll (mapped to regular scroll).
+* `wait`: Wait specified seconds for the change to happen.
+* `terminate`: Terminate the current task and report its completion status.
+* `answer`: Answer a question.
+""".strip(),
+                "enum": [
+                    "key",
+                    "type",
+                    "mouse_move",
+                    "left_click",
+                    "left_click_drag",
+                    "right_click",
+                    "middle_click",
+                    "double_click",
+                    "triple_click",
+                    "scroll",
+                    "hscroll",
+                    "wait",
+                    "terminate",
+                    "answer",
+                ],
+                "type": "string",
+            },
+            "keys": {
+                "description": "Required only by `action=key`.",
+                "type": "array",
+            },
+            "text": {
+                "description": "Required only by `action=type` and `action=answer`.",
+                "type": "string",
+            },
+            "coordinate": {
+                "description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to.",
+                "type": "array",
+            },
+            "pixels": {
+                "description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll` and `action=hscroll`.",
+                "type": "number",
+            },
+            "time": {
+                "description": "The seconds to wait. Required only by `action=wait`.",
+                "type": "number",
+            },
+            "status": {
+                "description": "The status of the task. Required only by `action=terminate`.",
+                "type": "string",
+                "enum": ["success", "failure"],
+            },
+        },
+        "required": ["action"],
+        "type": "object",
+    }
+
+    def __init__(self, cfg=None):
+        self.display_width_px = cfg["display_width_px"]
+        self.display_height_px = cfg["display_height_px"]
+        super().__init__(cfg)
+
+    def call(self, params: Union[str, dict], **kwargs):
+        params = self._verify_json_format_args(params)
+        action = params["action"]
+        if action in ["left_click", "right_click", "middle_click", "double_click","triple_click"]:
+            return self._mouse_click(action)
+        elif action == "key":
+            return self._key(params["keys"])
+        elif action == "type":
+            return self._type(params["text"])
+        elif action == "mouse_move":
+            return self._mouse_move(params["coordinate"])
+        elif action == "left_click_drag":
+            return self._left_click_drag(params["coordinate"])
+        elif action == "scroll":
+            return self._scroll(params["pixels"])
+        elif action == "hscroll":
+            return self._hscroll(params["pixels"])
+        elif action == "answer":
+            return self._answer(params["text"])
+        elif action == "wait":
+            return self._wait(params["time"])
+        elif action == "terminate":
+            return self._terminate(params["status"])
+        else:
+            raise ValueError(f"Invalid action: {action}")
+
+    def _mouse_click(self, button: str):
+        raise NotImplementedError()
+
+    def _key(self, keys: List[str]):
+        raise NotImplementedError()
+
+    def _type(self, text: str):
+        raise NotImplementedError()
+
+    def _mouse_move(self, coordinate: Tuple[int, int]):
+        raise NotImplementedError()
+
+    def _left_click_drag(self, coordinate: Tuple[int, int]):
+        raise NotImplementedError()
+
+    def _scroll(self, pixels: int):
+        raise NotImplementedError()
+
+    def _hscroll(self, pixels: int):
+        raise NotImplementedError()
+
+    def _answer(self, text: str):
+        raise NotImplementedError()
+
+    def _wait(self, time: int):
+        raise NotImplementedError()
+
+    def _terminate(self, status: str):
+        raise NotImplementedError()
\ No newline at end of file
--- a/cookbooks/utils/multimodal_coding/take_screenshot.py
+++ b/cookbooks/utils/multimodal_coding/take_screenshot.py
+import os
+from playwright.sync_api import sync_playwright
+import argparse
+from PIL import Image
+import time
+
+
+def take_screenshot(url, output_file="screenshot.png"):
+    # Convert local path to file:// URL if it's a file
+    if os.path.exists(url):
+        url = "file://" + os.path.abspath(url)
+
+    try:
+        with sync_playwright() as p:
+            # Choose a browser, e.g., Chromium, Firefox, or WebKit
+            browser = p.chromium.launch(headless=True, args=["--disable-web-security"])
+            page = browser.new_page()
+
+            # Navigate to the URL
+            page.goto(url, timeout=60000)
+            # page.wait_for_timeout(1000)  # give it 1 second to paint
+
+            # Take the screenshot
+            page.screenshot(path=output_file, full_page=True, animations="disabled", timeout=60000)
+
+            browser.close()
+    except Exception as e:
+        print(f"Failed to take screenshot due to: {e}. Generating a blank image.")
+        # Generate a blank image
+        img = Image.new("RGB", (1280, 960), color="white")
+        img.save(output_file)
+
+
+if __name__ == "__main__":
+
+    # Initialize the parser
+    parser = argparse.ArgumentParser(description="Process two path strings.")
+
+    # Define the arguments
+    parser.add_argument("--html", type=str)
+    parser.add_argument("--png", type=str)
+
+    # Parse the arguments
+    args = parser.parse_args()
+
+    take_screenshot(args.html, args.png)
--- a/cookbooks/utils/multimodal_coding/test_mmcode.py
+++ b/cookbooks/utils/multimodal_coding/test_mmcode.py
--- a/cookbooks/video_understanding.ipynb
+++ b/cookbooks/video_understanding.ipynb
--- a/doc/demo.jpeg
+++ b/doc/demo.jpeg
--- a/doc/dog.jpg
+++ b/doc/dog.jpg
--- a/doc/qwen3vl_arc.jpg
+++ b/doc/qwen3vl_arc.jpg
--- a/doc/result.png
+++ b/doc/result.png
--- a/doc/result_multi_images.png
+++ b/doc/result_multi_images.png
--- a/doc/result_vedio.png
+++ b/doc/result_vedio.png
--- a/doc/space_woaudio.mp4
+++ b/doc/space_woaudio.mp4
--- a/evaluation/MathVision/README.md
+++ b/evaluation/MathVision/README.md
+# MathVision Benchmark Evaluation
+
+This directory contains the implementation for evaluating vision-language models on the MathVision benchmark using vLLM for high-speed inference.
+
+## Overview
+
+MathVision is a mathematical visual reasoning benchmark that evaluates models' ability to solve mathematical problems based on visual information. This implementation provides:
+
+- **High-speed inference** using vLLM with automatic batch optimization
+- **Two-stage evaluation** using rule-based and GPT-4o-based answer extraction
+- **Support for thinking models** with extended reasoning capabilities
+- **Modular code structure** for easy maintenance and extension
+
+## Project Structure
+
+```
+MathVision/
+├── run_mathv.py          # Main script for inference and evaluation
+├── dataset_utils.py      # Dataset loading and preprocessing utilities
+├── eval_utils.py         # Evaluation logic and answer extraction
+├── common_utils.py       # Common utilities for image processing, file I/O
+├── infer_instruct.sh     # Inference script for instruct models
+├── infer_think.sh        # Inference script for thinking models
+├── eval_instruct.sh      # Evaluation script for instruct model results
+├── eval_think.sh         # Evaluation script for thinking model results
+├── requirements.txt      # Python dependencies
+└── README.md            # This file
+```
+
+## Requirements
+
+### Python Dependencies
+
+```bash
+pip install -r requirements.txt
+```
+
+Key dependencies:
+- `vllm` - High-speed LLM inference engine
+- `transformers` - HuggingFace transformers
+- `qwen_vl_utils` - Qwen VL utilities for vision processing
+- `pandas`, `numpy` - Data processing
+- `Pillow` - Image processing
+- `latex2sympy2` - LaTeX to symbolic math conversion (optional)
+- `openpyxl` - Excel file handling
+- `requests` - API calls for evaluation
+
+### Environment Variables
+
+For evaluation, you need to set up API credentials for the judge model:
+
+**Option 1: DashScope API (Recommended)**
+```bash
+export CHATGPT_DASHSCOPE_API_KEY="your-api-key"
+export DASHSCOPE_API_BASE="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
+```
+
+**Option 2: Custom OpenAI-compatible API**
+```bash
+export MIT_SPIDER_TOKEN="your-api-key"
+export MIT_SPIDER_URL="your-api-endpoint"
+```
+
+## Quick Start
+
+### 1. Inference
+
+Run inference on MathVision dataset using an instruct model:
+
+```bash
+bash infer_instruct.sh
+```
+
+Or customize the inference:
+
+```bash
+python run_mathv.py infer \
+    --model-path /path/to/Qwen3-VL-Instruct \
+    --data-dir /path/to/data \
+    --dataset MathVision \
+    --output-file results/predictions.jsonl \
+    --max-new-tokens 32768 \
+    --temperature 0.7 \
+    --top-p 0.8 \
+    --top-k 20 \
+    --repetition-penalty 1.0 \
+    --presence-penalty 1.5
+```
+
+For thinking models with extended reasoning:
+
+```bash
+bash infer_think.sh
+```
+
+### 2. Evaluation
+
+Evaluate the inference results using GPT-4o as a judge:
+
+```bash
+bash eval_instruct.sh
+```
+
+Or customize the evaluation:
+
+```bash
+python run_mathv.py eval \
+    --data-dir /path/to/data \
+    --input-file results/predictions.jsonl \
+    --output-file results/evaluation.csv \
+    --dataset MathVision \
+    --eval-model gpt-4o-2024-05-13 \
+    --api-type dash \
+    --nproc 16
+```
+
+## Detailed Usage
+
+### Inference Mode
+
+**Basic Arguments:**
+- `--model-path`: Path to the Qwen3-VL model directory (required)
+- `--data-dir`: Directory to store/load MathVision dataset (required)
+- `--dataset`: Dataset name (default: `MathVision`)
+  - `MathVision`: Full dataset with ~3,000 samples
+  - `MathVision_MINI`: Mini version for quick testing
+- `--output-file`: Path to save inference results in JSONL format (required)
+
+**vLLM Arguments:**
+- `--tensor-parallel-size`: Number of GPUs for tensor parallelism (default: auto-detect)
+- `--gpu-memory-utilization`: GPU memory utilization ratio, 0.0-1.0 (default: 0.9)
+- `--max-model-len`: Maximum model context length (default: 128000)
+- `--max-images-per-prompt`: Maximum images per prompt (default: 10)
+
+**Generation Parameters:**
+- `--max-new-tokens`: Maximum tokens to generate (default: 32768)
+- `--temperature`: Sampling temperature (default: 0.7)
+- `--top-p`: Top-p sampling (default: 0.8)
+- `--top-k`: Top-k sampling (default: 20)
+- `--repetition-penalty`: Repetition penalty (default: 1.0)
+- `--presence-penalty`: Presence penalty to reduce repetition (default: 1.5)
+
+**Advanced Options:**
+- `--use-cot`: Enable Chain-of-Thought prompting for better reasoning
+- `--cot-prompt`: Custom CoT prompt (default: " Let's think step by step.")
+- `--num-samples`: Number of samples to process (optional, for testing)
+
+### Evaluation Mode
+
+**Basic Arguments:**
+- `--data-dir`: Directory containing MathVision dataset (required)
+- `--input-file`: Inference results file in JSONL format (required)
+- `--output-file`: Path to save evaluation results in CSV format (required)
+- `--dataset`: Dataset name, must match inference (default: `MathVision`)
+
+**Judge Model Arguments:**
+- `--eval-model`: Judge model name (default: `gpt-4o`)
+  - Options: `gpt-4o`, `gpt-4o-2024-05-13`, `gpt-3.5-turbo-0125`, etc.
+- `--api-type`: API service type (default: `dash`)
+  - `dash`: DashScope API (Alibaba Cloud)
+  - `mit`: Custom OpenAI-compatible API
+- `--nproc`: Number of parallel workers for evaluation (default: 4)
+
+## Output Files
+
+### Inference Output
+
+The inference script generates a JSONL file where each line contains:
+
+```json
+{
+  "question_id": 123,
+  "annotation": {
+    "index": 123,
+    "question": "What is the area of the triangle?",
+    "answer": "12",
+    "category": "Geometry",
+    "choices": "[]",
+    ...
+  },
+  "task": "MathVision",
+  "result": {
+    "gen": "The final answer",
+    "gen_raw": "Raw model output including thinking process"
+  },
+  "messages": [...]
+}
+```
+
+### Evaluation Output
+
+The evaluation script generates multiple files:
+
+1. **Intermediate results** (`*_eval_results.xlsx`): Raw predictions with metadata
+2. **Detailed evaluation** (`*_eval_results_eval.xlsx`): Results with extracted answers
+   - Columns: `index`, `question`, `prediction`, `answer`, `res` (extracted), `log`, `extract_model`, `extract_flag`, `category`
+3. **Score summary** (`*_eval_results_eval_score.csv`): Accuracy by category
+
+Example score summary:
+```
+Subject         | tot | prefetch | hit | prefetch_rate | acc
+----------------|-----|----------|-----|---------------|------
+Overall         | 3000| 2400     | 2100| 80.0          | 70.0
+Algebra         | 800 | 640      | 560 | 80.0          | 70.0
+Geometry        | 750 | 600      | 525 | 80.0          | 70.0
+```
+
+## Model-Specific Configurations
+
+### Instruct Models (e.g., Qwen3-VL-2B-Instruct, Qwen3-VL-30B-Instruct)
+
+Use standard parameters for balanced performance:
+
+```bash
+--max-new-tokens 32768
+--temperature 0.7
+--top-p 0.8
+--top-k 20
+--repetition-penalty 1.0
+--presence-penalty 1.5
+```
+
+### Thinking Models (e.g., Qwen3-VL-4B-Thinking, Qwen3-VL-30B-Thinking)
+
+Use extended parameters for deeper reasoning:
+
+```bash
+--max-new-tokens 40960
+--temperature 1.0
+--top-p 0.95
+--top-k 20
+--repetition-penalty 1.0
+--presence-penalty 0.0
+```
+
+**Note:** Thinking models output reasoning steps wrapped in `<think>...</think>` tags. The evaluation automatically extracts the final answer after `</think>`.
+
+## Performance Tips
+
+1. **GPU Memory**: Adjust `--gpu-memory-utilization` based on your GPU:
+   - 0.9: Recommended for most cases
+   - 0.95: For maximum throughput (may cause OOM)
+   - 0.7-0.8: If experiencing OOM errors
+
+2. **Batch Size**: vLLM automatically optimizes batch size based on available memory
+
+3. **Tensor Parallelism**: Use `--tensor-parallel-size` for large models:
+   - 2B/4B models: 1-2 GPUs
+   - 7B/14B models: 2-4 GPUs
+   - 30B+ models: 4-8 GPUs
+
+4. **Context Length**: Reduce `--max-model-len` if memory is limited:
+   - 128000: Default, works well for most cases
+   - 64000: Reduces memory usage by ~40%
+
+5. **Image Resolution**: MathVision uses optimized resolution (768×28×28 to 5120×28×28)
+   - Lower min_pixels for faster processing
+   - Higher max_pixels for better accuracy on complex diagrams
+
+## Troubleshooting
+
+### Common Issues
+
+**1. CUDA Out of Memory**
+```bash
+# Reduce GPU memory utilization
+--gpu-memory-utilization 0.7
+
+# Or reduce context length
+--max-model-len 64000
+```
+
+**2. vLLM Multiprocessing Issues**
+The code automatically sets `VLLM_WORKER_MULTIPROC_METHOD=spawn`. If you still encounter issues:
+```bash
+export VLLM_WORKER_MULTIPROC_METHOD=spawn
+```
+
+**3. Evaluation API Errors**
+- Verify API credentials are set correctly
+- Check API endpoint connectivity
+- Monitor rate limits
+- Increase `--nproc` value if rate-limited (up to 32)
+
+**4. Dataset Download Issues**
+The dataset is automatically downloaded from:
+```
+https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv
+```
+If download fails, manually download and place in `--data-dir`.
+
+**5. Excel Export Errors**
+The code automatically removes illegal Excel characters. If you still encounter issues:
+- Check `clean_for_excel()` function in `run_mathv.py`
+- Ensure `openpyxl` is installed
+
+**6. LaTeX Conversion Errors**
+Install `latex2sympy2` for better LaTeX support:
+```bash
+pip install latex2sympy2
+```
+
+## Advanced Usage
+
+### Custom Image Resolution
+
+Edit `run_mathv.py` to modify image resolution:
+
+```python
+MIN_PIXELS = 768*28*28   # ~0.6M pixels
+MAX_PIXELS = 5120*28*28  # ~4M pixels
+```
+
+### Custom Evaluation Prompts
+
+The evaluation uses in-context examples defined in `eval_utils.py`:
+- Edit `get_gpt4_ICE()` to customize examples
+- Edit `build_mathv_gpt4_prompt()` to modify prompt structure
+
+### Testing with Limited Samples
+
+Use `--num-samples` for quick testing:
+
+```bash
+python run_mathv.py infer \
+    --model-path /path/to/model \
+    --data-dir /path/to/data \
+    --dataset MathVision \
+    --output-file results/test.jsonl \
+    --num-samples 100
+```
+
+### Debugging
+
+Enable debug mode for detailed logs:
+
+```bash
+DEBUG=true python run_mathv.py eval ...
+```
+
+This processes only the first 5 samples in single-threaded mode.
+
+## Citation
+
+If you use this code or the MathVision benchmark, please cite:
+
+```bibtex
+@article{mathvision,
+  title={Measuring Multimodal Mathematical Reasoning with MATH-Vision Dataset}, 
+  author={Ke Wang and Junting Pan and Weikang Shi and Zimu Lu and Mingjie Zhan and Hongsheng Li},
+  journal={arXiv:2402.14804},
+  year={2024}
+}
+```
+
+## License
+
+This code is released under the same license as the Qwen3-VL model.
+
+## Support
+
+For issues and questions:
+- GitHub Issues: [Qwen3-VL Repository](https://github.com/QwenLM/Qwen3-VL)
+- Documentation: See inline code comments and docstrings
--- a/evaluation/MathVision/common_utils.py
+++ b/evaluation/MathVision/common_utils.py
+import os
+import requests
+import base64
+import hashlib
+import io
+from PIL import Image
+from typing import List, Union
+
+def encode_image_to_base64(image, target_size=None):
+    """Encode an image to base64 string."""
+    if target_size is not None:
+        width, height = image.size
+        # Resize the image while maintaining the aspect ratio
+        if width > height:
+            new_width = target_size
+            new_height = int(height * target_size / width)
+        else:
+            new_height = target_size
+            new_width = int(width * target_size / height)
+        image = image.resize((new_width, new_height))
+    
+    buffer = io.BytesIO()
+    image.save(buffer, format="JPEG")
+    return base64.b64encode(buffer.getvalue()).decode('utf-8')
+
+def decode_base64_to_image(base64_string):
+    """Decode a base64 string to an image."""
+    image_data = base64.b64decode(base64_string)
+    return Image.open(io.BytesIO(image_data))
+
+def decode_base64_to_image_file(base64_string, output_path):
+    """Decode a base64 string and save it to a file."""
+    image = decode_base64_to_image(base64_string)
+    image.save(output_path)
+
+def download_file(url, local_path):
+    """Download a file from a URL to a local path."""
+    response = requests.get(url, stream=True)
+    response.raise_for_status()
+    
+    with open(local_path, 'wb') as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            f.write(chunk)
+
+def md5(file_path):
+    """Calculate the MD5 hash of a file."""
+    hash_md5 = hashlib.md5()
+    with open(file_path, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+
+def toliststr(s):
+    if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'):
+        return [str(x) for x in eval(s)]
+    elif isinstance(s, str):
+        return [s]
+    elif isinstance(s, list):
+        return [str(x) for x in s]
+    raise NotImplementedError
--- a/evaluation/MathVision/dataset_utils.py
+++ b/evaluation/MathVision/dataset_utils.py
+import os
+import pandas as pd
+import numpy as np
+from typing import Dict, Any
+from common_utils import download_file, md5, toliststr, decode_base64_to_image_file
+
+# MathVision dataset URLs and MD5
+MATHVISION_DATASET_URL = {
+    'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
+    'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
+}
+
+MATHVISION_DATASET_MD5 = {
+    'MathVision': '93f6de14f7916e598aa1b7165589831e',
+    'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33'
+}
+
+def load_dataset(dataset_name='MathVision'):
+    """Load the MathVision dataset."""
+    data_root = os.path.join(os.environ['LMUData'])
+    os.makedirs(data_root, exist_ok=True)
+    
+    file_name = f"{dataset_name}.tsv"
+    data_path = os.path.join(data_root, file_name)
+    
+    # Download if not exists or MD5 doesn't match
+    if dataset_name in MATHVISION_DATASET_MD5:
+        expected_md5 = MATHVISION_DATASET_MD5[dataset_name]
+        if not os.path.exists(data_path) or md5(data_path) != expected_md5:
+            print(f"Downloading {dataset_name} dataset...")
+            download_file(MATHVISION_DATASET_URL[dataset_name], data_path)
+    
+    # Load the dataset
+    data = pd.read_csv(data_path, sep='\t')
+    
+    # Process the dataset
+    data['index'] = [str(x) for x in data['index']]
+    
+    # Handle image data
+    if 'image' in data:
+        data['image'] = [str(x) for x in data['image']]
+        image_map = {x: y for x, y in zip(data['index'], data['image'])}
+        for k in image_map:
+            if len(image_map[k]) <= 64:
+                idx = image_map[k]
+                assert idx in image_map and len(image_map[idx]) > 64
+                image_map[k] = image_map[idx]
+
+        images = [toliststr(image_map[k]) for k in data['index']]
+        data['image'] = [x[0] if len(x) == 1 else x for x in images]
+    
+    # Handle image paths
+    if 'image_path' in data:
+        paths = [toliststr(x) for x in data['image_path']]
+        data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
+    
+    # Convert index to int if possible
+    if np.all([isinstance(x, int) or x.isdigit() for x in data['index']]):
+        data['index'] = [int(x) for x in data['index']]
+    
+    return data
+
+def dump_image(line, img_root):
+    """Save image data to disk and return the path."""
+    os.makedirs(img_root, exist_ok=True)
+    
+    if 'image' in line:
+        if isinstance(line['image'], list):
+            tgt_path = []
+            assert 'image_path' in line
+            for img, im_name in zip(line['image'], line['image_path']):
+                path = os.path.join(img_root, im_name)
+                if not os.path.exists(path):
+                    decode_base64_to_image_file(img, path)
+                tgt_path.append(path)
+        else:
+            tgt_path = os.path.join(img_root, f"{line['index']}.jpg")
+            if not os.path.exists(tgt_path):
+                decode_base64_to_image_file(line['image'], tgt_path)
+            tgt_path = [tgt_path]
+    else:
+        assert 'image_path' in line
+        tgt_path = toliststr(line['image_path'])
+    
+    return tgt_path
--- a/evaluation/MathVision/eval_instruct.sh
+++ b/evaluation/MathVision/eval_instruct.sh
+#!/bin/bash
+
+# MathVision Evaluation Script (Instruct Model)
+# This script evaluates the inference results using GPT-4o
+
+python run_mathv.py eval \
+    --data-dir /path/to/mathvision_data \
+    --input-file results/mathvision_predictions.jsonl \
+    --output-file results/mathvision_eval_results.csv \
+    --dataset MathVision \
+    --eval-model gpt-4o-2024-05-13 \
+    --api-type dash \
+    --nproc 16
--- a/evaluation/MathVision/eval_think.sh
+++ b/evaluation/MathVision/eval_think.sh
+#!/bin/bash
+
+# MathVision Evaluation Script (Thinking Model)
+# This script evaluates the inference results from thinking model using GPT-4o
+
+python run_mathv.py eval \
+    --data-dir /path/to/mathvision_data \
+    --input-file results/mathvision_predictions_thinking.jsonl \
+    --output-file results/mathvision_eval_results_thinking.csv \
+    --dataset MathVision \
+    --eval-model gpt-4o-2024-05-13 \
+    --api-type dash \
+    --nproc 16