Commit 3d735feb authored by luopl's avatar luopl
Browse files

"Initial commit"

parents
Pipeline #3074 canceled with stages
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
from typing import Union, Tuple, List
from qwen_agent.tools.base import BaseTool, register_tool
@register_tool("mobile_use")
class MobileUse(BaseTool):
@property
def description(self):
return f"""
Use a touchscreen to interact with a mobile device, and take screenshots.
* This is an interface to a mobile device with touchscreen. You can perform actions like clicking, typing, swiping, etc.
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions.
* The screen's resolution is {self.display_width_px}x{self.display_height_px}.
* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.
""".strip()
parameters = {
"properties": {
"action": {
"description": """
The action to perform. The available actions are:
* `key`: Perform a key event on the mobile device.
- This supports adb's `keyevent` syntax.
- Examples: "volume_up", "volume_down", "power", "camera", "clear".
* `click`: Click the point on the screen with coordinate (x, y).
* `long_press`: Press the point on the screen with coordinate (x, y) for specified seconds.
* `swipe`: Swipe from the starting point with coordinate (x, y) to the end point with coordinates2 (x2, y2).
* `type`: Input the specified text into the activated input box.
* `system_button`: Press the system button.
* `open`: Open an app on the device.
* `wait`: Wait specified seconds for the change to happen.
* `terminate`: Terminate the current task and report its completion status.
""".strip(),
"enum": [
"key",
"click",
"long_press",
"swipe",
"type",
"system_button",
"open",
"wait",
"terminate",
],
"type": "string",
},
"coordinate": {
"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=click`, `action=long_press`, and `action=swipe`.",
"type": "array",
},
"coordinate2": {
"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=swipe`.",
"type": "array",
},
"text": {
"description": "Required only by `action=key`, `action=type`, and `action=open`.",
"type": "string",
},
"time": {
"description": "The seconds to wait. Required only by `action=long_press` and `action=wait`.",
"type": "number",
},
"button": {
"description": "Back means returning to the previous interface, Home means returning to the desktop, Menu means opening the application background menu, and Enter means pressing the enter. Required only by `action=system_button`",
"enum": [
"Back",
"Home",
"Menu",
"Enter",
],
"type": "string",
},
"status": {
"description": "The status of the task. Required only by `action=terminate`.",
"type": "string",
"enum": ["success", "failure"],
},
},
"required": ["action"],
"type": "object",
}
def __init__(self, cfg=None):
self.display_width_px = cfg["display_width_px"]
self.display_height_px = cfg["display_height_px"]
super().__init__(cfg)
def call(self, params: Union[str, dict], **kwargs):
params = self._verify_json_format_args(params)
action = params["action"]
if action == "key":
return self._key(params["text"])
elif action == "click":
return self._click(
coordinate=params["coordinate"]
)
elif action == "long_press":
return self._long_press(
coordinate=params["coordinate"], time=params["time"]
)
elif action == "swipe":
return self._swipe(
coordinate=params["coordinate"], coordinate2=params["coordinate2"]
)
elif action == "type":
return self._type(params["text"])
elif action == "system_button":
return self._system_button(params["button"])
elif action == "open":
return self._open(params["text"])
elif action == "wait":
return self._wait(params["time"])
elif action == "terminate":
return self._terminate(params["status"])
else:
raise ValueError(f"Unknown action: {action}")
def _key(self, text: str):
raise NotImplementedError()
def _click(self, coordinate: Tuple[int, int]):
raise NotImplementedError()
def _long_press(self, coordinate: Tuple[int, int], time: int):
raise NotImplementedError()
def _swipe(self, coordinate: Tuple[int, int], coordinate2: Tuple[int, int]):
raise NotImplementedError()
def _type(self, text: str):
raise NotImplementedError()
def _system_button(self, button: str):
raise NotImplementedError()
def _open(self, text: str):
raise NotImplementedError()
def _wait(self, time: int):
raise NotImplementedError()
def _terminate(self, status: str):
raise NotImplementedError()
@register_tool("computer_use")
class ComputerUse(BaseTool):
@property
def description(self):
return f"""
Use a mouse and keyboard to interact with a computer, and take screenshots.
* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.
* The screen's resolution is {self.display_width_px}x{self.display_height_px}.
* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.
* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.
* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges.
""".strip()
parameters = {
"properties": {
"action": {
"description": """
The action to perform. The available actions are:
* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.
* `type`: Type a string of text on the keyboard.
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen.
* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.
* `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen.
* `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen.
* `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen (simulated as double-click since it's the closest action).
* `scroll`: Performs a scroll of the mouse scroll wheel.
* `hscroll`: Performs a horizontal scroll (mapped to regular scroll).
* `wait`: Wait specified seconds for the change to happen.
* `terminate`: Terminate the current task and report its completion status.
* `answer`: Answer a question.
""".strip(),
"enum": [
"key",
"type",
"mouse_move",
"left_click",
"left_click_drag",
"right_click",
"middle_click",
"double_click",
"triple_click",
"scroll",
"hscroll",
"wait",
"terminate",
"answer",
],
"type": "string",
},
"keys": {
"description": "Required only by `action=key`.",
"type": "array",
},
"text": {
"description": "Required only by `action=type` and `action=answer`.",
"type": "string",
},
"coordinate": {
"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to.",
"type": "array",
},
"pixels": {
"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll` and `action=hscroll`.",
"type": "number",
},
"time": {
"description": "The seconds to wait. Required only by `action=wait`.",
"type": "number",
},
"status": {
"description": "The status of the task. Required only by `action=terminate`.",
"type": "string",
"enum": ["success", "failure"],
},
},
"required": ["action"],
"type": "object",
}
def __init__(self, cfg=None):
self.display_width_px = cfg["display_width_px"]
self.display_height_px = cfg["display_height_px"]
super().__init__(cfg)
def call(self, params: Union[str, dict], **kwargs):
params = self._verify_json_format_args(params)
action = params["action"]
if action in ["left_click", "right_click", "middle_click", "double_click","triple_click"]:
return self._mouse_click(action)
elif action == "key":
return self._key(params["keys"])
elif action == "type":
return self._type(params["text"])
elif action == "mouse_move":
return self._mouse_move(params["coordinate"])
elif action == "left_click_drag":
return self._left_click_drag(params["coordinate"])
elif action == "scroll":
return self._scroll(params["pixels"])
elif action == "hscroll":
return self._hscroll(params["pixels"])
elif action == "answer":
return self._answer(params["text"])
elif action == "wait":
return self._wait(params["time"])
elif action == "terminate":
return self._terminate(params["status"])
else:
raise ValueError(f"Invalid action: {action}")
def _mouse_click(self, button: str):
raise NotImplementedError()
def _key(self, keys: List[str]):
raise NotImplementedError()
def _type(self, text: str):
raise NotImplementedError()
def _mouse_move(self, coordinate: Tuple[int, int]):
raise NotImplementedError()
def _left_click_drag(self, coordinate: Tuple[int, int]):
raise NotImplementedError()
def _scroll(self, pixels: int):
raise NotImplementedError()
def _hscroll(self, pixels: int):
raise NotImplementedError()
def _answer(self, text: str):
raise NotImplementedError()
def _wait(self, time: int):
raise NotImplementedError()
def _terminate(self, status: str):
raise NotImplementedError()
\ No newline at end of file
import os
from playwright.sync_api import sync_playwright
import argparse
from PIL import Image
import time
def take_screenshot(url, output_file="screenshot.png"):
# Convert local path to file:// URL if it's a file
if os.path.exists(url):
url = "file://" + os.path.abspath(url)
try:
with sync_playwright() as p:
# Choose a browser, e.g., Chromium, Firefox, or WebKit
browser = p.chromium.launch(headless=True, args=["--disable-web-security"])
page = browser.new_page()
# Navigate to the URL
page.goto(url, timeout=60000)
# page.wait_for_timeout(1000) # give it 1 second to paint
# Take the screenshot
page.screenshot(path=output_file, full_page=True, animations="disabled", timeout=60000)
browser.close()
except Exception as e:
print(f"Failed to take screenshot due to: {e}. Generating a blank image.")
# Generate a blank image
img = Image.new("RGB", (1280, 960), color="white")
img.save(output_file)
if __name__ == "__main__":
# Initialize the parser
parser = argparse.ArgumentParser(description="Process two path strings.")
# Define the arguments
parser.add_argument("--html", type=str)
parser.add_argument("--png", type=str)
# Parse the arguments
args = parser.parse_args()
take_screenshot(args.html, args.png)
This diff is collapsed.
This diff is collapsed.
doc/dog.jpg

46.1 KB

# MathVision Benchmark Evaluation
This directory contains the implementation for evaluating vision-language models on the MathVision benchmark using vLLM for high-speed inference.
## Overview
MathVision is a mathematical visual reasoning benchmark that evaluates models' ability to solve mathematical problems based on visual information. This implementation provides:
- **High-speed inference** using vLLM with automatic batch optimization
- **Two-stage evaluation** using rule-based and GPT-4o-based answer extraction
- **Support for thinking models** with extended reasoning capabilities
- **Modular code structure** for easy maintenance and extension
## Project Structure
```
MathVision/
├── run_mathv.py # Main script for inference and evaluation
├── dataset_utils.py # Dataset loading and preprocessing utilities
├── eval_utils.py # Evaluation logic and answer extraction
├── common_utils.py # Common utilities for image processing, file I/O
├── infer_instruct.sh # Inference script for instruct models
├── infer_think.sh # Inference script for thinking models
├── eval_instruct.sh # Evaluation script for instruct model results
├── eval_think.sh # Evaluation script for thinking model results
├── requirements.txt # Python dependencies
└── README.md # This file
```
## Requirements
### Python Dependencies
```bash
pip install -r requirements.txt
```
Key dependencies:
- `vllm` - High-speed LLM inference engine
- `transformers` - HuggingFace transformers
- `qwen_vl_utils` - Qwen VL utilities for vision processing
- `pandas`, `numpy` - Data processing
- `Pillow` - Image processing
- `latex2sympy2` - LaTeX to symbolic math conversion (optional)
- `openpyxl` - Excel file handling
- `requests` - API calls for evaluation
### Environment Variables
For evaluation, you need to set up API credentials for the judge model:
**Option 1: DashScope API (Recommended)**
```bash
export CHATGPT_DASHSCOPE_API_KEY="your-api-key"
export DASHSCOPE_API_BASE="https://dashscope.aliyuncs.com/compatible-mode/v1/chat/completions"
```
**Option 2: Custom OpenAI-compatible API**
```bash
export MIT_SPIDER_TOKEN="your-api-key"
export MIT_SPIDER_URL="your-api-endpoint"
```
## Quick Start
### 1. Inference
Run inference on MathVision dataset using an instruct model:
```bash
bash infer_instruct.sh
```
Or customize the inference:
```bash
python run_mathv.py infer \
--model-path /path/to/Qwen3-VL-Instruct \
--data-dir /path/to/data \
--dataset MathVision \
--output-file results/predictions.jsonl \
--max-new-tokens 32768 \
--temperature 0.7 \
--top-p 0.8 \
--top-k 20 \
--repetition-penalty 1.0 \
--presence-penalty 1.5
```
For thinking models with extended reasoning:
```bash
bash infer_think.sh
```
### 2. Evaluation
Evaluate the inference results using GPT-4o as a judge:
```bash
bash eval_instruct.sh
```
Or customize the evaluation:
```bash
python run_mathv.py eval \
--data-dir /path/to/data \
--input-file results/predictions.jsonl \
--output-file results/evaluation.csv \
--dataset MathVision \
--eval-model gpt-4o-2024-05-13 \
--api-type dash \
--nproc 16
```
## Detailed Usage
### Inference Mode
**Basic Arguments:**
- `--model-path`: Path to the Qwen3-VL model directory (required)
- `--data-dir`: Directory to store/load MathVision dataset (required)
- `--dataset`: Dataset name (default: `MathVision`)
- `MathVision`: Full dataset with ~3,000 samples
- `MathVision_MINI`: Mini version for quick testing
- `--output-file`: Path to save inference results in JSONL format (required)
**vLLM Arguments:**
- `--tensor-parallel-size`: Number of GPUs for tensor parallelism (default: auto-detect)
- `--gpu-memory-utilization`: GPU memory utilization ratio, 0.0-1.0 (default: 0.9)
- `--max-model-len`: Maximum model context length (default: 128000)
- `--max-images-per-prompt`: Maximum images per prompt (default: 10)
**Generation Parameters:**
- `--max-new-tokens`: Maximum tokens to generate (default: 32768)
- `--temperature`: Sampling temperature (default: 0.7)
- `--top-p`: Top-p sampling (default: 0.8)
- `--top-k`: Top-k sampling (default: 20)
- `--repetition-penalty`: Repetition penalty (default: 1.0)
- `--presence-penalty`: Presence penalty to reduce repetition (default: 1.5)
**Advanced Options:**
- `--use-cot`: Enable Chain-of-Thought prompting for better reasoning
- `--cot-prompt`: Custom CoT prompt (default: " Let's think step by step.")
- `--num-samples`: Number of samples to process (optional, for testing)
### Evaluation Mode
**Basic Arguments:**
- `--data-dir`: Directory containing MathVision dataset (required)
- `--input-file`: Inference results file in JSONL format (required)
- `--output-file`: Path to save evaluation results in CSV format (required)
- `--dataset`: Dataset name, must match inference (default: `MathVision`)
**Judge Model Arguments:**
- `--eval-model`: Judge model name (default: `gpt-4o`)
- Options: `gpt-4o`, `gpt-4o-2024-05-13`, `gpt-3.5-turbo-0125`, etc.
- `--api-type`: API service type (default: `dash`)
- `dash`: DashScope API (Alibaba Cloud)
- `mit`: Custom OpenAI-compatible API
- `--nproc`: Number of parallel workers for evaluation (default: 4)
## Output Files
### Inference Output
The inference script generates a JSONL file where each line contains:
```json
{
"question_id": 123,
"annotation": {
"index": 123,
"question": "What is the area of the triangle?",
"answer": "12",
"category": "Geometry",
"choices": "[]",
...
},
"task": "MathVision",
"result": {
"gen": "The final answer",
"gen_raw": "Raw model output including thinking process"
},
"messages": [...]
}
```
### Evaluation Output
The evaluation script generates multiple files:
1. **Intermediate results** (`*_eval_results.xlsx`): Raw predictions with metadata
2. **Detailed evaluation** (`*_eval_results_eval.xlsx`): Results with extracted answers
- Columns: `index`, `question`, `prediction`, `answer`, `res` (extracted), `log`, `extract_model`, `extract_flag`, `category`
3. **Score summary** (`*_eval_results_eval_score.csv`): Accuracy by category
Example score summary:
```
Subject | tot | prefetch | hit | prefetch_rate | acc
----------------|-----|----------|-----|---------------|------
Overall | 3000| 2400 | 2100| 80.0 | 70.0
Algebra | 800 | 640 | 560 | 80.0 | 70.0
Geometry | 750 | 600 | 525 | 80.0 | 70.0
```
## Model-Specific Configurations
### Instruct Models (e.g., Qwen3-VL-2B-Instruct, Qwen3-VL-30B-Instruct)
Use standard parameters for balanced performance:
```bash
--max-new-tokens 32768
--temperature 0.7
--top-p 0.8
--top-k 20
--repetition-penalty 1.0
--presence-penalty 1.5
```
### Thinking Models (e.g., Qwen3-VL-4B-Thinking, Qwen3-VL-30B-Thinking)
Use extended parameters for deeper reasoning:
```bash
--max-new-tokens 40960
--temperature 1.0
--top-p 0.95
--top-k 20
--repetition-penalty 1.0
--presence-penalty 0.0
```
**Note:** Thinking models output reasoning steps wrapped in `<think>...</think>` tags. The evaluation automatically extracts the final answer after `</think>`.
## Performance Tips
1. **GPU Memory**: Adjust `--gpu-memory-utilization` based on your GPU:
- 0.9: Recommended for most cases
- 0.95: For maximum throughput (may cause OOM)
- 0.7-0.8: If experiencing OOM errors
2. **Batch Size**: vLLM automatically optimizes batch size based on available memory
3. **Tensor Parallelism**: Use `--tensor-parallel-size` for large models:
- 2B/4B models: 1-2 GPUs
- 7B/14B models: 2-4 GPUs
- 30B+ models: 4-8 GPUs
4. **Context Length**: Reduce `--max-model-len` if memory is limited:
- 128000: Default, works well for most cases
- 64000: Reduces memory usage by ~40%
5. **Image Resolution**: MathVision uses optimized resolution (768×28×28 to 5120×28×28)
- Lower min_pixels for faster processing
- Higher max_pixels for better accuracy on complex diagrams
## Troubleshooting
### Common Issues
**1. CUDA Out of Memory**
```bash
# Reduce GPU memory utilization
--gpu-memory-utilization 0.7
# Or reduce context length
--max-model-len 64000
```
**2. vLLM Multiprocessing Issues**
The code automatically sets `VLLM_WORKER_MULTIPROC_METHOD=spawn`. If you still encounter issues:
```bash
export VLLM_WORKER_MULTIPROC_METHOD=spawn
```
**3. Evaluation API Errors**
- Verify API credentials are set correctly
- Check API endpoint connectivity
- Monitor rate limits
- Increase `--nproc` value if rate-limited (up to 32)
**4. Dataset Download Issues**
The dataset is automatically downloaded from:
```
https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv
```
If download fails, manually download and place in `--data-dir`.
**5. Excel Export Errors**
The code automatically removes illegal Excel characters. If you still encounter issues:
- Check `clean_for_excel()` function in `run_mathv.py`
- Ensure `openpyxl` is installed
**6. LaTeX Conversion Errors**
Install `latex2sympy2` for better LaTeX support:
```bash
pip install latex2sympy2
```
## Advanced Usage
### Custom Image Resolution
Edit `run_mathv.py` to modify image resolution:
```python
MIN_PIXELS = 768*28*28 # ~0.6M pixels
MAX_PIXELS = 5120*28*28 # ~4M pixels
```
### Custom Evaluation Prompts
The evaluation uses in-context examples defined in `eval_utils.py`:
- Edit `get_gpt4_ICE()` to customize examples
- Edit `build_mathv_gpt4_prompt()` to modify prompt structure
### Testing with Limited Samples
Use `--num-samples` for quick testing:
```bash
python run_mathv.py infer \
--model-path /path/to/model \
--data-dir /path/to/data \
--dataset MathVision \
--output-file results/test.jsonl \
--num-samples 100
```
### Debugging
Enable debug mode for detailed logs:
```bash
DEBUG=true python run_mathv.py eval ...
```
This processes only the first 5 samples in single-threaded mode.
## Citation
If you use this code or the MathVision benchmark, please cite:
```bibtex
@article{mathvision,
title={Measuring Multimodal Mathematical Reasoning with MATH-Vision Dataset},
author={Ke Wang and Junting Pan and Weikang Shi and Zimu Lu and Mingjie Zhan and Hongsheng Li},
journal={arXiv:2402.14804},
year={2024}
}
```
## License
This code is released under the same license as the Qwen3-VL model.
## Support
For issues and questions:
- GitHub Issues: [Qwen3-VL Repository](https://github.com/QwenLM/Qwen3-VL)
- Documentation: See inline code comments and docstrings
import os
import requests
import base64
import hashlib
import io
from PIL import Image
from typing import List, Union
def encode_image_to_base64(image, target_size=None):
"""Encode an image to base64 string."""
if target_size is not None:
width, height = image.size
# Resize the image while maintaining the aspect ratio
if width > height:
new_width = target_size
new_height = int(height * target_size / width)
else:
new_height = target_size
new_width = int(width * target_size / height)
image = image.resize((new_width, new_height))
buffer = io.BytesIO()
image.save(buffer, format="JPEG")
return base64.b64encode(buffer.getvalue()).decode('utf-8')
def decode_base64_to_image(base64_string):
"""Decode a base64 string to an image."""
image_data = base64.b64decode(base64_string)
return Image.open(io.BytesIO(image_data))
def decode_base64_to_image_file(base64_string, output_path):
"""Decode a base64 string and save it to a file."""
image = decode_base64_to_image(base64_string)
image.save(output_path)
def download_file(url, local_path):
"""Download a file from a URL to a local path."""
response = requests.get(url, stream=True)
response.raise_for_status()
with open(local_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
f.write(chunk)
def md5(file_path):
"""Calculate the MD5 hash of a file."""
hash_md5 = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def toliststr(s):
if isinstance(s, str) and (s[0] == '[') and (s[-1] == ']'):
return [str(x) for x in eval(s)]
elif isinstance(s, str):
return [s]
elif isinstance(s, list):
return [str(x) for x in s]
raise NotImplementedError
import os
import pandas as pd
import numpy as np
from typing import Dict, Any
from common_utils import download_file, md5, toliststr, decode_base64_to_image_file
# MathVision dataset URLs and MD5
MATHVISION_DATASET_URL = {
'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
}
MATHVISION_DATASET_MD5 = {
'MathVision': '93f6de14f7916e598aa1b7165589831e',
'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33'
}
def load_dataset(dataset_name='MathVision'):
"""Load the MathVision dataset."""
data_root = os.path.join(os.environ['LMUData'])
os.makedirs(data_root, exist_ok=True)
file_name = f"{dataset_name}.tsv"
data_path = os.path.join(data_root, file_name)
# Download if not exists or MD5 doesn't match
if dataset_name in MATHVISION_DATASET_MD5:
expected_md5 = MATHVISION_DATASET_MD5[dataset_name]
if not os.path.exists(data_path) or md5(data_path) != expected_md5:
print(f"Downloading {dataset_name} dataset...")
download_file(MATHVISION_DATASET_URL[dataset_name], data_path)
# Load the dataset
data = pd.read_csv(data_path, sep='\t')
# Process the dataset
data['index'] = [str(x) for x in data['index']]
# Handle image data
if 'image' in data:
data['image'] = [str(x) for x in data['image']]
image_map = {x: y for x, y in zip(data['index'], data['image'])}
for k in image_map:
if len(image_map[k]) <= 64:
idx = image_map[k]
assert idx in image_map and len(image_map[idx]) > 64
image_map[k] = image_map[idx]
images = [toliststr(image_map[k]) for k in data['index']]
data['image'] = [x[0] if len(x) == 1 else x for x in images]
# Handle image paths
if 'image_path' in data:
paths = [toliststr(x) for x in data['image_path']]
data['image_path'] = [x[0] if len(x) == 1 else x for x in paths]
# Convert index to int if possible
if np.all([isinstance(x, int) or x.isdigit() for x in data['index']]):
data['index'] = [int(x) for x in data['index']]
return data
def dump_image(line, img_root):
"""Save image data to disk and return the path."""
os.makedirs(img_root, exist_ok=True)
if 'image' in line:
if isinstance(line['image'], list):
tgt_path = []
assert 'image_path' in line
for img, im_name in zip(line['image'], line['image_path']):
path = os.path.join(img_root, im_name)
if not os.path.exists(path):
decode_base64_to_image_file(img, path)
tgt_path.append(path)
else:
tgt_path = os.path.join(img_root, f"{line['index']}.jpg")
if not os.path.exists(tgt_path):
decode_base64_to_image_file(line['image'], tgt_path)
tgt_path = [tgt_path]
else:
assert 'image_path' in line
tgt_path = toliststr(line['image_path'])
return tgt_path
#!/bin/bash
# MathVision Evaluation Script (Instruct Model)
# This script evaluates the inference results using GPT-4o
python run_mathv.py eval \
--data-dir /path/to/mathvision_data \
--input-file results/mathvision_predictions.jsonl \
--output-file results/mathvision_eval_results.csv \
--dataset MathVision \
--eval-model gpt-4o-2024-05-13 \
--api-type dash \
--nproc 16
#!/bin/bash
# MathVision Evaluation Script (Thinking Model)
# This script evaluates the inference results from thinking model using GPT-4o
python run_mathv.py eval \
--data-dir /path/to/mathvision_data \
--input-file results/mathvision_predictions_thinking.jsonl \
--output-file results/mathvision_eval_results_thinking.csv \
--dataset MathVision \
--eval-model gpt-4o-2024-05-13 \
--api-type dash \
--nproc 16
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment