Commit 26e59280 authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #2674 failed with stages
in 0 seconds
icon.png

53.8 KB

import math
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer, AutoConfig
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
def split_model(model_name):
device_map = {}
world_size = torch.cuda.device_count()
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
num_layers = config.llm_config.num_hidden_layers
# Since the first GPU will be used for ViT, treat it as half a GPU.
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.model.rotary_emb'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
# If you set `load_in_8bit=True`, you will need two 80GB GPUs.
# If you set `load_in_8bit=False`, you will need at least three 80GB GPUs.
path = '/home/wanglch/InternVL/InternVL3-1B/'
device_map = split_model('InternVL3-1B')
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
load_in_8bit=False,
low_cpu_mem_usage=True,
use_flash_attn=False,
trust_remote_code=True,
device_map=device_map).eval()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
# set the max number of tiles in `max_num`
pixel_values = load_image('/home/wanglch/Images/8.jpg', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)
# single-image single-round conversation (单图单轮对话)
question = '<image>\n提取图片中的文字信息,并保留文字信息的位置'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')
# InternVL-Chat
This folder contains the implementation of the InternVL-Chat.
## 📖 Documents
### 🌟 **Get Started**
- **Installation**: 🌱 [Installation Guide](https://internvl.readthedocs.io/en/latest/get_started/installation.html) | 📄 [requirements.txt](./requirements.txt)
- **Chat Data Format**: 📝 [Meta File](https://internvl.readthedocs.io/en/latest/get_started/chat_data_format.html#meta-file) | ✏️ [Text](https://internvl.readthedocs.io/en/latest/get_started/chat_data_format.html#pure-text-data) | 🖼️ [Single-Image](https://internvl.readthedocs.io/en/latest/get_started/chat_data_format.html#single-image-data) | 🖼️🖼️ [Multi-Image](https://internvl.readthedocs.io/en/latest/get_started/chat_data_format.html#multi-image-data) | 🎥 [Video](https://internvl.readthedocs.io/en/latest/get_started/chat_data_format.html#video-data)
- **Local Chat Demo**: 🤖 [Streamlit Demo](https://internvl.readthedocs.io/en/latest/get_started/local_chat_demo.html#streamlit-demo)
- **InternVL-Chat API**: 🌐 [InternVL2-Pro](https://internvl.readthedocs.io/en/latest/get_started/internvl_chat_api.html#official-api-of-internvl2-pro)
- **Tutorials**: 🚀 [Enhancing InternVL2 on COCO Caption Using LoRA Fine-Tuning](https://internvl.readthedocs.io/en/latest/tutorials/coco_caption_finetune.html)
### 🏆 **InternVL Family**
- **InternVL 2.5**: 📖 [Introduction](https://internvl.readthedocs.io/en/latest/internvl2.5/introduction.html) | ⚡ [Quick Start](https://internvl.readthedocs.io/en/latest/internvl2.5/quick_start.html) | ✨ [Finetune](https://internvl.readthedocs.io/en/latest/internvl2.5/finetune.html) | 📊 [Evaluation](https://internvl.readthedocs.io/en/latest/internvl2.5/evaluation.html) | 📦 [Deployment](https://internvl.readthedocs.io/en/latest/internvl2.5/deployment.html) | 🎯 [Preference Optimization](https://internvl.readthedocs.io/en/latest/internvl2.5/preference_optimization.html)
- **InternVL 2.0**: 📖 [Introduction](https://internvl.readthedocs.io/en/latest/internvl2.0/introduction.html) | ⚡ [Quick Start](https://internvl.readthedocs.io/en/latest/internvl2.0/quick_start.html) | ✨ [Finetune](https://internvl.readthedocs.io/en/latest/internvl2.0/finetune.html) | 📊 [Evaluation](https://internvl.readthedocs.io/en/latest/internvl2.0/evaluation.html) | 📦 [Deployment](https://internvl.readthedocs.io/en/latest/internvl2.0/deployment.html) | 🎯 [Preference Optimization](https://internvl.readthedocs.io/en/latest/internvl2.0/preference_optimization.html)
- **InternVL 1.5**: 📖 [Introduction](https://internvl.readthedocs.io/en/latest/internvl1.5/introduction.html) | ⚡ [Quick Start](https://internvl.readthedocs.io/en/latest/internvl1.5/quick_start.html) | ✨ [Finetune](https://internvl.readthedocs.io/en/latest/internvl1.5/finetune.html) | 📊 [Evaluation](https://internvl.readthedocs.io/en/latest/internvl1.5/evaluation.html) | 📦 [Deployment](https://internvl.readthedocs.io/en/latest/internvl1.5/deployment.html)
- **InternVL 1.2**: 📖 [Introduction](https://internvl.readthedocs.io/en/latest/internvl1.2/introduction.html) | ⚡ [Quick Start](https://internvl.readthedocs.io/en/latest/internvl1.2/quick_start.html) | ✨ [Finetune](https://internvl.readthedocs.io/en/latest/internvl1.2/finetune.html) | 📊 [Evaluation](https://internvl.readthedocs.io/en/latest/internvl1.2/evaluation.html)
- **InternVL 1.1**: 📖 [Introduction](https://internvl.readthedocs.io/en/latest/internvl1.1/introduction.html) | ⚡ [Quick Start](https://internvl.readthedocs.io/en/latest/internvl1.1/quick_start.html) | 📊 [Evaluation](https://internvl.readthedocs.io/en/latest/internvl1.1/evaluation.html)
# Introduction
We are excited to introduce **InternVL 2.5**, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality.
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/5HDAGOQOZvS1EtI107Ac-.png)
## InternVL 2.5 Family
In the following table, we provide an overview of the InternVL 2.5 series.
| Model Name | Vision Part | Language Part | HF Link |
| :-------------: | :-------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------: | :---------------------------------------------------------: |
| InternVL2_5-1B | [InternViT-300M-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5) | [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-1B) |
| InternVL2_5-2B | [InternViT-300M-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5) | [internlm2_5-1_8b-chat](https://huggingface.co/internlm/internlm2_5-1_8b-chat) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-2B) |
| InternVL2_5-4B | [InternViT-300M-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5) | [Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-4B) |
| InternVL2_5-8B | [InternViT-300M-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5) | [internlm2_5-7b-chat](https://huggingface.co/internlm/internlm2_5-7b-chat) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-8B) |
| InternVL2_5-26B | [InternViT-6B-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V2_5) | [internlm2_5-20b-chat](https://huggingface.co/internlm/internlm2_5-20b-chat) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-26B) |
| InternVL2_5-38B | [InternViT-6B-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V2_5) | [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-38B) |
| InternVL2_5-78B | [InternViT-6B-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V2_5) | [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-78B) |
## Model Architecture
As shown in the following figure, InternVL 2.5 retains the same model architecture as its predecessors, InternVL 1.5 and 2.0, following the "ViT-MLP-LLM" paradigm. In this new version, we integrate a newly incrementally pre-trained InternViT with various pre-trained LLMs, including InternLM 2.5 and Qwen 2.5, using a randomly initialized MLP projector.
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/BiiyXN6NOk0p-3rl3ueyL.png)
As in the previous version, we applied a pixel unshuffle operation, reducing the number of visual tokens to one-quarter of the original. Besides, we adopted a similar dynamic resolution strategy as InternVL 1.5, dividing images into tiles of 448×448 pixels. The key difference, starting from InternVL 2.0, is that we additionally introduced support for multi-image and video data.
## Training Strategy
### Dynamic High-Resolution for Multimodal Data
In InternVL 2.0 and 2.5, we extend the dynamic high-resolution training approach, enhancing its capabilities to handle multi-image and video datasets.
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/xoMY6rwRrNxbAGYPNyU8g.png)
- For single-image datasets, the total number of tiles `n_max` are allocated to a single image for maximum resolution. Visual tokens are enclosed in `<img>` and `</img>` tags.
- For multi-image datasets, the total number of tiles `n_max` are distributed across all images in a sample. Each image is labeled with auxiliary tags like `Image-1` and enclosed in `<img>` and `</img>` tags.
- For videos, each frame is resized to 448×448. Frames are labeled with tags like `Frame-1` and enclosed in `<img>` and `</img>` tags, similar to images.
### Single Model Training Pipeline
The training pipeline for a single model in InternVL 2.5 is structured across three stages, designed to enhance the model's visual perception and multimodal capabilities.
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/5NduZeCPLgPJTFr0RGTq3.png)
- **Stage 1: MLP Warmup.** In this stage, only the MLP projector is trained while the vision encoder and language model are frozen. A dynamic high-resolution training strategy is applied for better performance, despite increased cost. This phase ensures robust cross-modal alignment and prepares the model for stable multimodal training.
- **Stage 1.5: ViT Incremental Learning (Optional).** This stage allows incremental training of the vision encoder and MLP projector using the same data as Stage 1. It enhances the encoder’s ability to handle rare domains like multilingual OCR and mathematical charts. Once trained, the encoder can be reused across LLMs without retraining, making this stage optional unless new domains are introduced.
- **Stage 2: Full Model Instruction Tuning.** The entire model is trained on high-quality multimodal instruction datasets. Strict data quality controls are enforced to prevent degradation of the LLM, as noisy data can cause issues like repetitive or incorrect outputs. After this stage, the training process is complete.
### Progressive Scaling Strategy
We introduce a progressive scaling strategy to align the vision encoder with LLMs efficiently. This approach trains with smaller LLMs first (e.g., 20B) to optimize foundational visual capabilities and cross-modal alignment before transferring the vision encoder to larger LLMs (e.g., 72B) without retraining. This reuse skips intermediate stages for larger models.
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64006c09330a45b03605bba3/UoNUyS7ctN5pBxNv9KnzH.png)
Compared to Qwen2-VL's 1.4 trillion tokens, InternVL2.5-78B uses only 120 billion tokens—less than one-tenth. This strategy minimizes redundancy, maximizes pre-trained component reuse, and enables efficient training for complex vision-language tasks.
### Training Enhancements
To improve real-world adaptability and performance, we introduce two key techniques:
- **Random JPEG Compression**: Random JPEG compression with quality levels between 75 and 100 is applied as a data augmentation technique. This simulates image degradation from internet sources, enhancing the model's robustness to noisy images.
- **Loss Reweighting**: To balance the NTP loss across responses of different lengths, we use a reweighting strategy called **square averaging**. This method balances contributions from responses of varying lengths, mitigating biases toward longer or shorter responses.
## Data Organization
### Dataset Configuration
In InternVL 2.0 and 2.5, the organization of the training data is controlled by several key parameters to optimize the balance and distribution of datasets during training.
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/2LJe24b1ua3gjI9gDitVl.png)
- **Data Augmentation:** JPEG compression is applied conditionally: enabled for image datasets to enhance robustness and disabled for video datasets to maintain consistent frame quality.
- **Maximum Tile Number:** The parameter `n_max` controls the maximum tiles per dataset. For example, higher values (24–36) are used for multi-image or high-resolution data, lower values (6–12) for standard images, and 1 for videos.
- **Repeat Factor:** The repeat factor `r` adjusts dataset sampling frequency. Values below 1 reduce a dataset's weight, while values above 1 increase it. This ensures balanced training across tasks and prevents overfitting or underfitting.
### Data Filtering Pipeline
During development, we found that LLMs are highly sensitive to data noise, with even small anomalies—like outliers or repetitive data—causing abnormal behavior during inference. Repetitive generation, especially in long-form or CoT reasoning tasks, proved particularly harmful.
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/aka8ZRiKF3ajdyZBnNFZI.png)
To address this challenge and support future research, we designed an efficient data filtering pipeline to remove low-quality samples.
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/70l1UxnX-Arn0NoOGwpth.png)
The pipeline includes two modules, for **pure-text data**, three key strategies are used:
1. **LLM-Based Quality Scoring**: Each sample is scored (0–10) using a pre-trained LLM with domain-specific prompts. Samples scoring below a threshold (e.g., 7) are removed to ensure high-quality data.
2. **Repetition Detection**: Repetitive samples are flagged using LLM-based prompts and manually reviewed. Samples scoring below a stricter threshold (e.g., 3) are excluded to avoid repetitive patterns.
3. **Heuristic Rule-Based Filtering**: Anomalies like abnormal sentence lengths or duplicate lines are detected using rules. Flagged samples undergo manual verification to ensure accuracy before removal.
For **multimodal data**, two strategies are used:
1. **Repetition Detection**: Repetitive samples in non-academic datasets are flagged and manually reviewed to prevent pattern loops. High-quality datasets are exempt from this process.
2. **Heuristic Rule-Based Filtering**: Similar rules are applied to detect visual anomalies, with flagged data verified manually to maintain integrity.
### Training Data
As shown in the following figure, from InternVL 1.5 to 2.0 and then to 2.5, the fine-tuning data mixture has undergone iterative improvements in scale, quality, and diversity. For more information about the training data, please refer to our technical report.
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/GaTY9Lde02YzclASMthDa.png)
## Evaluation on Multimodal Capability
### Multimodal Reasoning and Mathematics
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/ihFWMRHbF0lpFTkLqnnj1.png)
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/Nrzq0kjlitjp_jrJCqtwX.png)
### OCR, Chart, and Document Understanding
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/3yCMoLjlbsqY7ZJViGzih.png)
### Multi-Image & Real-World Comprehension
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/DSnalmEyhDVQ9GE0GPCla.png)
### Comprehensive Multimodal & Hallucination Evaluation
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/Z7Raj3TGDiV1H81pDHtoG.png)
### Visual Grounding
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/lPcIrng8MPSg_PM1hpDPt.png)
### Multimodal Multilingual Understanding
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/BPpbAOX36RV8RTnm3j-gs.png)
### Video Understanding
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64006c09330a45b03605bba3/tcwH-i1qc8H16En-7AZ5M.png)
## Evaluation on Language Capability
Training InternVL 2.0 models led to a decline in pure language capabilities. InternVL 2.5 addresses this by collecting more high-quality open-source data and filtering out low-quality data, achieving better preservation of pure language performance.
![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/mxuSKvSY-kfI8zePpXj6y.png)
## Quick Start
We provide an example code to run `InternVL2_5-8B` using `transformers`.
> Please use transformers>=4.37.2 to ensure the model works normally.
### Model Loading
#### 16-bit (bf16 / fp16)
```python
import torch
from transformers import AutoTokenizer, AutoModel
path = "OpenGVLab/InternVL2_5-8B"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval().cuda()
```
#### BNB 8-bit Quantization
```python
import torch
from transformers import AutoTokenizer, AutoModel
path = "OpenGVLab/InternVL2_5-8B"
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
load_in_8bit=True,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval()
```
#### Multiple GPUs
The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
```python
import math
import torch
from transformers import AutoTokenizer, AutoModel
def split_model(model_name):
device_map = {}
world_size = torch.cuda.device_count()
num_layers = {
'InternVL2_5-1B': 24, 'InternVL2_5-2B': 24, 'InternVL2_5-4B': 36, 'InternVL2_5-8B': 32,
'InternVL2_5-26B': 48, 'InternVL2_5-38B': 64, 'InternVL2_5-78B': 80}[model_name]
# Since the first GPU will be used for ViT, treat it as half a GPU.
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
layer_cnt = 0
for i, num_layer in enumerate(num_layers_per_gpu):
for j in range(num_layer):
device_map[f'language_model.model.layers.{layer_cnt}'] = i
layer_cnt += 1
device_map['vision_model'] = 0
device_map['mlp1'] = 0
device_map['language_model.model.tok_embeddings'] = 0
device_map['language_model.model.embed_tokens'] = 0
device_map['language_model.output'] = 0
device_map['language_model.model.norm'] = 0
device_map['language_model.lm_head'] = 0
device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
return device_map
path = "OpenGVLab/InternVL2_5-8B"
device_map = split_model('InternVL2_5-8B')
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True,
device_map=device_map).eval()
```
### Inference with Transformers
```python
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
T.ToTensor(),
T.Normalize(mean=MEAN, std=STD)
])
return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
best_ratio_diff = float('inf')
best_ratio = (1, 1)
area = width * height
for ratio in target_ratios:
target_aspect_ratio = ratio[0] / ratio[1]
ratio_diff = abs(aspect_ratio - target_aspect_ratio)
if ratio_diff < best_ratio_diff:
best_ratio_diff = ratio_diff
best_ratio = ratio
elif ratio_diff == best_ratio_diff:
if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
best_ratio = ratio
return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
orig_width, orig_height = image.size
aspect_ratio = orig_width / orig_height
# calculate the existing image aspect ratio
target_ratios = set(
(i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
i * j <= max_num and i * j >= min_num)
target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
# find the closest aspect ratio to the target
target_aspect_ratio = find_closest_aspect_ratio(
aspect_ratio, target_ratios, orig_width, orig_height, image_size)
# calculate the target width and height
target_width = image_size * target_aspect_ratio[0]
target_height = image_size * target_aspect_ratio[1]
blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
# resize the image
resized_img = image.resize((target_width, target_height))
processed_images = []
for i in range(blocks):
box = (
(i % (target_width // image_size)) * image_size,
(i // (target_width // image_size)) * image_size,
((i % (target_width // image_size)) + 1) * image_size,
((i // (target_width // image_size)) + 1) * image_size
)
# split the image
split_img = resized_img.crop(box)
processed_images.append(split_img)
assert len(processed_images) == blocks
if use_thumbnail and len(processed_images) != 1:
thumbnail_img = image.resize((image_size, image_size))
processed_images.append(thumbnail_img)
return processed_images
def load_image(image_file, input_size=448, max_num=12):
image = Image.open(image_file).convert('RGB')
transform = build_transform(input_size=input_size)
images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return pixel_values
# If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
path = 'OpenGVLab/InternVL2_5-8B'
model = AutoModel.from_pretrained(
path,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
use_flash_attn=True,
trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
# set the max number of tiles in `max_num`
pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
generation_config = dict(max_new_tokens=1024, do_sample=True)
# pure-text conversation (纯文本对话)
question = 'Hello, who are you?'
response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Can you tell me a story?'
response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# single-image single-round conversation (单图单轮对话)
question = '<image>\nPlease describe the image shortly.'
response = model.chat(tokenizer, pixel_values, question, generation_config)
print(f'User: {question}\nAssistant: {response}')
# single-image multi-round conversation (单图多轮对话)
question = '<image>\nPlease describe the image in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Please write a poem according to the image.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# multi-image multi-round conversation, combined images (多图多轮对话,拼接图像)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
question = '<image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# multi-image multi-round conversation, separate images (多图多轮对话,独立图像)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'What are the similarities and differences between these two images.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list,
history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
# batch inference, single image per sample (单图批处理)
pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
responses = model.batch_chat(tokenizer, pixel_values,
num_patches_list=num_patches_list,
questions=questions,
generation_config=generation_config)
for question, response in zip(questions, responses):
print(f'User: {question}\nAssistant: {response}')
# video multi-round conversation (视频多轮对话)
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
if bound:
start, end = bound[0], bound[1]
else:
start, end = -100000, 100000
start_idx = max(first_idx, round(start * fps))
end_idx = min(round(end * fps), max_frame)
seg_size = float(end_idx - start_idx) / num_segments
frame_indices = np.array([
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
for idx in range(num_segments)
])
return frame_indices
def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
max_frame = len(vr) - 1
fps = float(vr.get_avg_fps())
pixel_values_list, num_patches_list = [], []
transform = build_transform(input_size=input_size)
frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
for frame_index in frame_indices:
img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
pixel_values = [transform(tile) for tile in img]
pixel_values = torch.stack(pixel_values)
num_patches_list.append(pixel_values.shape[0])
pixel_values_list.append(pixel_values)
pixel_values = torch.cat(pixel_values_list)
return pixel_values, num_patches_list
video_path = './examples/red-panda.mp4'
pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
pixel_values = pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
question = video_prefix + 'What is the red panda doing?'
# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list, history=None, return_history=True)
print(f'User: {question}\nAssistant: {response}')
question = 'Describe this video in detail.'
response, history = model.chat(tokenizer, pixel_values, question, generation_config,
num_patches_list=num_patches_list, history=history, return_history=True)
print(f'User: {question}\nAssistant: {response}')
```
#### Streaming Output
Besides this method, you can also use the following code to get streamed output.
```python
from transformers import TextIteratorStreamer
from threading import Thread
# Initialize the streamer
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
# Define the generation configuration
generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
# Start the model chat in a separate thread
thread = Thread(target=model.chat, kwargs=dict(
tokenizer=tokenizer, pixel_values=pixel_values, question=question,
history=None, return_history=False, generation_config=generation_config,
))
thread.start()
# Initialize an empty string to store the generated text
generated_text = ''
# Loop through the streamer to get the new text as it is generated
for new_text in streamer:
if new_text == model.conv_template.sep:
break
generated_text += new_text
print(new_text, end='', flush=True) # Print each new chunk of generated text on the same line
```
## Finetune
Many repositories now support fine-tuning of the InternVL series models, including [InternVL](https://github.com/OpenGVLab/InternVL), [SWIFT](https://github.com/modelscope/ms-swift), [XTurner](https://github.com/InternLM/xtuner), and others. Please refer to their documentation for more details on fine-tuning.
## Deployment
### LMDeploy
LMDeploy is a toolkit for compressing, deploying, and serving LLMs & VLMs.
```sh
pip install lmdeploy>=0.6.4 --no-deps
```
LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline.
#### A 'Hello, world' Example
```python
from lmdeploy import pipeline, TurbomindEngineConfig
from lmdeploy.vl import load_image
model = 'OpenGVLab/InternVL2_5-8B'
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
response = pipe(('describe this image', image))
print(response.text)
```
If `ImportError` occurs while executing this case, please install the required dependency packages as prompted.
#### Multi-images Inference
When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased.
```python
from lmdeploy import pipeline, TurbomindEngineConfig
from lmdeploy.vl import load_image
from lmdeploy.vl.constants import IMAGE_TOKEN
model = 'OpenGVLab/InternVL2_5-8B'
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
image_urls=[
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg',
'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/det.jpg'
]
images = [load_image(img_url) for img_url in image_urls]
# Numbering images improves multi-image conversations
response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe these two images', images))
print(response.text)
```
#### Batch Prompts Inference
Conducting inference with batch prompts is quite straightforward; just place them within a list structure:
```python
from lmdeploy import pipeline, TurbomindEngineConfig
from lmdeploy.vl import load_image
model = 'OpenGVLab/InternVL2_5-8B'
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
image_urls=[
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg",
"https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/det.jpg"
]
prompts = [('describe this image', load_image(img_url)) for img_url in image_urls]
response = pipe(prompts)
print(response)
```
#### Multi-turn Conversation
There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the `pipeline.chat` interface.
```python
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
from lmdeploy.vl import load_image
model = 'OpenGVLab/InternVL2_5-8B'
pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.8)
sess = pipe.chat(('describe this image', image), gen_config=gen_config)
print(sess.response.text)
sess = pipe.chat('What is the woman doing?', session=sess, gen_config=gen_config)
print(sess.response.text)
```
#### Service
LMDeploy's `api_server` enables models to be easily packed into services with a single command. The provided RESTful APIs are compatible with OpenAI's interfaces. Below are an example of service startup:
```shell
lmdeploy serve api_server OpenGVLab/InternVL2_5-8B --server-port 23333
```
To use the OpenAI-style interface, you need to install OpenAI:
```shell
pip install openai
```
Then, use the code below to make the API call:
```python
from openai import OpenAI
client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
model_name = client.models.list().data[0].id
response = client.chat.completions.create(
model=model_name,
messages=[{
'role':
'user',
'content': [{
'type': 'text',
'text': 'describe this image',
}, {
'type': 'image_url',
'image_url': {
'url':
'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/tiger.jpeg',
},
}],
}],
temperature=0.8,
top_p=0.8)
print(response)
```
# README for Evaluation
Here, we list the codebase we used to obtain the evaluation results in the InternVL 2.5 technical report.
## Multimodal Reasoning and Mathematics
| Benchmark Name | Codebase |
| -------------- | -------------------------------------------------------- |
| MMMU | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MMMU-Pro | [This Codebase](./mmmu_pro) |
| MathVista | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MATH-Vision | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MathVerse | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| OlympiadBench | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
## Multimodal Reasoning and Mathematics
| Benchmark Name | Codebase |
| ----------------- | -------------------------------------------------------- |
| AI2D with mask | [This Codebase](./vqa) |
| AI2D without mask | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| ChartQA | [This Codebase](./vqa) |
| DocVQA | [This Codebase](./vqa) |
| InfoVQA | [This Codebase](./vqa) |
| OCRBench | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| SEED-2-Plus | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| CharXiv | [CharXiv](https://github.com/princeton-nlp/CharXiv) |
| VCR | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
## Multi-Image Understanding
| Benchmark Name | Codebase |
| -------------- | -------------------------------------------------------- |
| BLINK | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| Mantis Eval | [This Codebase](./mantis_eval) |
| MMIU | [This Codebase](./mmiu) |
| MuirBench | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MMT-Bench | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MIRB | [This Codebase](./mirb) |
## Real-World Comprehension
| Benchmark Name | Codebase |
| -------------- | -------------------------------------------------------- |
| RealWorldQA | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MME-RealWorld | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| WildVision | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| R-Bench | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
## Comprehensive Multimodal Evaluation
| Benchmark Name | Codebase |
| -------------- | -------------------------------------------------------- |
| MME | [This Codebase](./mme) |
| MMBench | [This Codebase](./mmbench) |
| MMBench v1.1 | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MMVet | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MMVet v2 | [This Codebase](./mmvetv2) |
| MMStar | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
## Multimodal Hallucination Evaluation
| Benchmark Name | Codebase |
| -------------- | -------------------------------------------------------- |
| HallBench | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MMHal-Bench | [This Codebase](./mmhal) |
| CRPE | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| POPE | [This Codebase](./pope) |
## Visual Grounding
| Benchmark Name | Codebase |
| -------------- | -------------------------- |
| RefCOCO | [This Codebase](./refcoco) |
| RefCOCO+ | [This Codebase](./refcoco) |
| RefCOCOg | [This Codebase](./refcoco) |
## Multimodal Multilingual Understanding
| Benchmark Name | Codebase |
| -------------------- | -------------------------------------------------------- |
| MMMB | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| Multilingual MMBench | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MTVQA | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
## Video Understanding
| Benchmark Name | Codebase |
| -------------- | -------------------------------------------------------- |
| Video-MME | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MVBench | [This Codebase](./mvbench) |
| MMBench-Video | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| MLVU | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| LongVideoBench | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
| CG-Bench | provided by authors |
# README for Evaluation
## 🌟 Overview
This script provides an evaluation pipeline for image captioning across three datasets: `COCO`, `Flickr30k`, and `NoCaps`.
## 🗂️ Data Preparation
Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
### COCO Karpathy Test
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/coco && cd data/coco
# Step 2: Download and unzip image files
wget http://images.cocodataset.org/zips/train2014.zip && unzip train2014.zip
wget http://images.cocodataset.org/zips/val2014.zip && unzip val2014.zip
wget http://images.cocodataset.org/zips/test2015.zip && unzip test2015.zip
# Step 3: Download and place the annotation files
mkdir -p annotations && cd annotations/
wget https://github.com/OpenGVLab/InternVL/releases/download/data/coco_karpathy_test.json
wget https://github.com/OpenGVLab/InternVL/releases/download/data/coco_karpathy_test_gt.json
cd ../../..
```
After preparation is complete, the directory structure is:
```shell
data/coco
├── annotations
│ ├── coco_karpathy_test.json
│ └── coco_karpathy_test_gt.json
├── train2014
├── val2014
└── test2015
```
### Flickr30K Karpathy Test
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/flickr30k && cd data/flickr30k
# Step 2: Download and unzip image files
# Download images from https://bryanplummer.com/Flickr30kEntities/
# Step 3: Download and place the annotation files
# Karpathy split annotations can be downloaded from the following link:
wget https://github.com/mehdidc/retrieval_annotations/releases/download/1.0.0/flickr30k_test_karpathy.txt
# This file is provided by the clip-benchmark repository.
# We convert this txt file to json format, download the converted file:
wget https://github.com/OpenGVLab/InternVL/releases/download/data/flickr30k_test_karpathy.json
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/flickr30k
├── Images
├── flickr30k_test_karpathy.txt
└── flickr30k_test_karpathy.json
```
### NoCaps Val
Follow the instructions below to prepare the data:
```shell
# Step 1: Create the data directory
mkdir -p data/nocaps && cd data/nocaps
# Step 2: Download and unzip image files
# Download images from https://nocaps.org/download
# Step 3: Download and place the annotation files
# Original annotations can be downloaded from https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json
wget https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json
cd ../..
```
After preparation is complete, the directory structure is:
```shell
data/nocaps
├── images
└── nocaps_val_4500_captions.json
```
## 🏃 Evaluation Execution
> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
To run the evaluation, execute the following command on an 8-GPU setup:
```shell
torchrun --nproc_per_node=8 eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets ${DATASETS} --dynamic
```
Alternatively, you can run the following simplified command:
```shell
# Test COCO, Flickr30K, and NoCaps
GPUS=8 sh evaluate.sh ${CHECKPOINT} caption --dynamic
# Test COCO only
GPUS=8 sh evaluate.sh ${CHECKPOINT} caption-coco --dynamic
# Test Flickr30K only
GPUS=8 sh evaluate.sh ${CHECKPOINT} caption-flickr30k --dynamic
# Test NoCaps only
GPUS=8 sh evaluate.sh ${CHECKPOINT} caption-nocaps --dynamic
```
### Arguments
The following arguments can be configured for the evaluation script:
| Argument | Type | Default | Description |
| ---------------- | ------ | ------------------------- | ----------------------------------------------------------------------------------------------------------------- |
| `--checkpoint` | `str` | `''` | Path to the model checkpoint. |
| `--datasets` | `str` | `'coco,flickr30k,nocaps'` | Comma-separated list of datasets to evaluate. |
| `--dynamic` | `flag` | `False` | Enables dynamic high resolution preprocessing. |
| `--max-num` | `int` | `6` | Maximum tile number for dynamic high resolution. |
| `--load-in-8bit` | `flag` | `False` | Loads the model weights in 8-bit precision. |
| `--auto` | `flag` | `False` | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
import argparse
import itertools
import json
import os
import random
import time
from functools import partial
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from pycocoevalcap.eval import COCOEvalCap
from pycocotools.coco import COCO
from tqdm import tqdm
ds_collections = {
'flickr30k': {
'root': 'data/flickr30k/',
'annotation': 'data/flickr30k/flickr30k_test_karpathy.json',
'max_new_tokens': 30,
'min_new_tokens': 8,
},
'coco': {
'root': 'data/coco/',
'annotation': ['data/coco/annotations/coco_karpathy_test.json',
'data/coco/annotations/coco_karpathy_test_gt.json'],
'max_new_tokens': 30,
'min_new_tokens': 8,
},
'nocaps': {
'root': 'data/nocaps/images',
'annotation': 'data/nocaps/nocaps_val_4500_captions.json',
'max_new_tokens': 30,
'min_new_tokens': 8,
},
}
class CaptionDataset(torch.utils.data.Dataset):
def __init__(self, name, root, annotation, prompt, input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6):
if name == 'coco':
self.images = json.load(open(annotation))
else:
self.images = json.load(open(annotation))['images']
self.name = name
self.prompt = prompt
self.root = root
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.max_num = max_num
self.transform = build_transform(is_train=False, input_size=input_size)
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
if self.name == 'coco':
filename = self.images[idx]['image']
image_id = int(filename.split('_')[-1].replace('.jpg', ''))
image_path = os.path.join(self.root, filename)
else:
image_id = self.images[idx]['id']
if 'file_name' in self.images[idx]:
image_path = os.path.join(self.root, self.images[idx]['file_name'])
else:
image_path = os.path.join(self.root, self.images[idx]['image'])
image = Image.open(image_path)
if self.dynamic_image_size:
images = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return {
'image_id': image_id,
'input_text': self.prompt,
'pixel_values': pixel_values
}
def collate_fn(inputs, tokenizer):
pixel_values = torch.cat([_['pixel_values'] for _ in inputs], dim=0)
image_ids = [_['image_id'] for _ in inputs]
input_texts = [_['input_text'] for _ in inputs]
input_tokens = tokenizer(input_texts, return_tensors='pt')
return pixel_values, image_ids, input_tokens.input_ids, input_tokens.attention_mask
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def evaluate_chat_model():
prompt = 'Provide a one-sentence caption for the provided image.'
print('prompt:', prompt)
random.seed(args.seed)
summaries = []
for ds_name in args.datasets:
annotation = ds_collections[ds_name]['annotation']
if type(annotation) == list:
annotation = annotation[0]
dataset = CaptionDataset(
name=ds_name,
root=ds_collections[ds_name]['root'],
annotation=annotation,
prompt=prompt,
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
image_ids, captions = [], []
for _, (pixel_values, ids, _, _) in tqdm(enumerate(dataloader)):
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
pred = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=prompt,
generation_config=generation_config,
verbose=True
)
image_ids.extend(ids)
captions.extend([pred])
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_ids = [None for _ in range(world_size)]
merged_captions = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_ids, image_ids)
torch.distributed.all_gather_object(merged_captions, captions)
merged_ids = [_ for _ in itertools.chain.from_iterable(merged_ids)]
merged_captions = [_ for _ in itertools.chain.from_iterable(merged_captions)]
average_length = sum(len(x.split()) for x in merged_captions) / len(merged_captions)
print(f'Average caption length: {average_length}')
if torch.distributed.get_rank() == 0:
print(f'Evaluating {ds_name} ...')
results = []
for image_id, caption in zip(merged_ids, merged_captions):
results.append({
'image_id': int(image_id),
'caption': caption,
})
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{ds_name}_{time_prefix}.json'
results_file = os.path.join(args.out_dir, results_file)
json.dump(results, open(results_file, 'w'))
annotation = ds_collections[ds_name]['annotation']
if type(annotation) == list:
annotation = annotation[-1]
coco = COCO(annotation)
coco_result = coco.loadRes(results_file)
coco_eval = COCOEvalCap(coco, coco_result)
coco_eval.evaluate()
summary = coco_eval.eval.items()
print(summary)
summaries.append([args.checkpoint, ds_name, average_length, summary])
torch.distributed.barrier()
out_path = '_'.join(args.checkpoint.split('/')[-2:])
writer = open(os.path.join(args.out_dir, f'{out_path}.txt'), 'a')
print(f"write results to file {os.path.join(args.out_dir, f'{out_path}.txt')}")
for summary in summaries:
print(summary)
writer.write(f'{summary}\n')
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str, default='coco,flickr30k,nocaps')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=6)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
print(f'[test] max_num: {args.max_num}')
evaluate_chat_model()
import argparse
import itertools
import json
import os
import random
import re
import time
from functools import partial
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from tqdm import tqdm
ds_collections = {
'DriveLM_val': {
'root': 'InternVL-Domain-Adaptation-Data/val/drivelm_val.jsonl',
'max_new_tokens': 200,
'min_new_tokens': 1,
'split': 'validation',
'image_root': 'InternVL-Domain-Adaptation-Data/images/drivelm/stitch',
}
}
def post_process(pred):
pred = pred.strip()
pattern = r'<c[^,]*,\s*[^,]*,\s*\[\s*-?[0-9]*\.?[0-9]+\s*,\s*-?[0-9]*\.?[0-9]+\s*\]\s*>'
mapping = {'CAM_FRONT_LEFT': [0, 0], 'CAM_FRONT': [1, 0], 'CAM_FRONT_RIGHT': [2, 0], 'CAM_BACK_LEFT': [0, 1],
'CAM_BACK': [1, 1], 'CAM_BACK_RIGHT': [2, 1]}
patch_size = 448
width = patch_size * 2
height = patch_size
whole_img_width = width * 3
whole_img_height = height * 2
matches = re.findall(pattern, pred)
for object_id in matches:
object_id_c = object_id.replace('<', '').replace('>', '')
try:
ctag = object_id_c.split(',')[0]
cxcy = json.loads(','.join(object_id_c.split(',')[2:]))
cam = object_id_c.split(',')[1]
if cam in mapping:
mx, my = mapping[cam]
# old_wide,old_height = images_size[cam]
old_wide, old_height = 1600, 900
cx, cy = cxcy
cx = (cx / 1000) * whole_img_width
cy = (cy / 1000) * whole_img_height
cx -= mx * width
cy -= my * height
cx = cx / width * old_wide
cy = cy / height * old_height
# cx =max(0,min(old_wide,cx))
# cy =max(0,min(old_height,cy))
cx = round(max(0, min(old_wide, cx)), 1)
cy = round(max(0, min(old_height, cy)), 1)
new_object_id = f'<{ctag},{cam},{cx},{cy}>'
pred = pred.replace(object_id, new_object_id)
except Exception as e:
print(e)
return pred
def collate_fn(batches, tokenizer):
pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
questions = [_['question'] for _ in batches]
questions_old = [_['question_old'] for _ in batches]
answers = [_['answer'] for _ in batches]
data_ids = [_['data_id'] for _ in batches]
return pixel_values, questions_old, questions, answers, data_ids
class DriveLMDataset(torch.utils.data.Dataset):
def __init__(self, root, split, prompt, image_path, input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6, ):
with open(root, 'r') as f:
self.data = [json.loads(line) for line in f.readlines()]
# data_val = json.load(f)
# merge all dataset
# self.data = concatenate_datasets(sub_dataset_list)
self.prompt = prompt
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.max_num = max_num
self.transform = build_transform(is_train=False, input_size=input_size)
self.image_path = image_path
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
data = self.data[idx]
data_id = data['id']
question = data['conversations'][0]['value'].strip()
question_old = data['question_old']
image_file = os.path.join(self.image_path, data['image'])
image = Image.open(image_file).convert('RGB')
# question_type = data['question_type']
# choices = eval(data['options'])
answer = data['conversations'][1]['value'].strip()
if self.dynamic_image_size:
pil_image = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
images = pil_image
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return {
'question_old': question_old,
'question': question,
'pixel_values': pixel_values,
'answer': answer,
'data_id': data_id
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def evaluate_chat_model():
random.seed(args.seed)
prompt = None
for ds_name in args.datasets:
dataset = DriveLMDataset(
root=ds_collections[ds_name]['root'],
split=ds_collections[ds_name]['split'],
prompt=prompt,
image_path=ds_collections[ds_name]['image_root'],
# image_meta = ds_collections[ds_name]["image_meta"],
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for _, (pixel_values, questions_old, questions, answers, data_ids) in tqdm(enumerate(dataloader)):
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
pred = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=questions[0],
generation_config=generation_config
)
preds = [post_process(pred)]
for question, pred, answer, data_id, question_old in zip(questions, preds, answers, data_ids,
questions_old):
outputs.append({
'question': question_old,
'answer': pred,
'gt_answers': answer,
'id': data_id
})
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
merged_outputs = [json.loads(_) for _ in merged_outputs]
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
if torch.distributed.get_rank() == 0:
print(f'Evaluating {ds_name} ...')
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{ds_name}_{time_prefix}.json'
output_path = os.path.join(args.out_dir, results_file)
with open(output_path, 'w') as f:
json.dump(merged_outputs, f, indent=4)
print('Results saved to {}'.format(output_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str, default='DriveLM_val')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=12)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
print(f'[test] max_num: {args.max_num}')
evaluate_chat_model()
import argparse
import itertools
import json
import os
import random
import re
import time
from functools import partial
from typing import Literal
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from tqdm import tqdm
ds_collections = {
'MME_RealWorld': {
'root': 'InternVL-Domain-Adaptation-DataMME-RealWorld/val/MME_RealWorld.json',
'max_new_tokens': 100,
'min_new_tokens': 1,
'img_root': 'InternVL-Domain-Adaptation-DataMME-RealWorld/images/MME-RealWorld/data',
'type': 'dev',
'language': 'en'
}
}
def collate_fn(batches, tokenizer):
pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
questions = [_['question'] for _ in batches]
answers = [_['answer'] for _ in batches]
indexes = [_['index'] for _ in batches]
choices = [_['choice'] for _ in batches]
categorys = [_['category'] for _ in batches]
tasks = [_['task'] for _ in batches]
return pixel_values, questions, answers, indexes, choices, categorys, tasks
class MMERealworldDataset(torch.utils.data.Dataset):
def __init__(self, root, prompt, language, subtask: Literal[
'Monitoring', 'OCR with Complex Context', 'Diagram and Table', 'Autonomous_Driving', 'Remote Sensing'],
img_root, input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6):
with open(root, 'r') as f:
self.data_meta = json.load(f)
self.subtask = subtask
self.data_meta = [item for item in self.data_meta if item['Subtask'] == self.subtask]
self.img_root = img_root
self.prompt = prompt
self.language = language
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.max_num = max_num
self.transform = build_transform(is_train=False, input_size=input_size)
def __len__(self):
return len(self.data_meta)
def __getitem__(self, idx):
index = self.data_meta[idx]['Question_id']
assert self.data_meta[idx]['Question Type'] == 'Multiple Choice'
image = os.path.join(self.img_root, self.data_meta[idx]['Image'])
question = self.data_meta[idx]['Text']
choices = self.data_meta[idx]['Answer choices']
answer = self.data_meta[idx]['Ground truth']
category = self.data_meta[idx]['Category']
task = self.data_meta[idx]['Task']
# catetory = self.df.iloc[idx]['category']
# l2_catetory = self.df.iloc[idx]['l2-category']
image = Image.open(image).convert('RGB')
if self.dynamic_image_size:
images = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
if self.language == 'cn':
question = question + 'The choices are listed below:\n' + '\n'.join(choices) + '\n' + self.prompt['cn']
else:
question = question + '选项如下所示:\n' + '\n'.join(choices) + '\n' + self.prompt['en']
return {
'question': question,
'pixel_values': pixel_values,
'answer': answer,
'index': index,
'choice': choices,
'category': category,
'task': task
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def post_process(s, choices):
s = s.strip()
answer_prefixes = [
'The best answer is',
'The correct answer is',
'The answer is',
'The answer',
'The best option is'
'The correct option is',
'Best answer:',
'Best option:',
]
for answer_prefix in answer_prefixes:
s = s.replace(answer_prefix, '')
if len(s.split()) > 10 and not re.search('[ABCDE]', s):
return ''
matches = re.search(r'[ABCDE]', s)
if matches is None:
for choice in choices:
if s.lower() in choice.lower():
return choice[1]
return ''
return matches[0]
def evaluate(outputs):
results = {'Reasoning': {},
'Perception': {}}
for data_item in outputs:
cnt = data_item['answer'] == data_item['gt_answers']
category = data_item['category']
task = data_item['task']
if category not in results[task]:
results[task][category] = {'true': cnt, 'false': 1 - cnt}
else:
results[task][category]['true'] += cnt
results[task][category]['false'] += 1 - cnt
cnt_subtask, sum_subtask = 0, 0
for task, tasks_values in results.items():
cnt_task, sum_task = 0, 0
for category, category_dict in tasks_values.items():
cnt_task += category_dict['true']
sum_task += category_dict['false'] + category_dict['true']
acc = category_dict['true'] / (category_dict['false'] + category_dict['true'])
print(f'-' * 4 + f'\t' + 'Acc ' + '{:.4f}'.format(acc) + f'\t{category.capitalize()}')
cnt_subtask += cnt_task
sum_subtask += sum_task
if sum_task == 0:
acc_task = 0
else:
acc_task = cnt_task / sum_task
print(f'*' * 32 + f'Acc' + '{:.4f}'.format(acc_task) + f'\t{task}')
if sum_subtask == 0:
acc_subtasks = 0
else:
acc_subtasks = cnt_subtask / sum_subtask
print(f'+' * 16 + f'\t Acc ' + '{:.4f}'.format(acc_subtasks))
return acc_subtasks
def evaluate_chat_model():
random.seed(args.seed)
for ds_name in args.datasets:
dataset = MMERealworldDataset(
root=ds_collections[ds_name]['root'],
prompt=prompt,
language=ds_collections[ds_name]['language'],
subtask=args.subtask,
img_root=ds_collections[ds_name]['img_root'],
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for pixel_values, questions, answers, indexes, options, categorys, tasks in tqdm(dataloader):
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
out = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=questions[0],
generation_config=generation_config
)
outs = [out]
preds = [post_process(out, options[0])]
for question, pred, answer, index, out, category, task in zip(questions, preds, answers, indexes, outs,
categorys, tasks):
outputs.append({
'question': question,
'output': out,
'answer': pred,
'gt_answers': answer,
'index': index,
'category': category,
'task': task
})
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
merged_outputs = [json.loads(_) for _ in merged_outputs]
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
if torch.distributed.get_rank() == 0:
print(f'Evaluating {ds_name} ...')
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{ds_name}_{args.subtask}_{time_prefix}.json'
output_path = os.path.join(args.out_dir, results_file)
with open(output_path, 'w') as f:
json.dump(merged_outputs, f, indent=4)
evaluate(merged_outputs)
print('Results saved to {}'.format(output_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str, default='MME_RealWorld')
parser.add_argument('--subtask', type=str, default='Autonomous_Driving')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=6)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
print(f'[test] max_num: {args.max_num}')
prompt = {
'en': 'Select the best answer to the above multiple-choice question based on the image. \
Respond with only the letter (A, B, C, D, or E) of the correct option. \nThe best answer is:',
'cn': '根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母(A, B, C, D 或 E)。\n 最佳答案为:',
}
evaluate_chat_model()
import argparse
import json
import re
import torch
from torchvision.ops.boxes import box_area
def calculate_iou(box1, box2):
x1, y1, x2, y2 = box1
x3, y3, x4, y4 = box2
intersection_x1 = max(x1, x3)
intersection_y1 = max(y1, y3)
intersection_x2 = min(x2, x4)
intersection_y2 = min(y2, y4)
intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(
0, intersection_y2 - intersection_y1 + 1
)
box1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
box2_area = (x4 - x3 + 1) * (y4 - y3 + 1)
union_area = box1_area + box2_area - intersection_area
iou = intersection_area / union_area
return iou
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
wh = (rb - lt).clamp(min=0) # [N,M,2]
inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
def transform_bbox(bbox, image_size):
x1, y1, x2, y2 = bbox
W, H = image_size
x1 = min(max(x1 / 1000 * W, 0), W)
x2 = min(max(x2 / 1000 * W, 0), W)
y1 = min(max(y1 / 1000 * H, 0), H)
y2 = min(max(y2 / 1000 * H, 0), H)
return [x1, y1, x2, y2]
def evaluation_metrics(outputs):
correct = 0
incorrect = 0
pattern = r'\[*\[.*?,.*?,.*?,.*?\]\]*'
# pattern = r'\[*\[(.*?),(.*?),(.*?),(.*?)\]\]*'
# print(outputs)
for output in outputs:
bbox = output['gt_answers']
image_size = output['image_size']
pred = output['answer']
# 查找所有匹配
matches = re.findall(pattern, pred)
if len(matches) > 1:
print('大于一个匹配')
print(matches)
if len(matches) == 0:
incorrect = incorrect + 1
else:
try:
pred_bbox = json.loads(matches[0])
pred_bbox = transform_bbox(pred_bbox[0], image_size)
iou_score = calculate_iou(pred_bbox, bbox)
if iou_score > 0.5:
correct = correct + 1
else:
incorrect = incorrect + 1
except Exception as e:
print(e)
print(output)
incorrect = incorrect + 1
# else:
# continue
print('correct:', correct)
print('incorrect:', incorrect)
print('Total:', correct + incorrect)
print('Acc@0.5:', (correct / (correct + incorrect)))
return {
'correct:': correct,
'incorrect:': incorrect,
'Total:': correct + incorrect,
'Acc@0.5:': correct / (correct + incorrect)
}
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--output_file', type=str, default='')
args = parser.parse_args()
with open(args.output_file, 'r') as f:
data = json.load(f)
if 'outputs' in data:
data = data['outputs']
outputs = data
results = evaluation_metrics(outputs)
results_file = args.output_file
with open(results_file, 'w') as f:
json.dump({
'results': results,
'outputs': outputs
}, f, indent=4)
import argparse
import itertools
import json
import os
import random
import time
from functools import partial
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from tqdm import tqdm
ds_collections = {
'DIOR_RSVG': {
'root': 'InternVL-Domain-Adaptation-Data/val/dior_rsvg_test.json',
'max_new_tokens': 200,
'min_new_tokens': 1,
'type': 'test',
'image_root': 'InternVL-Domain-Adaptation-Data/images/'
},
}
def collate_fn(batches, tokenizer):
pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
questions = [_['question'] for _ in batches]
answers = [_['answer'] for _ in batches]
image_sizes = [_['image_size'] for _ in batches]
return pixel_values, questions, answers, image_sizes
class GroundingDataset(torch.utils.data.Dataset):
def __init__(self, root, image_root, prompt='', input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6):
with open(root, 'r') as f:
self.ann_data = json.load(f)
self.image_root = image_root
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.max_num = max_num
self.transform = build_transform(is_train=False, input_size=input_size)
self.prompt = prompt
def __len__(self):
return len(self.ann_data)
def __getitem__(self, idx):
data_item = self.ann_data[idx]
# index = data_item["id"]
image = data_item['image']
question = self.prompt + data_item['prompt']
answer = data_item['bbox']
image_size_ = data_item['size']
# catetory = self.df.iloc[idx]['category']
# l2_catetory = self.df.iloc[idx]['l2-category']
image = Image.open(os.path.join(self.image_root, image)).convert('RGB')
if self.dynamic_image_size:
images = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return {
'question': question,
'pixel_values': pixel_values,
'answer': answer,
'image_size': image_size_
}
def calculate_iou(box1, box2):
x1, y1, x2, y2 = box1
x3, y3, x4, y4 = box2
intersection_x1 = max(x1, x3)
intersection_y1 = max(y1, y3)
intersection_x2 = min(x2, x4)
intersection_y2 = min(y2, y4)
intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(
0, intersection_y2 - intersection_y1 + 1
)
box1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
box2_area = (x4 - x3 + 1) * (y4 - y3 + 1)
union_area = box1_area + box2_area - intersection_area
iou = intersection_area / union_area
return iou
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def evaluate_chat_model():
random.seed(args.seed)
for ds_name in args.datasets:
dataset = GroundingDataset(
root=ds_collections[ds_name]['root'],
image_root=ds_collections[ds_name]['image_root'],
prompt=prompt_prefix,
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for _, (pixel_values, questions, answers, image_sizes) in tqdm(enumerate(dataloader)):
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
pred = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=questions[0],
generation_config=generation_config
)
preds = [pred]
for question, pred, answer, image_size_ in zip(questions, preds, answers, image_sizes):
outputs.append({
'question': question,
'answer': pred,
'gt_answers': answer,
'image_size': image_size_
})
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
merged_outputs = [json.loads(_) for _ in merged_outputs]
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
if torch.distributed.get_rank() == 0:
print(f'Evaluating {ds_name} ...')
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{ds_name}_{time_prefix}.json'
output_path = os.path.join(args.out_dir, results_file)
with open(output_path, 'w') as f:
json.dump({'outputs': merged_outputs}, f, indent=4)
print('Results saved to {}'.format(output_path))
cmd = f'python eval/rs_det/caculate.py --output_file {output_path}'
print(cmd)
os.system(cmd)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str, default='DIOR_RSVG')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=6)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
print(f'[test] max_num: {args.max_num}')
prompt_prefix = 'Detect '
# prompt_prefix = "Please provide the bounding box coordinate of the region this sentence describes: "
evaluate_chat_model()
import argparse
import itertools
import json
import os
import random
import time
from functools import partial
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from tqdm import tqdm
ds_collections = {
'RSVQA_H_TEST2': {
'root': 'InternVL-Domain-Adaptation-Data/val/rsvqa_h_test_2_instruct.json',
'max_new_tokens': 50,
'min_new_tokens': 1,
'type': 'test',
'image_root': 'InternVL-Domain-Adaptation-Data/images/RSVQA-H/Data'
},
'RSVQA_H_TEST1': {
'root': 'InternVL-Domain-Adaptation-Data/val//rsvqa_h_test_1_instruct.json',
'max_new_tokens': 50,
'min_new_tokens': 1,
'type': 'test',
'image_root': 'InternVL-Domain-Adaptation-Data/images/RSVQA-H/Data'
},
'RSVQA_L': {
'root': 'InternVL-Domain-Adaptation-Data/val/rsvqa_l_test_instruct.json',
'max_new_tokens': 50,
'min_new_tokens': 1,
'type': 'test',
'image_root': 'InternVL-Domain-Adaptation-Data/images/RSVQA_L/Images_LR'
},
}
def collate_fn(batches, tokenizer):
pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
questions = [_['question'] for _ in batches]
answers = [_['answer'] for _ in batches]
indexes = [_['index'] for _ in batches]
question_types = [_['question_type'] for _ in batches]
return pixel_values, questions, answers, indexes, question_types
class RSVQADataset(torch.utils.data.Dataset):
def __init__(self, root, prompt, image_root, input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6):
with open(root, 'r') as f:
self.ann_data = json.load(f)
self.prompt = prompt
self.image_root = image_root
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.max_num = max_num
self.transform = build_transform(is_train=False, input_size=input_size)
def __len__(self):
return len(self.ann_data)
def __getitem__(self, idx):
data_item = self.ann_data[idx]
index = data_item['id']
image = data_item['image']
question = data_item['question'] + '\n' + self.prompt
answer = data_item['gt_answer']
question_type = data_item['type']
# catetory = self.df.iloc[idx]['category']
# l2_catetory = self.df.iloc[idx]['l2-category']
image = Image.open(os.path.join(self.image_root, image)).convert('RGB')
if self.dynamic_image_size:
images = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
return {
'question': question,
'pixel_values': pixel_values,
'answer': answer,
'index': index,
'question_type': question_type
}
def evaluation_metrics(outputs):
correct = 0
incorrect = 0
for output in outputs:
gt = output['gt_answers']
answer = output['answer'].split(',')[0].lower().replace('.', '')
if gt == answer:
correct = correct + 1
else:
incorrect = incorrect + 1
# else:
# continue
print('correct:', correct)
print('incorrect:', incorrect)
print('Total:', correct + incorrect)
print('Acc:', (correct / (correct + incorrect)))
return {
'correct:': correct,
'incorrect:': incorrect,
'Total:': correct + incorrect,
'Acc:': correct / (correct + incorrect)
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
def evaluate_chat_model():
random.seed(args.seed)
for ds_name in args.datasets:
dataset = RSVQADataset(
root=ds_collections[ds_name]['root'],
prompt=prompt,
image_root=ds_collections[ds_name]['image_root'],
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for _, (pixel_values, questions, answers, indexes, question_types) in tqdm(enumerate(dataloader)):
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
pred = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=questions[0],
generation_config=generation_config
)
preds = [pred]
for question, pred, answer, index, question_type in zip(questions, preds, answers, indexes, question_types):
outputs.append({
'question': question,
'response': pred,
'gt_answer': answer,
'index': int(index),
'question_type': question_type
})
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
merged_outputs = [json.loads(_) for _ in merged_outputs]
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
if torch.distributed.get_rank() == 0:
print(f'Evaluating {ds_name} ...')
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{ds_name}_{time_prefix}.json'
output_path = os.path.join(args.out_dir, results_file)
with open(output_path, 'w') as f:
json.dump(merged_outputs, f, indent=4)
cmd = f'python eval/rs_vqa/score.py --output_file {output_path}'
print(cmd)
os.system(cmd)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str, default='RSVQA_H_TEST2')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=6)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
print(f'[test] max_num: {args.max_num}')
prompt = 'Answer the question using a single word or phrase.'
evaluate_chat_model()
import argparse
import json
def is_correct_count(response, answer):
try:
response = int(response) if response is not None else 0
answer = int(answer)
except ValueError:
return False
if response == 0 and answer == 0:
return True
elif 0 < response <= 100 and 0 < answer <= 100:
return True
elif 100 < response <= 1000 and 100 < answer <= 1000:
return True
elif response > 1000 and answer > 1000:
return True
return False
def is_correct_area(response, answer):
try:
response = int(response) if response is not None else 0
answer = int(answer.rstrip('m2'))
except ValueError:
return False
return is_correct_count(response, answer)
def calculate_scores(data):
type_counts = {}
type_correct = {}
for entry in data:
question_type = entry['question_type']
response = entry['response']
answer = entry['gt_answer']
if question_type not in type_counts:
type_counts[question_type] = 0
type_correct[question_type] = 0
type_counts[question_type] += 1
if question_type == 'count':
if is_correct_count(response, answer):
type_correct[question_type] += 1
elif question_type == 'area':
if is_correct_area(response, answer):
type_correct[question_type] += 1
else:
if response and response.lower() == answer.lower():
type_correct[question_type] += 1
type_scores = {}
for question_type in type_counts:
score = type_correct[question_type] / type_counts[question_type]
type_scores[question_type] = round(score, 4)
total_correct = sum(type_correct.values())
total_count = sum(type_counts.values())
total_score = round(total_correct / total_count, 4) if total_count > 0 else 0.0
total_correct_useful = sum([v for k, v in type_correct.items() if k not in ['count', 'area']])
total_count_useful = sum([v for k, v in type_counts.items() if k not in ['count', 'area']])
total_score_useful = round(total_correct_useful / total_count_useful, 4) if total_count_useful > 0 else 0.0
print(f'{type_scores=}')
print(f'{total_score_useful=}')
return type_scores, total_score, total_score_useful, type_counts
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--output_file', type=str, default='')
args = parser.parse_args()
with open(args.output_file, 'r') as f:
data = json.load(f)
if 'outputs' in data:
data = data['outputs']
type_scores, total_score, total_score_useful, type_counts = calculate_scores(data)
results = {
'type_scores': type_scores,
'type_counts': type_counts,
'total_score': total_score,
'total_score_useful': total_score_useful,
'outputs': data
}
with open(args.output_file, 'w') as f:
json.dump(results, f, indent=4)
# README for Evaluation
## 🌟 Overview
This script provides an evaluation pipeline for `LLaVA-Bench`.
For scoring, we use **GPT-4-0613** as the evaluation model.
While the provided code can run the benchmark, we recommend using [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) for testing this benchmark if you aim to align results with our technical report.
## 🗂️ Data Preparation
Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
### LLaVA-Bench
Follow the instructions below to prepare the data:
```shell
# Step 1: Download the dataset
cd data/
git clone https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild
cd ../
```
After preparation is complete, the directory structure is:
```shell
data/llava-bench-in-the-wild
├── images
├── answers_gpt4.jsonl
├── bard_0718.jsonl
├── bing_chat_0629.jsonl
├── context.jsonl
├── questions.jsonl
└── README.md
```
## 🏃 Evaluation Execution
> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
To run the evaluation, execute the following command on an 1-GPU setup:
```shell
# Step 1: Remove old inference results if exists
rm -rf results/llava_bench_results_review.jsonl
# Step 2: Run the evaluation
torchrun --nproc_per_node=1 eval/llava_bench/evaluate_llava_bench.py --checkpoint ${CHECKPOINT} --dynamic
# Step 3: Scoring the results using gpt-4-0613
export OPENAI_API_KEY="your_openai_api_key"
python -u eval/llava_bench/eval_gpt_review_bench.py \
--question data/llava-bench-in-the-wild/questions.jsonl \
--context data/llava-bench-in-the-wild/context.jsonl \
--rule eval/llava_bench/rule.json \
--answer-list \
data/llava-bench-in-the-wild/answers_gpt4.jsonl \
results/llava_bench_results.jsonl \
--output \
results/llava_bench_results_review.jsonl
python -u eval/llava_bench/summarize_gpt_review.py -f results/llava_bench_results_review.jsonl
```
Alternatively, you can run the following simplified command:
```shell
export OPENAI_API_KEY="your_openai_api_key"
GPUS=1 sh evaluate.sh ${CHECKPOINT} llava-bench --dynamic
```
### Arguments
The following arguments can be configured for the evaluation script:
| Argument | Type | Default | Description |
| ---------------- | ------ | --------------- | ----------------------------------------------------------------------------------------------------------------- |
| `--checkpoint` | `str` | `''` | Path to the model checkpoint. |
| `--datasets` | `str` | `'llava_bench'` | Comma-separated list of datasets to evaluate. |
| `--dynamic` | `flag` | `False` | Enables dynamic high resolution preprocessing. |
| `--max-num` | `int` | `6` | Maximum tile number for dynamic high resolution. |
| `--load-in-8bit` | `flag` | `False` | Loads the model weights in 8-bit precision. |
| `--auto` | `flag` | `False` | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
import argparse
import json
import os
import time
import openai
NUM_SECONDS_TO_SLEEP = 0.5
def get_eval(content: str, max_tokens: int):
while True:
try:
completion = openai.chat.completions.create(
model='gpt-4-0613',
messages=[{
'role': 'system',
'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
}, {
'role': 'user',
'content': content,
}],
temperature=0.2, # TODO: figure out which temperature is best for evaluation
max_tokens=max_tokens,
)
break
except Exception as e:
print(e)
time.sleep(NUM_SECONDS_TO_SLEEP)
return completion.choices[0].message.content
def parse_score(review):
try:
score_pair = review.split('\n')[0]
score_pair = score_pair.replace(',', ' ')
sp = score_pair.split(' ')
if len(sp) == 2:
return [float(sp[0]), float(sp[1])]
else:
print('error', review)
return [-1, -1]
except Exception as e:
print(e)
print('error', review)
return [-1, -1]
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
parser.add_argument('-q', '--question')
parser.add_argument('-c', '--context')
parser.add_argument('-a', '--answer-list', nargs='+', default=[])
parser.add_argument('-r', '--rule')
parser.add_argument('-o', '--output')
parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
args = parser.parse_args()
f_q = open(os.path.expanduser(args.question))
f_ans1 = open(os.path.expanduser(args.answer_list[0]))
f_ans2 = open(os.path.expanduser(args.answer_list[1]))
rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
if os.path.isfile(os.path.expanduser(args.output)):
cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
else:
cur_reviews = []
review_file = open(f'{args.output}', 'a')
context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
image_to_context = {context['image']: context for context in context_list}
handles = []
idx = 0
for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
ques = json.loads(ques_js)
ans1 = json.loads(ans1_js)
ans2 = json.loads(ans2_js)
print(ques, ans1, ans2)
inst = image_to_context[ques['image']]
if isinstance(inst['caption'], list):
cap_str = '\n'.join(inst['caption'])
else:
cap_str = inst['caption']
category = 'llava_bench_' + json.loads(ques_js)['category']
if category in rule_dict:
rule = rule_dict[category]
else:
assert False, f'Visual QA category not found in rule file: {category}.'
prompt = rule['prompt']
role = rule['role']
content = (f'[Context]\n{cap_str}\n\n'
f'[Question]\n{ques["text"]}\n\n'
f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
f'[System]\n{prompt}\n\n')
cur_js = {
'id': idx + 1,
'question_id': ques['question_id'],
'answer1_id': ans1.get('answer_id', ans1['question_id']),
'answer2_id': ans2.get('answer_id', ans1['question_id']),
'category': category
}
if idx >= len(cur_reviews):
review = get_eval(content, args.max_tokens)
scores = parse_score(review)
cur_js['content'] = review
cur_js['tuple'] = scores
review_file.write(json.dumps(cur_js) + '\n')
review_file.flush()
else:
print(f'Skipping {idx} as we already have it.')
idx += 1
print(idx)
review_file.close()
import argparse
import json
import os
import random
import torch
from internvl.model import load_model_and_tokenizer
from internvl.train.dataset import build_transform, dynamic_preprocess
from PIL import Image
from tqdm import tqdm
ds_collections = {
'llava_bench': {
'root': 'data/llava-bench-in-the-wild/images',
'question': 'data/llava-bench-in-the-wild/questions.jsonl',
'max_new_tokens': 1000,
'min_new_tokens': 1,
},
}
class VQADataset(torch.utils.data.Dataset):
def __init__(self, root, data, prompt, input_size=224, dynamic_image_size=False,
use_thumbnail=False, max_num=6):
self.root = root
self.data = open(data).readlines()
self.prompt = prompt
self.input_size = input_size
self.dynamic_image_size = dynamic_image_size
self.use_thumbnail = use_thumbnail
self.max_num = max_num
self.transform = build_transform(is_train=False, input_size=input_size)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
data = json.loads(self.data[idx].strip())
image, question, question_id, annotation = data['image'], data[
'text'], data['question_id'], data.get('answer', None)
image = os.path.join(self.root, image)
image = Image.open(image).convert('RGB')
if self.dynamic_image_size:
images = dynamic_preprocess(image, image_size=self.input_size,
use_thumbnail=self.use_thumbnail,
max_num=self.max_num)
else:
images = [image]
pixel_values = [self.transform(image) for image in images]
pixel_values = torch.stack(pixel_values)
question = question + self.prompt
return question_id, question, pixel_values, annotation
def evaluate_chat_model():
random.seed(args.seed)
for ds_name in args.datasets:
dataset = VQADataset(
root=ds_collections[ds_name]['root'],
data=ds_collections[ds_name]['question'],
prompt=' Please give a detailed answer.',
input_size=image_size,
dynamic_image_size=args.dynamic,
use_thumbnail=use_thumbnail,
max_num=args.max_num
)
outputs = []
for _, (question_id, question, pixel_values, annotations) in tqdm(enumerate(dataset)):
pixel_values = pixel_values.to(torch.bfloat16).cuda()
generation_config = dict(
num_beams=args.num_beams,
max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
)
pred = model.chat(
tokenizer=tokenizer,
pixel_values=pixel_values,
question=question,
generation_config=generation_config,
verbose=True
)
outputs.append({
'question_id': question_id,
'text': pred,
'model_id': model_id,
'metadata': {}
})
print(f'Evaluating {ds_name} ...')
results_file = 'llava_bench_results.jsonl'
results_file = os.path.join(args.out_dir, results_file)
writer = open(results_file, 'w')
for item in outputs:
writer.write(json.dumps(item) + '\n')
writer.close()
print('Results saved to {}'.format(results_file))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--datasets', type=str, default='llava_bench')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--num-beams', type=int, default=1)
parser.add_argument('--temperature', type=float, default=0.0)
parser.add_argument('--out-dir', type=str, default='results')
parser.add_argument('--seed', type=int, default=0)
parser.add_argument('--dynamic', action='store_true')
parser.add_argument('--max-num', type=int, default=6)
parser.add_argument('--load-in-8bit', action='store_true')
parser.add_argument('--load-in-4bit', action='store_true')
parser.add_argument('--auto', action='store_true')
args = parser.parse_args()
if not os.path.exists(args.out_dir):
os.makedirs(args.out_dir, exist_ok=True)
args.datasets = args.datasets.split(',')
print('datasets:', args.datasets)
assert args.batch_size == 1, 'Only batch size 1 is supported'
model, tokenizer = load_model_and_tokenizer(args)
image_size = model.config.force_image_size or model.config.vision_config.image_size
use_thumbnail = model.config.use_thumbnail
total_params = sum(p.numel() for p in model.parameters()) / 1e9
if total_params > 20 or args.dynamic:
args.num_beams = 1
print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
else:
print(f'[test] total_params: {total_params}B')
print(f'[test] image_size: {image_size}')
print(f'[test] template: {model.config.template}')
print(f'[test] dynamic_image_size: {args.dynamic}')
print(f'[test] use_thumbnail: {use_thumbnail}')
print(f'[test] max_num: {args.max_num}')
model_id = '_'.join(args.checkpoint.split('/')[-2:])
evaluate_chat_model()
{
"coding": {"role": "Assistant", "prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."},
"math": {"role": "Assistant", "prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."},
"default": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
"conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
"detail": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
"complex": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
"llava_bench_conv": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
"llava_bench_detail": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
"llava_bench_complex": {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}
}
import argparse
import json
import os
from collections import defaultdict
import numpy as np
def parse_args():
parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
parser.add_argument('-d', '--dir', default=None)
parser.add_argument('-v', '--version', default=None)
parser.add_argument('-s', '--select', nargs='*', default=None)
parser.add_argument('-f', '--files', nargs='*', default=[])
parser.add_argument('-i', '--ignore', nargs='*', default=[])
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
if args.ignore is not None:
args.ignore = [int(x) for x in args.ignore]
if len(args.files) > 0:
review_files = args.files
else:
review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (
x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith(
'review_') or 'review' in args.dir)]
for review_file in sorted(review_files):
config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
if args.select is not None and any(x not in config for x in args.select):
continue
if '0613' in config:
version = '0613'
else:
version = '0314'
if args.version is not None and args.version != version:
continue
scores = defaultdict(list)
print(config)
with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
for review_str in f:
review = json.loads(review_str)
if review['question_id'] in args.ignore:
continue
if 'category' in review:
scores[review['category']].append(review['tuple'])
scores['all'].append(review['tuple'])
else:
if 'tuple' in review:
scores['all'].append(review['tuple'])
else:
scores['all'].append(review['score'])
for k, v in sorted(scores.items()):
stats = np.asarray(v).mean(0).tolist()
stats = [round(x, 3) for x in stats]
print(k, stats, round(stats[1] / stats[0] * 100, 1))
print(k, round(stats[1] / stats[0] * 100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
print('=================================')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment