Initial commit

26e59280 · wanglch · 26e59280 · 26e59280 · 26e59280 · 26e59280
Commit 26e59280 authored Apr 24, 2025 by wanglch
20 changed files
--- a/icon.png
+++ b/icon.png
--- a/images/arch.png
+++ b/images/arch.png
--- a/images/result.png
+++ b/images/result.png
--- a/images/theory.png
+++ b/images/theory.png
--- a/internvl3_inference.py
+++ b/internvl3_inference.py
+import math
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer, AutoConfig
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, input_size=448, max_num=12):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+def split_model(model_name):
+    device_map = {}
+    world_size = torch.cuda.device_count()
+    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
+    num_layers = config.llm_config.num_hidden_layers
+    # Since the first GPU will be used for ViT, treat it as half a GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
+    num_layers_per_gpu = [num_layers_per_gpu] * world_size
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = i
+            layer_cnt += 1
+    device_map['vision_model'] = 0
+    device_map['mlp1'] = 0
+    device_map['language_model.model.tok_embeddings'] = 0
+    device_map['language_model.model.embed_tokens'] = 0
+    device_map['language_model.output'] = 0
+    device_map['language_model.model.norm'] = 0
+    device_map['language_model.model.rotary_emb'] = 0
+    device_map['language_model.lm_head'] = 0
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
+    return device_map
+# If you set `load_in_8bit=True`, you will need two 80GB GPUs.
+# If you set `load_in_8bit=False`, you will need at least three 80GB GPUs.
+path = '/home/wanglch/InternVL/InternVL3-1B/'
+device_map = split_model('InternVL3-1B')
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    load_in_8bit=False,
+    low_cpu_mem_usage=True,
+    use_flash_attn=False,
+    trust_remote_code=True,
+    device_map=device_map).eval()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
+# set the max number of tiles in `max_num`
+pixel_values = load_image('/home/wanglch/Images/8.jpg', max_num=12).to(torch.bfloat16).cuda()
+generation_config = dict(max_new_tokens=1024, do_sample=True)
+# single-image single-round conversation (单图单轮对话)
+question = '<image>\n提取图片中的文字信息，并保留文字信息的位置'
+response = model.chat(tokenizer, pixel_values, question, generation_config)
+print(f'User: {question}\nAssistant: {response}')
--- a/internvl_chat/README.md
+++ b/internvl_chat/README.md
+# InternVL-Chat
+This folder contains the implementation of the InternVL-Chat.
+## 📖 Documents
+### 🌟 **Get Started**
+- **Installation**: 🌱 [Installation Guide](https://internvl.readthedocs.io/en/latest/get_started/installation.html) | 📄 [requirements.txt](./requirements.txt)
+- **Chat Data Format**: 📝 [Meta File](https://internvl.readthedocs.io/en/latest/get_started/chat_data_format.html#meta-file) | ✏️ [Text](https://internvl.readthedocs.io/en/latest/get_started/chat_data_format.html#pure-text-data) | 🖼️ [Single-Image](https://internvl.readthedocs.io/en/latest/get_started/chat_data_format.html#single-image-data) | 🖼️🖼️ [Multi-Image](https://internvl.readthedocs.io/en/latest/get_started/chat_data_format.html#multi-image-data) | 🎥 [Video](https://internvl.readthedocs.io/en/latest/get_started/chat_data_format.html#video-data)
+- **Local Chat Demo**: 🤖 [Streamlit Demo](https://internvl.readthedocs.io/en/latest/get_started/local_chat_demo.html#streamlit-demo)
+- **InternVL-Chat API**: 🌐 [InternVL2-Pro](https://internvl.readthedocs.io/en/latest/get_started/internvl_chat_api.html#official-api-of-internvl2-pro)
+- **Tutorials**: 🚀 [Enhancing InternVL2 on COCO Caption Using LoRA Fine-Tuning](https://internvl.readthedocs.io/en/latest/tutorials/coco_caption_finetune.html)
+### 🏆 **InternVL Family**
+- **InternVL 2.5**: 📖 [Introduction](https://internvl.readthedocs.io/en/latest/internvl2.5/introduction.html) | ⚡ [Quick Start](https://internvl.readthedocs.io/en/latest/internvl2.5/quick_start.html) | ✨ [Finetune](https://internvl.readthedocs.io/en/latest/internvl2.5/finetune.html) | 📊 [Evaluation](https://internvl.readthedocs.io/en/latest/internvl2.5/evaluation.html) | 📦 [Deployment](https://internvl.readthedocs.io/en/latest/internvl2.5/deployment.html) | 🎯 [Preference Optimization](https://internvl.readthedocs.io/en/latest/internvl2.5/preference_optimization.html)
+- **InternVL 2.0**: 📖 [Introduction](https://internvl.readthedocs.io/en/latest/internvl2.0/introduction.html) | ⚡ [Quick Start](https://internvl.readthedocs.io/en/latest/internvl2.0/quick_start.html) | ✨ [Finetune](https://internvl.readthedocs.io/en/latest/internvl2.0/finetune.html) | 📊 [Evaluation](https://internvl.readthedocs.io/en/latest/internvl2.0/evaluation.html) | 📦 [Deployment](https://internvl.readthedocs.io/en/latest/internvl2.0/deployment.html) | 🎯 [Preference Optimization](https://internvl.readthedocs.io/en/latest/internvl2.0/preference_optimization.html)
+- **InternVL 1.5**: 📖 [Introduction](https://internvl.readthedocs.io/en/latest/internvl1.5/introduction.html) | ⚡ [Quick Start](https://internvl.readthedocs.io/en/latest/internvl1.5/quick_start.html) | ✨ [Finetune](https://internvl.readthedocs.io/en/latest/internvl1.5/finetune.html) | 📊 [Evaluation](https://internvl.readthedocs.io/en/latest/internvl1.5/evaluation.html) | 📦 [Deployment](https://internvl.readthedocs.io/en/latest/internvl1.5/deployment.html)
+- **InternVL 1.2**: 📖 [Introduction](https://internvl.readthedocs.io/en/latest/internvl1.2/introduction.html) | ⚡ [Quick Start](https://internvl.readthedocs.io/en/latest/internvl1.2/quick_start.html) | ✨ [Finetune](https://internvl.readthedocs.io/en/latest/internvl1.2/finetune.html) | 📊 [Evaluation](https://internvl.readthedocs.io/en/latest/internvl1.2/evaluation.html)
+- **InternVL 1.1**: 📖 [Introduction](https://internvl.readthedocs.io/en/latest/internvl1.1/introduction.html) | ⚡ [Quick Start](https://internvl.readthedocs.io/en/latest/internvl1.1/quick_start.html) | 📊 [Evaluation](https://internvl.readthedocs.io/en/latest/internvl1.1/evaluation.html)
+# Introduction
+We are excited to introduce **InternVL 2.5**, an advanced multimodal large language model (MLLM) series that builds upon InternVL 2.0, maintaining its core model architecture while introducing significant enhancements in training and testing strategies as well as data quality.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/5HDAGOQOZvS1EtI107Ac-.png)
+## InternVL 2.5 Family
+In the following table, we provide an overview of the InternVL 2.5 series.
+|   Model Name    |                                       Vision Part                                       |                                 Language Part                                  |                           HF Link                           |
+| :-------------: | :-------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------: | :---------------------------------------------------------: |
+| InternVL2_5-1B  | [InternViT-300M-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5) |   [Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)   | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-1B)  |
+| InternVL2_5-2B  | [InternViT-300M-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5) | [internlm2_5-1_8b-chat](https://huggingface.co/internlm/internlm2_5-1_8b-chat) | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-2B)  |
+| InternVL2_5-4B  | [InternViT-300M-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5) |     [Qwen2.5-3B-Instruct](https://huggingface.co/Qwen/Qwen2.5-3B-Instruct)     | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-4B)  |
+| InternVL2_5-8B  | [InternViT-300M-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-300M-448px-V2_5) |   [internlm2_5-7b-chat](https://huggingface.co/internlm/internlm2_5-7b-chat)   | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-8B)  |
+| InternVL2_5-26B |   [InternViT-6B-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V2_5)   |  [internlm2_5-20b-chat](https://huggingface.co/internlm/internlm2_5-20b-chat)  | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-26B) |
+| InternVL2_5-38B |   [InternViT-6B-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V2_5)   |    [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct)    | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-38B) |
+| InternVL2_5-78B |   [InternViT-6B-448px-V2_5](https://huggingface.co/OpenGVLab/InternViT-6B-448px-V2_5)   |    [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct)    | [🤗 link](https://huggingface.co/OpenGVLab/InternVL2_5-78B) |
+## Model Architecture
+As shown in the following figure, InternVL 2.5 retains the same model architecture as its predecessors, InternVL 1.5 and 2.0, following the "ViT-MLP-LLM" paradigm. In this new version, we integrate a newly incrementally pre-trained InternViT with various pre-trained LLMs, including InternLM 2.5 and Qwen 2.5, using a randomly initialized MLP projector.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/BiiyXN6NOk0p-3rl3ueyL.png)
+As in the previous version, we applied a pixel unshuffle operation, reducing the number of visual tokens to one-quarter of the original. Besides, we adopted a similar dynamic resolution strategy as InternVL 1.5, dividing images into tiles of 448×448 pixels. The key difference, starting from InternVL 2.0, is that we additionally introduced support for multi-image and video data.
+## Training Strategy
+### Dynamic High-Resolution for Multimodal Data
+In InternVL 2.0 and 2.5, we extend the dynamic high-resolution training approach, enhancing its capabilities to handle multi-image and video datasets.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/xoMY6rwRrNxbAGYPNyU8g.png)
+- For single-image datasets, the total number of tiles `n_max` are allocated to a single image for maximum resolution. Visual tokens are enclosed in `<img>` and `</img>` tags.
+- For multi-image datasets, the total number of tiles `n_max` are distributed across all images in a sample. Each image is labeled with auxiliary tags like `Image-1` and enclosed in `<img>` and `</img>` tags.
+- For videos, each frame is resized to 448×448. Frames are labeled with tags like `Frame-1` and enclosed in `<img>` and `</img>` tags, similar to images.
+### Single Model Training Pipeline
+The training pipeline for a single model in InternVL 2.5 is structured across three stages, designed to enhance the model's visual perception and multimodal capabilities.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/5NduZeCPLgPJTFr0RGTq3.png)
+- **Stage 1: MLP Warmup.** In this stage, only the MLP projector is trained while the vision encoder and language model are frozen. A dynamic high-resolution training strategy is applied for better performance, despite increased cost. This phase ensures robust cross-modal alignment and prepares the model for stable multimodal training.
+- **Stage 1.5: ViT Incremental Learning (Optional).** This stage allows incremental training of the vision encoder and MLP projector using the same data as Stage 1. It enhances the encoder’s ability to handle rare domains like multilingual OCR and mathematical charts. Once trained, the encoder can be reused across LLMs without retraining, making this stage optional unless new domains are introduced.
+- **Stage 2: Full Model Instruction Tuning.** The entire model is trained on high-quality multimodal instruction datasets. Strict data quality controls are enforced to prevent degradation of the LLM, as noisy data can cause issues like repetitive or incorrect outputs. After this stage, the training process is complete.
+### Progressive Scaling Strategy
+We introduce a progressive scaling strategy to align the vision encoder with LLMs efficiently. This approach trains with smaller LLMs first (e.g., 20B) to optimize foundational visual capabilities and cross-modal alignment before transferring the vision encoder to larger LLMs (e.g., 72B) without retraining. This reuse skips intermediate stages for larger models.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64006c09330a45b03605bba3/UoNUyS7ctN5pBxNv9KnzH.png)
+Compared to Qwen2-VL's 1.4 trillion tokens, InternVL2.5-78B uses only 120 billion tokens—less than one-tenth. This strategy minimizes redundancy, maximizes pre-trained component reuse, and enables efficient training for complex vision-language tasks.
+### Training Enhancements
+To improve real-world adaptability and performance, we introduce two key techniques:
+- **Random JPEG Compression**: Random JPEG compression with quality levels between 75 and 100 is applied as a data augmentation technique. This simulates image degradation from internet sources, enhancing the model's robustness to noisy images.
+- **Loss Reweighting**: To balance the NTP loss across responses of different lengths, we use a reweighting strategy called **square averaging**. This method balances contributions from responses of varying lengths, mitigating biases toward longer or shorter responses.
+## Data Organization
+### Dataset Configuration
+In InternVL 2.0 and 2.5, the organization of the training data is controlled by several key parameters to optimize the balance and distribution of datasets during training.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/2LJe24b1ua3gjI9gDitVl.png)
+- **Data Augmentation:** JPEG compression is applied conditionally: enabled for image datasets to enhance robustness and disabled for video datasets to maintain consistent frame quality.
+- **Maximum Tile Number:** The parameter `n_max` controls the maximum tiles per dataset. For example, higher values (24–36) are used for multi-image or high-resolution data, lower values (6–12) for standard images, and 1 for videos.
+- **Repeat Factor:** The repeat factor `r` adjusts dataset sampling frequency. Values below 1 reduce a dataset's weight, while values above 1 increase it. This ensures balanced training across tasks and prevents overfitting or underfitting.
+### Data Filtering Pipeline
+During development, we found that LLMs are highly sensitive to data noise, with even small anomalies—like outliers or repetitive data—causing abnormal behavior during inference. Repetitive generation, especially in long-form or CoT reasoning tasks, proved particularly harmful.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/aka8ZRiKF3ajdyZBnNFZI.png)
+To address this challenge and support future research, we designed an efficient data filtering pipeline to remove low-quality samples.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/70l1UxnX-Arn0NoOGwpth.png)
+The pipeline includes two modules, for **pure-text data**, three key strategies are used:
+1. **LLM-Based Quality Scoring**: Each sample is scored (0–10) using a pre-trained LLM with domain-specific prompts. Samples scoring below a threshold (e.g., 7) are removed to ensure high-quality data.
+2. **Repetition Detection**: Repetitive samples are flagged using LLM-based prompts and manually reviewed. Samples scoring below a stricter threshold (e.g., 3) are excluded to avoid repetitive patterns.
+3. **Heuristic Rule-Based Filtering**: Anomalies like abnormal sentence lengths or duplicate lines are detected using rules. Flagged samples undergo manual verification to ensure accuracy before removal.
+For **multimodal data**, two strategies are used:
+1. **Repetition Detection**: Repetitive samples in non-academic datasets are flagged and manually reviewed to prevent pattern loops. High-quality datasets are exempt from this process.
+2. **Heuristic Rule-Based Filtering**: Similar rules are applied to detect visual anomalies, with flagged data verified manually to maintain integrity.
+### Training Data
+As shown in the following figure, from InternVL 1.5 to 2.0 and then to 2.5, the fine-tuning data mixture has undergone iterative improvements in scale, quality, and diversity. For more information about the training data, please refer to our technical report.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/GaTY9Lde02YzclASMthDa.png)
+## Evaluation on Multimodal Capability
+### Multimodal Reasoning and Mathematics
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/ihFWMRHbF0lpFTkLqnnj1.png)
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/Nrzq0kjlitjp_jrJCqtwX.png)
+### OCR, Chart, and Document Understanding
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/3yCMoLjlbsqY7ZJViGzih.png)
+### Multi-Image & Real-World Comprehension
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/DSnalmEyhDVQ9GE0GPCla.png)
+### Comprehensive Multimodal & Hallucination Evaluation
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/Z7Raj3TGDiV1H81pDHtoG.png)
+### Visual Grounding
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/lPcIrng8MPSg_PM1hpDPt.png)
+### Multimodal Multilingual Understanding
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/BPpbAOX36RV8RTnm3j-gs.png)
+### Video Understanding
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64006c09330a45b03605bba3/tcwH-i1qc8H16En-7AZ5M.png)
+## Evaluation on Language Capability
+Training InternVL 2.0 models led to a decline in pure language capabilities. InternVL 2.5 addresses this by collecting more high-quality open-source data and filtering out low-quality data, achieving better preservation of pure language performance.
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/64119264f0f81eb569e0d569/mxuSKvSY-kfI8zePpXj6y.png)
+## Quick Start
+We provide an example code to run `InternVL2_5-8B` using `transformers`.
+> Please use transformers>=4.37.2 to ensure the model works normally.
+### Model Loading
+#### 16-bit (bf16 / fp16)
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+path = "OpenGVLab/InternVL2_5-8B"
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    use_flash_attn=True,
+    trust_remote_code=True).eval().cuda()
+```
+#### BNB 8-bit Quantization
+```python
+import torch
+from transformers import AutoTokenizer, AutoModel
+path = "OpenGVLab/InternVL2_5-8B"
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    load_in_8bit=True,
+    low_cpu_mem_usage=True,
+    use_flash_attn=True,
+    trust_remote_code=True).eval()
+```
+#### Multiple GPUs
+The reason for writing the code this way is to avoid errors that occur during multi-GPU inference due to tensors not being on the same device. By ensuring that the first and last layers of the large language model (LLM) are on the same device, we prevent such errors.
+```python
+import math
+import torch
+from transformers import AutoTokenizer, AutoModel
+def split_model(model_name):
+    device_map = {}
+    world_size = torch.cuda.device_count()
+    num_layers = {
+        'InternVL2_5-1B': 24, 'InternVL2_5-2B': 24, 'InternVL2_5-4B': 36, 'InternVL2_5-8B': 32,
+        'InternVL2_5-26B': 48, 'InternVL2_5-38B': 64, 'InternVL2_5-78B': 80}[model_name]
+    # Since the first GPU will be used for ViT, treat it as half a GPU.
+    num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
+    num_layers_per_gpu = [num_layers_per_gpu] * world_size
+    num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
+    layer_cnt = 0
+    for i, num_layer in enumerate(num_layers_per_gpu):
+        for j in range(num_layer):
+            device_map[f'language_model.model.layers.{layer_cnt}'] = i
+            layer_cnt += 1
+    device_map['vision_model'] = 0
+    device_map['mlp1'] = 0
+    device_map['language_model.model.tok_embeddings'] = 0
+    device_map['language_model.model.embed_tokens'] = 0
+    device_map['language_model.output'] = 0
+    device_map['language_model.model.norm'] = 0
+    device_map['language_model.lm_head'] = 0
+    device_map[f'language_model.model.layers.{num_layers - 1}'] = 0
+    return device_map
+path = "OpenGVLab/InternVL2_5-8B"
+device_map = split_model('InternVL2_5-8B')
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    use_flash_attn=True,
+    trust_remote_code=True,
+    device_map=device_map).eval()
+```
+### Inference with Transformers
+```python
+import numpy as np
+import torch
+import torchvision.transforms as T
+from decord import VideoReader, cpu
+from PIL import Image
+from torchvision.transforms.functional import InterpolationMode
+from transformers import AutoModel, AutoTokenizer
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+def build_transform(input_size):
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+    # calculate the existing image aspect ratio
+    target_ratios = set(
+        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
+        i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = find_closest_aspect_ratio(
+        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = (
+            (i % (target_width // image_size)) * image_size,
+            (i // (target_width // image_size)) * image_size,
+            ((i % (target_width // image_size)) + 1) * image_size,
+            ((i // (target_width // image_size)) + 1) * image_size
+        )
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+def load_image(image_file, input_size=448, max_num=12):
+    image = Image.open(image_file).convert('RGB')
+    transform = build_transform(input_size=input_size)
+    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+# If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section.
+path = 'OpenGVLab/InternVL2_5-8B'
+model = AutoModel.from_pretrained(
+    path,
+    torch_dtype=torch.bfloat16,
+    low_cpu_mem_usage=True,
+    use_flash_attn=True,
+    trust_remote_code=True).eval().cuda()
+tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
+# set the max number of tiles in `max_num`
+pixel_values = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+generation_config = dict(max_new_tokens=1024, do_sample=True)
+# pure-text conversation (纯文本对话)
+question = 'Hello, who are you?'
+response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+question = 'Can you tell me a story?'
+response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+# single-image single-round conversation (单图单轮对话)
+question = '<image>\nPlease describe the image shortly.'
+response = model.chat(tokenizer, pixel_values, question, generation_config)
+print(f'User: {question}\nAssistant: {response}')
+# single-image multi-round conversation (单图多轮对话)
+question = '<image>\nPlease describe the image in detail.'
+response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+question = 'Please write a poem according to the image.'
+response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+# multi-image multi-round conversation, combined images (多图多轮对话，拼接图像)
+pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
+pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
+question = '<image>\nDescribe the two images in detail.'
+response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+                               history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+question = 'What are the similarities and differences between these two images.'
+response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+                               history=history, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+# multi-image multi-round conversation, separate images (多图多轮对话，独立图像)
+pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
+pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
+num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
+question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.'
+response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+                               num_patches_list=num_patches_list,
+                               history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+question = 'What are the similarities and differences between these two images.'
+response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+                               num_patches_list=num_patches_list,
+                               history=history, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+# batch inference, single image per sample (单图批处理)
+pixel_values1 = load_image('./examples/image1.jpg', max_num=12).to(torch.bfloat16).cuda()
+pixel_values2 = load_image('./examples/image2.jpg', max_num=12).to(torch.bfloat16).cuda()
+num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)]
+pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0)
+questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list)
+responses = model.batch_chat(tokenizer, pixel_values,
+                             num_patches_list=num_patches_list,
+                             questions=questions,
+                             generation_config=generation_config)
+for question, response in zip(questions, responses):
+    print(f'User: {question}\nAssistant: {response}')
+# video multi-round conversation (视频多轮对话)
+def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array([
+        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
+        for idx in range(num_segments)
+    ])
+    return frame_indices
+def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
+    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+    pixel_values_list, num_patches_list = [], []
+    transform = build_transform(input_size=input_size)
+    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+    for frame_index in frame_indices:
+        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
+        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
+        pixel_values = [transform(tile) for tile in img]
+        pixel_values = torch.stack(pixel_values)
+        num_patches_list.append(pixel_values.shape[0])
+        pixel_values_list.append(pixel_values)
+    pixel_values = torch.cat(pixel_values_list)
+    return pixel_values, num_patches_list
+video_path = './examples/red-panda.mp4'
+pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1)
+pixel_values = pixel_values.to(torch.bfloat16).cuda()
+video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
+question = video_prefix + 'What is the red panda doing?'
+# Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question}
+response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+                               num_patches_list=num_patches_list, history=None, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+question = 'Describe this video in detail.'
+response, history = model.chat(tokenizer, pixel_values, question, generation_config,
+                               num_patches_list=num_patches_list, history=history, return_history=True)
+print(f'User: {question}\nAssistant: {response}')
+```
+#### Streaming Output
+Besides this method, you can also use the following code to get streamed output.
+```python
+from transformers import TextIteratorStreamer
+from threading import Thread
+# Initialize the streamer
+streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10)
+# Define the generation configuration
+generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer)
+# Start the model chat in a separate thread
+thread = Thread(target=model.chat, kwargs=dict(
+    tokenizer=tokenizer, pixel_values=pixel_values, question=question,
+    history=None, return_history=False, generation_config=generation_config,
+))
+thread.start()
+# Initialize an empty string to store the generated text
+generated_text = ''
+# Loop through the streamer to get the new text as it is generated
+for new_text in streamer:
+    if new_text == model.conv_template.sep:
+        break
+    generated_text += new_text
+    print(new_text, end='', flush=True)  # Print each new chunk of generated text on the same line
+```
+## Finetune
+Many repositories now support fine-tuning of the InternVL series models, including [InternVL](https://github.com/OpenGVLab/InternVL), [SWIFT](https://github.com/modelscope/ms-swift), [XTurner](https://github.com/InternLM/xtuner), and others. Please refer to their documentation for more details on fine-tuning.
+## Deployment
+### LMDeploy
+LMDeploy is a toolkit for compressing, deploying, and serving LLMs & VLMs.
+```sh
+pip install lmdeploy>=0.6.4 --no-deps
+```
+LMDeploy abstracts the complex inference process of multi-modal Vision-Language Models (VLM) into an easy-to-use pipeline, similar to the Large Language Model (LLM) inference pipeline.
+#### A 'Hello, world' Example
+```python
+from lmdeploy import pipeline, TurbomindEngineConfig
+from lmdeploy.vl import load_image
+model = 'OpenGVLab/InternVL2_5-8B'
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
+pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
+response = pipe(('describe this image', image))
+print(response.text)
+```
+If `ImportError` occurs while executing this case, please install the required dependency packages as prompted.
+#### Multi-images Inference
+When dealing with multiple images, you can put them all in one list. Keep in mind that multiple images will lead to a higher number of input tokens, and as a result, the size of the context window typically needs to be increased.
+```python
+from lmdeploy import pipeline, TurbomindEngineConfig
+from lmdeploy.vl import load_image
+from lmdeploy.vl.constants import IMAGE_TOKEN
+model = 'OpenGVLab/InternVL2_5-8B'
+pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
+image_urls=[
+    'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg',
+    'https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/det.jpg'
+]
+images = [load_image(img_url) for img_url in image_urls]
+# Numbering images improves multi-image conversations
+response = pipe((f'Image-1: {IMAGE_TOKEN}\nImage-2: {IMAGE_TOKEN}\ndescribe these two images', images))
+print(response.text)
+```
+#### Batch Prompts Inference
+Conducting inference with batch prompts is quite straightforward; just place them within a list structure:
+```python
+from lmdeploy import pipeline, TurbomindEngineConfig
+from lmdeploy.vl import load_image
+model = 'OpenGVLab/InternVL2_5-8B'
+pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
+image_urls=[
+    "https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg",
+    "https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/det.jpg"
+]
+prompts = [('describe this image', load_image(img_url)) for img_url in image_urls]
+response = pipe(prompts)
+print(response)
+```
+#### Multi-turn Conversation
+There are two ways to do the multi-turn conversations with the pipeline. One is to construct messages according to the format of OpenAI and use above introduced method, the other is to use the `pipeline.chat` interface.
+```python
+from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
+from lmdeploy.vl import load_image
+model = 'OpenGVLab/InternVL2_5-8B'
+pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192))
+image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/demo/resources/human-pose.jpg')
+gen_config = GenerationConfig(top_k=40, top_p=0.8, temperature=0.8)
+sess = pipe.chat(('describe this image', image), gen_config=gen_config)
+print(sess.response.text)
+sess = pipe.chat('What is the woman doing?', session=sess, gen_config=gen_config)
+print(sess.response.text)
+```
+#### Service
+LMDeploy's `api_server` enables models to be easily packed into services with a single command. The provided RESTful APIs are compatible with OpenAI's interfaces. Below are an example of service startup:
+```shell
+lmdeploy serve api_server OpenGVLab/InternVL2_5-8B --server-port 23333
+```
+To use the OpenAI-style interface, you need to install OpenAI:
+```shell
+pip install openai
+```
+Then, use the code below to make the API call:
+```python
+from openai import OpenAI
+client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1')
+model_name = client.models.list().data[0].id
+response = client.chat.completions.create(
+    model=model_name,
+    messages=[{
+        'role':
+        'user',
+        'content': [{
+            'type': 'text',
+            'text': 'describe this image',
+        }, {
+            'type': 'image_url',
+            'image_url': {
+                'url':
+                'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/tiger.jpeg',
+            },
+        }],
+    }],
+    temperature=0.8,
+    top_p=0.8)
+print(response)
+```
--- a/internvl_chat/eval/README.md
+++ b/internvl_chat/eval/README.md
+# README for Evaluation
+Here, we list the codebase we used to obtain the evaluation results in the InternVL 2.5 technical report.
+## Multimodal Reasoning and Mathematics
+| Benchmark Name | Codebase                                                 |
+| -------------- | -------------------------------------------------------- |
+| MMMU           | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MMMU-Pro       | [This Codebase](./mmmu_pro)                              |
+| MathVista      | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MATH-Vision    | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MathVerse      | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| OlympiadBench  | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+## Multimodal Reasoning and Mathematics
+| Benchmark Name    | Codebase                                                 |
+| ----------------- | -------------------------------------------------------- |
+| AI2D with mask    | [This Codebase](./vqa)                                   |
+| AI2D without mask | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| ChartQA           | [This Codebase](./vqa)                                   |
+| DocVQA            | [This Codebase](./vqa)                                   |
+| InfoVQA           | [This Codebase](./vqa)                                   |
+| OCRBench          | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| SEED-2-Plus       | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| CharXiv           | [CharXiv](https://github.com/princeton-nlp/CharXiv)      |
+| VCR               | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+## Multi-Image Understanding
+| Benchmark Name | Codebase                                                 |
+| -------------- | -------------------------------------------------------- |
+| BLINK          | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| Mantis Eval    | [This Codebase](./mantis_eval)                           |
+| MMIU           | [This Codebase](./mmiu)                                  |
+| MuirBench      | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MMT-Bench      | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MIRB           | [This Codebase](./mirb)                                  |
+## Real-World Comprehension
+| Benchmark Name | Codebase                                                 |
+| -------------- | -------------------------------------------------------- |
+| RealWorldQA    | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MME-RealWorld  | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| WildVision     | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| R-Bench        | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+## Comprehensive Multimodal Evaluation
+| Benchmark Name | Codebase                                                 |
+| -------------- | -------------------------------------------------------- |
+| MME            | [This Codebase](./mme)                                   |
+| MMBench        | [This Codebase](./mmbench)                               |
+| MMBench v1.1   | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MMVet          | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MMVet v2       | [This Codebase](./mmvetv2)                               |
+| MMStar         | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+## Multimodal Hallucination Evaluation
+| Benchmark Name | Codebase                                                 |
+| -------------- | -------------------------------------------------------- |
+| HallBench      | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MMHal-Bench    | [This Codebase](./mmhal)                                 |
+| CRPE           | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| POPE           | [This Codebase](./pope)                                  |
+## Visual Grounding
+| Benchmark Name | Codebase                   |
+| -------------- | -------------------------- |
+| RefCOCO        | [This Codebase](./refcoco) |
+| RefCOCO+       | [This Codebase](./refcoco) |
+| RefCOCOg       | [This Codebase](./refcoco) |
+## Multimodal Multilingual Understanding
+| Benchmark Name       | Codebase                                                 |
+| -------------------- | -------------------------------------------------------- |
+| MMMB                 | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| Multilingual MMBench | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MTVQA                | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+## Video Understanding
+| Benchmark Name | Codebase                                                 |
+| -------------- | -------------------------------------------------------- |
+| Video-MME      | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MVBench        | [This Codebase](./mvbench)                               |
+| MMBench-Video  | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| MLVU           | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| LongVideoBench | [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) |
+| CG-Bench       | provided by authors                                      |
--- a/internvl_chat/eval/caption/README.md
+++ b/internvl_chat/eval/caption/README.md
+# README for Evaluation
+## 🌟 Overview
+This script provides an evaluation pipeline for image captioning across three datasets: `COCO`, `Flickr30k`, and `NoCaps`.
+## 🗂️ Data Preparation
+Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
+### COCO Karpathy Test
+Follow the instructions below to prepare the data:
+```shell
+# Step 1: Create the data directory
+mkdir -p data/coco && cd data/coco
+# Step 2: Download and unzip image files
+wget http://images.cocodataset.org/zips/train2014.zip && unzip train2014.zip
+wget http://images.cocodataset.org/zips/val2014.zip && unzip val2014.zip
+wget http://images.cocodataset.org/zips/test2015.zip && unzip test2015.zip
+# Step 3: Download and place the annotation files
+mkdir -p annotations && cd annotations/
+wget https://github.com/OpenGVLab/InternVL/releases/download/data/coco_karpathy_test.json
+wget https://github.com/OpenGVLab/InternVL/releases/download/data/coco_karpathy_test_gt.json
+cd ../../..
+```
+After preparation is complete, the directory structure is:
+```shell
+data/coco
+├── annotations
+│   ├── coco_karpathy_test.json
+│   └── coco_karpathy_test_gt.json
+├── train2014
+├── val2014
+└── test2015
+```
+### Flickr30K Karpathy Test
+Follow the instructions below to prepare the data:
+```shell
+# Step 1: Create the data directory
+mkdir -p data/flickr30k && cd data/flickr30k
+# Step 2: Download and unzip image files
+# Download images from https://bryanplummer.com/Flickr30kEntities/
+# Step 3: Download and place the annotation files
+# Karpathy split annotations can be downloaded from the following link:
+wget https://github.com/mehdidc/retrieval_annotations/releases/download/1.0.0/flickr30k_test_karpathy.txt
+# This file is provided by the clip-benchmark repository.
+# We convert this txt file to json format, download the converted file:
+wget https://github.com/OpenGVLab/InternVL/releases/download/data/flickr30k_test_karpathy.json
+cd ../..
+```
+After preparation is complete, the directory structure is:
+```shell
+data/flickr30k
+├── Images
+├── flickr30k_test_karpathy.txt
+└── flickr30k_test_karpathy.json
+```
+### NoCaps Val
+Follow the instructions below to prepare the data:
+```shell
+# Step 1: Create the data directory
+mkdir -p data/nocaps && cd data/nocaps
+# Step 2: Download and unzip image files
+# Download images from https://nocaps.org/download
+# Step 3: Download and place the annotation files
+# Original annotations can be downloaded from https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json
+wget https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json
+cd ../..
+```
+After preparation is complete, the directory structure is:
+```shell
+data/nocaps
+├── images
+└── nocaps_val_4500_captions.json
+```
+## 🏃 Evaluation Execution
+> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
+To run the evaluation, execute the following command on an 8-GPU setup:
+```shell
+torchrun --nproc_per_node=8 eval/caption/evaluate_caption.py --checkpoint ${CHECKPOINT} --datasets ${DATASETS} --dynamic
+```
+Alternatively, you can run the following simplified command:
+```shell
+# Test COCO, Flickr30K, and NoCaps
+GPUS=8 sh evaluate.sh ${CHECKPOINT} caption --dynamic
+# Test COCO only
+GPUS=8 sh evaluate.sh ${CHECKPOINT} caption-coco --dynamic
+# Test Flickr30K only
+GPUS=8 sh evaluate.sh ${CHECKPOINT} caption-flickr30k --dynamic
+# Test NoCaps only
+GPUS=8 sh evaluate.sh ${CHECKPOINT} caption-nocaps --dynamic
+```
+### Arguments
+The following arguments can be configured for the evaluation script:
+| Argument         | Type   | Default                   | Description                                                                                                       |
+| ---------------- | ------ | ------------------------- | ----------------------------------------------------------------------------------------------------------------- |
+| `--checkpoint`   | `str`  | `''`                      | Path to the model checkpoint.                                                                                     |
+| `--datasets`     | `str`  | `'coco,flickr30k,nocaps'` | Comma-separated list of datasets to evaluate.                                                                     |
+| `--dynamic`      | `flag` | `False`                   | Enables dynamic high resolution preprocessing.                                                                    |
+| `--max-num`      | `int`  | `6`                       | Maximum tile number for dynamic high resolution.                                                                  |
+| `--load-in-8bit` | `flag` | `False`                   | Loads the model weights in 8-bit precision.                                                                       |
+| `--auto`         | `flag` | `False`                   | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
--- a/internvl_chat/eval/caption/evaluate_caption.py
+++ b/internvl_chat/eval/caption/evaluate_caption.py
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from pycocoevalcap.eval import COCOEvalCap
+from pycocotools.coco import COCO
+from tqdm import tqdm
+ds_collections = {
+    'flickr30k': {
+        'root': 'data/flickr30k/',
+        'annotation': 'data/flickr30k/flickr30k_test_karpathy.json',
+        'max_new_tokens': 30,
+        'min_new_tokens': 8,
+    },
+    'coco': {
+        'root': 'data/coco/',
+        'annotation': ['data/coco/annotations/coco_karpathy_test.json',
+                       'data/coco/annotations/coco_karpathy_test_gt.json'],
+        'max_new_tokens': 30,
+        'min_new_tokens': 8,
+    },
+    'nocaps': {
+        'root': 'data/nocaps/images',
+        'annotation': 'data/nocaps/nocaps_val_4500_captions.json',
+        'max_new_tokens': 30,
+        'min_new_tokens': 8,
+    },
+}
+class CaptionDataset(torch.utils.data.Dataset):
+    def __init__(self, name, root, annotation, prompt, input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6):
+        if name == 'coco':
+            self.images = json.load(open(annotation))
+        else:
+            self.images = json.load(open(annotation))['images']
+        self.name = name
+        self.prompt = prompt
+        self.root = root
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.max_num = max_num
+        self.transform = build_transform(is_train=False, input_size=input_size)
+    def __len__(self):
+        return len(self.images)
+    def __getitem__(self, idx):
+        if self.name == 'coco':
+            filename = self.images[idx]['image']
+            image_id = int(filename.split('_')[-1].replace('.jpg', ''))
+            image_path = os.path.join(self.root, filename)
+        else:
+            image_id = self.images[idx]['id']
+            if 'file_name' in self.images[idx]:
+                image_path = os.path.join(self.root, self.images[idx]['file_name'])
+            else:
+                image_path = os.path.join(self.root, self.images[idx]['image'])
+        image = Image.open(image_path)
+        if self.dynamic_image_size:
+            images = dynamic_preprocess(image, image_size=self.input_size,
+                                        use_thumbnail=self.use_thumbnail,
+                                        max_num=self.max_num)
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        return {
+            'image_id': image_id,
+            'input_text': self.prompt,
+            'pixel_values': pixel_values
+        }
+def collate_fn(inputs, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in inputs], dim=0)
+    image_ids = [_['image_id'] for _ in inputs]
+    input_texts = [_['input_text'] for _ in inputs]
+    input_tokens = tokenizer(input_texts, return_tensors='pt')
+    return pixel_values, image_ids, input_tokens.input_ids, input_tokens.attention_mask
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+def evaluate_chat_model():
+    prompt = 'Provide a one-sentence caption for the provided image.'
+    print('prompt:', prompt)
+    random.seed(args.seed)
+    summaries = []
+    for ds_name in args.datasets:
+        annotation = ds_collections[ds_name]['annotation']
+        if type(annotation) == list:
+            annotation = annotation[0]
+        dataset = CaptionDataset(
+            name=ds_name,
+            root=ds_collections[ds_name]['root'],
+            annotation=annotation,
+            prompt=prompt,
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            sampler=InferenceSampler(len(dataset)),
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, tokenizer=tokenizer),
+        )
+        image_ids, captions = [], []
+        for _, (pixel_values, ids, _, _) in tqdm(enumerate(dataloader)):
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
+                min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            pred = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=prompt,
+                generation_config=generation_config,
+                verbose=True
+            )
+            image_ids.extend(ids)
+            captions.extend([pred])
+        torch.distributed.barrier()
+        world_size = torch.distributed.get_world_size()
+        merged_ids = [None for _ in range(world_size)]
+        merged_captions = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_ids, image_ids)
+        torch.distributed.all_gather_object(merged_captions, captions)
+        merged_ids = [_ for _ in itertools.chain.from_iterable(merged_ids)]
+        merged_captions = [_ for _ in itertools.chain.from_iterable(merged_captions)]
+        average_length = sum(len(x.split()) for x in merged_captions) / len(merged_captions)
+        print(f'Average caption length: {average_length}')
+        if torch.distributed.get_rank() == 0:
+            print(f'Evaluating {ds_name} ...')
+            results = []
+            for image_id, caption in zip(merged_ids, merged_captions):
+                results.append({
+                    'image_id': int(image_id),
+                    'caption': caption,
+                })
+            time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+            results_file = f'{ds_name}_{time_prefix}.json'
+            results_file = os.path.join(args.out_dir, results_file)
+            json.dump(results, open(results_file, 'w'))
+            annotation = ds_collections[ds_name]['annotation']
+            if type(annotation) == list:
+                annotation = annotation[-1]
+            coco = COCO(annotation)
+            coco_result = coco.loadRes(results_file)
+            coco_eval = COCOEvalCap(coco, coco_result)
+            coco_eval.evaluate()
+            summary = coco_eval.eval.items()
+            print(summary)
+            summaries.append([args.checkpoint, ds_name, average_length, summary])
+        torch.distributed.barrier()
+    out_path = '_'.join(args.checkpoint.split('/')[-2:])
+    writer = open(os.path.join(args.out_dir, f'{out_path}.txt'), 'a')
+    print(f"write results to file {os.path.join(args.out_dir, f'{out_path}.txt')}")
+    for summary in summaries:
+        print(summary)
+        writer.write(f'{summary}\n')
+    writer.close()
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='coco,flickr30k,nocaps')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=6)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    args = parser.parse_args()
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+    print(f'[test] max_num: {args.max_num}')
+    evaluate_chat_model()
--- a/internvl_chat/eval/domain_specific/drivelm/evaluate.py
+++ b/internvl_chat/eval/domain_specific/drivelm/evaluate.py
+import argparse
+import itertools
+import json
+import os
+import random
+import re
+import time
+from functools import partial
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from tqdm import tqdm
+ds_collections = {
+    'DriveLM_val': {
+        'root': 'InternVL-Domain-Adaptation-Data/val/drivelm_val.jsonl',
+        'max_new_tokens': 200,
+        'min_new_tokens': 1,
+        'split': 'validation',
+        'image_root': 'InternVL-Domain-Adaptation-Data/images/drivelm/stitch',
+    }
+}
+def post_process(pred):
+    pred = pred.strip()
+    pattern = r'<c[^,]*,\s*[^,]*,\s*\[\s*-?[0-9]*\.?[0-9]+\s*,\s*-?[0-9]*\.?[0-9]+\s*\]\s*>'
+    mapping = {'CAM_FRONT_LEFT': [0, 0], 'CAM_FRONT': [1, 0], 'CAM_FRONT_RIGHT': [2, 0], 'CAM_BACK_LEFT': [0, 1],
+               'CAM_BACK': [1, 1], 'CAM_BACK_RIGHT': [2, 1]}
+    patch_size = 448
+    width = patch_size * 2
+    height = patch_size
+    whole_img_width = width * 3
+    whole_img_height = height * 2
+    matches = re.findall(pattern, pred)
+    for object_id in matches:
+        object_id_c = object_id.replace('<', '').replace('>', '')
+        try:
+            ctag = object_id_c.split(',')[0]
+            cxcy = json.loads(','.join(object_id_c.split(',')[2:]))
+            cam = object_id_c.split(',')[1]
+            if cam in mapping:
+                mx, my = mapping[cam]
+                # old_wide,old_height = images_size[cam]
+                old_wide, old_height = 1600, 900
+                cx, cy = cxcy
+                cx = (cx / 1000) * whole_img_width
+                cy = (cy / 1000) * whole_img_height
+                cx -= mx * width
+                cy -= my * height
+                cx = cx / width * old_wide
+                cy = cy / height * old_height
+                # cx =max(0,min(old_wide,cx))
+                # cy =max(0,min(old_height,cy))
+                cx = round(max(0, min(old_wide, cx)), 1)
+                cy = round(max(0, min(old_height, cy)), 1)
+                new_object_id = f'<{ctag},{cam},{cx},{cy}>'
+                pred = pred.replace(object_id, new_object_id)
+        except Exception as e:
+            print(e)
+    return pred
+def collate_fn(batches, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
+    questions = [_['question'] for _ in batches]
+    questions_old = [_['question_old'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+    data_ids = [_['data_id'] for _ in batches]
+    return pixel_values, questions_old, questions, answers, data_ids
+class DriveLMDataset(torch.utils.data.Dataset):
+    def __init__(self, root, split, prompt, image_path, input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6, ):
+        with open(root, 'r') as f:
+            self.data = [json.loads(line) for line in f.readlines()]
+            # data_val = json.load(f)
+        # merge all dataset
+        # self.data = concatenate_datasets(sub_dataset_list)
+        self.prompt = prompt
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.max_num = max_num
+        self.transform = build_transform(is_train=False, input_size=input_size)
+        self.image_path = image_path
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        data = self.data[idx]
+        data_id = data['id']
+        question = data['conversations'][0]['value'].strip()
+        question_old = data['question_old']
+        image_file = os.path.join(self.image_path, data['image'])
+        image = Image.open(image_file).convert('RGB')
+        # question_type = data['question_type']
+        # choices = eval(data['options'])
+        answer = data['conversations'][1]['value'].strip()
+        if self.dynamic_image_size:
+            pil_image = dynamic_preprocess(image, image_size=self.input_size,
+                                           use_thumbnail=self.use_thumbnail,
+                                           max_num=self.max_num)
+            images = pil_image
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        return {
+            'question_old': question_old,
+            'question': question,
+            'pixel_values': pixel_values,
+            'answer': answer,
+            'data_id': data_id
+        }
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+def evaluate_chat_model():
+    random.seed(args.seed)
+    prompt = None
+    for ds_name in args.datasets:
+        dataset = DriveLMDataset(
+            root=ds_collections[ds_name]['root'],
+            split=ds_collections[ds_name]['split'],
+            prompt=prompt,
+            image_path=ds_collections[ds_name]['image_root'],
+            # image_meta = ds_collections[ds_name]["image_meta"],
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            sampler=InferenceSampler(len(dataset)),
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, tokenizer=tokenizer),
+        )
+        outputs = []
+        for _, (pixel_values, questions_old, questions, answers, data_ids) in tqdm(enumerate(dataloader)):
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
+                min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            pred = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=questions[0],
+                generation_config=generation_config
+            )
+            preds = [post_process(pred)]
+            for question, pred, answer, data_id, question_old in zip(questions, preds, answers, data_ids,
+                                                                     questions_old):
+                outputs.append({
+                    'question': question_old,
+                    'answer': pred,
+                    'gt_answers': answer,
+                    'id': data_id
+                })
+        torch.distributed.barrier()
+        world_size = torch.distributed.get_world_size()
+        merged_outputs = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+        merged_outputs = [json.loads(_) for _ in merged_outputs]
+        merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+        if torch.distributed.get_rank() == 0:
+            print(f'Evaluating {ds_name} ...')
+            time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+            results_file = f'{ds_name}_{time_prefix}.json'
+            output_path = os.path.join(args.out_dir, results_file)
+            with open(output_path, 'w') as f:
+                json.dump(merged_outputs, f, indent=4)
+            print('Results saved to {}'.format(output_path))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='DriveLM_val')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=12)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    args = parser.parse_args()
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+    print(f'[test] max_num: {args.max_num}')
+    evaluate_chat_model()
--- a/internvl_chat/eval/domain_specific/mme_rw/evaluate.py
+++ b/internvl_chat/eval/domain_specific/mme_rw/evaluate.py
+import argparse
+import itertools
+import json
+import os
+import random
+import re
+import time
+from functools import partial
+from typing import Literal
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from tqdm import tqdm
+ds_collections = {
+    'MME_RealWorld': {
+        'root': 'InternVL-Domain-Adaptation-DataMME-RealWorld/val/MME_RealWorld.json',
+        'max_new_tokens': 100,
+        'min_new_tokens': 1,
+        'img_root': 'InternVL-Domain-Adaptation-DataMME-RealWorld/images/MME-RealWorld/data',
+        'type': 'dev',
+        'language': 'en'
+    }
+}
+def collate_fn(batches, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
+    questions = [_['question'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+    indexes = [_['index'] for _ in batches]
+    choices = [_['choice'] for _ in batches]
+    categorys = [_['category'] for _ in batches]
+    tasks = [_['task'] for _ in batches]
+    return pixel_values, questions, answers, indexes, choices, categorys, tasks
+class MMERealworldDataset(torch.utils.data.Dataset):
+    def __init__(self, root, prompt, language, subtask: Literal[
+        'Monitoring', 'OCR with Complex Context', 'Diagram and Table', 'Autonomous_Driving', 'Remote Sensing'],
+                 img_root, input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6):
+        with open(root, 'r') as f:
+            self.data_meta = json.load(f)
+        self.subtask = subtask
+        self.data_meta = [item for item in self.data_meta if item['Subtask'] == self.subtask]
+        self.img_root = img_root
+        self.prompt = prompt
+        self.language = language
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.max_num = max_num
+        self.transform = build_transform(is_train=False, input_size=input_size)
+    def __len__(self):
+        return len(self.data_meta)
+    def __getitem__(self, idx):
+        index = self.data_meta[idx]['Question_id']
+        assert self.data_meta[idx]['Question Type'] == 'Multiple Choice'
+        image = os.path.join(self.img_root, self.data_meta[idx]['Image'])
+        question = self.data_meta[idx]['Text']
+        choices = self.data_meta[idx]['Answer choices']
+        answer = self.data_meta[idx]['Ground truth']
+        category = self.data_meta[idx]['Category']
+        task = self.data_meta[idx]['Task']
+        # catetory = self.df.iloc[idx]['category']
+        # l2_catetory = self.df.iloc[idx]['l2-category']
+        image = Image.open(image).convert('RGB')
+        if self.dynamic_image_size:
+            images = dynamic_preprocess(image, image_size=self.input_size,
+                                        use_thumbnail=self.use_thumbnail,
+                                        max_num=self.max_num)
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        if self.language == 'cn':
+            question = question + 'The choices are listed below:\n' + '\n'.join(choices) + '\n' + self.prompt['cn']
+        else:
+            question = question + '选项如下所示:\n' + '\n'.join(choices) + '\n' + self.prompt['en']
+        return {
+            'question': question,
+            'pixel_values': pixel_values,
+            'answer': answer,
+            'index': index,
+            'choice': choices,
+            'category': category,
+            'task': task
+        }
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+def post_process(s, choices):
+    s = s.strip()
+    answer_prefixes = [
+        'The best answer is',
+        'The correct answer is',
+        'The answer is',
+        'The answer',
+        'The best option is'
+        'The correct option is',
+        'Best answer:',
+        'Best option:',
+    ]
+    for answer_prefix in answer_prefixes:
+        s = s.replace(answer_prefix, '')
+    if len(s.split()) > 10 and not re.search('[ABCDE]', s):
+        return ''
+    matches = re.search(r'[ABCDE]', s)
+    if matches is None:
+        for choice in choices:
+            if s.lower() in choice.lower():
+                return choice[1]
+        return ''
+    return matches[0]
+def evaluate(outputs):
+    results = {'Reasoning': {},
+               'Perception': {}}
+    for data_item in outputs:
+        cnt = data_item['answer'] == data_item['gt_answers']
+        category = data_item['category']
+        task = data_item['task']
+        if category not in results[task]:
+            results[task][category] = {'true': cnt, 'false': 1 - cnt}
+        else:
+            results[task][category]['true'] += cnt
+            results[task][category]['false'] += 1 - cnt
+    cnt_subtask, sum_subtask = 0, 0
+    for task, tasks_values in results.items():
+        cnt_task, sum_task = 0, 0
+        for category, category_dict in tasks_values.items():
+            cnt_task += category_dict['true']
+            sum_task += category_dict['false'] + category_dict['true']
+            acc = category_dict['true'] / (category_dict['false'] + category_dict['true'])
+            print(f'-' * 4 + f'\t' + 'Acc ' + '{:.4f}'.format(acc) + f'\t{category.capitalize()}')
+        cnt_subtask += cnt_task
+        sum_subtask += sum_task
+        if sum_task == 0:
+            acc_task = 0
+        else:
+            acc_task = cnt_task / sum_task
+        print(f'*' * 32 + f'Acc' + '{:.4f}'.format(acc_task) + f'\t{task}')
+    if sum_subtask == 0:
+        acc_subtasks = 0
+    else:
+        acc_subtasks = cnt_subtask / sum_subtask
+    print(f'+' * 16 + f'\t Acc ' + '{:.4f}'.format(acc_subtasks))
+    return acc_subtasks
+def evaluate_chat_model():
+    random.seed(args.seed)
+    for ds_name in args.datasets:
+        dataset = MMERealworldDataset(
+            root=ds_collections[ds_name]['root'],
+            prompt=prompt,
+            language=ds_collections[ds_name]['language'],
+            subtask=args.subtask,
+            img_root=ds_collections[ds_name]['img_root'],
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            sampler=InferenceSampler(len(dataset)),
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, tokenizer=tokenizer),
+        )
+        outputs = []
+        for pixel_values, questions, answers, indexes, options, categorys, tasks in tqdm(dataloader):
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
+                min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            out = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=questions[0],
+                generation_config=generation_config
+            )
+            outs = [out]
+            preds = [post_process(out, options[0])]
+            for question, pred, answer, index, out, category, task in zip(questions, preds, answers, indexes, outs,
+                                                                          categorys, tasks):
+                outputs.append({
+                    'question': question,
+                    'output': out,
+                    'answer': pred,
+                    'gt_answers': answer,
+                    'index': index,
+                    'category': category,
+                    'task': task
+                })
+        torch.distributed.barrier()
+        world_size = torch.distributed.get_world_size()
+        merged_outputs = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+        merged_outputs = [json.loads(_) for _ in merged_outputs]
+        merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+        if torch.distributed.get_rank() == 0:
+            print(f'Evaluating {ds_name} ...')
+            time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+            results_file = f'{ds_name}_{args.subtask}_{time_prefix}.json'
+            output_path = os.path.join(args.out_dir, results_file)
+            with open(output_path, 'w') as f:
+                json.dump(merged_outputs, f, indent=4)
+            evaluate(merged_outputs)
+            print('Results saved to {}'.format(output_path))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='MME_RealWorld')
+    parser.add_argument('--subtask', type=str, default='Autonomous_Driving')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=6)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    args = parser.parse_args()
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+    print(f'[test] max_num: {args.max_num}')
+    prompt = {
+        'en': 'Select the best answer to the above multiple-choice question based on the image. \
+            Respond with only the letter (A, B, C, D, or E) of the correct option. \nThe best answer is:',
+        'cn': '根据图像选择上述多项选择题的最佳答案。只需回答正确选项的字母（A, B, C, D 或 E）。\n 最佳答案为：',
+    }
+    evaluate_chat_model()
--- a/internvl_chat/eval/domain_specific/rs_det/caculate.py
+++ b/internvl_chat/eval/domain_specific/rs_det/caculate.py
+import argparse
+import json
+import re
+import torch
+from torchvision.ops.boxes import box_area
+def calculate_iou(box1, box2):
+    x1, y1, x2, y2 = box1
+    x3, y3, x4, y4 = box2
+    intersection_x1 = max(x1, x3)
+    intersection_y1 = max(y1, y3)
+    intersection_x2 = min(x2, x4)
+    intersection_y2 = min(y2, y4)
+    intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(
+        0, intersection_y2 - intersection_y1 + 1
+    )
+    box1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
+    box2_area = (x4 - x3 + 1) * (y4 - y3 + 1)
+    union_area = box1_area + box2_area - intersection_area
+    iou = intersection_area / union_area
+    return iou
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+    union = area1[:, None] + area2 - inter
+    iou = inter / union
+    return iou, union
+def transform_bbox(bbox, image_size):
+    x1, y1, x2, y2 = bbox
+    W, H = image_size
+    x1 = min(max(x1 / 1000 * W, 0), W)
+    x2 = min(max(x2 / 1000 * W, 0), W)
+    y1 = min(max(y1 / 1000 * H, 0), H)
+    y2 = min(max(y2 / 1000 * H, 0), H)
+    return [x1, y1, x2, y2]
+def evaluation_metrics(outputs):
+    correct = 0
+    incorrect = 0
+    pattern = r'\[*\[.*?,.*?,.*?,.*?\]\]*'
+    # pattern = r'\[*\[(.*?),(.*?),(.*?),(.*?)\]\]*'
+    # print(outputs)
+    for output in outputs:
+        bbox = output['gt_answers']
+        image_size = output['image_size']
+        pred = output['answer']
+        # 查找所有匹配
+        matches = re.findall(pattern, pred)
+        if len(matches) > 1:
+            print('大于一个匹配')
+            print(matches)
+        if len(matches) == 0:
+            incorrect = incorrect + 1
+        else:
+            try:
+                pred_bbox = json.loads(matches[0])
+                pred_bbox = transform_bbox(pred_bbox[0], image_size)
+                iou_score = calculate_iou(pred_bbox, bbox)
+                if iou_score > 0.5:
+                    correct = correct + 1
+                else:
+                    incorrect = incorrect + 1
+            except Exception as e:
+                print(e)
+                print(output)
+                incorrect = incorrect + 1
+        # else:
+        #     continue
+    print('correct:', correct)
+    print('incorrect:', incorrect)
+    print('Total:', correct + incorrect)
+    print('Acc@0.5:', (correct / (correct + incorrect)))
+    return {
+        'correct:': correct,
+        'incorrect:': incorrect,
+        'Total:': correct + incorrect,
+        'Acc@0.5:': correct / (correct + incorrect)
+    }
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_file', type=str, default='')
+    args = parser.parse_args()
+    with open(args.output_file, 'r') as f:
+        data = json.load(f)
+    if 'outputs' in data:
+        data = data['outputs']
+    outputs = data
+    results = evaluation_metrics(outputs)
+    results_file = args.output_file
+    with open(results_file, 'w') as f:
+        json.dump({
+            'results': results,
+            'outputs': outputs
+        }, f, indent=4)
--- a/internvl_chat/eval/domain_specific/rs_det/evaluate.py
+++ b/internvl_chat/eval/domain_specific/rs_det/evaluate.py
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from tqdm import tqdm
+ds_collections = {
+    'DIOR_RSVG': {
+        'root': 'InternVL-Domain-Adaptation-Data/val/dior_rsvg_test.json',
+        'max_new_tokens': 200,
+        'min_new_tokens': 1,
+        'type': 'test',
+        'image_root': 'InternVL-Domain-Adaptation-Data/images/'
+    },
+}
+def collate_fn(batches, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
+    questions = [_['question'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+    image_sizes = [_['image_size'] for _ in batches]
+    return pixel_values, questions, answers, image_sizes
+class GroundingDataset(torch.utils.data.Dataset):
+    def __init__(self, root, image_root, prompt='', input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6):
+        with open(root, 'r') as f:
+            self.ann_data = json.load(f)
+        self.image_root = image_root
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.max_num = max_num
+        self.transform = build_transform(is_train=False, input_size=input_size)
+        self.prompt = prompt
+    def __len__(self):
+        return len(self.ann_data)
+    def __getitem__(self, idx):
+        data_item = self.ann_data[idx]
+        # index = data_item["id"]
+        image = data_item['image']
+        question = self.prompt + data_item['prompt']
+        answer = data_item['bbox']
+        image_size_ = data_item['size']
+        # catetory = self.df.iloc[idx]['category']
+        # l2_catetory = self.df.iloc[idx]['l2-category']
+        image = Image.open(os.path.join(self.image_root, image)).convert('RGB')
+        if self.dynamic_image_size:
+            images = dynamic_preprocess(image, image_size=self.input_size,
+                                        use_thumbnail=self.use_thumbnail,
+                                        max_num=self.max_num)
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        return {
+            'question': question,
+            'pixel_values': pixel_values,
+            'answer': answer,
+            'image_size': image_size_
+        }
+def calculate_iou(box1, box2):
+    x1, y1, x2, y2 = box1
+    x3, y3, x4, y4 = box2
+    intersection_x1 = max(x1, x3)
+    intersection_y1 = max(y1, y3)
+    intersection_x2 = min(x2, x4)
+    intersection_y2 = min(y2, y4)
+    intersection_area = max(0, intersection_x2 - intersection_x1 + 1) * max(
+        0, intersection_y2 - intersection_y1 + 1
+    )
+    box1_area = (x2 - x1 + 1) * (y2 - y1 + 1)
+    box2_area = (x4 - x3 + 1) * (y4 - y3 + 1)
+    union_area = box1_area + box2_area - intersection_area
+    iou = intersection_area / union_area
+    return iou
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+def evaluate_chat_model():
+    random.seed(args.seed)
+    for ds_name in args.datasets:
+        dataset = GroundingDataset(
+            root=ds_collections[ds_name]['root'],
+            image_root=ds_collections[ds_name]['image_root'],
+            prompt=prompt_prefix,
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            sampler=InferenceSampler(len(dataset)),
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, tokenizer=tokenizer),
+        )
+        outputs = []
+        for _, (pixel_values, questions, answers, image_sizes) in tqdm(enumerate(dataloader)):
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
+                min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            pred = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=questions[0],
+                generation_config=generation_config
+            )
+            preds = [pred]
+            for question, pred, answer, image_size_ in zip(questions, preds, answers, image_sizes):
+                outputs.append({
+                    'question': question,
+                    'answer': pred,
+                    'gt_answers': answer,
+                    'image_size': image_size_
+                })
+        torch.distributed.barrier()
+        world_size = torch.distributed.get_world_size()
+        merged_outputs = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+        merged_outputs = [json.loads(_) for _ in merged_outputs]
+        merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+        if torch.distributed.get_rank() == 0:
+            print(f'Evaluating {ds_name} ...')
+            time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+            results_file = f'{ds_name}_{time_prefix}.json'
+            output_path = os.path.join(args.out_dir, results_file)
+            with open(output_path, 'w') as f:
+                json.dump({'outputs': merged_outputs}, f, indent=4)
+            print('Results saved to {}'.format(output_path))
+            cmd = f'python eval/rs_det/caculate.py --output_file {output_path}'
+            print(cmd)
+            os.system(cmd)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='DIOR_RSVG')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=6)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    args = parser.parse_args()
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+    print(f'[test] max_num: {args.max_num}')
+    prompt_prefix = 'Detect '
+    # prompt_prefix =  "Please provide the bounding box coordinate of the region this sentence describes: "
+    evaluate_chat_model()
--- a/internvl_chat/eval/domain_specific/rs_vqa/evaluate.py
+++ b/internvl_chat/eval/domain_specific/rs_vqa/evaluate.py
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from tqdm import tqdm
+ds_collections = {
+    'RSVQA_H_TEST2': {
+        'root': 'InternVL-Domain-Adaptation-Data/val/rsvqa_h_test_2_instruct.json',
+        'max_new_tokens': 50,
+        'min_new_tokens': 1,
+        'type': 'test',
+        'image_root': 'InternVL-Domain-Adaptation-Data/images/RSVQA-H/Data'
+    },
+    'RSVQA_H_TEST1': {
+        'root': 'InternVL-Domain-Adaptation-Data/val//rsvqa_h_test_1_instruct.json',
+        'max_new_tokens': 50,
+        'min_new_tokens': 1,
+        'type': 'test',
+        'image_root': 'InternVL-Domain-Adaptation-Data/images/RSVQA-H/Data'
+    },
+    'RSVQA_L': {
+        'root': 'InternVL-Domain-Adaptation-Data/val/rsvqa_l_test_instruct.json',
+        'max_new_tokens': 50,
+        'min_new_tokens': 1,
+        'type': 'test',
+        'image_root': 'InternVL-Domain-Adaptation-Data/images/RSVQA_L/Images_LR'
+    },
+}
+def collate_fn(batches, tokenizer):
+    pixel_values = torch.cat([_['pixel_values'] for _ in batches], dim=0)
+    questions = [_['question'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+    indexes = [_['index'] for _ in batches]
+    question_types = [_['question_type'] for _ in batches]
+    return pixel_values, questions, answers, indexes, question_types
+class RSVQADataset(torch.utils.data.Dataset):
+    def __init__(self, root, prompt, image_root, input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6):
+        with open(root, 'r') as f:
+            self.ann_data = json.load(f)
+        self.prompt = prompt
+        self.image_root = image_root
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.max_num = max_num
+        self.transform = build_transform(is_train=False, input_size=input_size)
+    def __len__(self):
+        return len(self.ann_data)
+    def __getitem__(self, idx):
+        data_item = self.ann_data[idx]
+        index = data_item['id']
+        image = data_item['image']
+        question = data_item['question'] + '\n' + self.prompt
+        answer = data_item['gt_answer']
+        question_type = data_item['type']
+        # catetory = self.df.iloc[idx]['category']
+        # l2_catetory = self.df.iloc[idx]['l2-category']
+        image = Image.open(os.path.join(self.image_root, image)).convert('RGB')
+        if self.dynamic_image_size:
+            images = dynamic_preprocess(image, image_size=self.input_size,
+                                        use_thumbnail=self.use_thumbnail,
+                                        max_num=self.max_num)
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        return {
+            'question': question,
+            'pixel_values': pixel_values,
+            'answer': answer,
+            'index': index,
+            'question_type': question_type
+        }
+def evaluation_metrics(outputs):
+    correct = 0
+    incorrect = 0
+    for output in outputs:
+        gt = output['gt_answers']
+        answer = output['answer'].split(',')[0].lower().replace('.', '')
+        if gt == answer:
+            correct = correct + 1
+        else:
+            incorrect = incorrect + 1
+        # else:
+        #     continue
+    print('correct:', correct)
+    print('incorrect:', incorrect)
+    print('Total:', correct + incorrect)
+    print('Acc:', (correct / (correct + incorrect)))
+    return {
+        'correct:': correct,
+        'incorrect:': incorrect,
+        'Total:': correct + incorrect,
+        'Acc:': correct / (correct + incorrect)
+    }
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size, self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+def evaluate_chat_model():
+    random.seed(args.seed)
+    for ds_name in args.datasets:
+        dataset = RSVQADataset(
+            root=ds_collections[ds_name]['root'],
+            prompt=prompt,
+            image_root=ds_collections[ds_name]['image_root'],
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset=dataset,
+            sampler=InferenceSampler(len(dataset)),
+            batch_size=args.batch_size,
+            num_workers=args.num_workers,
+            pin_memory=True,
+            drop_last=False,
+            collate_fn=partial(collate_fn, tokenizer=tokenizer),
+        )
+        outputs = []
+        for _, (pixel_values, questions, answers, indexes, question_types) in tqdm(enumerate(dataloader)):
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
+                min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            pred = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=questions[0],
+                generation_config=generation_config
+            )
+            preds = [pred]
+            for question, pred, answer, index, question_type in zip(questions, preds, answers, indexes, question_types):
+                outputs.append({
+                    'question': question,
+                    'response': pred,
+                    'gt_answer': answer,
+                    'index': int(index),
+                    'question_type': question_type
+                })
+        torch.distributed.barrier()
+        world_size = torch.distributed.get_world_size()
+        merged_outputs = [None for _ in range(world_size)]
+        torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+        merged_outputs = [json.loads(_) for _ in merged_outputs]
+        merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+        if torch.distributed.get_rank() == 0:
+            print(f'Evaluating {ds_name} ...')
+            time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+            results_file = f'{ds_name}_{time_prefix}.json'
+            output_path = os.path.join(args.out_dir, results_file)
+            with open(output_path, 'w') as f:
+                json.dump(merged_outputs, f, indent=4)
+            cmd = f'python eval/rs_vqa/score.py --output_file {output_path}'
+            print(cmd)
+            os.system(cmd)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='RSVQA_H_TEST2')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=6)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    args = parser.parse_args()
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+    print(f'[test] max_num: {args.max_num}')
+    prompt = 'Answer the question using a single word or phrase.'
+    evaluate_chat_model()
--- a/internvl_chat/eval/domain_specific/rs_vqa/score.py
+++ b/internvl_chat/eval/domain_specific/rs_vqa/score.py
+import argparse
+import json
+def is_correct_count(response, answer):
+    try:
+        response = int(response) if response is not None else 0
+        answer = int(answer)
+    except ValueError:
+        return False
+    if response == 0 and answer == 0:
+        return True
+    elif 0 < response <= 100 and 0 < answer <= 100:
+        return True
+    elif 100 < response <= 1000 and 100 < answer <= 1000:
+        return True
+    elif response > 1000 and answer > 1000:
+        return True
+    return False
+def is_correct_area(response, answer):
+    try:
+        response = int(response) if response is not None else 0
+        answer = int(answer.rstrip('m2'))
+    except ValueError:
+        return False
+    return is_correct_count(response, answer)
+def calculate_scores(data):
+    type_counts = {}
+    type_correct = {}
+    for entry in data:
+        question_type = entry['question_type']
+        response = entry['response']
+        answer = entry['gt_answer']
+        if question_type not in type_counts:
+            type_counts[question_type] = 0
+            type_correct[question_type] = 0
+        type_counts[question_type] += 1
+        if question_type == 'count':
+            if is_correct_count(response, answer):
+                type_correct[question_type] += 1
+        elif question_type == 'area':
+            if is_correct_area(response, answer):
+                type_correct[question_type] += 1
+        else:
+            if response and response.lower() == answer.lower():
+                type_correct[question_type] += 1
+    type_scores = {}
+    for question_type in type_counts:
+        score = type_correct[question_type] / type_counts[question_type]
+        type_scores[question_type] = round(score, 4)
+    total_correct = sum(type_correct.values())
+    total_count = sum(type_counts.values())
+    total_score = round(total_correct / total_count, 4) if total_count > 0 else 0.0
+    total_correct_useful = sum([v for k, v in type_correct.items() if k not in ['count', 'area']])
+    total_count_useful = sum([v for k, v in type_counts.items() if k not in ['count', 'area']])
+    total_score_useful = round(total_correct_useful / total_count_useful, 4) if total_count_useful > 0 else 0.0
+    print(f'{type_scores=}')
+    print(f'{total_score_useful=}')
+    return type_scores, total_score, total_score_useful, type_counts
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--output_file', type=str, default='')
+    args = parser.parse_args()
+    with open(args.output_file, 'r') as f:
+        data = json.load(f)
+    if 'outputs' in data:
+        data = data['outputs']
+    type_scores, total_score, total_score_useful, type_counts = calculate_scores(data)
+    results = {
+        'type_scores': type_scores,
+        'type_counts': type_counts,
+        'total_score': total_score,
+        'total_score_useful': total_score_useful,
+        'outputs': data
+    }
+    with open(args.output_file, 'w') as f:
+        json.dump(results, f, indent=4)
--- a/internvl_chat/eval/llava_bench/README.md
+++ b/internvl_chat/eval/llava_bench/README.md
+# README for Evaluation
+## 🌟 Overview
+This script provides an evaluation pipeline for `LLaVA-Bench`.
+For scoring, we use **GPT-4-0613** as the evaluation model.
+While the provided code can run the benchmark, we recommend using [VLMEvalKit](https://github.com/open-compass/VLMEvalKit) for testing this benchmark if you aim to align results with our technical report.
+## 🗂️ Data Preparation
+Before starting to download the data, please create the `InternVL/internvl_chat/data` folder.
+### LLaVA-Bench
+Follow the instructions below to prepare the data:
+```shell
+# Step 1: Download the dataset
+cd data/
+git clone https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild
+cd ../
+```
+After preparation is complete, the directory structure is:
+```shell
+data/llava-bench-in-the-wild
+├── images
+├── answers_gpt4.jsonl
+├── bard_0718.jsonl
+├── bing_chat_0629.jsonl
+├── context.jsonl
+├── questions.jsonl
+└── README.md
+```
+## 🏃 Evaluation Execution
+> ⚠️ Note: For testing InternVL (1.5, 2.0, 2.5, and later versions), always enable `--dynamic` to perform dynamic resolution testing.
+To run the evaluation, execute the following command on an 1-GPU setup:
+```shell
+# Step 1: Remove old inference results if exists
+rm -rf results/llava_bench_results_review.jsonl
+# Step 2: Run the evaluation
+torchrun --nproc_per_node=1 eval/llava_bench/evaluate_llava_bench.py --checkpoint ${CHECKPOINT} --dynamic
+# Step 3: Scoring the results using gpt-4-0613
+export OPENAI_API_KEY="your_openai_api_key"
+python -u eval/llava_bench/eval_gpt_review_bench.py \
+  --question data/llava-bench-in-the-wild/questions.jsonl \
+  --context data/llava-bench-in-the-wild/context.jsonl \
+  --rule eval/llava_bench/rule.json \
+  --answer-list \
+      data/llava-bench-in-the-wild/answers_gpt4.jsonl \
+      results/llava_bench_results.jsonl \
+  --output \
+      results/llava_bench_results_review.jsonl
+python -u eval/llava_bench/summarize_gpt_review.py -f results/llava_bench_results_review.jsonl
+```
+Alternatively, you can run the following simplified command:
+```shell
+export OPENAI_API_KEY="your_openai_api_key"
+GPUS=1 sh evaluate.sh ${CHECKPOINT} llava-bench --dynamic
+```
+### Arguments
+The following arguments can be configured for the evaluation script:
+| Argument         | Type   | Default         | Description                                                                                                       |
+| ---------------- | ------ | --------------- | ----------------------------------------------------------------------------------------------------------------- |
+| `--checkpoint`   | `str`  | `''`            | Path to the model checkpoint.                                                                                     |
+| `--datasets`     | `str`  | `'llava_bench'` | Comma-separated list of datasets to evaluate.                                                                     |
+| `--dynamic`      | `flag` | `False`         | Enables dynamic high resolution preprocessing.                                                                    |
+| `--max-num`      | `int`  | `6`             | Maximum tile number for dynamic high resolution.                                                                  |
+| `--load-in-8bit` | `flag` | `False`         | Loads the model weights in 8-bit precision.                                                                       |
+| `--auto`         | `flag` | `False`         | Automatically splits a large model across 8 GPUs when needed, useful for models too large to fit on a single GPU. |
--- a/internvl_chat/eval/llava_bench/eval_gpt_review_bench.py
+++ b/internvl_chat/eval/llava_bench/eval_gpt_review_bench.py
+import argparse
+import json
+import os
+import time
+import openai
+NUM_SECONDS_TO_SLEEP = 0.5
+def get_eval(content: str, max_tokens: int):
+    while True:
+        try:
+            completion = openai.chat.completions.create(
+                model='gpt-4-0613',
+                messages=[{
+                    'role': 'system',
+                    'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
+                }, {
+                    'role': 'user',
+                    'content': content,
+                }],
+                temperature=0.2,  # TODO: figure out which temperature is best for evaluation
+                max_tokens=max_tokens,
+            )
+            break
+        except Exception as e:
+            print(e)
+        time.sleep(NUM_SECONDS_TO_SLEEP)
+    return completion.choices[0].message.content
+def parse_score(review):
+    try:
+        score_pair = review.split('\n')[0]
+        score_pair = score_pair.replace(',', ' ')
+        sp = score_pair.split(' ')
+        if len(sp) == 2:
+            return [float(sp[0]), float(sp[1])]
+        else:
+            print('error', review)
+            return [-1, -1]
+    except Exception as e:
+        print(e)
+        print('error', review)
+        return [-1, -1]
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-q', '--question')
+    parser.add_argument('-c', '--context')
+    parser.add_argument('-a', '--answer-list', nargs='+', default=[])
+    parser.add_argument('-r', '--rule')
+    parser.add_argument('-o', '--output')
+    parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
+    args = parser.parse_args()
+    f_q = open(os.path.expanduser(args.question))
+    f_ans1 = open(os.path.expanduser(args.answer_list[0]))
+    f_ans2 = open(os.path.expanduser(args.answer_list[1]))
+    rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
+    if os.path.isfile(os.path.expanduser(args.output)):
+        cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
+    else:
+        cur_reviews = []
+    review_file = open(f'{args.output}', 'a')
+    context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
+    image_to_context = {context['image']: context for context in context_list}
+    handles = []
+    idx = 0
+    for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
+        ques = json.loads(ques_js)
+        ans1 = json.loads(ans1_js)
+        ans2 = json.loads(ans2_js)
+        print(ques, ans1, ans2)
+        inst = image_to_context[ques['image']]
+        if isinstance(inst['caption'], list):
+            cap_str = '\n'.join(inst['caption'])
+        else:
+            cap_str = inst['caption']
+        category = 'llava_bench_' + json.loads(ques_js)['category']
+        if category in rule_dict:
+            rule = rule_dict[category]
+        else:
+            assert False, f'Visual QA category not found in rule file: {category}.'
+        prompt = rule['prompt']
+        role = rule['role']
+        content = (f'[Context]\n{cap_str}\n\n'
+                   f'[Question]\n{ques["text"]}\n\n'
+                   f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
+                   f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
+                   f'[System]\n{prompt}\n\n')
+        cur_js = {
+            'id': idx + 1,
+            'question_id': ques['question_id'],
+            'answer1_id': ans1.get('answer_id', ans1['question_id']),
+            'answer2_id': ans2.get('answer_id', ans1['question_id']),
+            'category': category
+        }
+        if idx >= len(cur_reviews):
+            review = get_eval(content, args.max_tokens)
+            scores = parse_score(review)
+            cur_js['content'] = review
+            cur_js['tuple'] = scores
+            review_file.write(json.dumps(cur_js) + '\n')
+            review_file.flush()
+        else:
+            print(f'Skipping {idx} as we already have it.')
+        idx += 1
+        print(idx)
+    review_file.close()
--- a/internvl_chat/eval/llava_bench/evaluate_llava_bench.py
+++ b/internvl_chat/eval/llava_bench/evaluate_llava_bench.py
+import argparse
+import json
+import os
+import random
+import torch
+from internvl.model import load_model_and_tokenizer
+from internvl.train.dataset import build_transform, dynamic_preprocess
+from PIL import Image
+from tqdm import tqdm
+ds_collections = {
+    'llava_bench': {
+        'root': 'data/llava-bench-in-the-wild/images',
+        'question': 'data/llava-bench-in-the-wild/questions.jsonl',
+        'max_new_tokens': 1000,
+        'min_new_tokens': 1,
+    },
+}
+class VQADataset(torch.utils.data.Dataset):
+    def __init__(self, root, data, prompt, input_size=224, dynamic_image_size=False,
+                 use_thumbnail=False, max_num=6):
+        self.root = root
+        self.data = open(data).readlines()
+        self.prompt = prompt
+        self.input_size = input_size
+        self.dynamic_image_size = dynamic_image_size
+        self.use_thumbnail = use_thumbnail
+        self.max_num = max_num
+        self.transform = build_transform(is_train=False, input_size=input_size)
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        data = json.loads(self.data[idx].strip())
+        image, question, question_id, annotation = data['image'], data[
+            'text'], data['question_id'], data.get('answer', None)
+        image = os.path.join(self.root, image)
+        image = Image.open(image).convert('RGB')
+        if self.dynamic_image_size:
+            images = dynamic_preprocess(image, image_size=self.input_size,
+                                        use_thumbnail=self.use_thumbnail,
+                                        max_num=self.max_num)
+        else:
+            images = [image]
+        pixel_values = [self.transform(image) for image in images]
+        pixel_values = torch.stack(pixel_values)
+        question = question + self.prompt
+        return question_id, question, pixel_values, annotation
+def evaluate_chat_model():
+    random.seed(args.seed)
+    for ds_name in args.datasets:
+        dataset = VQADataset(
+            root=ds_collections[ds_name]['root'],
+            data=ds_collections[ds_name]['question'],
+            prompt=' Please give a detailed answer.',
+            input_size=image_size,
+            dynamic_image_size=args.dynamic,
+            use_thumbnail=use_thumbnail,
+            max_num=args.max_num
+        )
+        outputs = []
+        for _, (question_id, question, pixel_values, annotations) in tqdm(enumerate(dataset)):
+            pixel_values = pixel_values.to(torch.bfloat16).cuda()
+            generation_config = dict(
+                num_beams=args.num_beams,
+                max_new_tokens=ds_collections[ds_name]['max_new_tokens'],
+                min_new_tokens=ds_collections[ds_name]['min_new_tokens'],
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+            )
+            pred = model.chat(
+                tokenizer=tokenizer,
+                pixel_values=pixel_values,
+                question=question,
+                generation_config=generation_config,
+                verbose=True
+            )
+            outputs.append({
+                'question_id': question_id,
+                'text': pred,
+                'model_id': model_id,
+                'metadata': {}
+            })
+        print(f'Evaluating {ds_name} ...')
+        results_file = 'llava_bench_results.jsonl'
+        results_file = os.path.join(args.out_dir, results_file)
+        writer = open(results_file, 'w')
+        for item in outputs:
+            writer.write(json.dumps(item) + '\n')
+        writer.close()
+        print('Results saved to {}'.format(results_file))
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--datasets', type=str, default='llava_bench')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--num-beams', type=int, default=1)
+    parser.add_argument('--temperature', type=float, default=0.0)
+    parser.add_argument('--out-dir', type=str, default='results')
+    parser.add_argument('--seed', type=int, default=0)
+    parser.add_argument('--dynamic', action='store_true')
+    parser.add_argument('--max-num', type=int, default=6)
+    parser.add_argument('--load-in-8bit', action='store_true')
+    parser.add_argument('--load-in-4bit', action='store_true')
+    parser.add_argument('--auto', action='store_true')
+    args = parser.parse_args()
+    if not os.path.exists(args.out_dir):
+        os.makedirs(args.out_dir, exist_ok=True)
+    args.datasets = args.datasets.split(',')
+    print('datasets:', args.datasets)
+    assert args.batch_size == 1, 'Only batch size 1 is supported'
+    model, tokenizer = load_model_and_tokenizer(args)
+    image_size = model.config.force_image_size or model.config.vision_config.image_size
+    use_thumbnail = model.config.use_thumbnail
+    total_params = sum(p.numel() for p in model.parameters()) / 1e9
+    if total_params > 20 or args.dynamic:
+        args.num_beams = 1
+        print(f'[test] total_params: {total_params}B, use num_beams: {args.num_beams}')
+    else:
+        print(f'[test] total_params: {total_params}B')
+    print(f'[test] image_size: {image_size}')
+    print(f'[test] template: {model.config.template}')
+    print(f'[test] dynamic_image_size: {args.dynamic}')
+    print(f'[test] use_thumbnail: {use_thumbnail}')
+    print(f'[test] max_num: {args.max_num}')
+    model_id = '_'.join(args.checkpoint.split('/')[-2:])
+    evaluate_chat_model()
--- a/internvl_chat/eval/llava_bench/rule.json
+++ b/internvl_chat/eval/llava_bench/rule.json
+{
+    "coding": {"role": "Assistant", "prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."},
+    "math":  {"role": "Assistant", "prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."},
+    "default":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "conv":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "detail":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "complex":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "llava_bench_conv":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "llava_bench_detail":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "llava_bench_complex":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}
+}
--- a/internvl_chat/eval/llava_bench/summarize_gpt_review.py
+++ b/internvl_chat/eval/llava_bench/summarize_gpt_review.py
+import argparse
+import json
+import os
+from collections import defaultdict
+import numpy as np
+def parse_args():
+    parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
+    parser.add_argument('-d', '--dir', default=None)
+    parser.add_argument('-v', '--version', default=None)
+    parser.add_argument('-s', '--select', nargs='*', default=None)
+    parser.add_argument('-f', '--files', nargs='*', default=[])
+    parser.add_argument('-i', '--ignore', nargs='*', default=[])
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = parse_args()
+    if args.ignore is not None:
+        args.ignore = [int(x) for x in args.ignore]
+    if len(args.files) > 0:
+        review_files = args.files
+    else:
+        review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (
+                    x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith(
+                'review_') or 'review' in args.dir)]
+    for review_file in sorted(review_files):
+        config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
+        if args.select is not None and any(x not in config for x in args.select):
+            continue
+        if '0613' in config:
+            version = '0613'
+        else:
+            version = '0314'
+        if args.version is not None and args.version != version:
+            continue
+        scores = defaultdict(list)
+        print(config)
+        with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
+            for review_str in f:
+                review = json.loads(review_str)
+                if review['question_id'] in args.ignore:
+                    continue
+                if 'category' in review:
+                    scores[review['category']].append(review['tuple'])
+                    scores['all'].append(review['tuple'])
+                else:
+                    if 'tuple' in review:
+                        scores['all'].append(review['tuple'])
+                    else:
+                        scores['all'].append(review['score'])
+        for k, v in sorted(scores.items()):
+            stats = np.asarray(v).mean(0).tolist()
+            stats = [round(x, 3) for x in stats]
+            print(k, stats, round(stats[1] / stats[0] * 100, 1))
+            print(k, round(stats[1] / stats[0] * 100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
+        print('=================================')