Initial commit

7a60e044 · wanglch · 7a60e044 · 7a60e044 · 7a60e044 · 7a60e044
Commit 7a60e044 authored Jun 11, 2024 by wanglch
20 changed files
--- a/data/self_build/eval_data/eval_data.json
+++ b/data/self_build/eval_data/eval_data.json
+[
+      {
+        "id": "5",
+        "image": "/home/wanglch/projects/MiniCPM-V/data/self_build/eval_data/image_5.jpg",
+        "conversations": [
+              {
+                  "role": "user", 
+                  "content": "<image>\n."
+              }, 
+              {
+                  "role": "assistant", 
+                  "content": "There is a rectangular black sign in the image. The text reads HOTEL ENTRANCE written in white. Below the text is an arrow pointing upwards. The sign post is attached to a pink wall. There are some leaves on the wall."
+              },   
+              {
+                  "role": "user", 
+                  "content": "Written all the texts of this picture."
+              }, 
+              {
+                  "role": "assistant", 
+                  "content": "图像中有一个矩形黑色标志。文字显示为白色的酒店大门。下方是一个指向上方的箭头。标志被固定在粉红色的墙上。墙上有些叶子。"
+              }
+          ]
+      }
+  ]
\ No newline at end of file
--- a/data/self_build/eval_data/image_5.jpg
+++ b/data/self_build/eval_data/image_5.jpg
--- a/data/self_build/train_data/image_0.png
+++ b/data/self_build/train_data/image_0.png
--- a/data/self_build/train_data/image_1.png
+++ b/data/self_build/train_data/image_1.png
--- a/data/self_build/train_data/image_2.png
+++ b/data/self_build/train_data/image_2.png
--- a/data/self_build/train_data/image_3.png
+++ b/data/self_build/train_data/image_3.png
--- a/data/self_build/train_data/image_4.png
+++ b/data/self_build/train_data/image_4.png
--- a/data/self_build/train_data/train_data.json
+++ b/data/self_build/train_data/train_data.json
+[
+    {
+        "id": "0",
+        "image": "/home/wanglch/projects/MiniCPM-V/data/self_build/train_data/image_0.png",
+        "conversations": [
+              {
+                  "role": "user", 
+                  "content": "<image>\n提取这张图中的文字信息."
+              }, 
+              {
+                  "role": "assistant", 
+                  "content": "水平长方形牌匾上面是绿色下面是白色。 写着几个黑色的汉字。 左边是第33卷。 右边是第6期。"
+              },   
+              {
+                  "role": "user", 
+                  "content": "Written all the texts of this picture."
+              },
+              {
+                "role": "assistant", 
+                "content": "The horizontal rectangle plaque is green above and white below. The content is a few black Chinese words. On the left side is 第33卷. On the right side is 第6期."
+              }
+          ]
+      },
+      {
+        "id": "1",
+        "image": "/home/wanglch/projects/MiniCPM-V/data/self_build/train_data/image_1.png",
+        "conversations": [
+              {
+                  "role": "user", 
+                  "content": "<image>\n提取这张图中的文字信息."
+              }, 
+              {
+                  "role": "assistant", 
+                  "content": "图片底部有橙色背景。上半部分显示了一个水平矩形图像。图像中有一盘绿色和白色的食物。在中间部分，棕色的文字在粉红色的背景下写着清炒大虾仁。"
+              },   
+              {
+                  "role": "user", 
+                  "content":"Written all the texts of this picture."
+              },
+              {
+                "role": "assistant", 
+                "content": "The picture has an orange background at the bottom. The upper part shows a horizontal rectangle image. In the image, there is a plate of food with green and white colors. In the middle part, brown text reads Clear fried big shrimp against a pink background."
+              }
+          ]
+      },
+      {
+        "id": "2",
+        "image": "/home/wanglch/projects/MiniCPM-V/data/self_build/train_data/image_2.png",
+        "conversations": [
+              {
+                  "role": "user", 
+                  "content": "<image>\n提取这张图中的文字信息."
+              }, 
+              {
+                  "role": "assistant", 
+                  "content": "金属背景上雕刻着两行黑色文字,文字内容是CALL IS BEING PLACED。在左下角，有一个金属头螺栓。在右边缘，有一盏红灯。"
+              },   
+              {
+                  "role": "user", 
+                  "content": "Written all the texts of this picture."
+              },
+              {
+                "role": "assistant", 
+                "content": "Two lines of black text are carved on a metal background.Text reads CALL IS BEING PLACED.In the bottom left corner, there is a metal head bolt.In the right edge, there is a red light."
+              }
+          ]
+      },
+      {
+        "id": "3",
+        "image": "/home/wanglch/projects/MiniCPM-V/data/self_build/train_data/image_3.png",
+        "conversations": [
+              {
+                  "role": "user", 
+                  "content": "<image>\n提取这张图中的文字信息."
+              }, 
+              {
+                  "role": "assistant", 
+                  "content": "图像中有一个矩形黄色标志。文本显示为黑色的HOTEL ENTRANCE。文本下方是一个指向左的箭头。路标贴在白色的墙上。"
+              },   
+              {
+                  "role": "user", 
+                  "content": "Written all the texts of this picture."
+              },
+              {
+                "role": "assistant", 
+                "content": "There is a rectangular yellow sign in the image. The text reads HOTEL ENTRANCE written in black. Below the text is an arrow pointing left. The sign post is attached to a white wall."
+              }
+          ]
+      },
+      {
+        "id": "4",
+        "image": "/home/wanglch/projects/MiniCPM-V/data/self_build/train_data/image_4.png",
+        "conversations": [
+              {
+                  "role": "user", 
+                  "content": "<image>\n."
+              }, 
+              {
+                  "role": "assistant", 
+                  "content": "窗口上写着两行文字。第一行以粗体和大号字体显示。第二行写着 Wardour Street. 所有文本均以白色书写。"
+              },   
+              {
+                  "role": "user", 
+                  "content": "Written all the texts of this picture."
+              }, 
+              {
+                  "role": "assistant", 
+                  "content": "Two lines of text are written on the window. The first line reads 1 written in bold and large size. The second line reads Wardour Street. All texts are written in white"
+              }
+          ]
+      }
+  ]
\ No newline at end of file
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu22.04-dtk23.10.1-py310
+ENV DEBIAN_FRONTEND=noninteractive
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
\ No newline at end of file
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
+transformers
+accelerate
+tiktoken
+einops
+transformers_stream_generator
+scipy
+torchvision
+pillow
+tensorboard
+matplotlib
+tk
+shutilwhich
+deepspeed
\ No newline at end of file
--- a/docker/requirements_web_demo.txt
+++ b/docker/requirements_web_demo.txt
+gradio
+modelscope
--- a/docs/compare_with_phi-3_vision.md
+++ b/docs/compare_with_phi-3_vision.md
+## Phi-3-vision-128K-Instruct vs MiniCPM-Llama3-V 2.5
+
+Comparison results of Phi-3-vision-128K-Instruct and MiniCPM-Llama3-V 2.5, regarding the model size, hardware requirements, and performances.
+With int4 quantization, MiniCPM-Llama3-V 2.5 delivers **smooth inference with only 8GB of GPU memory**. In most benchmarks, MiniCPM-Llama3-V 2.5 achieves **better performance** compared with Phi-3-vision-128K-Instruct. Moreover, MiniCPM-Llama3-V 2.5 also exhibits **lower latency and better throughtput even without quantization**.
+
+我们提供了从模型参数、硬件需求、性能指标等方面对比 Phi-3-vision-128K-Instruct 和 MiniCPM-Llama3-V 2.5 的结果。通过 int4 量化，MiniCPM-Llama3-V 2.5 **仅需 8GB 显存即可推理**。在大多数评测集上， MiniCPM-Llama3-V 2.5 相比于 Phi-3-vision-128K-Instruct 都展现出了**更优的性能表现**。 即使未经量化，MiniCPM-Llama3-V 2.5 的**推理延迟和吞吐率也都更具优势**。
+
+<div align="center">
+    <img src="../assets/phi3_vision_comparison.jpg" width="85%" />
+</div>
+
+
+
+### Multilingual Capabilities（多语言能力对比）
+
+
+MiniCPM-Llama3-V 2.5 exhibits **stronger multilingual** capabilities compared with Phi-3-vision-128K-Instruct on LLaVA Bench.
+
+MiniCPM-Llama3-V 2.5 在对话和推理评测榜单 LLaVA Bench 上展现出了比 Phi-3-vision-128K-Instruct **更强的多语言的性能**。
+
+<div align="center">
+    <img src="../assets/llavabench_compare_phi3.png" width="100%" />
+    <br>
+    Evaluation results of multilingual LLaVA Bench
+    <br>
+    多语言LLaVA Bench评测结果
+</div>
--- a/docs/inference_on_multiple_gpus.md
+++ b/docs/inference_on_multiple_gpus.md
+## Using MiniCPM-Llama3-V-2_5 with Multiple GPUs
+
+Due to the limited memory capacity of a single GPU, it may be impossible to load the entire MiniCPMV model (the model weights account for 18 GiB) onto one device for inference (assume one gpu only has 12 GiB or 16 GiB GPU memory). To address this limitation, multi-GPU inference can be employed, where the model's layers are distributed across multiple GPUs.
+
+A minimal modification method can be used to achieve this distribution, ensuring that the layers are assigned to different GPUs with minimal changes to the original model structure.
+
+To implement this, we utilize features provided by `accelerate` library. 
+
+Install all requirements of MiniCPM-Llama3-V-2_5, additionally, you also need to install `accelerate`.
+
+```bash
+pip install accelerate
+```
+
+<br/>
+
+### Example Usage for `2x16GiB` GPUs
+
+Now we consider a demo for two GPUs, where each GPU has 16 GiB GPU memory.
+
+1. Import necessary libraries.
+
+```python
+from PIL import Image
+import torch
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_in_model, dispatch_model
+```
+
+2. Download model weights.
+
+```python
+MODEL_PATH = '/local/path/to/MiniCPM-Llama3-V-2_5' # you can download in advance or use `openbmb/MiniCPM-Llama3-V-2_5`
+```
+
+3. Determine the distribution of layers on multiple GPUs. 
+
+```python
+max_memory_each_gpu = '10GiB' # Define the maximum memory to use on each gpu, here we suggest using a balanced value, because the weight is not everything, the intermediate activation value also uses GPU memory (10GiB < 16GiB)
+
+gpu_device_ids = [0, 1] # Define which gpu to use (now we have two GPUs, each has 16GiB memory)
+
+no_split_module_classes = ["LlamaDecoderLayer"]
+
+max_memory = {
+    device_id: max_memory_each_gpu for device_id in gpu_device_ids
+}
+
+config = AutoConfig.from_pretrained(
+    MODEL_PATH, 
+    trust_remote_code=True
+)
+
+tokenizer = AutoTokenizer.from_pretrained(
+    MODEL_PATH, 
+    trust_remote_code=True
+)
+
+with init_empty_weights():
+    model = AutoModel.from_config(
+        config, 
+        torch_dtype=torch.float16, 
+        trust_remote_code=True
+    )
+
+device_map = infer_auto_device_map(
+    model,
+    max_memory=max_memory, no_split_module_classes=no_split_module_classes
+)
+
+print("auto determined device_map", device_map)
+
+# Here we want to make sure the input and output layer are all on the first gpu to avoid any modifications to original inference script.
+
+device_map["llm.model.embed_tokens"] = 0
+device_map["llm.model.layers.0"] = 0
+device_map["llm.lm_head"] = 0
+device_map["vpm"] = 0
+device_map["resampler"] = 0
+
+print("modified device_map", device_map)
+
+```
+
+You may see this output:
+
+```
+modified device_map OrderedDict([('llm.model.embed_tokens', 0), ('llm.model.layers.0', 0), ('llm.model.layers.1', 0), ('llm.model.layers.2', 0), ('llm.model.layers.3', 0), ('llm.model.layers.4', 0), ('llm.model.layers.5', 0), ('llm.model.layers.6', 0), ('llm.model.layers.7', 0), ('llm.model.layers.8', 0), ('llm.model.layers.9', 0), ('llm.model.layers.10', 0), ('llm.model.layers.11', 0), ('llm.model.layers.12', 0), ('llm.model.layers.13', 0), ('llm.model.layers.14', 0), ('llm.model.layers.15', 0), ('llm.model.layers.16', 1), ('llm.model.layers.17', 1), ('llm.model.layers.18', 1), ('llm.model.layers.19', 1), ('llm.model.layers.20', 1), ('llm.model.layers.21', 1), ('llm.model.layers.22', 1), ('llm.model.layers.23', 1), ('llm.model.layers.24', 1), ('llm.model.layers.25', 1), ('llm.model.layers.26', 1), ('llm.model.layers.27', 1), ('llm.model.layers.28', 1), ('llm.model.layers.29', 1), ('llm.model.layers.30', 1), ('llm.model.layers.31', 1), ('llm.model.norm', 1), ('llm.lm_head', 0), ('vpm', 0), ('resampler', 0)])
+```
+
+4. Next, use the `device_map` to dispatch the model layers to corresponding gpus.
+
+```python
+load_checkpoint_in_model(
+    model, 
+    MODEL_PATH, 
+    device_map=device_map)
+
+model = dispatch_model(
+    model, 
+    device_map=device_map
+)
+
+torch.set_grad_enabled(False)
+
+model.eval()
+```
+
+
+
+5. Chat!
+
+```python
+image_path = '/local/path/to/test.png'
+
+response = model.chat(
+    image=Image.open(image_path).convert("RGB"),
+    msgs=[
+        {
+            "role": "user",
+            "content": "guess what I am doing?"
+        }
+    ],
+    tokenizer=tokenizer
+)
+
+print(response)
+```
+
+In this case the OOM (CUDA out of memory) problem may be eliminated. We have tested that:
+
+- it works well for `3000` text input tokens and `1000` text output tokens.
+- it works well for a high-resolution input image.
+
+<br/>
+
+### Usage for general cases
+
+It is similar to the previous example, but you may consider modifying these two variables.
+
+```python
+max_memory_each_gpu = '10GiB' # Define the maximum memory to use on each gpu, here we suggest using a balanced value, because the weight is not everything, the intermediate activation value also uses GPU memory (10GiB < 16GiB)
+
+gpu_device_ids = [0, 1, ...] # Define which gpu to use (now we have two GPUs, each has 16GiB memory)
+```
+
+You may use the following shell script to monitor the memory usage during inference, if there is OOM, try to reduce `max_memory_each_gpu`, to make memory pressure more balanced across all gpus.
+
+```bash
+watch -n1 nvidia-smi
+```
+
+<br/>
+
+
+### References
+
+[Ref 1](https://zhuanlan.zhihu.com/p/639850033)
+
--- a/docs/wechat.md
+++ b/docs/wechat.md
+<div align="center">
+<img src="../assets/minicpm-v17.png" width="60%"/>
+
+<p> 扫码加入「MiniCPM-V 交流群」 </p>
+<p> Scan the QR code to join the "MiniCPM-V Discussion Group" </p>
+</div>
--- a/eval_mm/README.md
+++ b/eval_mm/README.md
+# Evaluation
+
+## opencompass
+First, enter the `vlmevalkit` directory and install all dependencies:
+```bash
+cd vlmevalkit
+pip install -r requirements.txt
+```
+<br />
+
+Then, run `script/run_inference.sh`, which receives three input parameters in sequence: `MODELNAME`, `DATALIST`, and `MODE`. `MODELNAME` represents the name of the model, `DATALIST` represents the datasets used for inference, and `MODE` represents evaluation mode:
+```bash
+chmod +x ./script/run_inference.sh
+./script/run_inference.sh $MODELNAME $DATALIST $MODE
+```
+<br />
+
+The three available choices for `MODELNAME` are listed in `vlmeval/config.py`:
+```bash
+ungrouped = {
+    'MiniCPM-V':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
+    'MiniCPM-V-2':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'),
+    'MiniCPM-Llama3-V-2_5':partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'),
+}
+```
+<br />
+
+All available choices for `DATALIST` are listed in `vlmeval/utils/dataset_config.py`. While evaluating on a single dataset, call the dataset name directly without quotation marks; while evaluating on multiple datasets, separate the names of different datasets with spaces and add quotation marks at both ends:
+```bash
+$DATALIST="POPE ScienceQA_TEST ChartQA_TEST"
+```
+<br />
+
+While scoring on each benchmark directly, set `MODE=all`. If only inference results are required, set `MODE=infer`. In order to reproduce the results in the table displayed on the homepage (columns between MME and RealWorldQA), you need to run the script according to the following settings:
+```bash
+# run on all 7 datasets
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 "MME MMBench_TEST_EN MMBench_TEST_CN MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA" all
+
+# The following are instructions for running on a single dataset
+# MME
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MME all
+# MMBench_TEST_EN
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMBench_TEST_EN all
+# MMBench_TEST_CN
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMBench_TEST_CN all
+# MMMU_DEV_VAL
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMMU_DEV_VAL all
+# MathVista_MINI
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MathVista_MINI all
+# LLaVABench
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 LLaVABench all
+# RealWorldQA
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 RealWorldQA all
+```
+<br />
+
+## vqadataset
+First, enter the `vqaeval` directory and install all dependencies. Then, create `downloads` subdirectory to store the downloaded dataset for all tasks:
+```bash
+cd vqaeval
+pip install -r requirements.txt
+mkdir downloads
+```
+<br />
+
+Download the datasets from the following links and place it in the specified directories:
+###### TextVQA
+```bash
+cd downloads
+mkdir TextVQA && cd TextVQA
+wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip
+unzip train_val_images.zip && rm train_val_images.zip
+mv train_val_images/train_images . && rm -rf train_val_images
+wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json
+cd ../..
+```
+
+###### DocVQA / DocVQATest
+
+```bash
+cd downloads
+mkdir DocVQA && cd DocVQA && mkdir spdocvqa_images
+# Download Images and Annotations from Task 1 - Single Page Document Visual Question Answering at https://rrc.cvc.uab.es/?ch=17&com=downloads
+# Move the spdocvqa_images.tar.gz and spdocvqa_qas.zip to DocVQA directory
+tar -zxvf spdocvqa_images.tar.gz -C spdocvqa_images && rm spdocvqa_images.tar.gz
+unzip spdocvqa_qas.zip && rm spdocvqa_qas.zip
+cp spdocvqa_qas/val_v1.0_withQT.json . && cp spdocvqa_qas/test_v1.0.json .  && rm -rf spdocvqa_qas
+cd ../..
+```
+<br />
+
+The `downloads` directory should be organized according to the following structure:
+```bash
+downloads
+├── TextVQA
+│   ├── train_images
+│   │   ├── ...
+│   ├── TextVQA_0.5.1_val.json
+├── DocVQA
+│   ├── spdocvqa_images
+│   │   ├── ...
+│   ├── val_v1.0_withQT.json
+│   ├── test_v1.0.json
+```
+<br />
+
+Modify the parameters in `shell/run_inference.sh` and run inference:
+
+```bash
+chmod +x ./shell/run_inference.sh
+./shell/run_inference.sh
+```
+<br />
+
+All optional parameters are listed in `eval_utils/getargs.py`. The meanings of some major parameters are listed as follows:
+```bash
+# path to images and their corresponding questions
+# TextVQA
+--textVQA_image_dir
+--textVQA_ann_path
+# DocVQA
+--docVQA_image_dir
+--docVQA_ann_path
+# DocVQATest
+--docVQATest_image_dir
+--docVQATest_ann_path
+
+# whether to eval on certain task
+--eval_textVQA
+--eval_docVQA
+--eval_docVQATest
+--eval_all
+
+# model name and model path
+--model_name
+--model_path
+# load model from ckpt
+--ckpt
+# the way the model processes input data, "interleave" represents interleaved image-text form, while "old" represents non-interleaved.
+--generate_method
+
+--batchsize
+
+# path to save the outputs
+--answer_path
+```
+<br />
+
+While evaluating on different tasks, parameters need to be set as follows:
+###### TextVQA
+```bash
+--eval_textVQA
+--textVQA_image_dir ./downloads/TextVQA/train_images
+--textVQA_ann_path ./downloads/TextVQA/TextVQA_0.5.1_val.json
+```
+
+###### DocVQA
+```bash
+--eval_docVQA
+--docVQA_image_dir ./downloads/DocVQA/spdocvqa_images
+--docVQA_ann_path ./downloads/DocVQA/val_v1.0_withQT.json
+```
+
+###### DocVQATest
+```bash
+--eval_docVQATest
+--docVQATest_image_dir ./downloads/DocVQA/spdocvqa_images
+--docVQATest_ann_path ./downloads/DocVQA/test_v1.0.json
+```
+
+<br />
+
+For the DocVQATest task, in order to upload the inference results to the [official website](https://rrc.cvc.uab.es/?ch=17) for evaluation, run `shell/run_transform.sh` for format transformation after inference. `input_file_path` represents the path to the original output json, `output_file_path` represents the path to the transformed json:
+```bash
+chmod +x ./shell/run_transform.sh
+./shell/run_transform.sh
+```
\ No newline at end of file
--- a/eval_mm/README_zh.md
+++ b/eval_mm/README_zh.md
+# Evaluation
+
+## opencompass
+首先，进入 `vlmevalkit` 目录下，安装必要的依赖：
+```bash
+cd vlmevalkit
+pip install -r requirements.txt
+```
+<br />
+
+然后，运行 `script/run_inference.sh`，该脚本依次接收三个输入参数：`MODELNAME`, `DATALIST`, `MODE`。`MODELNAME` 为模型名称，`DATALIST` 为目标数据集，`MODE` 为评测模式。
+```bash
+chmod +x ./script/run_inference.sh
+./script/run_inference.sh $MODELNAME $DATALIST $MODE
+```
+<br />
+
+`MODELNAME` 有三种选择，位于 `vlmeval/config.py` 中：
+```bash
+ungrouped = {
+    'MiniCPM-V':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V'),
+    'MiniCPM-V-2':partial(MiniCPM_V, model_path='openbmb/MiniCPM-V-2'),
+    'MiniCPM-Llama3-V-2_5':partial(MiniCPM_Llama3_V, model_path='openbmb/MiniCPM-Llama3-V-2_5'),
+}
+```
+<br />
+
+可选的所有 `DATALIST` 位于 `vlmeval/utils/dataset_config.py` 中，评测单个数据集时，直接调用数据集名称，不加引号；评测多个数据集时，将不同数据集名称以空格隔开，两端加引号：
+```bash
+$DATALIST="POPE ScienceQA_TEST ChartQA_TEST"
+```
+<br />
+
+直接对各 benchmark 进行评分时，设置 `MODE=all`。如果仅需要推理结果，则设置 `MODE=infer`
+为了复现出首页展示的表格中的各项结果（MME 到 RealWorldQA 之间的列），需要按照如下设置运行：
+```bash
+# 一次性运行 7 个数据集
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 "MME MMBench_TEST_EN MMBench_TEST_CN MMMU_DEV_VAL MathVista_MINI LLaVABench RealWorldQA" all
+
+# 以下是单独运行 1 个数据集的指令
+# MME
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MME all
+# MMBench_TEST_EN
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMBench_TEST_EN all
+# MMBench_TEST_CN
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMBench_TEST_CN all
+# MMMU_DEV_VAL
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MMMU_DEV_VAL all
+# MathVista_MINI
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 MathVista_MINI all
+# LLaVABench
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 LLaVABench all
+# RealWorldQA
+./script/run_inference.sh MiniCPM-Llama3-V-2_5 RealWorldQA all
+```
+<br />
+
+## vqadataset
+首先，进入 `vqaeval` 目录下，安装必要的依赖，并创建 `downloads` 子目录，用于存储下载的数据集：
+```bash
+cd vqaeval
+pip install -r requirements.txt
+mkdir downloads
+```
+<br />
+
+然后，从下列各地址下载数据集并置于指定目录下：
+###### TextVQA
+```bash
+cd downloads
+mkdir TextVQA && cd TextVQA
+wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip
+unzip train_val_images.zip && rm train_val_images.zip
+mv train_val_images/train_images . && rm -rf train_val_images
+wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json
+cd ../..
+```
+
+###### DocVQA / DocVQATest
+```bash
+cd downloads
+mkdir DocVQA && cd DocVQA && mkdir spdocvqa_images
+# 在 https://rrc.cvc.uab.es/?ch=17&com=downloads 下载 Task 1 - Single Page Document Visual Question Answering 下的 Images 和 Annotations
+# 将下载得到的 spdocvqa_images.tar.gz 以及 spdocvqa_qas.zip 置于 DocVQA 目录下
+tar -zxvf spdocvqa_images.tar.gz -C spdocvqa_images && rm spdocvqa_images.tar.gz
+unzip spdocvqa_qas.zip && rm spdocvqa_qas.zip
+cp spdocvqa_qas/val_v1.0_withQT.json . && cp spdocvqa_qas/test_v1.0.json .  && rm -rf spdocvqa_qas
+cd ../..
+```
+<br />
+
+`downloads` 目录应当按照下列结构组织：
+```bash
+downloads
+├── TextVQA
+│   ├── train_images
+│   │   ├── ...
+│   ├── TextVQA_0.5.1_val.json
+├── DocVQA
+│   ├── spdocvqa_images
+│   │   ├── ...
+│   ├── val_v1.0_withQT.json
+│   ├── test_v1.0.json
+```
+<br />
+
+准备好相应的数据集之后，修改 `shell/run_inference.sh` 的参数，运行推理：
+
+```bash
+chmod +x ./shell/run_inference.sh
+./shell/run_inference.sh
+```
+<br />
+
+可以传入的参数位于 `eval_utils/getargs.py` 中，各主要参数的含义如下：
+```bash
+# 指定 TextVQA 评测所有图片和问题的路径
+--textVQA_image_dir
+--textVQA_ann_path
+# 指定 DocVQA 评测所有图片和问题的路径
+--docVQA_image_dir
+--docVQA_ann_path
+# 指定 DocVQATest 评测所有图片和问题的路径
+--docVQATest_image_dir
+--docVQATest_ann_path
+
+# 决定是否评测某个任务，eval_all 设置为 True 表示所有任务都评测
+--eval_textVQA
+--eval_docVQA
+--eval_docVQATest
+--eval_all
+
+# 模型名称、模型路径（从指定路径加载模型）
+--model_name
+--model_path
+# 从 checkpoint 加载模型
+--ckpt
+# 模型处理输入数据的方式，interleave 表示图文交错式，old 表示非交错式
+--generate_method
+# 推理时的批处理规模，建议推理时设置为 1
+--batchsize
+
+# 输出内容保存的路径
+--answer_path
+```
+<br />
+
+评测三个任务需要设置的参数如下：
+###### TextVQA
+```bash
+--eval_textVQA
+--textVQA_image_dir ./downloads/TextVQA/train_images
+--textVQA_ann_path ./downloads/TextVQA/TextVQA_0.5.1_val.json
+```
+
+###### DocVQA
+```bash
+--eval_docVQA
+--docVQA_image_dir ./downloads/DocVQA/spdocvqa_images
+--docVQA_ann_path ./downloads/DocVQA/val_v1.0_withQT.json
+```
+
+###### DocVQATest
+```bash
+--eval_docVQATest
+--docVQATest_image_dir ./downloads/DocVQA/spdocvqa_images
+--docVQATest_ann_path ./downloads/DocVQA/test_v1.0.json
+```
+<br />
+
+对于 DocVQATest 任务，为了将推理结果上传到[官方网站](https://rrc.cvc.uab.es/?ch=17)进行评测，还需要运行 `shell/run_transform.sh` 进行格式转换。其中，`input_file_path` 对应原始输出的 json 的路径，`output_file_path` 为自定义的转换后的 json 的路径：
+```bash
+chmod +x ./shell/run_transform.sh
+./shell/run_transform.sh
+```
\ No newline at end of file
--- a/eval_mm/vlmevalkit/requirements.txt
+++ b/eval_mm/vlmevalkit/requirements.txt
+einops
+gradio==4.15.0
+huggingface_hub
+matplotlib
+numpy>=1.23.4
+omegaconf
+openai==1.3.5
+opencv-python>=4.4.0.46
+openpyxl
+pandas>=1.5.3
+pillow
+portalocker
+protobuf
+pycocoevalcap
+python-dotenv
+requests
+rich
+seaborn
+sentencepiece
+sty
+tabulate
+tiktoken
+timeout-decorator
+tqdm
+typing_extensions==4.7.1
+validators
+visual_genome
+xlsxwriter
+Pillow==10.1.0
+sentencepiece==0.1.99
+transformers==4.40.0
+torch==1.13.1
+torchvision
--- a/eval_mm/vlmevalkit/run.py
+++ b/eval_mm/vlmevalkit/run.py
+import torch
+import torch.distributed as dist
+from vlmeval.smp import *
+from vlmeval.evaluate import *
+from vlmeval.inference import infer_data_job
+from vlmeval.config import supported_VLM
+from vlmeval.utils import dataset_URLs, DATASET_TYPE, abbr2full, MMMU_result_transfer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data', type=str, nargs='+', required=True)
+    parser.add_argument('--model', type=str, nargs='+', required=True)
+    parser.add_argument('--work-dir', type=str, default='.', help='select the output directory')
+    parser.add_argument('--mode', type=str, default='all', choices=['all', 'infer'])
+    parser.add_argument('--nproc', type=int, default=4, help='Parallel API calling')
+    parser.add_argument('--retry', type=int, default=None, help='retry numbers for API VLMs')
+    parser.add_argument('--judge', type=str, default=None)
+    parser.add_argument('--ignore', action='store_true', help='Ignore failed indices. ')
+    parser.add_argument('--verbose', action='store_true')
+    parser.add_argument('--rerun', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    logger = get_logger('RUN')
+
+    args = parse_args()
+    assert len(args.data), '--data should be a list of data files'
+
+    if args.retry is not None:
+        for k, v in supported_VLM.items():
+            if hasattr(v, 'keywords') and 'retry' in v.keywords:
+                v.keywords['retry'] = args.retry
+                supported_VLM[k] = v
+            if hasattr(v, 'keywords') and 'verbose' in v.keywords:
+                v.keywords['verbose'] = args.verbose
+                supported_VLM[k] = v
+
+    rank, world_size = get_rank_and_world_size()
+    if world_size > 1:
+        local_rank = os.environ.get('LOCAL_RANK', 0)
+        torch.cuda.set_device(int(local_rank))
+        dist.init_process_group(backend='nccl', timeout=datetime.timedelta(seconds=10800))
+
+    for _, model_name in enumerate(args.model):
+        model = None
+
+        pred_root = osp.join(args.work_dir, model_name)
+        os.makedirs(pred_root, exist_ok=True)
+
+        for _, dataset_name in enumerate(args.data):
+            custom_flag = False
+
+            if dataset_name not in dataset_URLs:
+                dataset_name = abbr2full(dataset_name)
+
+            if dataset_name not in dataset_URLs:
+                logger.warning(f'Dataset {dataset_name} is not officially supported. ')
+                file_path = osp.join(LMUDataRoot(), f'{dataset_name}.tsv')
+                if not osp.exists(file_path):
+                    logger.error(f'Cannot find the local dataset {dataset_name}. ')
+                    continue
+                else:
+                    custom_flag = True
+
+            result_file = f'{pred_root}/{model_name}_{dataset_name}.xlsx'
+            if osp.exists(result_file) and args.rerun:
+                os.system(f'rm {pred_root}/{model_name}_{dataset_name}_*')
+
+            if model is None:
+                model = model_name  # which is only a name
+
+            model = infer_data_job(
+                model,
+                work_dir=pred_root,
+                model_name=model_name,
+                dataset_name=dataset_name,
+                verbose=args.verbose,
+                api_nproc=args.nproc,
+                ignore_failed=args.ignore)
+
+            if rank == 0:
+                if dataset_name in ['MMMU_TEST']:
+                    result_json = MMMU_result_transfer(result_file)
+                    logger.info(f'Transfer MMMU_TEST result to json for official evaluation, json file saved in {result_json}')  # noqa: E501
+                    continue
+
+            if dataset_name in [
+                'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN'
+                'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11'
+            ]:
+                if not MMBenchOfficialServer(dataset_name):
+                    logger.error(
+                        f'Can not evaluate {dataset_name} on non-official servers, '
+                        'will skip the evaluation. '
+                    )
+                    continue
+
+            judge_kwargs = {
+                'nproc': args.nproc,
+                'verbose': args.verbose,
+            }
+            if args.retry is not None:
+                judge_kwargs['retry'] = args.retry
+            if args.judge is not None:
+                judge_kwargs['model'] = args.judge
+            else:
+                if DATASET_TYPE(dataset_name) in ['multi-choice', 'Y/N']:
+                    judge_kwargs['model'] = 'chatgpt-0613'
+                elif listinstr(['MMVet', 'MathVista', 'LLaVABench'], dataset_name):
+                    judge_kwargs['model'] = 'gpt-4-turbo'
+            if 'OPENAI_API_KEY_JUDGE' in os.environ and len(os.environ['OPENAI_API_KEY_JUDGE']):
+                judge_kwargs['key'] = os.environ['OPENAI_API_KEY_JUDGE']
+            if 'OPENAI_API_BASE_JUDGE' in os.environ and len(os.environ['OPENAI_API_BASE_JUDGE']):
+                judge_kwargs['api_base'] = os.environ['OPENAI_API_BASE_JUDGE']
+
+            if rank == 0 and args.mode == 'all':
+                if DATASET_TYPE(dataset_name) == 'multi-choice':
+                    dataset_name = 'default' if custom_flag else dataset_name
+                    multiple_choice_eval(
+                        result_file,
+                        dataset=dataset_name,
+                        **judge_kwargs)
+                elif DATASET_TYPE(dataset_name) == 'Y/N':
+                    YOrN_eval(
+                        result_file,
+                        dataset=dataset_name,
+                        **judge_kwargs)
+                elif DATASET_TYPE(dataset_name) == 'Caption':
+                    COCO_eval(result_file)
+                elif dataset_name == 'MMVet':
+                    MMVet_eval(result_file, **judge_kwargs)
+                elif dataset_name == 'OCRBench':
+                    OCRBench_eval(result_file)
+                elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA'], dataset_name):
+                    VQAEval(result_file, dataset_name)
+                elif listinstr(['MathVista'], dataset_name):
+                    MathVista_eval(result_file, **judge_kwargs)
+                elif listinstr(['LLaVABench'], dataset_name):
+                    LLaVABench_eval(result_file, **judge_kwargs)
+                else:
+                    logger.error(f'Dataset {dataset_name} is not handled by evaluator, will be skipped. ')
+
+
+if __name__ == '__main__':
+    load_env()
+    main()
--- a/eval_mm/vlmevalkit/script/run_inference.sh
+++ b/eval_mm/vlmevalkit/script/run_inference.sh
+export PATH=/usr/local/cuda/bin:$PATH
+
+export HF_ENDPOINT=https://hf-mirror.com
+export OMP_NUM_THREADS=1
+export timestamp=`date +"%Y%m%d%H%M%S"`
+export OLD_VERSION='False'
+export PYTHONPATH=$(dirname $SELF_DIR):$PYTHONPATH
+
+# gpu consumed
+# fp16 17-18G
+# int4 7-8G
+
+# model to be used
+# Example: MODELNAME=MiniCPM-Llama3-V-2_5
+MODELNAME=$1
+# datasets to be tested
+# Example: DATALIST="POPE ScienceQA_TEST ChartQA_TEST"
+DATALIST=$2
+# test mode, all or infer
+MODE=$3
+
+echo "Starting inference with model $MODELNAME on datasets $DATALIST"
+# run on multi gpus with torchrun command
+# remember to run twice, the first run may fail
+torchrun --nproc_per_node=8 run.py --data $DATALIST --model $MODELNAME --mode $MODE
+torchrun --nproc_per_node=8 run.py --data $DATALIST --model $MODELNAME --mode $MODE
+# run on single gpu with python command
+# python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE
+# python run.py --data $DATALIST --model $MODELNAME --verbose --mode $MODE
+
+ls
--- a/eval_mm/vlmevalkit/vlmeval/__init__.py
+++ b/eval_mm/vlmevalkit/vlmeval/__init__.py
+try:
+    import torch
+except ImportError:
+    pass
+
+from .smp import *
+from .api import *
+from .evaluate import *
+from .utils import *
+from .vlm import *
+from .config import *
+
+load_env()