Initial commit

5e887c2c · wanglch · 5e887c2c · 5e887c2c · 5e887c2c · 5e887c2c
Commit 5e887c2c authored May 31, 2024 by wanglch
20 changed files
--- a/assets/touchstone_eval.png
+++ b/assets/touchstone_eval.png
--- a/assets/touchstone_logo.png
+++ b/assets/touchstone_logo.png
--- a/assets/train_ticket.jpg
+++ b/assets/train_ticket.jpg
--- a/assets/train_ticket2.jpg
+++ b/assets/train_ticket2.jpg
--- a/assets/train_ticket_info.png
+++ b/assets/train_ticket_info.png
--- a/assets/transformer.jpg
+++ b/assets/transformer.jpg
--- a/assets/transformer.png
+++ b/assets/transformer.png
--- a/assets/wechat.png
+++ b/assets/wechat.png
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu22.04-dtk23.10.1-py310
+ENV DEBIAN_FRONTEND=noninteractive
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
+COPY requirements_web_demo.txt requirements_web_demo.txt
+RUN pip install -r requirements_web_demo.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
--- a/docker/requirements.txt
+++ b/docker/requirements.txt
+transformers
+accelerate
+tiktoken
+einops
+transformers_stream_generator
+scipy
+torchvision
+pillow
+tensorboard
+matplotlib
+tk
+shutilwhich
+deepspeed
\ No newline at end of file
--- a/docker/requirements_web_demo.txt
+++ b/docker/requirements_web_demo.txt
+gradio
+modelscope
--- a/eval_mm/EVALUATION.md
+++ b/eval_mm/EVALUATION.md
+# Evaluation
+
+## Dependencies
+
+```bash
+pip install pycocoevalcap tqdm
+```
+
+## Image Caption
+
+### [Flickr30K](https://bryanplummer.com/Flickr30kEntities/)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/flickr && cd data/flickr
+
+# download images from https://bryanplummer.com/Flickr30kEntities/
+
+# karpathy split annotations can be downloaded from https://cs.stanford.edu/people/karpathy/deepimagesent/
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/flickr30k/flickr30k_karpathy_test.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/flickr30k/flickr30k_karpathy_train.json
+
+cd ../..
+```
+
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+ds="flickr"
+checkpoint=/PATH/TO/CHECKPOINT
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_caption.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 8 \
+    --num-workers 2
+```
+
+</details>
+
+### [Nocaps](https://nocaps.org/)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/nocaps && cd data/nocaps
+
+# download images from https://nocaps.org/download
+
+# original annotations can be downloaded from https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/nocaps/nocaps_val.json
+
+cd ../..
+```
+
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+ds="nocaps"
+checkpoint=/PATH/TO/CHECKPOINT
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_caption.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 8 \
+    --num-workers 2
+```
+
+</details>
+
+## [COCO](https://cocodataset.org/)
+
+> COCO images are used in VQAv2/OK-VQA/RefCOCO/RefCOCO+/RefCOCOg, make sure you have already downloaded COCO images before evaluate on these benchmarks.
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/coco && cd data/coco
+
+# download coco2014 images
+wget http://images.cocodataset.org/zips/train2014.zip && unzip train2014.zip
+wget http://images.cocodataset.org/zips/val2014.zip && unzip val2014.zip
+wget http://images.cocodataset.org/zips/test2015.zip && unzip test2015.zip
+
+cd ../..
+```
+
+</details>
+
+## General VQA
+
+### [VQAv2](https://visualqa.org/)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/vqav2 && cd data/vqav2
+
+# make sure you have downloaded COCO images
+
+# download questions and annotations
+wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip && unzip v2_Annotations_Train_mscoco.zip
+wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip && unzip v2_Questions_Train_mscoco.zip
+wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip && unzip v2_Annotations_Val_mscoco.zip
+wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip && unzip v2_Questions_Val_mscoco.zip
+wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip && unzip v2_Questions_Test_mscoco.zip
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_val.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_testdev.jsonl
+```
+
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+checkpoint=/PATH/TO/CHECKPOINT
+for ds in "vqav2_val" "vqav2_testdev"
+    python -m torch.distributed.launch --use-env \
+        --nproc_per_node ${NPROC_PER_NODE:-8} \
+        --nnodes ${WORLD_SIZE:-1} \
+        --node_rank ${RANK:-0} \
+        --master_addr ${MASTER_ADDR:-127.0.0.1} \
+        --master_port ${MASTER_PORT:-12345} \
+        evaluate_vqa.py \
+        --checkpoint $checkpoint \
+        --dataset $ds \
+        --batch-size 8 \
+        --num-workers 2
+```
+
+</details>
+
+### [OKVQA](https://okvqa.allenai.org/)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/okvqa && cd data/okvqa
+
+# download annotations and questions
+wget https://okvqa.allenai.org/static/data/mscoco_train2014_annotations.json.zip && unzip mscoco_train2014_annotations.json.zip
+wget https://okvqa.allenai.org/static/data/OpenEnded_mscoco_train2014_questions.json.zip && unzip OpenEnded_mscoco_train2014_questions.json.zip
+wget https://okvqa.allenai.org/static/data/mscoco_val2014_annotations.json.zip && unzip mscoco_val2014_annotations.json.zip
+wget https://okvqa.allenai.org/static/data/OpenEnded_mscoco_val2014_questions.json.zip && unzip OpenEnded_mscoco_val2014_questions.json.zip
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/okvqa/okvqa_train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/okvqa/okvqa_val.jsonl
+
+cd ../..
+```
+
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+ds="okvqa_val"
+checkpoint=/PATH/TO/CHECKPOINT
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_vqa.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 8 \
+    --num-workers 2
+```
+
+</details>
+
+### [TextVQA](https://textvqa.org/)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/textvqa && cd data/textvqa
+
+# download images
+wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip && unzip train_val_images.zip
+
+# download annotations and questions
+wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_train.json
+wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_annotations.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_questions.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_annotations.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_questions.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val.jsonl
+
+cd ../..
+```
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+ds="textvqa_val"
+checkpoint=/PATH/TO/CHECKPOINT
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_vqa.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 8 \
+    --num-workers 2
+```
+
+</details>
+
+### [VizWiz](https://vizwiz.org/tasks-and-datasets/vqa/)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/vizwiz && cd data/vizwiz
+
+# download images
+wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip && unzip train.zip
+wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip && unzip val.zip
+wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip && unzip test.zip
+
+# download annotations
+wget https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip && unzip Annotations.zip
+
+# download converted files
+# train
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train_annotations.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train_questions.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train.jsonl
+# val
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val_annotations.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val_questions.json
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val.jsonl
+# test
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_test.jsonl
+cd ../..
+```
+
+</details>
+
+<details>
+<summary>Evaluation</summary>
+
+```bash
+# evaluate vqa score on vizwiz val split
+ds="vizwiz_val"
+checkpoint=/PATH/TO/CHECKPOINT
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_vqa.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 8 \
+    --num-workers 2
+
+```
+
+</details>
+
+### [DocVQA](https://www.docvqa.org/datasets)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/docvqa && cd data/docvqa
+
+# download images and annotations from https://www.docvqa.org/datasets
+
+# download converted files
+# train
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/train.jsonl
+# val
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/val.jsonl
+# test
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/test.jsonl
+cd ../..
+```
+
+</details>
+
+<details>
+<summary>Evaluation</summary>
+
+```bash
+# evaluate vqa score on docvqa val split
+ds="docvqa_val"
+checkpoint=/PATH/TO/CHECKPOINT
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_vqa.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 8 \
+    --num-workers 2
+```
+
+</details>
+
+### [ChartQA](https://aclanthology.org/2022.findings-acl.177/)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/chartqa && cd data/chartqa
+
+# download images from https://drive.google.com/file/d/1Lm_w6zeET1Hyl_9ks6w5nEsgpoyPHalV/view
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_human.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_augmented.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_human.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_augmented.jsonl
+
+cd ../..
+```
+
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+checkpoint=/PATH/TO/CHECKPOINT
+for ds in "chartqa_test_human" "chartqa_test_augmented"
+    python -m torch.distributed.launch --use-env \
+        --nproc_per_node ${NPROC_PER_NODE:-8} \
+        --nnodes ${WORLD_SIZE:-1} \
+        --node_rank ${RANK:-0} \
+        --master_addr ${MASTER_ADDR:-127.0.0.1} \
+        --master_port ${MASTER_PORT:-12345} \
+        evaluate_vqa.py \
+        --checkpoint $checkpoint \
+        --dataset $ds \
+        --batch-size 8 \
+        --num-workers 2
+```
+
+</details>
+
+### [GQA](https://cs.stanford.edu/people/dorarad/gqa/about.html)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/gqa && cd data/gqa
+
+# download images
+wget https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip
+unzip images.zip
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/gqa/testdev_balanced.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/gqa/train_balanced.jsonl
+
+cd ../..
+```
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+checkpoint=/PATH/TO/CHECKPOINT
+ds="gqa_testdev"
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_vqa.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 8 \
+    --num-workers 2
+```
+
+</details>
+
+### [OCRVQA](https://ocr-vqa.github.io/)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/ocrvqa && cd data/ocrvqa
+
+# download images by following instructions at https://ocr-vqa.github.io/kvqa_ProjectFiles/README.txt
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/ocrvqa/ocrvqa_train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/ocrvqa/ocrvqa_val.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/ocrvqa/ocrvqa_test.jsonl
+
+cd ../..
+```
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+checkpoint=/PATH/TO/CHECKPOINT
+ds="ocrvqa_test"
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_vqa.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 8 \
+    --num-workers 2
+```
+</details>
+
+### [AI2Diagram](https://allenai.org/data/diagrams)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/ai2diagram && cd data/ai2diagram
+
+# download images
+wget https://ai2-public-datasets.s3.amazonaws.com/diagrams/ai2d-all.zip
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/ai2diagram/train.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/ai2diagram/test.jsonl
+
+cd ../..
+```
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+checkpoint=/PATH/TO/CHECKPOINT
+ds="ai2diagram_test"
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_vqa.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 8 \
+    --num-workers 2
+```
+</details>
+
+### [ScienceQA](https://github.com/lupantech/ScienceQA)
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/scienceqa/images && cd data/scienceqa/images
+
+# download images
+wget https://scienceqa.s3.us-west-1.amazonaws.com/images/test.zip && unzip test.zip
+
+cd ..
+
+# download original questions
+wget https://github.com/lupantech/ScienceQA/blob/main/data/scienceqa/problems.json
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/scienceqa/scienceqa_test_img.jsonl
+
+cd ../..
+```
+
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+ds="scienceqa_test_img"
+checkpoint=/PATH/TO/CHECKPOINT
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_multiple_choice.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 8 \
+    --num-workers 2
+```
+
+</details>
+
+## Refer Expression Comprehension
+
+### RefCOCO
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/refcoco && cd data/refcoco
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco/refcoco_val.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco/refcoco_testA.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco/refcoco_testB.jsonl
+
+cd ../..
+```
+</details>
+
+<details>
+<summary>Evaluation</summary>
+
+```bash
+checkpoint=/PATH/TO/CHECKPOINT
+for ds in "refcoco_val" "refcoco_testA" "refcoco_testB"
+    python -m torch.distributed.launch --use-env \
+        --nproc_per_node ${NPROC_PER_NODE:-8} \
+        --nnodes ${WORLD_SIZE:-1} \
+        --node_rank ${RANK:-0} \
+        --master_addr ${MASTER_ADDR:-127.0.0.1} \
+        --master_port ${MASTER_PORT:-12345} \
+        evaluate_grounding.py \
+        --checkpoint $checkpoint \
+        --dataset $ds \
+        --batch-size 8 \
+        --num-workers 2
+```
+
+</details>
+
+### RefCOCO+
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/refcoco+ && cd data/refcoco+
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco%2B/refcoco%2B_val.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco%2B/refcoco%2B_testA.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco%2B/refcoco%2B_testB.jsonl
+
+cd ../..
+```
+</details>
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+checkpoint=/PATH/TO/CHECKPOINT
+for ds in "refcoco+_val" "refcoco+_testA" "refcoco+_testB"
+    python -m torch.distributed.launch --use-env \
+        --nproc_per_node ${NPROC_PER_NODE:-8} \
+        --nnodes ${WORLD_SIZE:-1} \
+        --node_rank ${RANK:-0} \
+        --master_addr ${MASTER_ADDR:-127.0.0.1} \
+        --master_port ${MASTER_PORT:-12345} \
+        evaluate_grounding.py \
+        --checkpoint $checkpoint \
+        --dataset $ds \
+        --batch-size 8 \
+        --num-workers 2
+```
+
+</details>
+
+### RefCOCOg
+
+<details>
+<summary>Data Preparation</summary>
+
+```bash
+mkdir -p data/refcocog && data/refcocog
+
+# download converted files
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcocog/refcocog_val.jsonl
+wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcocog/refcocog_test.jsonl
+
+cd ../..
+```
+
+</details>
+
+<details>
+<summary>Evaluate</summary>
+
+```bash
+checkpoint=/PATH/TO/CHECKPOINT
+for ds in "refcocog_val" "refcocog_test"
+    python -m torch.distributed.launch --use-env \
+        --nproc_per_node ${NPROC_PER_NODE:-8} \
+        --nnodes ${WORLD_SIZE:-1} \
+        --node_rank ${RANK:-0} \
+        --master_addr ${MASTER_ADDR:-127.0.0.1} \
+        --master_port ${MASTER_PORT:-12345} \
+        evaluate_grounding.py \
+        --checkpoint $checkpoint \
+        --dataset $ds \
+        --batch-size 8 \
+        --num-workers 2
+```
+</details>
--- a/eval_mm/data
+++ b/eval_mm/data
+/cpfs01/shared/public/shusheng.yss/datasets/qwenvl_evaluation
\ No newline at end of file
--- a/eval_mm/evaluate_caption.py
+++ b/eval_mm/evaluate_caption.py
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+
+import torch
+from pycocoevalcap.eval import COCOEvalCap
+from pycocotools.coco import COCO
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+ds_collections = {
+    'flickr': {
+        'train': 'data/flickr30k/flickr30k_karpathy_test.json',
+        'test': 'data/flickr30k/flickr30k_karpathy_test.json',
+    },
+    'nocaps': {
+        'train': '',
+        'test': 'data/nocaps/nocaps_val.json',
+    },
+}
+
+
+class CaptionDataset(torch.utils.data.Dataset):
+
+    def __init__(self, train, test, prompt, few_shot=0):
+        self.images = json.load(open(test))['images']
+        self.prompt = prompt
+
+        self.few_shot = few_shot
+        if few_shot > 0:
+            self.train = json.load(open(train))['annotations']
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, idx):
+        image_id, image_path = self.images[idx]['id'], self.images[idx][
+            'image']
+
+        few_shot_prompt = ''
+        if self.few_shot > 0:
+            few_shot_samples = random.sample(self.train, self.few_shot)
+            for sample in few_shot_samples:
+                few_shot_prompt += self.prompt.format(
+                    sample['image']) + f" {sample['caption']}"
+
+        return {
+            'image_id': image_id,
+            'input_text': few_shot_prompt + self.prompt.format(image_path)
+        }
+
+
+def collate_fn(inputs, tokenizer):
+
+    image_ids = [_['image_id'] for _ in inputs]
+    input_texts = [_['input_text'] for _ in inputs]
+    input_tokens = tokenizer(input_texts,
+                             return_tensors='pt',
+                             padding='longest')
+
+    return image_ids, input_tokens.input_ids, input_tokens.attention_mask
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--few-shot', type=int, default=0)
+    parser.add_argument('--seed', type=int, default=0)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    prompt = '<img>{}</img>Describe the image in English:'
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+    tokenizer.padding_side = 'left'
+    tokenizer.pad_token_id = tokenizer.eod_id
+
+    random.seed(args.seed)
+    dataset = CaptionDataset(
+        train=ds_collections[args.dataset]['train'],
+        test=ds_collections[args.dataset]['test'],
+        prompt=prompt,
+        few_shot=args.few_shot,
+    )
+    coco_karpathy_test_loader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, tokenizer=tokenizer),
+    )
+
+    image_ids = []
+    captions = []
+    for _, (ids, input_ids,
+            attention_mask) in tqdm(enumerate(coco_karpathy_test_loader)):
+        pred = model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=30,
+            min_new_tokens=8,
+            length_penalty=0,
+            num_return_sequences=1,
+            use_cache=True,
+            pad_token_id=tokenizer.eod_id,
+            eos_token_id=tokenizer.eod_id,
+        )
+        image_ids.extend(ids)
+        captions.extend([
+            tokenizer.decode(_[input_ids.size(1):].cpu(),
+                             skip_special_tokens=True).strip() for _ in pred
+        ])
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_ids = [None for _ in range(world_size)]
+    merged_captions = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_ids, image_ids)
+    torch.distributed.all_gather_object(merged_captions, captions)
+
+    merged_ids = [_ for _ in itertools.chain.from_iterable(merged_ids)]
+    merged_captions = [
+        _ for _ in itertools.chain.from_iterable(merged_captions)
+    ]
+
+    if torch.distributed.get_rank() == 0:
+        print(f"Evaluating {args.dataset} ...")
+
+        results = []
+        for image_id, caption in zip(merged_ids, merged_captions):
+            results.append({
+                'image_id': int(image_id),
+                'caption': caption,
+            })
+        time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+        results_file = f'{args.dataset}_{time_prefix}.json'
+        json.dump(results, open(results_file, 'w'))
+
+        coco = COCO(ds_collections[args.dataset]['test'])
+        coco_result = coco.loadRes(results_file)
+        coco_eval = COCOEvalCap(coco, coco_result)
+        coco_eval.evaluate()
+
+        print(coco_eval.eval.items())
+    torch.distributed.barrier()
--- a/eval_mm/evaluate_grounding.py
+++ b/eval_mm/evaluate_grounding.py
+import argparse
+import itertools
+import json
+import os
+import re
+from functools import partial
+
+import torch
+from torchvision.ops.boxes import box_area
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+ds_collections = {
+    'refcoco_val': 'data/refcoco/refcoco_val.jsonl',
+    'refcoco_testA': 'data/refcoco/refcoco_testA.jsonl',
+    'refcoco_testB': 'data/refcoco/refcoco_testB.jsonl',
+    'refcoco+_val': 'data/refcoco+/refcoco+_val.jsonl',
+    'refcoco+_testA': 'data/refcoco+/refcoco+_testA.jsonl',
+    'refcoco+_testB': 'data/refcoco+/refcoco+_testB.jsonl',
+    'refcocog_val': 'data/refcocog/refcocog_val.jsonl',
+    'refcocog_test': 'data/refcocog/refcocog_test.jsonl',
+}
+
+
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def collate_fn(batches, tokenizer):
+
+    texts = [_['text'] for _ in batches]
+    bboxes = [_['bbox'] for _ in batches]
+    hws = [_['hw'] for _ in batches]
+
+    input_ids = tokenizer(texts, return_tensors='pt', padding='longest')
+
+    return input_ids.input_ids, input_ids.attention_mask, bboxes, hws
+
+
+class RefCOCODataset(torch.utils.data.Dataset):
+
+    def __init__(self, test, tokenizer, prompt):
+        self.datas = open(test).readlines()
+        self.tokenizer = tokenizer
+        self.prompt = prompt
+
+    def __len__(self):
+        return len(self.datas)
+
+    def __getitem__(self, idx):
+        data = json.loads(self.datas[idx].strip())
+        image = data['image']
+        text = data['sent']
+        bbox = data['bbox']
+
+        w, h = data['width'], data['height']
+
+        return {
+            'text': self.prompt.format(image, text),
+            'bbox': bbox,
+            'hw': (h, w),
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+    tokenizer.padding_side = 'left'
+    tokenizer.pad_token_id = tokenizer.eod_id
+
+    prompt = '<img>{}</img><ref>{}</ref><box>'
+
+    dataset = RefCOCODataset(test=ds_collections[args.dataset],
+                             tokenizer=tokenizer,
+                             prompt=prompt)
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=True,
+        collate_fn=partial(collate_fn, tokenizer=tokenizer),
+    )
+
+    outputs = []
+    for _, (input_ids, attention_mask, bboxes,
+            hws) in tqdm(enumerate(dataloader)):
+        pred = model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=28,
+            min_new_tokens=10,
+            length_penalty=1,
+            num_return_sequences=1,
+            use_cache=True,
+            pad_token_id=tokenizer.eod_id,
+            eos_token_id=tokenizer.eod_id,
+        )
+        answers = [
+            tokenizer.decode(_[input_ids.size(1):].cpu(),
+                             skip_special_tokens=True) for _ in pred
+        ]
+
+        for bbox, hw, answer in zip(bboxes, hws, answers):
+            outputs.append({
+                'answer': answer,
+                'gt_bbox': bbox,
+                'hw': hw,
+            })
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_outputs, outputs)
+
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+    PATTERN = re.compile(r'\((.*?)\),\((.*?)\)')
+
+    if torch.distributed.get_rank() == 0:
+        correct = total_cnt = 0
+        for i, output in enumerate(merged_outputs):
+            predict_bbox = re.findall(PATTERN, output['answer'])
+            try:
+                if ',' not in predict_bbox[0][0] or ',' not in predict_bbox[0][
+                        1]:
+                    predict_bbox = (0., 0., 0., 0.)
+                else:
+                    x1, y1 = [
+                        float(tmp) for tmp in predict_bbox[0][0].split(',')
+                    ]
+                    x2, y2 = [
+                        float(tmp) for tmp in predict_bbox[0][1].split(',')
+                    ]
+                    predict_bbox = (x1, y1, x2, y2)
+            except:
+                predict_bbox = (0., 0., 0., 0.)
+            target_bbox = torch.tensor(output['gt_bbox'],
+                                       dtype=torch.float32).view(-1, 4)
+            predict_bbox = torch.tensor(predict_bbox,
+                                        dtype=torch.float32).view(-1, 4) / 999
+            predict_bbox[:, 0::2] *= output['hw'][1]
+            predict_bbox[:, 1::2] *= output['hw'][0]
+            iou, _ = box_iou(predict_bbox, target_bbox)
+            iou = iou.item()
+            total_cnt += 1
+            if iou >= 0.5:
+                correct += 1
+
+        print(f"Evaluating {args.dataset} ...")
+        print(f'Precision @ 1: {correct / total_cnt} \n')
+    torch.distributed.barrier()
--- a/eval_mm/evaluate_multiple_choice.py
+++ b/eval_mm/evaluate_multiple_choice.py
+import argparse
+import itertools
+import json
+import os
+from functools import partial
+
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+multiple_choices = ['A', 'B', 'C', 'D', 'E']
+
+ds_collections = {
+    'scienceqa_test_img': {
+        'test': 'data/scienceqa/scienceqa_test_img.jsonl',
+    }
+}
+
+
+def collate_fn(batches, pad_token_id):
+
+    input_tokens = [_['input_tokens'] for _ in batches]
+    target_lengths = [_['target_lengths'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+
+    chunk_sizes = [len(_) for _ in input_tokens]
+
+    input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
+
+    max_lengths = max([len(_) for _ in input_tokens])
+    input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
+                    for _ in input_tokens]
+    input_tokens = torch.LongTensor(input_tokens)
+
+    attention_mask = 1 - input_tokens.eq(pad_token_id).float()
+
+    return input_tokens, attention_mask, target_lengths, answers, chunk_sizes
+
+
+class MultipleChoiceDataste(torch.utils.data.Dataset):
+
+    def __init__(self, test, prompt, tokenizer):
+        self.datas = open(test).readlines()
+        self.prompt = prompt
+        self.tokenizer = tokenizer
+
+    def __len__(self):
+        return len(self.datas)
+
+    def __getitem__(self, idx):
+
+        data = json.loads(self.datas[idx].strip())
+        image = data['image']
+        hint = data['hint'] if data['hint'] else 'N/A'
+        question = data['question']
+
+        choices = data['choices']
+        choice_list = []
+        for i, c in enumerate(choices):
+            choice_list.append('{}. {}'.format(multiple_choices[i], c))
+        choice_txt = '\n'.join(choice_list)
+
+        prompt = self.prompt.format(image, hint, question, choice_txt)
+
+        prompt_tokens = self.tokenizer(prompt).input_ids
+        target_tokens = [
+            self.tokenizer(' ' + _).input_ids
+            for _ in multiple_choices[:len(choices)]
+        ]
+
+        return {
+            'input_tokens': [prompt_tokens + _ for _ in target_tokens],
+            'target_lengths': [len(_) for _ in target_tokens],
+            'answer': data['answer'],
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+
+    prompt = '<img>{}</img>Context: {}\nQuestion: {}\nOptions: {}\nAnswer:'
+
+    dataset = MultipleChoiceDataste(test=ds_collections[args.dataset]['test'],
+                                    prompt=prompt,
+                                    tokenizer=tokenizer)
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
+    )
+
+    results = []
+    with torch.no_grad():
+        for _, (input_tokens, attention_mask, target_lengths, answer,
+                chunk_sizes) in tqdm(enumerate(dataloader)):
+
+            outputs = model(
+                input_ids=input_tokens[:, :-1].cuda(),
+                attention_mask=attention_mask[:, :-1].cuda(),
+                return_dict=True,
+            )
+            losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
+                0, 2, 1),
+                                                       input_tokens[:,
+                                                                    1:].cuda(),
+                                                       reduction='none')
+
+            losses = losses.split(chunk_sizes, dim=0)
+
+            for loss, target_length, answer in zip(losses, target_lengths,
+                                                   answer):
+
+                target_loss = loss.mean(-1)
+                for _ in range(len(target_length)):
+                    target_loss[_] = loss[_, -target_length[_]:].mean()
+                pred = target_loss.argmin().item()
+                if pred == answer:
+                    results.append(1)
+                else:
+                    results.append(0)
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_results = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_results, results)
+
+    merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
+
+    if torch.distributed.get_rank() == 0:
+        print(f"Evaluating {args.dataset} ...")
+        print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
+
+    torch.distributed.barrier()
--- a/eval_mm/evaluate_vqa.py
+++ b/eval_mm/evaluate_vqa.py
+import argparse
+import itertools
+import json
+import os
+import random
+import time
+from functools import partial
+from typing import Optional
+
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from vqa import VQA
+from vqa_eval import VQAEval
+
+ds_collections = {
+    'vqav2_val': {
+        'train': 'data/vqav2/vqav2_train.jsonl',
+        'test': 'data/vqav2/vqav2_val.jsonl',
+        'question': 'data/vqav2/v2_OpenEnded_mscoco_val2014_questions.json',
+        'annotation': 'data/vqav2/v2_mscoco_val2014_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'vqav2_testdev': {
+        'train': 'data/vqav2/vqav2_train.jsonl',
+        'test': 'data/vqav2/vqav2_testdev.jsonl',
+        'metric': None,
+        'max_new_tokens': 10,
+    },
+    'okvqa_val': {
+        'train': 'data/okvqa/okvqa_train.jsonl',
+        'test': 'data/okvqa/okvqa_val.jsonl',
+        'question': 'data/okvqa/OpenEnded_mscoco_val2014_questions.json',
+        'annotation': 'data/okvqa/mscoco_val2014_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'textvqa_val': {
+        'train': 'data/textvqa/textvqa_train.jsonl',
+        'test': 'data/textvqa/textvqa_val.jsonl',
+        'question': 'data/textvqa/textvqa_val_questions.json',
+        'annotation': 'data/textvqa/textvqa_val_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'vizwiz_val': {
+        'train': 'data/vizwiz/vizwiz_train.jsonl',
+        'test': 'data/vizwiz/vizwiz_val.jsonl',
+        'question': 'data/vizwiz/vizwiz_val_questions.json',
+        'annotation': 'data/vizwiz/vizwiz_val_annotations.json',
+        'metric': 'vqa_score',
+        'max_new_tokens': 10,
+    },
+    'vizwiz_test': {
+        'train': 'data/vizwiz/vizwiz_train.jsonl',
+        'test': 'data/vizwiz/vizwiz_test.jsonl',
+        'metric': None,
+        'max_new_tokens': 10,
+    },
+    'docvqa_val': {
+        'train': 'data/docvqa/train.jsonl',
+        'test': 'data/docvqa/val.jsonl',
+        'annotation': 'data/docvqa/val/val_v1.0.json',
+        'metric': 'anls',
+        'max_new_tokens': 100,
+    },
+    'docvqa_test': {
+        'train': 'data/docvqa/train.jsonl',
+        'test': 'data/docvqa/test.jsonl',
+        'metric': None,
+        'max_new_tokens': 100,
+    },
+    'chartqa_test_human': {
+        'train': 'data/chartqa/train_human.jsonl',
+        'test': 'data/chartqa/test_human.jsonl',
+        'metric': 'relaxed_accuracy',
+        'max_new_tokens': 100,
+    },
+    'chartqa_test_augmented': {
+        'train': 'data/chartqa/train_augmented.jsonl',
+        'test': 'data/chartqa/test_augmented.jsonl',
+        'metric': 'relaxed_accuracy',
+        'max_new_tokens': 100,
+    },
+    'gqa_testdev': {
+        'train': 'data/gqa/train.jsonl',
+        'test': 'data/gqa/testdev_balanced.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 10,
+    },
+    'ocrvqa_val': {
+        'train': 'data/ocrvqa/ocrvqa_train.jsonl',
+        'test': 'data/ocrvqa/ocrvqa_val.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 100,
+    },
+    'ocrvqa_test': {
+        'train': 'data/ocrvqa/ocrvqa_train.jsonl',
+        'test': 'data/ocrvqa/ocrvqa_test.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 100,
+    },
+    'ai2diagram_test': {
+        'train': 'data/ai2diagram/train.jsonl',
+        'test': 'data/ai2diagram/test.jsonl',
+        'metric': 'accuracy',
+        'max_new_tokens': 10,
+    }
+}
+
+# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
+def relaxed_correctness(target: str,
+                        prediction: str,
+                        max_relative_change: float = 0.05) -> bool:
+    """Calculates relaxed correctness.
+
+    The correctness tolerates certain error ratio defined by max_relative_change.
+    See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
+    “Following Methani et al. (2020), we use a relaxed accuracy measure for the
+    numeric answers to allow a minor inaccuracy that may result from the automatic
+    data extraction process. We consider an answer to be correct if it is within
+    5% of the gold answer. For non-numeric answers, we still need an exact match
+    to consider an answer to be correct.”
+
+    Args:
+      target: Target string.
+      prediction: Predicted string.
+      max_relative_change: Maximum relative change.
+
+    Returns:
+      Whether the prediction was correct given the specified tolerance.
+    """
+
+    def _to_float(text: str) -> Optional[float]:
+        try:
+            if text.endswith('%'):
+                # Convert percentages to floats.
+                return float(text.rstrip('%')) / 100.0
+            else:
+                return float(text)
+        except ValueError:
+            return None
+
+    prediction_float = _to_float(prediction)
+    target_float = _to_float(target)
+    if prediction_float is not None and target_float:
+        relative_change = abs(prediction_float -
+                              target_float) / abs(target_float)
+        return relative_change <= max_relative_change
+    else:
+        return prediction.lower() == target.lower()
+
+
+def evaluate_relaxed_accuracy(entries):
+    scores = []
+    for elem in entries:
+        if isinstance(elem['annotation'], str):
+            elem['annotation'] = [elem['annotation']]
+        score = max([
+            relaxed_correctness(elem['answer'].strip(), ann)
+            for ann in elem['annotation']
+        ])
+        scores.append(score)
+    return sum(scores) / len(scores)
+
+
+def evaluate_exact_match_accuracy(entries):
+    scores = []
+    for elem in entries:
+        if isinstance(elem['annotation'], str):
+            elem['annotation'] = [elem['annotation']]
+        score = max([
+            (1.0 if
+             (elem['answer'].strip().lower() == ann.strip().lower()) else 0.0)
+            for ann in elem['annotation']
+        ])
+        scores.append(score)
+    return sum(scores) / len(scores)
+
+
+def collate_fn(batches, tokenizer):
+
+    questions = [_['question'] for _ in batches]
+    question_ids = [_['question_id'] for _ in batches]
+    annotations = [_['annotation'] for _ in batches]
+
+    input_ids = tokenizer(questions, return_tensors='pt', padding='longest')
+
+    return question_ids, input_ids.input_ids, input_ids.attention_mask, annotations
+
+
+class VQADataset(torch.utils.data.Dataset):
+
+    def __init__(self, train, test, prompt, few_shot):
+        self.test = open(test).readlines()
+        self.prompt = prompt
+
+        self.few_shot = few_shot
+        if few_shot > 0:
+            self.train = open(train).readlines()
+
+    def __len__(self):
+        return len(self.test)
+
+    def __getitem__(self, idx):
+        data = json.loads(self.test[idx].strip())
+        image, question, question_id, annotation = data['image'], data[
+            'question'], data['question_id'], data.get('answer', None)
+
+        few_shot_prompt = ''
+        if self.few_shot > 0:
+            few_shot_samples = random.sample(self.train, self.few_shot)
+            for sample in few_shot_samples:
+                sample = json.loads(sample.strip())
+                few_shot_prompt += self.prompt.format(
+                    sample['image'],
+                    sample['question']) + f" {sample['answer']}"
+
+        return {
+            'question': few_shot_prompt + self.prompt.format(image, question),
+            'question_id': question_id,
+            'annotation': annotation
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    parser.add_argument('--few-shot', type=int, default=0)
+    parser.add_argument('--seed', type=int, default=0)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+    tokenizer.padding_side = 'left'
+    tokenizer.pad_token_id = tokenizer.eod_id
+
+    prompt = '<img>{}</img>{} Answer:'
+
+    random.seed(args.seed)
+    dataset = VQADataset(
+        train=ds_collections[args.dataset]['train'],
+        test=ds_collections[args.dataset]['test'],
+        prompt=prompt,
+        few_shot=args.few_shot,
+    )
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, tokenizer=tokenizer),
+    )
+
+    outputs = []
+    for _, (question_ids, input_ids, attention_mask,
+            annotations) in tqdm(enumerate(dataloader)):
+        pred = model.generate(
+            input_ids=input_ids.cuda(),
+            attention_mask=attention_mask.cuda(),
+            do_sample=False,
+            num_beams=1,
+            max_new_tokens=ds_collections[args.dataset]['max_new_tokens'],
+            min_new_tokens=1,
+            length_penalty=1,
+            num_return_sequences=1,
+            output_hidden_states=True,
+            use_cache=True,
+            pad_token_id=tokenizer.eod_id,
+            eos_token_id=tokenizer.eod_id,
+        )
+        answers = [
+            tokenizer.decode(_[input_ids.size(1):].cpu(),
+                             skip_special_tokens=True).strip() for _ in pred
+        ]
+
+        for question_id, answer, annotation in zip(question_ids, answers,
+                                                   annotations):
+            if args.dataset in ['vqav2_val', 'vqav2_testdev', 'okvqa_val', 'textvqa_val', 'vizwiz_val']:
+                outputs.append({
+                    'question_id': question_id,
+                    'answer': answer,
+                })
+            elif args.dataset in ['docvqa_val', 'infographicsvqa', 'gqa_testdev', 'ocrvqa_val', 'ocrvqa_test']:
+                outputs.append({
+                    'questionId': question_id,
+                    'answer': answer,
+                    'annotation': annotation,
+                })
+            elif args.dataset in ['ai2diagram_test']:
+                outputs.append({
+                    'image': question_id,
+                    'answer': answer,
+                    'annotation': annotation,
+                })
+            elif args.dataset in ['chartqa_test_human', 'chartqa_test_augmented']:
+                outputs.append({
+                    'answer': answer,
+                    'annotation': annotation,
+                })
+            elif args.dataset in ['docvqa_test']:
+                outputs.append({
+                    'questionId': question_id,
+                    'answer': answer,
+                })
+            elif args.dataset in ['vizwiz_test']:
+                outputs.append({
+                    'image': question_id,
+                    'answer': answer,
+                })
+            else:
+                raise NotImplementedError
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_outputs = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
+
+    merged_outputs = [json.loads(_) for _ in merged_outputs]
+    merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
+
+    if torch.distributed.get_rank() == 0:
+        print(f"Evaluating {args.dataset} ...")
+        time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
+        results_file = f'{args.dataset}_{time_prefix}_fs{args.few_shot}_s{args.seed}.json'
+        json.dump(merged_outputs, open(results_file, 'w'), ensure_ascii=False)
+
+        if ds_collections[args.dataset]['metric'] == 'vqa_score':
+            vqa = VQA(ds_collections[args.dataset]['annotation'],
+                      ds_collections[args.dataset]['question'])
+            results = vqa.loadRes(
+                resFile=results_file,
+                quesFile=ds_collections[args.dataset]['question'])
+            vqa_scorer = VQAEval(vqa, results, n=2)
+            vqa_scorer.evaluate()
+
+            print(vqa_scorer.accuracy)
+
+        elif ds_collections[args.dataset]['metric'] == 'anls':
+            json.dump(merged_outputs,
+                      open(results_file, 'w'),
+                      ensure_ascii=False)
+            print('python infographicsvqa_eval.py -g ' +
+                  ds_collections[args.dataset]['annotation'] + ' -s ' +
+                  results_file)
+            os.system('python infographicsvqa_eval.py -g ' +
+                      ds_collections[args.dataset]['annotation'] + ' -s ' +
+                      results_file)
+        elif ds_collections[args.dataset]['metric'] == 'relaxed_accuracy':
+            print({
+                'relaxed_accuracy': evaluate_relaxed_accuracy(merged_outputs)
+            })
+        elif ds_collections[args.dataset]['metric'] == 'accuracy':
+            if 'gqa' in args.dataset:
+                for entry in merged_outputs:
+                    response = entry['answer']
+                    response = response.strip().split('.')[0].split(
+                        ',')[0].split('!')[0].lower()
+                    if 'is ' in response:
+                        response = response.split('is ')[1]
+                    if 'are ' in response:
+                        response = response.split('are ')[1]
+                    if 'a ' in response:
+                        response = response.split('a ')[1]
+                    if 'an ' in response:
+                        response = response.split('an ')[1]
+                    if 'the ' in response:
+                        response = response.split('the ')[1]
+                    if ' of' in response:
+                        response = response.split(' of')[0]
+                    response = response.strip()
+                    entry['answer'] = response
+            print({'accuracy': evaluate_exact_match_accuracy(merged_outputs)})
+
+    torch.distributed.barrier()
--- a/eval_mm/infographicsvqa_eval.py
+++ b/eval_mm/infographicsvqa_eval.py
+# This file can be downloaded from: https://www.docvqa.org/datasets/infographicvqa and https://rrc.cvc.uab.es/?ch=17&com=introduction
+
+import os, json
+import argparse
+
+question_ids_to_exclude = []
+
+# answer_types = {'image span': 'Image-Span', 'question span': 'Question-Span', 'multiple spans': 'Multi-Span', 'non span': 'None span', 'list': 'List'}
+answer_types = {'image span': 'Image-Span', 'question span': 'Question-Span', 'multiple spans': 'Multi-Span', 'non span': 'None span'}
+evidence_types = {'table/list': 'Table/list', 'textual': 'Text', 'photo/pciture/visual_objects': 'Visual/Layout', 'figure': 'Figure', 'map': 'Map'}
+reasoning_requirements = {'comparison': 'Sorting', 'arithmetic': 'Arithmetic', 'counting':'Counting'}
+
+
+def save_json(file_path, data):
+    with open(file_path, 'w+') as json_file:
+        json.dump(data, json_file)
+
+
+
+def levenshtein_distance(s1, s2):
+    if len(s1) > len(s2):
+        s1, s2 = s2, s1
+
+    distances = range(len(s1) + 1)
+    for i2, c2 in enumerate(s2):
+        distances_ = [i2+1]
+        for i1, c1 in enumerate(s1):
+            if c1 == c2:
+                distances_.append(distances[i1])
+            else:
+                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
+        distances = distances_
+    return distances[-1]
+
+
+def validate_data(gtFilePath, submFilePath):
+    """
+    Method validate_data: validates that all files in the results folder are correct (have the correct name contents).
+                            Validates also that there are no missing files in the folder.
+                            If some error detected, the method raises the error
+    """
+    
+    gtJson = json.load(open(gtFilePath,'rb'));
+    submJson = json.load(open(submFilePath,'rb'));
+    
+    if not 'data' in gtJson:
+        raise Exception("The GT file is not valid (no data key)")
+    
+    if not 'dataset_name' in gtJson:
+        raise Exception("The GT file is not valid (no dataset_name key)")      
+    
+    if isinstance(submJson, list) == False :
+        raise Exception("The Det file is not valid (root item must be an array)")
+        
+    if len(submJson) != len(gtJson['data']) :
+        raise Exception("The Det file is not valid (invalid number of answers. Expected:" + str(len(gtJson['data'])) + " Found:" + str(len(submJson)) + ")")    
+    
+    gtQuestions = sorted([r['questionId'] for r in gtJson['data']])
+    res_id_to_index = {int(r['questionId']): ix for ix, r in enumerate(submJson)}
+    detQuestions = sorted([r['questionId'] for r in submJson])
+    
+    if( (gtQuestions == detQuestions) == False ):
+        raise Exception("The Det file is not valid. Question IDs must much GT")    
+    
+    for gtObject in gtJson['data']:
+        
+        try:
+            q_id = int(gtObject['questionId']);
+            res_ix = res_id_to_index[q_id];
+            
+        except:
+            raise Exception("The Det file is not valid. Question " + str(gtObject['questionId']) + " not present")
+        
+        else:
+            detObject = submJson[res_ix];
+            
+#            if detObject['questionId'] != gtObject['questionId'] :
+#                raise Exception("Answer #" + str(i) + " not valid (invalid question ID. Expected:" + str(gtObject['questionId']) + "Found:" + detObject['questionId'] + ")")
+
+            if not 'answer' in detObject:
+                raise Exception("Question " + str(gtObject['questionId']) + " not valid (no answer key)")
+
+            if isinstance(detObject['answer'], list) == True :
+                raise Exception("Question " + str(gtObject['questionId']) + " not valid (answer key has to be a single string)")
+
+
+def evaluate_method(gtFilePath, submFilePath, evaluationParams):
+    """
+    Method evaluate_method: evaluate method and returns the results
+        Results. Dictionary with the following values:
+        - method (required)  Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 }
+        - samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 }
+    """  
+    
+    show_scores_per_answer_type = evaluationParams.answer_types
+
+    gtJson = json.load(open(gtFilePath,'rb'));
+    submJson = json.load(open(submFilePath,'rb'));
+    
+    res_id_to_index = {int(r['questionId']): ix for ix, r in enumerate(submJson)}
+    
+    
+    perSampleMetrics = {}
+    
+    totalScore = 0
+    row = 0
+    
+    if show_scores_per_answer_type:
+	    answerTypeTotalScore = {x:0 for x in answer_types.keys()}
+	    answerTypeNumQuestions = {x:0 for x in answer_types.keys()}
+
+	    evidenceTypeTotalScore = {x:0 for x in evidence_types.keys()}
+	    evidenceTypeNumQuestions = {x:0 for x in evidence_types.keys()}
+
+	    reasoningTypeTotalScore = {x:0 for x in reasoning_requirements.keys()}
+	    reasoningTypeNumQuestions = {x:0 for x in reasoning_requirements.keys()}
+    
+    for gtObject in gtJson['data']:
+
+        q_id = int(gtObject['questionId']);
+        res_ix = res_id_to_index[q_id];
+        detObject = submJson[res_ix];
+
+        if q_id in question_ids_to_exclude:
+            question_result = 0
+            info = 'Question EXCLUDED from the result'
+            
+        else:
+            info = ''
+            values = []
+            for answer in gtObject['answers']:
+                # preprocess both the answers - gt and prediction
+                gt_answer = ' '.join(answer.strip().lower().split())
+                det_answer = ' '.join(detObject['answer'].strip().lower().split())
+
+                #dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
+                dist = levenshtein_distance(gt_answer,det_answer)
+                length = max( len(answer.upper()), len(detObject['answer'].upper()) )
+                values.append( 0.0 if length == 0 else float(dist) / float(length) )
+
+            question_result = 1 - min(values)
+        
+            if (question_result < evaluationParams.anls_threshold) :
+                question_result = 0
+
+            totalScore += question_result
+            
+            if show_scores_per_answer_type:
+                for q_type in gtObject["answer_type"]:
+                    answerTypeTotalScore[q_type] += question_result
+                    answerTypeNumQuestions[q_type] += 1
+
+                for q_type in gtObject["evidence"]:
+                    evidenceTypeTotalScore[q_type] += question_result
+                    evidenceTypeNumQuestions[q_type] += 1
+
+                for q_type in gtObject["operation/reasoning"]:
+                    reasoningTypeTotalScore[q_type] += question_result
+                    reasoningTypeNumQuestions[q_type] += 1
+                
+        
+        perSampleMetrics[str(gtObject['questionId'])] = {
+                                'score':question_result,
+                                'question':gtObject['question'],
+                                'gt':gtObject['answers'],
+                                'det':detObject['answer'],
+                                'info': info
+                                }
+        row = row + 1
+
+                                
+    methodMetrics = {
+		'score': 0 if len(gtJson['data']) == 0 else totalScore/ (len(gtJson['data']) - len(question_ids_to_exclude) )
+    }
+
+    answer_types_scores = {}
+    evidence_types_scores = {}
+    operation_types_scores = {}
+
+    if show_scores_per_answer_type:
+        for a_type, ref in answer_types.items():
+            answer_types_scores[ref] = 0 if len(gtJson['data']) == 0 else answerTypeTotalScore[a_type] / (answerTypeNumQuestions[a_type] )
+
+        for e_type, ref in evidence_types.items():
+            evidence_types_scores[ref] = 0 if len(gtJson['data']) == 0 else evidenceTypeTotalScore[e_type] / (evidenceTypeNumQuestions[e_type] )
+
+        for r_type, ref in reasoning_requirements.items():
+            operation_types_scores[ref] = 0 if len(gtJson['data']) == 0 else reasoningTypeTotalScore[r_type] / (reasoningTypeNumQuestions[r_type] )
+
+
+    resDict = {
+            'result': methodMetrics, 
+            'scores_by_types': {'answer_types': answer_types_scores, 'evidence_types': evidence_types_scores, 'operation_types': operation_types_scores},
+            'per_sample_result':perSampleMetrics
+            }
+
+    return resDict;
+
+
+def display_results(results, show_answer_types):
+    print("\nOverall ANLS: {:2.4f}".format(results['result']['score']))
+
+    if show_answer_types:
+        print("\nAnswer types:")
+        for a_type in answer_types.values():
+            print("\t{:12s} {:2.4f}".format(a_type, results['scores_by_types']['answer_types'][a_type]))
+
+        print("\nEvidence types:")
+        for e_type in evidence_types.values():
+            print("\t{:12s} {:2.4f}".format(e_type, results['scores_by_types']['evidence_types'][e_type]))
+
+        print("\nOperation required:")
+        for r_type in reasoning_requirements.values():
+            print("\t{:12s} {:2.4f}".format(r_type, results['scores_by_types']['operation_types'][r_type]))
+
+
+
+if __name__=='__main__':
+    parser = argparse.ArgumentParser(description="InfographVQA evaluation script.")
+
+    parser.add_argument('-g', '--ground_truth', type=str, help="Path of the Ground Truth file.", required=True)
+    parser.add_argument('-s', '--submission_file', type=str, help="Path of your method's results file.", required=True)
+
+    parser.add_argument('-t', '--anls_threshold', type=float, default=0.5, help="ANLS threshold to use (See Scene-Text VQA paper for more info.).", required=False)
+    parser.add_argument('-a', '--answer_types', type=bool, default=False, help="Score break down by answer types (special gt file required).", required=False)
+    parser.add_argument('-o', '--output', type=str, help="Path to a directory where to copy the file 'results.json' that contains per-sample results.", required=False)
+
+    args = parser.parse_args()
+
+    # Validate the format of ground truth and submission files.
+    validate_data(args.ground_truth, args.submission_file)
+
+    # Evaluate method
+    results = evaluate_method(args.ground_truth, args.submission_file, args)
+
+    display_results(results, args.answer_types)
+
+    if args.output:
+        output_dir = args.output
+
+        if not os.path.exists(output_dir):
+            os.makedirs(output_dir)
+
+        resultsOutputname = os.path.join(output_dir, 'results.json')
+        save_json(resultsOutputname, results)
+
+        print("All results including per-sample result has been correctly saved!")
+
--- a/eval_mm/mmbench/MMBENCH.md
+++ b/eval_mm/mmbench/MMBENCH.md
+# MMBench Evaluation
+
+## Data
+
+```bash
+/cpfs01/shared/public/shusheng.yss/workspace/23082502_qwenvl_eval_test/eval_mm/data/mmbench
+```
+
+## Dev
+
+```bash
+checkpoint=/PATH/TO/CHECKPOINT
+ds=mmbench_dev_20230712
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_multiple_choice_mmbench.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 2 \
+    --num-workers 2
+
+# the results will be saved to mmbench_dev_20230712.json
+
+# without consistency constrain
+
+python mmbench_evaluation.py
+
+# with consistency constrain
+
+python mmbench_evaluation_tricky.py
+
+```
+
+## Test
+
+```bash
+checkpoint=/PATH/TO/CHECKPOINT
+ds=mmbench_test_20230712
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    evaluate_multiple_choice_mmbench.py \
+    --checkpoint $checkpoint \
+    --dataset $ds \
+    --batch-size 2 \
+    --num-workers 2
+
+# the results will be saved to mmbench_test_20230712.json
+
+# convert to submission format with consistency constrain
+
+python mmbench_predict_to_submission.py
+
+```
--- a/eval_mm/mmbench/evaluate_multiple_choice_mmbench.py
+++ b/eval_mm/mmbench/evaluate_multiple_choice_mmbench.py
+import argparse
+import itertools
+import json
+import os
+from functools import partial
+
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+multiple_choices = ['A', 'B', 'C', 'D', 'E']
+
+ds_collections = {
+    'mmbench_dev_20230712': {
+        'test': 'data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl',
+    },
+    'mmbench_test_20230712': {
+        'test': 'data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl',
+    }
+}
+
+def collate_fn(batches, pad_token_id):
+
+    indexes = [_['index'] for _ in batches]
+
+    input_tokens = [_['input_tokens'] for _ in batches]
+    target_lengths = [_['target_lengths'] for _ in batches]
+
+    chunk_sizes = [len(_) for _ in input_tokens]
+
+    input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
+
+    max_lengths = max([len(_) for _ in input_tokens])
+    input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
+                    for _ in input_tokens]
+    input_tokens = torch.LongTensor(input_tokens)
+
+    attention_mask = 1 - input_tokens.eq(pad_token_id).float()
+
+    return input_tokens, attention_mask, target_lengths, chunk_sizes, indexes
+
+
+class MultipleChoiceDataste(torch.utils.data.Dataset):
+
+    def __init__(self, test, prompt, tokenizer):
+        self.datas = open(test).readlines()
+        self.prompt = prompt
+        self.tokenizer = tokenizer
+
+    def __len__(self):
+        return len(self.datas)
+
+    def __getitem__(self, idx):
+
+        data = json.loads(self.datas[idx].strip())
+        index = data['index']
+        image = data['image']
+        hint = data['hint'] if data['hint'] else 'N/A'
+        question = data['question']
+
+        choices = data['choices']
+        choice_list = []
+        for i, c in enumerate(choices):
+            choice_list.append('{}. {}'.format(multiple_choices[i], c))
+        choice_txt = '\n'.join(choice_list)
+
+        prompt = self.prompt.format(image, hint, question, choice_txt)
+
+        prompt_tokens = self.tokenizer(prompt).input_ids
+        target_tokens = [
+            self.tokenizer(' ' + _).input_ids
+            for _ in multiple_choices[:len(choices)]
+        ]
+
+        return {
+            'index': index,
+            'input_tokens': [prompt_tokens + _ for _ in target_tokens],
+            'target_lengths': [len(_) for _ in target_tokens],
+            # 'answer': data['answer'],
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+
+    prompt = '<img>{}</img>Context: {}\nQuestion: {}\nOptions: {}\nAnswer:'
+
+    dataset = MultipleChoiceDataste(test=ds_collections[args.dataset]['test'],
+                                    prompt=prompt,
+                                    tokenizer=tokenizer)
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
+    )
+
+    results = []
+    with torch.no_grad():
+        for _, (input_tokens, attention_mask, target_lengths,
+                chunk_sizes, indexes) in tqdm(enumerate(dataloader)):
+
+            outputs = model(
+                input_ids=input_tokens[:, :-1].cuda(),
+                attention_mask=attention_mask[:, :-1].cuda(),
+                return_dict=True,
+            )
+            losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
+                0, 2, 1),
+                                                       input_tokens[:,
+                                                                    1:].cuda(),
+                                                       reduction='none')
+
+            losses = losses.split(chunk_sizes, dim=0)
+
+            for loss, target_length, index in zip(losses, target_lengths, indexes):
+
+                target_loss = loss.mean(-1)
+                for _ in range(len(target_length)):
+                    target_loss[_] = loss[_, -target_length[_]:].mean()
+                pred = target_loss.argmin().item()
+
+                results.append({
+                    "index": index,
+                    "prediction": pred,
+                })
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_results = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_results, results)
+
+    merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
+
+    if torch.distributed.get_rank() == 0:
+        json.dump(merged_results, open(f"{args.dataset}.json", "w"))
+
+    torch.distributed.barrier()