"sgl-kernel/vscode:/vscode.git/clone" did not exist on "1bd5316873ee0ce327a5e92c0dc6bc799ff0d59c"
Commit 5e887c2c authored by wanglch's avatar wanglch
Browse files

Initial commit

parents
Pipeline #1060 canceled with stages
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-ubuntu22.04-dtk23.10.1-py310
ENV DEBIAN_FRONTEND=noninteractive
COPY requirements.txt requirements.txt
RUN pip install -r requirements.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
COPY requirements_web_demo.txt requirements_web_demo.txt
RUN pip install -r requirements_web_demo.txt -i http://mirrors.aliyun.com/pypi/simple/ --trusted-host mirrors.aliyun.com
transformers
accelerate
tiktoken
einops
transformers_stream_generator
scipy
torchvision
pillow
tensorboard
matplotlib
tk
shutilwhich
deepspeed
\ No newline at end of file
# Evaluation
## Dependencies
```bash
pip install pycocoevalcap tqdm
```
## Image Caption
### [Flickr30K](https://bryanplummer.com/Flickr30kEntities/)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/flickr && cd data/flickr
# download images from https://bryanplummer.com/Flickr30kEntities/
# karpathy split annotations can be downloaded from https://cs.stanford.edu/people/karpathy/deepimagesent/
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/flickr30k/flickr30k_karpathy_test.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/flickr30k/flickr30k_karpathy_train.json
cd ../..
```
</details>
<details>
<summary>Evaluate</summary>
```bash
ds="flickr"
checkpoint=/PATH/TO/CHECKPOINT
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_caption.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### [Nocaps](https://nocaps.org/)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/nocaps && cd data/nocaps
# download images from https://nocaps.org/download
# original annotations can be downloaded from https://nocaps.s3.amazonaws.com/nocaps_val_4500_captions.json
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/nocaps/nocaps_val.json
cd ../..
```
</details>
<details>
<summary>Evaluate</summary>
```bash
ds="nocaps"
checkpoint=/PATH/TO/CHECKPOINT
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_caption.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
## [COCO](https://cocodataset.org/)
> COCO images are used in VQAv2/OK-VQA/RefCOCO/RefCOCO+/RefCOCOg, make sure you have already downloaded COCO images before evaluate on these benchmarks.
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/coco && cd data/coco
# download coco2014 images
wget http://images.cocodataset.org/zips/train2014.zip && unzip train2014.zip
wget http://images.cocodataset.org/zips/val2014.zip && unzip val2014.zip
wget http://images.cocodataset.org/zips/test2015.zip && unzip test2015.zip
cd ../..
```
</details>
## General VQA
### [VQAv2](https://visualqa.org/)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/vqav2 && cd data/vqav2
# make sure you have downloaded COCO images
# download questions and annotations
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip && unzip v2_Annotations_Train_mscoco.zip
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip && unzip v2_Questions_Train_mscoco.zip
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip && unzip v2_Annotations_Val_mscoco.zip
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip && unzip v2_Questions_Val_mscoco.zip
wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip && unzip v2_Questions_Test_mscoco.zip
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_val.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vqav2/vqav2_testdev.jsonl
```
</details>
<details>
<summary>Evaluate</summary>
```bash
checkpoint=/PATH/TO/CHECKPOINT
for ds in "vqav2_val" "vqav2_testdev"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_vqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### [OKVQA](https://okvqa.allenai.org/)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/okvqa && cd data/okvqa
# download annotations and questions
wget https://okvqa.allenai.org/static/data/mscoco_train2014_annotations.json.zip && unzip mscoco_train2014_annotations.json.zip
wget https://okvqa.allenai.org/static/data/OpenEnded_mscoco_train2014_questions.json.zip && unzip OpenEnded_mscoco_train2014_questions.json.zip
wget https://okvqa.allenai.org/static/data/mscoco_val2014_annotations.json.zip && unzip mscoco_val2014_annotations.json.zip
wget https://okvqa.allenai.org/static/data/OpenEnded_mscoco_val2014_questions.json.zip && unzip OpenEnded_mscoco_val2014_questions.json.zip
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/okvqa/okvqa_train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/okvqa/okvqa_val.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluate</summary>
```bash
ds="okvqa_val"
checkpoint=/PATH/TO/CHECKPOINT
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_vqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### [TextVQA](https://textvqa.org/)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/textvqa && cd data/textvqa
# download images
wget https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip && unzip train_val_images.zip
# download annotations and questions
wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_train.json
wget https://dl.fbaipublicfiles.com/textvqa/data/TextVQA_0.5.1_val.json
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_annotations.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train_questions.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_annotations.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val_questions.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/textvqa/textvqa_val.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluate</summary>
```bash
ds="textvqa_val"
checkpoint=/PATH/TO/CHECKPOINT
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_vqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### [VizWiz](https://vizwiz.org/tasks-and-datasets/vqa/)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/vizwiz && cd data/vizwiz
# download images
wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/train.zip && unzip train.zip
wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/val.zip && unzip val.zip
wget https://vizwiz.cs.colorado.edu/VizWiz_final/images/test.zip && unzip test.zip
# download annotations
wget https://vizwiz.cs.colorado.edu/VizWiz_final/vqa_data/Annotations.zip && unzip Annotations.zip
# download converted files
# train
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train_annotations.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train_questions.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_train.jsonl
# val
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val_annotations.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val_questions.json
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_val.jsonl
# test
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/vizwiz/vizwiz_test.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluation</summary>
```bash
# evaluate vqa score on vizwiz val split
ds="vizwiz_val"
checkpoint=/PATH/TO/CHECKPOINT
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_vqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### [DocVQA](https://www.docvqa.org/datasets)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/docvqa && cd data/docvqa
# download images and annotations from https://www.docvqa.org/datasets
# download converted files
# train
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/train.jsonl
# val
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/val.jsonl
# test
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/docvqa/test.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluation</summary>
```bash
# evaluate vqa score on docvqa val split
ds="docvqa_val"
checkpoint=/PATH/TO/CHECKPOINT
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_vqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### [ChartQA](https://aclanthology.org/2022.findings-acl.177/)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/chartqa && cd data/chartqa
# download images from https://drive.google.com/file/d/1Lm_w6zeET1Hyl_9ks6w5nEsgpoyPHalV/view
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_human.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/train_augmented.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_human.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/chartqa/test_augmented.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluate</summary>
```bash
checkpoint=/PATH/TO/CHECKPOINT
for ds in "chartqa_test_human" "chartqa_test_augmented"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_vqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### [GQA](https://cs.stanford.edu/people/dorarad/gqa/about.html)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/gqa && cd data/gqa
# download images
wget https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip
unzip images.zip
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/gqa/testdev_balanced.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/gqa/train_balanced.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluate</summary>
```bash
checkpoint=/PATH/TO/CHECKPOINT
ds="gqa_testdev"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_vqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### [OCRVQA](https://ocr-vqa.github.io/)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/ocrvqa && cd data/ocrvqa
# download images by following instructions at https://ocr-vqa.github.io/kvqa_ProjectFiles/README.txt
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/ocrvqa/ocrvqa_train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/ocrvqa/ocrvqa_val.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/ocrvqa/ocrvqa_test.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluate</summary>
```bash
checkpoint=/PATH/TO/CHECKPOINT
ds="ocrvqa_test"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_vqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### [AI2Diagram](https://allenai.org/data/diagrams)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/ai2diagram && cd data/ai2diagram
# download images
wget https://ai2-public-datasets.s3.amazonaws.com/diagrams/ai2d-all.zip
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/ai2diagram/train.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/ai2diagram/test.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluate</summary>
```bash
checkpoint=/PATH/TO/CHECKPOINT
ds="ai2diagram_test"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_vqa.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### [ScienceQA](https://github.com/lupantech/ScienceQA)
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/scienceqa/images && cd data/scienceqa/images
# download images
wget https://scienceqa.s3.us-west-1.amazonaws.com/images/test.zip && unzip test.zip
cd ..
# download original questions
wget https://github.com/lupantech/ScienceQA/blob/main/data/scienceqa/problems.json
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/scienceqa/scienceqa_test_img.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluate</summary>
```bash
ds="scienceqa_test_img"
checkpoint=/PATH/TO/CHECKPOINT
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_multiple_choice.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
## Refer Expression Comprehension
### RefCOCO
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/refcoco && cd data/refcoco
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco/refcoco_val.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco/refcoco_testA.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco/refcoco_testB.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluation</summary>
```bash
checkpoint=/PATH/TO/CHECKPOINT
for ds in "refcoco_val" "refcoco_testA" "refcoco_testB"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_grounding.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### RefCOCO+
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/refcoco+ && cd data/refcoco+
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco%2B/refcoco%2B_val.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco%2B/refcoco%2B_testA.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcoco%2B/refcoco%2B_testB.jsonl
cd ../..
```
</details>
<details>
<summary>Data Preparation</summary>
```bash
checkpoint=/PATH/TO/CHECKPOINT
for ds in "refcoco+_val" "refcoco+_testA" "refcoco+_testB"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_grounding.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
### RefCOCOg
<details>
<summary>Data Preparation</summary>
```bash
mkdir -p data/refcocog && data/refcocog
# download converted files
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcocog/refcocog_val.jsonl
wget https://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/refcocog/refcocog_test.jsonl
cd ../..
```
</details>
<details>
<summary>Evaluate</summary>
```bash
checkpoint=/PATH/TO/CHECKPOINT
for ds in "refcocog_val" "refcocog_test"
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_grounding.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 8 \
--num-workers 2
```
</details>
/cpfs01/shared/public/shusheng.yss/datasets/qwenvl_evaluation
\ No newline at end of file
import argparse
import itertools
import json
import os
import random
import time
from functools import partial
import torch
from pycocoevalcap.eval import COCOEvalCap
from pycocotools.coco import COCO
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
ds_collections = {
'flickr': {
'train': 'data/flickr30k/flickr30k_karpathy_test.json',
'test': 'data/flickr30k/flickr30k_karpathy_test.json',
},
'nocaps': {
'train': '',
'test': 'data/nocaps/nocaps_val.json',
},
}
class CaptionDataset(torch.utils.data.Dataset):
def __init__(self, train, test, prompt, few_shot=0):
self.images = json.load(open(test))['images']
self.prompt = prompt
self.few_shot = few_shot
if few_shot > 0:
self.train = json.load(open(train))['annotations']
def __len__(self):
return len(self.images)
def __getitem__(self, idx):
image_id, image_path = self.images[idx]['id'], self.images[idx][
'image']
few_shot_prompt = ''
if self.few_shot > 0:
few_shot_samples = random.sample(self.train, self.few_shot)
for sample in few_shot_samples:
few_shot_prompt += self.prompt.format(
sample['image']) + f" {sample['caption']}"
return {
'image_id': image_id,
'input_text': few_shot_prompt + self.prompt.format(image_path)
}
def collate_fn(inputs, tokenizer):
image_ids = [_['image_id'] for _ in inputs]
input_texts = [_['input_text'] for _ in inputs]
input_tokens = tokenizer(input_texts,
return_tensors='pt',
padding='longest')
return image_ids, input_tokens.input_ids, input_tokens.attention_mask
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size,
self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--dataset', type=str, default='')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--few-shot', type=int, default=0)
parser.add_argument('--seed', type=int, default=0)
args = parser.parse_args()
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
prompt = '<img>{}</img>Describe the image in English:'
model = AutoModelForCausalLM.from_pretrained(
args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
trust_remote_code=True)
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eod_id
random.seed(args.seed)
dataset = CaptionDataset(
train=ds_collections[args.dataset]['train'],
test=ds_collections[args.dataset]['test'],
prompt=prompt,
few_shot=args.few_shot,
)
coco_karpathy_test_loader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
image_ids = []
captions = []
for _, (ids, input_ids,
attention_mask) in tqdm(enumerate(coco_karpathy_test_loader)):
pred = model.generate(
input_ids=input_ids.cuda(),
attention_mask=attention_mask.cuda(),
do_sample=False,
num_beams=1,
max_new_tokens=30,
min_new_tokens=8,
length_penalty=0,
num_return_sequences=1,
use_cache=True,
pad_token_id=tokenizer.eod_id,
eos_token_id=tokenizer.eod_id,
)
image_ids.extend(ids)
captions.extend([
tokenizer.decode(_[input_ids.size(1):].cpu(),
skip_special_tokens=True).strip() for _ in pred
])
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_ids = [None for _ in range(world_size)]
merged_captions = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_ids, image_ids)
torch.distributed.all_gather_object(merged_captions, captions)
merged_ids = [_ for _ in itertools.chain.from_iterable(merged_ids)]
merged_captions = [
_ for _ in itertools.chain.from_iterable(merged_captions)
]
if torch.distributed.get_rank() == 0:
print(f"Evaluating {args.dataset} ...")
results = []
for image_id, caption in zip(merged_ids, merged_captions):
results.append({
'image_id': int(image_id),
'caption': caption,
})
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{args.dataset}_{time_prefix}.json'
json.dump(results, open(results_file, 'w'))
coco = COCO(ds_collections[args.dataset]['test'])
coco_result = coco.loadRes(results_file)
coco_eval = COCOEvalCap(coco, coco_result)
coco_eval.evaluate()
print(coco_eval.eval.items())
torch.distributed.barrier()
import argparse
import itertools
import json
import os
import re
from functools import partial
import torch
from torchvision.ops.boxes import box_area
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
ds_collections = {
'refcoco_val': 'data/refcoco/refcoco_val.jsonl',
'refcoco_testA': 'data/refcoco/refcoco_testA.jsonl',
'refcoco_testB': 'data/refcoco/refcoco_testB.jsonl',
'refcoco+_val': 'data/refcoco+/refcoco+_val.jsonl',
'refcoco+_testA': 'data/refcoco+/refcoco+_testA.jsonl',
'refcoco+_testB': 'data/refcoco+/refcoco+_testB.jsonl',
'refcocog_val': 'data/refcocog/refcocog_val.jsonl',
'refcocog_test': 'data/refcocog/refcocog_test.jsonl',
}
def box_iou(boxes1, boxes2):
area1 = box_area(boxes1)
area2 = box_area(boxes2)
lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
wh = (rb - lt).clamp(min=0) # [N,M,2]
inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
union = area1[:, None] + area2 - inter
iou = inter / union
return iou, union
def collate_fn(batches, tokenizer):
texts = [_['text'] for _ in batches]
bboxes = [_['bbox'] for _ in batches]
hws = [_['hw'] for _ in batches]
input_ids = tokenizer(texts, return_tensors='pt', padding='longest')
return input_ids.input_ids, input_ids.attention_mask, bboxes, hws
class RefCOCODataset(torch.utils.data.Dataset):
def __init__(self, test, tokenizer, prompt):
self.datas = open(test).readlines()
self.tokenizer = tokenizer
self.prompt = prompt
def __len__(self):
return len(self.datas)
def __getitem__(self, idx):
data = json.loads(self.datas[idx].strip())
image = data['image']
text = data['sent']
bbox = data['bbox']
w, h = data['width'], data['height']
return {
'text': self.prompt.format(image, text),
'bbox': bbox,
'hw': (h, w),
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size,
self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--dataset', type=str, default='')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
args = parser.parse_args()
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model = AutoModelForCausalLM.from_pretrained(
args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
trust_remote_code=True)
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eod_id
prompt = '<img>{}</img><ref>{}</ref><box>'
dataset = RefCOCODataset(test=ds_collections[args.dataset],
tokenizer=tokenizer,
prompt=prompt)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=True,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for _, (input_ids, attention_mask, bboxes,
hws) in tqdm(enumerate(dataloader)):
pred = model.generate(
input_ids=input_ids.cuda(),
attention_mask=attention_mask.cuda(),
do_sample=False,
num_beams=1,
max_new_tokens=28,
min_new_tokens=10,
length_penalty=1,
num_return_sequences=1,
use_cache=True,
pad_token_id=tokenizer.eod_id,
eos_token_id=tokenizer.eod_id,
)
answers = [
tokenizer.decode(_[input_ids.size(1):].cpu(),
skip_special_tokens=True) for _ in pred
]
for bbox, hw, answer in zip(bboxes, hws, answers):
outputs.append({
'answer': answer,
'gt_bbox': bbox,
'hw': hw,
})
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, outputs)
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
PATTERN = re.compile(r'\((.*?)\),\((.*?)\)')
if torch.distributed.get_rank() == 0:
correct = total_cnt = 0
for i, output in enumerate(merged_outputs):
predict_bbox = re.findall(PATTERN, output['answer'])
try:
if ',' not in predict_bbox[0][0] or ',' not in predict_bbox[0][
1]:
predict_bbox = (0., 0., 0., 0.)
else:
x1, y1 = [
float(tmp) for tmp in predict_bbox[0][0].split(',')
]
x2, y2 = [
float(tmp) for tmp in predict_bbox[0][1].split(',')
]
predict_bbox = (x1, y1, x2, y2)
except:
predict_bbox = (0., 0., 0., 0.)
target_bbox = torch.tensor(output['gt_bbox'],
dtype=torch.float32).view(-1, 4)
predict_bbox = torch.tensor(predict_bbox,
dtype=torch.float32).view(-1, 4) / 999
predict_bbox[:, 0::2] *= output['hw'][1]
predict_bbox[:, 1::2] *= output['hw'][0]
iou, _ = box_iou(predict_bbox, target_bbox)
iou = iou.item()
total_cnt += 1
if iou >= 0.5:
correct += 1
print(f"Evaluating {args.dataset} ...")
print(f'Precision @ 1: {correct / total_cnt} \n')
torch.distributed.barrier()
import argparse
import itertools
import json
import os
from functools import partial
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
multiple_choices = ['A', 'B', 'C', 'D', 'E']
ds_collections = {
'scienceqa_test_img': {
'test': 'data/scienceqa/scienceqa_test_img.jsonl',
}
}
def collate_fn(batches, pad_token_id):
input_tokens = [_['input_tokens'] for _ in batches]
target_lengths = [_['target_lengths'] for _ in batches]
answers = [_['answer'] for _ in batches]
chunk_sizes = [len(_) for _ in input_tokens]
input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
max_lengths = max([len(_) for _ in input_tokens])
input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
for _ in input_tokens]
input_tokens = torch.LongTensor(input_tokens)
attention_mask = 1 - input_tokens.eq(pad_token_id).float()
return input_tokens, attention_mask, target_lengths, answers, chunk_sizes
class MultipleChoiceDataste(torch.utils.data.Dataset):
def __init__(self, test, prompt, tokenizer):
self.datas = open(test).readlines()
self.prompt = prompt
self.tokenizer = tokenizer
def __len__(self):
return len(self.datas)
def __getitem__(self, idx):
data = json.loads(self.datas[idx].strip())
image = data['image']
hint = data['hint'] if data['hint'] else 'N/A'
question = data['question']
choices = data['choices']
choice_list = []
for i, c in enumerate(choices):
choice_list.append('{}. {}'.format(multiple_choices[i], c))
choice_txt = '\n'.join(choice_list)
prompt = self.prompt.format(image, hint, question, choice_txt)
prompt_tokens = self.tokenizer(prompt).input_ids
target_tokens = [
self.tokenizer(' ' + _).input_ids
for _ in multiple_choices[:len(choices)]
]
return {
'input_tokens': [prompt_tokens + _ for _ in target_tokens],
'target_lengths': [len(_) for _ in target_tokens],
'answer': data['answer'],
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size,
self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--dataset', type=str, default='')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
args = parser.parse_args()
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model = AutoModelForCausalLM.from_pretrained(
args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
trust_remote_code=True)
prompt = '<img>{}</img>Context: {}\nQuestion: {}\nOptions: {}\nAnswer:'
dataset = MultipleChoiceDataste(test=ds_collections[args.dataset]['test'],
prompt=prompt,
tokenizer=tokenizer)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
)
results = []
with torch.no_grad():
for _, (input_tokens, attention_mask, target_lengths, answer,
chunk_sizes) in tqdm(enumerate(dataloader)):
outputs = model(
input_ids=input_tokens[:, :-1].cuda(),
attention_mask=attention_mask[:, :-1].cuda(),
return_dict=True,
)
losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
0, 2, 1),
input_tokens[:,
1:].cuda(),
reduction='none')
losses = losses.split(chunk_sizes, dim=0)
for loss, target_length, answer in zip(losses, target_lengths,
answer):
target_loss = loss.mean(-1)
for _ in range(len(target_length)):
target_loss[_] = loss[_, -target_length[_]:].mean()
pred = target_loss.argmin().item()
if pred == answer:
results.append(1)
else:
results.append(0)
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_results = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_results, results)
merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
if torch.distributed.get_rank() == 0:
print(f"Evaluating {args.dataset} ...")
print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
torch.distributed.barrier()
import argparse
import itertools
import json
import os
import random
import time
from functools import partial
from typing import Optional
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
from vqa import VQA
from vqa_eval import VQAEval
ds_collections = {
'vqav2_val': {
'train': 'data/vqav2/vqav2_train.jsonl',
'test': 'data/vqav2/vqav2_val.jsonl',
'question': 'data/vqav2/v2_OpenEnded_mscoco_val2014_questions.json',
'annotation': 'data/vqav2/v2_mscoco_val2014_annotations.json',
'metric': 'vqa_score',
'max_new_tokens': 10,
},
'vqav2_testdev': {
'train': 'data/vqav2/vqav2_train.jsonl',
'test': 'data/vqav2/vqav2_testdev.jsonl',
'metric': None,
'max_new_tokens': 10,
},
'okvqa_val': {
'train': 'data/okvqa/okvqa_train.jsonl',
'test': 'data/okvqa/okvqa_val.jsonl',
'question': 'data/okvqa/OpenEnded_mscoco_val2014_questions.json',
'annotation': 'data/okvqa/mscoco_val2014_annotations.json',
'metric': 'vqa_score',
'max_new_tokens': 10,
},
'textvqa_val': {
'train': 'data/textvqa/textvqa_train.jsonl',
'test': 'data/textvqa/textvqa_val.jsonl',
'question': 'data/textvqa/textvqa_val_questions.json',
'annotation': 'data/textvqa/textvqa_val_annotations.json',
'metric': 'vqa_score',
'max_new_tokens': 10,
},
'vizwiz_val': {
'train': 'data/vizwiz/vizwiz_train.jsonl',
'test': 'data/vizwiz/vizwiz_val.jsonl',
'question': 'data/vizwiz/vizwiz_val_questions.json',
'annotation': 'data/vizwiz/vizwiz_val_annotations.json',
'metric': 'vqa_score',
'max_new_tokens': 10,
},
'vizwiz_test': {
'train': 'data/vizwiz/vizwiz_train.jsonl',
'test': 'data/vizwiz/vizwiz_test.jsonl',
'metric': None,
'max_new_tokens': 10,
},
'docvqa_val': {
'train': 'data/docvqa/train.jsonl',
'test': 'data/docvqa/val.jsonl',
'annotation': 'data/docvqa/val/val_v1.0.json',
'metric': 'anls',
'max_new_tokens': 100,
},
'docvqa_test': {
'train': 'data/docvqa/train.jsonl',
'test': 'data/docvqa/test.jsonl',
'metric': None,
'max_new_tokens': 100,
},
'chartqa_test_human': {
'train': 'data/chartqa/train_human.jsonl',
'test': 'data/chartqa/test_human.jsonl',
'metric': 'relaxed_accuracy',
'max_new_tokens': 100,
},
'chartqa_test_augmented': {
'train': 'data/chartqa/train_augmented.jsonl',
'test': 'data/chartqa/test_augmented.jsonl',
'metric': 'relaxed_accuracy',
'max_new_tokens': 100,
},
'gqa_testdev': {
'train': 'data/gqa/train.jsonl',
'test': 'data/gqa/testdev_balanced.jsonl',
'metric': 'accuracy',
'max_new_tokens': 10,
},
'ocrvqa_val': {
'train': 'data/ocrvqa/ocrvqa_train.jsonl',
'test': 'data/ocrvqa/ocrvqa_val.jsonl',
'metric': 'accuracy',
'max_new_tokens': 100,
},
'ocrvqa_test': {
'train': 'data/ocrvqa/ocrvqa_train.jsonl',
'test': 'data/ocrvqa/ocrvqa_test.jsonl',
'metric': 'accuracy',
'max_new_tokens': 100,
},
'ai2diagram_test': {
'train': 'data/ai2diagram/train.jsonl',
'test': 'data/ai2diagram/test.jsonl',
'metric': 'accuracy',
'max_new_tokens': 10,
}
}
# https://github.com/google-research/pix2struct/blob/main/pix2struct/metrics.py#L81
def relaxed_correctness(target: str,
prediction: str,
max_relative_change: float = 0.05) -> bool:
"""Calculates relaxed correctness.
The correctness tolerates certain error ratio defined by max_relative_change.
See https://arxiv.org/pdf/2203.10244.pdf, end of section 5.1:
“Following Methani et al. (2020), we use a relaxed accuracy measure for the
numeric answers to allow a minor inaccuracy that may result from the automatic
data extraction process. We consider an answer to be correct if it is within
5% of the gold answer. For non-numeric answers, we still need an exact match
to consider an answer to be correct.”
Args:
target: Target string.
prediction: Predicted string.
max_relative_change: Maximum relative change.
Returns:
Whether the prediction was correct given the specified tolerance.
"""
def _to_float(text: str) -> Optional[float]:
try:
if text.endswith('%'):
# Convert percentages to floats.
return float(text.rstrip('%')) / 100.0
else:
return float(text)
except ValueError:
return None
prediction_float = _to_float(prediction)
target_float = _to_float(target)
if prediction_float is not None and target_float:
relative_change = abs(prediction_float -
target_float) / abs(target_float)
return relative_change <= max_relative_change
else:
return prediction.lower() == target.lower()
def evaluate_relaxed_accuracy(entries):
scores = []
for elem in entries:
if isinstance(elem['annotation'], str):
elem['annotation'] = [elem['annotation']]
score = max([
relaxed_correctness(elem['answer'].strip(), ann)
for ann in elem['annotation']
])
scores.append(score)
return sum(scores) / len(scores)
def evaluate_exact_match_accuracy(entries):
scores = []
for elem in entries:
if isinstance(elem['annotation'], str):
elem['annotation'] = [elem['annotation']]
score = max([
(1.0 if
(elem['answer'].strip().lower() == ann.strip().lower()) else 0.0)
for ann in elem['annotation']
])
scores.append(score)
return sum(scores) / len(scores)
def collate_fn(batches, tokenizer):
questions = [_['question'] for _ in batches]
question_ids = [_['question_id'] for _ in batches]
annotations = [_['annotation'] for _ in batches]
input_ids = tokenizer(questions, return_tensors='pt', padding='longest')
return question_ids, input_ids.input_ids, input_ids.attention_mask, annotations
class VQADataset(torch.utils.data.Dataset):
def __init__(self, train, test, prompt, few_shot):
self.test = open(test).readlines()
self.prompt = prompt
self.few_shot = few_shot
if few_shot > 0:
self.train = open(train).readlines()
def __len__(self):
return len(self.test)
def __getitem__(self, idx):
data = json.loads(self.test[idx].strip())
image, question, question_id, annotation = data['image'], data[
'question'], data['question_id'], data.get('answer', None)
few_shot_prompt = ''
if self.few_shot > 0:
few_shot_samples = random.sample(self.train, self.few_shot)
for sample in few_shot_samples:
sample = json.loads(sample.strip())
few_shot_prompt += self.prompt.format(
sample['image'],
sample['question']) + f" {sample['answer']}"
return {
'question': few_shot_prompt + self.prompt.format(image, question),
'question_id': question_id,
'annotation': annotation
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size,
self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--dataset', type=str, default='')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
parser.add_argument('--few-shot', type=int, default=0)
parser.add_argument('--seed', type=int, default=0)
args = parser.parse_args()
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model = AutoModelForCausalLM.from_pretrained(
args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
trust_remote_code=True)
tokenizer.padding_side = 'left'
tokenizer.pad_token_id = tokenizer.eod_id
prompt = '<img>{}</img>{} Answer:'
random.seed(args.seed)
dataset = VQADataset(
train=ds_collections[args.dataset]['train'],
test=ds_collections[args.dataset]['test'],
prompt=prompt,
few_shot=args.few_shot,
)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, tokenizer=tokenizer),
)
outputs = []
for _, (question_ids, input_ids, attention_mask,
annotations) in tqdm(enumerate(dataloader)):
pred = model.generate(
input_ids=input_ids.cuda(),
attention_mask=attention_mask.cuda(),
do_sample=False,
num_beams=1,
max_new_tokens=ds_collections[args.dataset]['max_new_tokens'],
min_new_tokens=1,
length_penalty=1,
num_return_sequences=1,
output_hidden_states=True,
use_cache=True,
pad_token_id=tokenizer.eod_id,
eos_token_id=tokenizer.eod_id,
)
answers = [
tokenizer.decode(_[input_ids.size(1):].cpu(),
skip_special_tokens=True).strip() for _ in pred
]
for question_id, answer, annotation in zip(question_ids, answers,
annotations):
if args.dataset in ['vqav2_val', 'vqav2_testdev', 'okvqa_val', 'textvqa_val', 'vizwiz_val']:
outputs.append({
'question_id': question_id,
'answer': answer,
})
elif args.dataset in ['docvqa_val', 'infographicsvqa', 'gqa_testdev', 'ocrvqa_val', 'ocrvqa_test']:
outputs.append({
'questionId': question_id,
'answer': answer,
'annotation': annotation,
})
elif args.dataset in ['ai2diagram_test']:
outputs.append({
'image': question_id,
'answer': answer,
'annotation': annotation,
})
elif args.dataset in ['chartqa_test_human', 'chartqa_test_augmented']:
outputs.append({
'answer': answer,
'annotation': annotation,
})
elif args.dataset in ['docvqa_test']:
outputs.append({
'questionId': question_id,
'answer': answer,
})
elif args.dataset in ['vizwiz_test']:
outputs.append({
'image': question_id,
'answer': answer,
})
else:
raise NotImplementedError
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_outputs = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_outputs, json.dumps(outputs))
merged_outputs = [json.loads(_) for _ in merged_outputs]
merged_outputs = [_ for _ in itertools.chain.from_iterable(merged_outputs)]
if torch.distributed.get_rank() == 0:
print(f"Evaluating {args.dataset} ...")
time_prefix = time.strftime('%y%m%d%H%M%S', time.localtime())
results_file = f'{args.dataset}_{time_prefix}_fs{args.few_shot}_s{args.seed}.json'
json.dump(merged_outputs, open(results_file, 'w'), ensure_ascii=False)
if ds_collections[args.dataset]['metric'] == 'vqa_score':
vqa = VQA(ds_collections[args.dataset]['annotation'],
ds_collections[args.dataset]['question'])
results = vqa.loadRes(
resFile=results_file,
quesFile=ds_collections[args.dataset]['question'])
vqa_scorer = VQAEval(vqa, results, n=2)
vqa_scorer.evaluate()
print(vqa_scorer.accuracy)
elif ds_collections[args.dataset]['metric'] == 'anls':
json.dump(merged_outputs,
open(results_file, 'w'),
ensure_ascii=False)
print('python infographicsvqa_eval.py -g ' +
ds_collections[args.dataset]['annotation'] + ' -s ' +
results_file)
os.system('python infographicsvqa_eval.py -g ' +
ds_collections[args.dataset]['annotation'] + ' -s ' +
results_file)
elif ds_collections[args.dataset]['metric'] == 'relaxed_accuracy':
print({
'relaxed_accuracy': evaluate_relaxed_accuracy(merged_outputs)
})
elif ds_collections[args.dataset]['metric'] == 'accuracy':
if 'gqa' in args.dataset:
for entry in merged_outputs:
response = entry['answer']
response = response.strip().split('.')[0].split(
',')[0].split('!')[0].lower()
if 'is ' in response:
response = response.split('is ')[1]
if 'are ' in response:
response = response.split('are ')[1]
if 'a ' in response:
response = response.split('a ')[1]
if 'an ' in response:
response = response.split('an ')[1]
if 'the ' in response:
response = response.split('the ')[1]
if ' of' in response:
response = response.split(' of')[0]
response = response.strip()
entry['answer'] = response
print({'accuracy': evaluate_exact_match_accuracy(merged_outputs)})
torch.distributed.barrier()
# This file can be downloaded from: https://www.docvqa.org/datasets/infographicvqa and https://rrc.cvc.uab.es/?ch=17&com=introduction
import os, json
import argparse
question_ids_to_exclude = []
# answer_types = {'image span': 'Image-Span', 'question span': 'Question-Span', 'multiple spans': 'Multi-Span', 'non span': 'None span', 'list': 'List'}
answer_types = {'image span': 'Image-Span', 'question span': 'Question-Span', 'multiple spans': 'Multi-Span', 'non span': 'None span'}
evidence_types = {'table/list': 'Table/list', 'textual': 'Text', 'photo/pciture/visual_objects': 'Visual/Layout', 'figure': 'Figure', 'map': 'Map'}
reasoning_requirements = {'comparison': 'Sorting', 'arithmetic': 'Arithmetic', 'counting':'Counting'}
def save_json(file_path, data):
with open(file_path, 'w+') as json_file:
json.dump(data, json_file)
def levenshtein_distance(s1, s2):
if len(s1) > len(s2):
s1, s2 = s2, s1
distances = range(len(s1) + 1)
for i2, c2 in enumerate(s2):
distances_ = [i2+1]
for i1, c1 in enumerate(s1):
if c1 == c2:
distances_.append(distances[i1])
else:
distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
distances = distances_
return distances[-1]
def validate_data(gtFilePath, submFilePath):
"""
Method validate_data: validates that all files in the results folder are correct (have the correct name contents).
Validates also that there are no missing files in the folder.
If some error detected, the method raises the error
"""
gtJson = json.load(open(gtFilePath,'rb'));
submJson = json.load(open(submFilePath,'rb'));
if not 'data' in gtJson:
raise Exception("The GT file is not valid (no data key)")
if not 'dataset_name' in gtJson:
raise Exception("The GT file is not valid (no dataset_name key)")
if isinstance(submJson, list) == False :
raise Exception("The Det file is not valid (root item must be an array)")
if len(submJson) != len(gtJson['data']) :
raise Exception("The Det file is not valid (invalid number of answers. Expected:" + str(len(gtJson['data'])) + " Found:" + str(len(submJson)) + ")")
gtQuestions = sorted([r['questionId'] for r in gtJson['data']])
res_id_to_index = {int(r['questionId']): ix for ix, r in enumerate(submJson)}
detQuestions = sorted([r['questionId'] for r in submJson])
if( (gtQuestions == detQuestions) == False ):
raise Exception("The Det file is not valid. Question IDs must much GT")
for gtObject in gtJson['data']:
try:
q_id = int(gtObject['questionId']);
res_ix = res_id_to_index[q_id];
except:
raise Exception("The Det file is not valid. Question " + str(gtObject['questionId']) + " not present")
else:
detObject = submJson[res_ix];
# if detObject['questionId'] != gtObject['questionId'] :
# raise Exception("Answer #" + str(i) + " not valid (invalid question ID. Expected:" + str(gtObject['questionId']) + "Found:" + detObject['questionId'] + ")")
if not 'answer' in detObject:
raise Exception("Question " + str(gtObject['questionId']) + " not valid (no answer key)")
if isinstance(detObject['answer'], list) == True :
raise Exception("Question " + str(gtObject['questionId']) + " not valid (answer key has to be a single string)")
def evaluate_method(gtFilePath, submFilePath, evaluationParams):
"""
Method evaluate_method: evaluate method and returns the results
Results. Dictionary with the following values:
- method (required) Global method metrics. Ex: { 'Precision':0.8,'Recall':0.9 }
- samples (optional) Per sample metrics. Ex: {'sample1' : { 'Precision':0.8,'Recall':0.9 } , 'sample2' : { 'Precision':0.8,'Recall':0.9 }
"""
show_scores_per_answer_type = evaluationParams.answer_types
gtJson = json.load(open(gtFilePath,'rb'));
submJson = json.load(open(submFilePath,'rb'));
res_id_to_index = {int(r['questionId']): ix for ix, r in enumerate(submJson)}
perSampleMetrics = {}
totalScore = 0
row = 0
if show_scores_per_answer_type:
answerTypeTotalScore = {x:0 for x in answer_types.keys()}
answerTypeNumQuestions = {x:0 for x in answer_types.keys()}
evidenceTypeTotalScore = {x:0 for x in evidence_types.keys()}
evidenceTypeNumQuestions = {x:0 for x in evidence_types.keys()}
reasoningTypeTotalScore = {x:0 for x in reasoning_requirements.keys()}
reasoningTypeNumQuestions = {x:0 for x in reasoning_requirements.keys()}
for gtObject in gtJson['data']:
q_id = int(gtObject['questionId']);
res_ix = res_id_to_index[q_id];
detObject = submJson[res_ix];
if q_id in question_ids_to_exclude:
question_result = 0
info = 'Question EXCLUDED from the result'
else:
info = ''
values = []
for answer in gtObject['answers']:
# preprocess both the answers - gt and prediction
gt_answer = ' '.join(answer.strip().lower().split())
det_answer = ' '.join(detObject['answer'].strip().lower().split())
#dist = levenshtein_distance(answer.lower(), detObject['answer'].lower())
dist = levenshtein_distance(gt_answer,det_answer)
length = max( len(answer.upper()), len(detObject['answer'].upper()) )
values.append( 0.0 if length == 0 else float(dist) / float(length) )
question_result = 1 - min(values)
if (question_result < evaluationParams.anls_threshold) :
question_result = 0
totalScore += question_result
if show_scores_per_answer_type:
for q_type in gtObject["answer_type"]:
answerTypeTotalScore[q_type] += question_result
answerTypeNumQuestions[q_type] += 1
for q_type in gtObject["evidence"]:
evidenceTypeTotalScore[q_type] += question_result
evidenceTypeNumQuestions[q_type] += 1
for q_type in gtObject["operation/reasoning"]:
reasoningTypeTotalScore[q_type] += question_result
reasoningTypeNumQuestions[q_type] += 1
perSampleMetrics[str(gtObject['questionId'])] = {
'score':question_result,
'question':gtObject['question'],
'gt':gtObject['answers'],
'det':detObject['answer'],
'info': info
}
row = row + 1
methodMetrics = {
'score': 0 if len(gtJson['data']) == 0 else totalScore/ (len(gtJson['data']) - len(question_ids_to_exclude) )
}
answer_types_scores = {}
evidence_types_scores = {}
operation_types_scores = {}
if show_scores_per_answer_type:
for a_type, ref in answer_types.items():
answer_types_scores[ref] = 0 if len(gtJson['data']) == 0 else answerTypeTotalScore[a_type] / (answerTypeNumQuestions[a_type] )
for e_type, ref in evidence_types.items():
evidence_types_scores[ref] = 0 if len(gtJson['data']) == 0 else evidenceTypeTotalScore[e_type] / (evidenceTypeNumQuestions[e_type] )
for r_type, ref in reasoning_requirements.items():
operation_types_scores[ref] = 0 if len(gtJson['data']) == 0 else reasoningTypeTotalScore[r_type] / (reasoningTypeNumQuestions[r_type] )
resDict = {
'result': methodMetrics,
'scores_by_types': {'answer_types': answer_types_scores, 'evidence_types': evidence_types_scores, 'operation_types': operation_types_scores},
'per_sample_result':perSampleMetrics
}
return resDict;
def display_results(results, show_answer_types):
print("\nOverall ANLS: {:2.4f}".format(results['result']['score']))
if show_answer_types:
print("\nAnswer types:")
for a_type in answer_types.values():
print("\t{:12s} {:2.4f}".format(a_type, results['scores_by_types']['answer_types'][a_type]))
print("\nEvidence types:")
for e_type in evidence_types.values():
print("\t{:12s} {:2.4f}".format(e_type, results['scores_by_types']['evidence_types'][e_type]))
print("\nOperation required:")
for r_type in reasoning_requirements.values():
print("\t{:12s} {:2.4f}".format(r_type, results['scores_by_types']['operation_types'][r_type]))
if __name__=='__main__':
parser = argparse.ArgumentParser(description="InfographVQA evaluation script.")
parser.add_argument('-g', '--ground_truth', type=str, help="Path of the Ground Truth file.", required=True)
parser.add_argument('-s', '--submission_file', type=str, help="Path of your method's results file.", required=True)
parser.add_argument('-t', '--anls_threshold', type=float, default=0.5, help="ANLS threshold to use (See Scene-Text VQA paper for more info.).", required=False)
parser.add_argument('-a', '--answer_types', type=bool, default=False, help="Score break down by answer types (special gt file required).", required=False)
parser.add_argument('-o', '--output', type=str, help="Path to a directory where to copy the file 'results.json' that contains per-sample results.", required=False)
args = parser.parse_args()
# Validate the format of ground truth and submission files.
validate_data(args.ground_truth, args.submission_file)
# Evaluate method
results = evaluate_method(args.ground_truth, args.submission_file, args)
display_results(results, args.answer_types)
if args.output:
output_dir = args.output
if not os.path.exists(output_dir):
os.makedirs(output_dir)
resultsOutputname = os.path.join(output_dir, 'results.json')
save_json(resultsOutputname, results)
print("All results including per-sample result has been correctly saved!")
# MMBench Evaluation
## Data
```bash
/cpfs01/shared/public/shusheng.yss/workspace/23082502_qwenvl_eval_test/eval_mm/data/mmbench
```
## Dev
```bash
checkpoint=/PATH/TO/CHECKPOINT
ds=mmbench_dev_20230712
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_multiple_choice_mmbench.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 2 \
--num-workers 2
# the results will be saved to mmbench_dev_20230712.json
# without consistency constrain
python mmbench_evaluation.py
# with consistency constrain
python mmbench_evaluation_tricky.py
```
## Test
```bash
checkpoint=/PATH/TO/CHECKPOINT
ds=mmbench_test_20230712
python -m torch.distributed.launch --use-env \
--nproc_per_node ${NPROC_PER_NODE:-8} \
--nnodes ${WORLD_SIZE:-1} \
--node_rank ${RANK:-0} \
--master_addr ${MASTER_ADDR:-127.0.0.1} \
--master_port ${MASTER_PORT:-12345} \
evaluate_multiple_choice_mmbench.py \
--checkpoint $checkpoint \
--dataset $ds \
--batch-size 2 \
--num-workers 2
# the results will be saved to mmbench_test_20230712.json
# convert to submission format with consistency constrain
python mmbench_predict_to_submission.py
```
import argparse
import itertools
import json
import os
from functools import partial
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer
multiple_choices = ['A', 'B', 'C', 'D', 'E']
ds_collections = {
'mmbench_dev_20230712': {
'test': 'data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl',
},
'mmbench_test_20230712': {
'test': 'data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl',
}
}
def collate_fn(batches, pad_token_id):
indexes = [_['index'] for _ in batches]
input_tokens = [_['input_tokens'] for _ in batches]
target_lengths = [_['target_lengths'] for _ in batches]
chunk_sizes = [len(_) for _ in input_tokens]
input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
max_lengths = max([len(_) for _ in input_tokens])
input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
for _ in input_tokens]
input_tokens = torch.LongTensor(input_tokens)
attention_mask = 1 - input_tokens.eq(pad_token_id).float()
return input_tokens, attention_mask, target_lengths, chunk_sizes, indexes
class MultipleChoiceDataste(torch.utils.data.Dataset):
def __init__(self, test, prompt, tokenizer):
self.datas = open(test).readlines()
self.prompt = prompt
self.tokenizer = tokenizer
def __len__(self):
return len(self.datas)
def __getitem__(self, idx):
data = json.loads(self.datas[idx].strip())
index = data['index']
image = data['image']
hint = data['hint'] if data['hint'] else 'N/A'
question = data['question']
choices = data['choices']
choice_list = []
for i, c in enumerate(choices):
choice_list.append('{}. {}'.format(multiple_choices[i], c))
choice_txt = '\n'.join(choice_list)
prompt = self.prompt.format(image, hint, question, choice_txt)
prompt_tokens = self.tokenizer(prompt).input_ids
target_tokens = [
self.tokenizer(' ' + _).input_ids
for _ in multiple_choices[:len(choices)]
]
return {
'index': index,
'input_tokens': [prompt_tokens + _ for _ in target_tokens],
'target_lengths': [len(_) for _ in target_tokens],
# 'answer': data['answer'],
}
class InferenceSampler(torch.utils.data.sampler.Sampler):
def __init__(self, size):
self._size = int(size)
assert size > 0
self._rank = torch.distributed.get_rank()
self._world_size = torch.distributed.get_world_size()
self._local_indices = self._get_local_indices(size, self._world_size,
self._rank)
@staticmethod
def _get_local_indices(total_size, world_size, rank):
shard_size = total_size // world_size
left = total_size % world_size
shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
begin = sum(shard_sizes[:rank])
end = min(sum(shard_sizes[:rank + 1]), total_size)
return range(begin, end)
def __iter__(self):
yield from self._local_indices
def __len__(self):
return len(self._local_indices)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--checkpoint', type=str, default='')
parser.add_argument('--dataset', type=str, default='')
parser.add_argument('--batch-size', type=int, default=1)
parser.add_argument('--num-workers', type=int, default=1)
args = parser.parse_args()
torch.distributed.init_process_group(
backend='nccl',
world_size=int(os.getenv('WORLD_SIZE', '1')),
rank=int(os.getenv('RANK', '0')),
)
torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
model = AutoModelForCausalLM.from_pretrained(
args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
trust_remote_code=True)
prompt = '<img>{}</img>Context: {}\nQuestion: {}\nOptions: {}\nAnswer:'
dataset = MultipleChoiceDataste(test=ds_collections[args.dataset]['test'],
prompt=prompt,
tokenizer=tokenizer)
dataloader = torch.utils.data.DataLoader(
dataset=dataset,
sampler=InferenceSampler(len(dataset)),
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=True,
drop_last=False,
collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
)
results = []
with torch.no_grad():
for _, (input_tokens, attention_mask, target_lengths,
chunk_sizes, indexes) in tqdm(enumerate(dataloader)):
outputs = model(
input_ids=input_tokens[:, :-1].cuda(),
attention_mask=attention_mask[:, :-1].cuda(),
return_dict=True,
)
losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
0, 2, 1),
input_tokens[:,
1:].cuda(),
reduction='none')
losses = losses.split(chunk_sizes, dim=0)
for loss, target_length, index in zip(losses, target_lengths, indexes):
target_loss = loss.mean(-1)
for _ in range(len(target_length)):
target_loss[_] = loss[_, -target_length[_]:].mean()
pred = target_loss.argmin().item()
results.append({
"index": index,
"prediction": pred,
})
torch.distributed.barrier()
world_size = torch.distributed.get_world_size()
merged_results = [None for _ in range(world_size)]
torch.distributed.all_gather_object(merged_results, results)
merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
if torch.distributed.get_rank() == 0:
json.dump(merged_results, open(f"{args.dataset}.json", "w"))
torch.distributed.barrier()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment