Initial commit

5e887c2c · wanglch · 5e887c2c · 5e887c2c · 5e887c2c · 5e887c2c
Commit 5e887c2c authored May 31, 2024 by wanglch
20 changed files
--- a/eval_mm/mmbench/mmbench_converter_dev.py
+++ b/eval_mm/mmbench/mmbench_converter_dev.py
+import pandas as pd
+import io
+import base64
+import json
+from PIL import Image
+'''
+This scripts convert mmbench_dev tsv file to jsonl
+'''
+datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
+global_choices = ['A', 'B', 'C', 'D']
+def decode_base64_to_image(base64_string):
+    image_data = base64.b64decode(base64_string)
+    image = Image.open(io.BytesIO(image_data))
+    return image
+with open('./data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl', 'w') as f:
+    for idx in range(len(datas)):
+        data = datas.iloc[idx]
+        index = int(data['index'])
+        question = data['question']
+        hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
+        choices = []
+        for opt in global_choices:
+            if pd.isna(data[opt]):
+                continue
+            choices.append(data[opt])
+        answer = global_choices.index(data['answer'])
+        image = decode_base64_to_image(data['image'])
+        image.save("data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index)
+        f.write(json.dumps({
+            "index": index,
+            "image": "data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index,
+            "hint": hint,
+            "question": question,
+            "choices": choices, 
+            "answer": answer,
+        }) + "\n")
--- a/eval_mm/mmbench/mmbench_converter_test.py
+++ b/eval_mm/mmbench/mmbench_converter_test.py
+import pandas as pd
+import io
+import base64
+import json
+from PIL import Image
+'''
+This script convert mmbench_test tsv file to jsonl
+This script is very similar to mmbench_converter_dev except there's no answer for accuracy calculation
+'''
+datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
+global_choices = ['A', 'B', 'C', 'D']
+def decode_base64_to_image(base64_string):
+    image_data = base64.b64decode(base64_string)
+    image = Image.open(io.BytesIO(image_data))
+    return image
+with open('./data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl', 'w') as f:
+    for idx in range(len(datas)):
+        data = datas.iloc[idx]
+        index = int(data['index'])
+        question = data['question']
+        hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
+        choices = []
+        for opt in global_choices:
+            if pd.isna(data[opt]):
+                continue
+            choices.append(data[opt])
+        # answer = global_choices.index(data['answer'])
+        image = decode_base64_to_image(data['image'])
+        image.save("data/mmbench/mmbench_test_20230712/images/%d.jpg" % index)
+        f.write(json.dumps({
+            "index": index,
+            "image": "data/mmbench/mmbench_test_20230712/images/%d.jpg" % index,
+            "hint": hint,
+            "question": question,
+            "choices": choices, 
+            # "answer": answer,
+        }) + "\n")
--- a/eval_mm/mmbench/mmbench_evaluation.py
+++ b/eval_mm/mmbench/mmbench_evaluation.py
+import pandas as pd
+import json
+'''
+This script provides `global top-1 accuracy` metric calculation for mmbench_dev.
+'''
+predictions = json.load(open('mmbench_dev_20230712.json'))
+index2predictions = {}
+for pred in predictions:
+    index2predictions[pred['index']] = pred['prediction']
+datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
+glb_opts = ['A', 'B', 'C', 'D']
+index2answer = {}
+for idx in range(len(datas)):
+    data = datas.iloc[idx]
+    index2answer[data['index']] = glb_opts.index(data['answer'])
+identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
+correct = 0
+total = 0
+for index in identity_indexes:
+    for _ in range(4):
+        cycle_index = int(_ * 1e6 + index)
+        if index2predictions.get(cycle_index, None) is not None:
+            if index2predictions[cycle_index] == index2answer[cycle_index]:
+                continue
+            else:
+                print(cycle_index)
+                break
+    else:
+        correct += 1
+    total += 1
+print(correct, total)
--- a/eval_mm/mmbench/mmbench_evaluation_tricky.py
+++ b/eval_mm/mmbench/mmbench_evaluation_tricky.py
+import pandas as pd
+import json
+import random
+'''
+This script provides metric calculation for mmbench_dev with the same accuarcy algo as OpenCompass server
+'''
+predictions = json.load(open('mmbench_dev_20230712.json'))
+index2predictions = {}
+for pred in predictions:
+    index2predictions[pred['index']] = pred['prediction']
+from collections import Counter
+def most_common_elements(lst):
+    counter = Counter(lst)
+    max_count = max(counter.values())
+    most_common = [element for element, count in counter.items() if count == max_count]
+    return random.choice(most_common) # random sample from random choice
+datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
+glb_opts = ['A', 'B', 'C', 'D']
+index2answer = {}
+index2choices = {}
+index2rawanswer = {}
+for idx in range(len(datas)):
+    data = datas.iloc[idx]
+    choices = []
+    for opt in glb_opts:
+        if not pd.isna(data[opt]):
+            choices.append(data[opt])
+    index2choices[data['index']] = choices
+    index2answer[data['index']] = glb_opts.index(data['answer'])
+    index2rawanswer[data['index']] = choices[glb_opts.index(data['answer'])]
+identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
+correct = 0
+total = 0
+for index in identity_indexes:
+    raw_preds = []
+    raw_answer = []
+    for _ in range(4):
+        cycle_index = int(_ * 1e6 + index)
+        if index2predictions.get(cycle_index, None) is not None:
+            raw_answer = index2rawanswer[cycle_index]
+            raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
+            raw_preds.append(raw_pred)
+    if len(set(raw_preds)) == 1:
+        if raw_preds[0] == raw_answer:
+            correct += 1
+    else:
+        result = most_common_elements(raw_preds)
+        if result == raw_answer:
+            correct += 1
+    total += 1
+print(correct, total, correct / total * 100.)
--- a/eval_mm/mmbench/mmbench_predict_to_submission.py
+++ b/eval_mm/mmbench/mmbench_predict_to_submission.py
+import pandas as pd
+import json
+import random
+'''
+This script convert the output file of our inference processor to target formation of OpenCompass evaluator server
+'''
+predictions = json.load(open('mmbench_test_20230712.json'))
+index2predictions = {}
+for pred in predictions:
+    index2predictions[pred['index']] = pred['prediction']
+from collections import Counter
+def most_common_elements(lst):
+    counter = Counter(lst)
+    max_count = max(counter.values())
+    most_common = [element for element, count in counter.items() if count == max_count]
+    print(most_common)
+    return random.choice(most_common)
+    # return most_common
+datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
+datas = datas.drop('image', axis=1)
+glb_opts = ['A', 'B', 'C', 'D']
+index2choices = {}
+for idx in range(len(datas)):
+    data = datas.iloc[idx]
+    choices = []
+    for opt in glb_opts:
+        if not pd.isna(data[opt]):
+            choices.append(data[opt])
+    index2choices[data['index']] = choices
+identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
+processed_index2predictions = {}
+for index in identity_indexes:
+    raw_preds = []
+    for _ in range(4):
+        cycle_index = int(_ * 1e6 + index)
+        if index2predictions.get(cycle_index, None) is not None:
+            raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
+            raw_preds.append(raw_pred)
+    if len(set(raw_preds)) == 1:
+        pred_answer = raw_preds[0]
+    else:
+        pred_answer = most_common_elements(raw_preds)
+    print(index, pred_answer)
+    for _ in range(4):
+        cycle_index = int(_ * 1e6 + index)
+        if index2predictions.get(cycle_index, None) is not None:
+            processed_index2predictions[cycle_index] = index2choices[cycle_index].index(pred_answer)
+predictions = []
+for idx in range(len(datas)):
+    data = datas.iloc[idx]
+    index = data['index']
+    prediction = glb_opts[processed_index2predictions[index]]
+    predictions.append(prediction)
+datas['prediction'] = predictions
+datas.to_excel("mmbench_test_20230712_230831_constrained.xlsx", index=False)
+# constrained means we force the model predict same answer when tested on a question for multiple times
--- a/eval_mm/mme/EVAL_MME.md
+++ b/eval_mm/mme/EVAL_MME.md
+# MME Benchmark
+[MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) is a comprehensive evaluation benchmark for multimodal large language models. It measures both perception and cognition abilities on a total of 14 subtasks, including existence, count, position, color, poster, celebrity, scene, landmark, artwork, OCR, commonsense reasoning, numerical calculation, text translation, and code reasoning.
+Qwen-VL-Chat achieves SOTAs on both perception and cognition evaluation.
+Perception Evaluation
+| Rank |      Model      |          Version         |  Score  |
+|:----:|:---------------:|:------------------------:|:-------:|
+|   1  | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)**|        **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)**       | **1487.57** |
+|   2  |    Skywork-MM   |      Skywork-MM-13B      | 1419.08 |
+|   3  |      MMICL      |         FlanT5xxl        | 1376.00 |
+|   4  |       Lynx      |         vicuna-7b        | 1373.23 |
+|   5  |      BLIVA      |         FlanT5xxl        | 1337.73 |
+Cognition Evaluation
+| Rank |       Model      |     Version    |    Score   |
+|:----:|:----------------:|:--------------:|:----------:|
+|   1  | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)** |   **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)**  | **360.71** |
+|   2  |       MMICL      |    FlanT5xxl   |   360.36   |
+|   3  |    Skywork-MM    | Skywork-MM-13B |   356.43   |
+|   4  |       BLIVA      |    FlanT5xxl   |   331.43   |
+|   5  |  LRV-Instruction |     LRV-7B     |   328.21   |
+Full Metrics
+```
+=========== Perception ===========
+total score: 1487.576330532213 
+         existence  score: 158.33333333333331
+         count  score: 150.0
+         position  score: 128.33333333333334
+         color  score: 170.0
+         posters  score: 178.57142857142856
+         celebrity  score: 120.58823529411764
+         scene  score: 152.25
+         landmark  score: 164.0
+         artwork  score: 125.5
+         OCR  score: 140.0
+=========== Cognition ===========
+total score: 360.71428571428567 
+         commonsense_reasoning  score: 130.7142857142857
+         numerical_calculation  score: 40.0
+         text_translation  score: 147.5
+         code_reasoning  score: 42.5
+```
+## How To Reproduce Results of MME Benchmark
+1. Download MME images and eval_tool from the [MME repo](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/blob/Evaluation/README.md)
+2. Rearrange images by executing `python get_images.py`
+3. Evaluate Qwen-VL-Chat results by executing `python eval.py`
+4. Calculate MME results by executing `python calculation.py --results_dir Qwen-VL-Chat`, which the calculation script comes from the MME eval_tool.
--- a/eval_mm/mme/cognition.jpg
+++ b/eval_mm/mme/cognition.jpg
--- a/eval_mm/mme/eval.py
+++ b/eval_mm/mme/eval.py
+import os
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+checkpoint = 'Qwen/Qwen-VL-Chat'
+tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    checkpoint, device_map='cuda', trust_remote_code=True).eval()
+model.generation_config = GenerationConfig.from_pretrained(checkpoint, trust_remote_code=True)
+model.generation_config.top_p = 0.01
+root = 'Your_Results'
+output = 'Qwen-VL-Chat'
+os.makedirs(output, exist_ok=True)
+for filename in os.listdir(root):
+    with open(os.path.join(root, filename), 'r') as fin, open(os.path.join(output, filename), 'w') as fout:
+        lines = fin.read().splitlines()
+        filename = filename.replace('.txt', '')
+        for line in tqdm(lines):
+            img, question, gt = line.strip().split('\t')
+            img_path = os.path.join('images', filename, img)
+            assert os.path.exists(img_path), img_path
+            query = f'<img>{img_path}</img>\n{question}'
+            response, _ = model.chat(tokenizer, query=query, history=None)
+            print(img, question, gt, response, sep='\t', file=fout)
--- a/eval_mm/mme/get_images.py
+++ b/eval_mm/mme/get_images.py
+import os
+from tqdm import tqdm
+os.system('rm -rf images')
+os.system('mkdir images')
+os.system('cp -r ../MME_Benchmark_release/OCR images/')
+os.system('mkdir images/artwork')
+os.system('cp ../MME_Benchmark_release/artwork/questions_answers_YN/* images/artwork/')
+with open('LaVIN/artwork.txt') as fin:
+    paths = [ line.strip().split('\t', 1)[0] for line in fin ]
+    paths = list(set(paths))
+    for path in tqdm(paths):
+        os.system(f'cp ../MME_Benchmark_release/artwork/images/toy_dataset/{path} images/artwork/{path}')
+os.system('mkdir images/celebrity')
+os.system('cp ../MME_Benchmark_release/celebrity/images/* images/celebrity/')
+os.system('cp ../MME_Benchmark_release/celebrity/questions_answers_YN/* images/celebrity/')
+os.system('cp -r ../MME_Benchmark_release/code_reasoning images/')
+os.system('cp -r ../MME_Benchmark_release/color images/')
+os.system('cp -r ../MME_Benchmark_release/commonsense_reasoning images/')
+os.system('cp -r ../MME_Benchmark_release/count images/')
+os.system('cp -r ../MME_Benchmark_release/existence images/')
+os.system('mkdir images/landmark')
+os.system('cp ../MME_Benchmark_release/landmark/images/* images/landmark/')
+os.system('cp ../MME_Benchmark_release/landmark/questions_answers_YN/* images/landmark/')
+os.system('cp -r ../MME_Benchmark_release/numerical_calculation images/')
+os.system('cp -r ../MME_Benchmark_release/position images/')
+os.system('mkdir images/posters')
+os.system('cp ../MME_Benchmark_release/posters/images/* images/posters/')
+os.system('cp ../MME_Benchmark_release/posters/questions_answers_YN/* images/posters/')
+os.system('mkdir images/scene')
+os.system('cp ../MME_Benchmark_release/scene/images/* images/scene/')
+os.system('cp ../MME_Benchmark_release/scene/questions_answers_YN/* images/scene/')
+os.system('cp -r ../MME_Benchmark_release/text_translation images/')
--- a/eval_mm/mme/perception.jpg
+++ b/eval_mm/mme/perception.jpg
--- a/eval_mm/seed_bench/EVAL_SEED.md
+++ b/eval_mm/seed_bench/EVAL_SEED.md
+# Seed-Bench Evaluation
+[SEED-Bench](https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard) is a multimodal benchmark of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both **image** and **video** understanding.
+Qwen-VL and Qwen-VL-Chat achieve SOTAs on this benchmark.
+<p align="center">
+    <img src="leaderboard.jpg"/>
+<p>
+## How To Process Video by Qwen-VL
+Qwen-VL and Qwen-VL-Chat didn't train any video data or tasks during training, but they can understand some videos in a zero-shot way. For the video question-answering task, we utilize four uniformly sampled frames per video sample. These frames are treated as separate images and are stitched into the context. For example:
+```
+{
+  "question_id": "v0",
+  "prompt": "<img>video_imgs_4/v0_0.jpg</img>\n<img>video_imgs_4/v0_1.jpg</img>\n<img>video_imgs_4/v0_2.jpg</img>\n<img>video_imgs_4/v0_3.jpg</img>\nQuestion: Can you identify the action taking place in the video?\nOptions: A. pretending to take something out of something\nB. pretending to take something from somewhere\nC. feigning to insert something into something\nD. simulating putting something onto something\nAnswer:"
+}
+```
+The above JSON line can be used as the input by `eval_mm/seed_bench/eval.py` and output the following results:
+```
+{"question_id": "v0", "prediction": "B"}
+```
+Please see [eval_mm/seed_bench/eval.py](eval.py) for more inference details.
+## How To Reproduce Results of Seed-Bench
+1. Download all images and videos by following the [instruction](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md). Then modify the root path in `eval_mm/seed_bench/trans.py` with your customized path.
+```
+# path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
+seed_bench_input_path = 'SEED-Bench.json'
+# root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
+cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
+# root directory of evaluation dimension 10
+dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
+# root directory of evaluation dimension 11
+dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
+# root directory of evaluation dimension 12
+dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
+```
+2. Generate input files of Qwen-VL with the JSON formatting.
+```
+cd eval_mm/seed_bench/
+python trans.py
+```
+This script will output two JSONL files and one directory. `image_input.jsonl` is the input file of image evaluation and `video_input_4.jsonl` is the input file of video evaluation by 4 frames. The directory `video_imgs_4` contains all 4-framed images extracted from videos. We provide our [image_input.jsonl](http://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/seed_bench/image_input.jsonl) and [video_input_4.jsonl](http://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/seed_bench/video_input_4.jsonl) here for reference.
+3. Produce the results of Seed-Bench.
+```
+# The number of available GPUs 
+export NPROC_PER_NODE=8
+# Produce the Qwen-VL-Chat results of image understanding
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    eval.py \
+    --checkpoint Qwen/Qwen-VL-Chat \
+    --dataset image_input.jsonl \
+    --batch-size 4 \
+    --num-workers 2
+# Collect the result files
+cat result_?.jsonl >results_chat_img.jsonl
+rm result_?.jsonl
+# Produce the results of video understanding
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    eval.py \
+    --checkpoint Qwen/Qwen-VL-Chat \
+    --dataset video_input_4.jsonl \
+    --batch-size 2 \
+    --num-workers 1
+# Collect the result files
+cat result_?.jsonl >results_chat_vid.jsonl
+rm result_?.jsonl
+# The file `results_chat.jsonl` can be submitted to the leaderboard
+cat results_chat_img.jsonl results_chat_vid.jsonl >results_chat.jsonl
+```
+You can reproduce the Seed-Bench results of Qwen-VL by replacing `Qwen/Qwen-VL-Chat` with `Qwen/Qwen-VL` on the above script.
--- a/eval_mm/seed_bench/eval.py
+++ b/eval_mm/seed_bench/eval.py
+import argparse
+import itertools
+import json
+import os
+from functools import partial
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+def collate_fn(batches, pad_token_id):
+    input_tokens = [_['input_tokens'] for _ in batches]
+    target_lengths = [_['target_lengths'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+    question_id = [_['question_id'] for _ in batches]
+    chunk_sizes = [len(_) for _ in input_tokens]
+    input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
+    max_lengths = max([len(_) for _ in input_tokens])
+    input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
+                    for _ in input_tokens]
+    input_tokens = torch.LongTensor(input_tokens)
+    attention_mask = 1 - input_tokens.eq(pad_token_id).float()
+    return input_tokens, attention_mask, target_lengths, answers, chunk_sizes, question_id
+class MultipleChoiceDataste(torch.utils.data.Dataset):
+    def __init__(self, test, tokenizer):
+        self.datas = []
+        with open(test) as fin:
+            for line in tqdm(fin):
+                self.datas.append(json.loads(line.strip()))
+        self.tokenizer = tokenizer
+    def __len__(self):
+        return len(self.datas)
+    def __getitem__(self, idx):
+        data = self.datas[idx]
+        prompt = data['prompt']
+        prompt_tokens = self.tokenizer(prompt).input_ids
+        target_tokens = [
+            self.tokenizer(' ' + _).input_ids
+            for _ in ['A', 'B', 'C', 'D']
+        ]
+        return {
+            'input_tokens': [prompt_tokens + _ for _ in target_tokens],
+            'target_lengths': [len(_) for _ in target_tokens],
+            'answer': data['answer'],
+            'question_id': data['question_id'],
+        }
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+    def __iter__(self):
+        yield from self._local_indices
+    def __len__(self):
+        return len(self._local_indices)
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    args = parser.parse_args()
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint, trust_remote_code=True)
+    model.generation_config.top_p = 0.01
+    dataset = MultipleChoiceDataste(test=args.dataset, tokenizer=tokenizer)
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        # sampler=InferenceSampler(1000),
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
+    )
+    results = []
+    fout = open('result_{}.jsonl'.format(torch.distributed.get_rank()), 'w')
+    with torch.no_grad():
+        for _, (input_tokens, attention_mask, target_lengths, answers,
+                chunk_sizes, question_ids) in tqdm(enumerate(dataloader)):
+            outputs = model(
+                input_ids=input_tokens[:, :-1].cuda(),
+                attention_mask=attention_mask[:, :-1].cuda(),
+                return_dict=True,
+            )
+            losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
+                0, 2, 1),
+                                                       input_tokens[:,
+                                                                    1:].cuda(),
+                                                       reduction='none')
+            losses = losses.split(chunk_sizes, dim=0)
+            for loss, target_length, answer, question_id in zip(losses, target_lengths,
+                                                   answers, question_ids):
+                target_loss = loss.mean(-1)
+                for _ in range(len(target_length)):
+                    target_loss[_] = loss[_, -target_length[_]:].mean()
+                pred = target_loss.argmin().item()
+                pred = chr(pred + 65)
+                if pred == answer:
+                    results.append(1)
+                else:
+                    results.append(0)
+                answer_record = {
+                    'question_id': question_id,
+                    'prediction': pred
+                }
+                print(json.dumps(answer_record), file=fout)
+    fout.close()
+    torch.distributed.barrier()
+    world_size = torch.distributed.get_world_size()
+    merged_results = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_results, results)
+    merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
+    if torch.distributed.get_rank() == 0:
+        print(f"Evaluating {args.dataset} ...")
+        print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
+    torch.distributed.barrier()
--- a/eval_mm/seed_bench/leaderboard.jpg
+++ b/eval_mm/seed_bench/leaderboard.jpg
--- a/eval_mm/seed_bench/trans.py
+++ b/eval_mm/seed_bench/trans.py
+import os
+import av
+import json
+import torch
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from decord import VideoReader, cpu
+# path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
+seed_bench_input_path = 'SEED-Bench.json'
+# root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
+cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
+# root directory of evaluation dimension 10
+dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
+# root directory of evaluation dimension 11
+dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
+# root directory of evaluation dimension 12
+dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
+def is_integer_string(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+def filter_questions(data, task='all'):
+    if task == "image":
+        return [q for q in data if 1 <= q["question_type_id"] <= 9]
+    elif task == "video":
+        return [q for q in data if 10 <= q["question_type_id"] <= 12]
+    elif task == "all":
+        return data
+    elif is_integer_string(task):
+        return [q for q in data if q["question_type_id"] == int(task)]
+    else:
+        raise ValueError(f"Invalid task: {task}")
+def get_index(num_frames, num_segments):
+    if num_segments > num_frames:
+        offsets = np.array([
+            idx for idx in range(num_frames)
+        ])
+    else:
+        # uniform sampling
+        seg_size = float(num_frames - 1) / num_segments
+        start = int(seg_size / 2)
+        offsets = np.array([
+            start + int(np.round(seg_size * idx)) for idx in range(num_segments)
+        ])
+    return offsets
+with open(seed_bench_input_path) as fin:
+    qa_anno = json.load(fin)['questions']
+fout = open('image_input.jsonl', 'w')
+i_anno = filter_questions(qa_anno, 'image')
+for qa_item in tqdm(i_anno):
+    data_path = cc3m_dir + qa_item['data_id']
+    choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
+    choice_list = []
+    for i, c in enumerate(choices):
+        choice_list.append('{}. {}'.format(chr(i + 65), c))
+    choice_txt = '\n'.join(choice_list)
+    prompt = '<img>{}</img>\nQuestion: {}\nOptions: {}\nAnswer:'.format(
+        data_path, qa_item['question'], choice_txt)
+    print(json.dumps({
+        'question_id': qa_item['question_id'],
+        'prompt': prompt,
+        'answer': qa_item['answer'],
+    }), file=fout)
+fout.close()
+n_frames = 8
+os.system('rm -rf video_input_' + str(n_frames))
+os.makedirs('video_imgs_' + str(n_frames), exist_ok=True)
+fout = open('video_input_{}.jsonl'.format(n_frames), 'w')
+v_anno = filter_questions(qa_anno, 'video')
+for qa_item in tqdm(v_anno):
+    if qa_item['question_type_id'] == 12:
+        data_path = dimension12_dir + qa_item['data_id']
+    elif qa_item['question_type_id'] == 11:
+        data_path = dimension11_dir + qa_item['data_id'].split('/')[-1]
+    elif qa_item['question_type_id'] == 10:
+        data_path = dimension10_dir + qa_item['data_id']
+    else:
+        assert False, str(qa_item)
+    print(data_path)
+    use_pyav = False
+    if 'segment' in qa_item.keys():
+        segment = qa_item['segment']
+        if isinstance(segment[0], int):
+            # using pyav for decoding videos in evaluation dimension 12
+            use_pyav = True
+        start, end = segment[0], segment[1]
+    else:
+        start = 0.0
+        end = 0.0
+    if use_pyav:
+        # using pyav for decoding videos in evaluation dimension 12
+        reader = av.open(data_path)
+        frames = [torch.from_numpy(f.to_rgb().to_ndarray()) for f in reader.decode(video=0)]
+        video_len = len(frames)
+        start_frame, end_frame = start, end
+        end_frame = min(end_frame, video_len)
+        offset = get_index(end_frame - start_frame, n_frames)
+        frame_indices = offset + start_frame
+        images = torch.stack([frames[idx] for idx in frame_indices]).numpy()
+    else:
+        # using decord for decoding videos in evaluation dimension 10-11
+        try:
+            vr = VideoReader(data_path, num_threads=1, ctx=cpu(0))
+            video_len = len(vr)
+            fps = vr.get_avg_fps()
+            if 'segment' in qa_item.keys():
+                # obtain start and end frame for the video segment in evaluation dimension 11
+                start_frame = int(min(max(start * fps, 0), video_len - 1))
+                end_frame = int(min(max(end * fps, 0), video_len - 1))
+                tot_frames = int(end_frame - start_frame)
+                offset = get_index(tot_frames, n_frames)
+                frame_indices = offset + start_frame
+            else:
+                # sample frames of the video in evaluation dimension 10
+                frame_indices = get_index(video_len - 1, n_frames)
+            vr.seek(0)
+            images = vr.get_batch(frame_indices).asnumpy()
+        except Exception as e:
+            print(json.dumps({
+                'question_id': qa_item['question_id'],
+                'prompt': "Error" + str(e),
+                'answer': qa_item['answer'],
+            }), file=fout)
+            continue
+    prompt = ''
+    for i in range(images.shape[0]):
+        data = Image.fromarray(images[i])
+        img_path = 'video_imgs_{}/{}_{}.jpg'.format(n_frames, qa_item['question_id'], i)
+        data.save(img_path)
+        prompt += '<img>' + img_path + '</img>\n'
+    choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
+    choice_list = []
+    for i, c in enumerate(choices):
+        choice_list.append('{}. {}'.format(chr(i + 65), c))
+    choice_txt = '\n'.join(choice_list)
+    prompt += 'Question: {}\nOptions: {}\nAnswer:'.format(qa_item['question'], choice_txt)
+    print(json.dumps({
+        'question_id': qa_item['question_id'],
+        'prompt': prompt,
+        'answer': qa_item['answer'],
+    }), file=fout)
+fout.close()
--- a/eval_mm/vqa.py
+++ b/eval_mm/vqa.py
+"""Copyright (c) 2022, salesforce.com, inc.
+All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+__author__ = 'aagrawal'
+__version__ = '0.9'
+# Interface for accessing the VQA dataset.
+# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
+# (https://github.com/pdollar/coco/blob/master/PythonAPI/pycocotools/coco.py).
+# The following functions are defined:
+#  VQA        - VQA class that loads VQA annotation file and prepares data structures.
+#  getQuesIds - Get question ids that satisfy given filter conditions.
+#  getImgIds  - Get image ids that satisfy given filter conditions.
+#  loadQA     - Load questions and answers with the specified question ids.
+#  showQA     - Display the specified questions and answers.
+#  loadRes    - Load result file and create result object.
+# Help on each function can be accessed by: "help(COCO.function)"
+import copy
+import datetime
+import json
+class VQA:
+    def __init__(self, annotation_file=None, question_file=None):
+        """Constructor of VQA helper class for reading and visualizing
+        questions and answers.
+        :param annotation_file (str): location of VQA annotation file
+        :return:
+        """
+        # load dataset
+        self.dataset = {}
+        self.questions = {}
+        self.qa = {}
+        self.qqa = {}
+        self.imgToQA = {}
+        if not annotation_file == None and not question_file == None:
+            print('loading VQA annotations and questions into memory...')
+            time_t = datetime.datetime.utcnow()
+            dataset = json.load(open(annotation_file, 'r'))
+            questions = json.load(open(question_file, 'r'))
+            self.dataset = dataset
+            self.questions = questions
+            self.createIndex()
+    def createIndex(self):
+        # create index
+        print('creating index...')
+        imgToQA = {ann['image_id']: [] for ann in self.dataset['annotations']}
+        qa = {ann['question_id']: [] for ann in self.dataset['annotations']}
+        qqa = {ann['question_id']: [] for ann in self.dataset['annotations']}
+        for ann in self.dataset['annotations']:
+            imgToQA[ann['image_id']] += [ann]
+            qa[ann['question_id']] = ann
+        for ques in self.questions['questions']:
+            qqa[ques['question_id']] = ques
+        print('index created!')
+        # create class members
+        self.qa = qa
+        self.qqa = qqa
+        self.imgToQA = imgToQA
+    def info(self):
+        """Print information about the VQA annotation file.
+        :return:
+        """
+        for key, value in self.datset['info'].items():
+            print('%s: %s' % (key, value))
+    def getQuesIds(self, imgIds=[], quesTypes=[], ansTypes=[]):
+        """Get question ids that satisfy given filter conditions. default skips
+        that filter.
+        :param  imgIds    (int array)   : get question ids for given imgs
+                        quesTypes (str array)   : get question ids for given question types
+                        ansTypes  (str array)   : get question ids for given answer types
+        :return:    ids   (int array)   : integer array of question ids
+        """
+        imgIds = imgIds if type(imgIds) == list else [imgIds]
+        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
+        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
+        if len(imgIds) == len(quesTypes) == len(ansTypes) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(imgIds) == 0:
+                anns = sum(
+                    [
+                        self.imgToQA[imgId]
+                        for imgId in imgIds if imgId in self.imgToQA
+                    ],
+                    [],
+                )
+            else:
+                anns = self.dataset['annotations']
+            anns = (anns if len(quesTypes) == 0 else
+                    [ann for ann in anns if ann['question_type'] in quesTypes])
+            anns = (anns if len(ansTypes) == 0 else
+                    [ann for ann in anns if ann['answer_type'] in ansTypes])
+        ids = [ann['question_id'] for ann in anns]
+        return ids
+    def getImgIds(self, quesIds=[], quesTypes=[], ansTypes=[]):
+        """Get image ids that satisfy given filter conditions. default skips
+        that filter.
+         :param quesIds   (int array)   : get image ids for given question ids
+        quesTypes (str array)   : get image ids for given question types
+        ansTypes  (str array)   : get image ids for given answer types
+         :return: ids     (int array)   : integer array of image ids
+        """
+        quesIds = quesIds if type(quesIds) == list else [quesIds]
+        quesTypes = quesTypes if type(quesTypes) == list else [quesTypes]
+        ansTypes = ansTypes if type(ansTypes) == list else [ansTypes]
+        if len(quesIds) == len(quesTypes) == len(ansTypes) == 0:
+            anns = self.dataset['annotations']
+        else:
+            if not len(quesIds) == 0:
+                anns = sum([
+                    self.qa[quesId] for quesId in quesIds if quesId in self.qa
+                ], [])
+            else:
+                anns = self.dataset['annotations']
+            anns = (anns if len(quesTypes) == 0 else
+                    [ann for ann in anns if ann['question_type'] in quesTypes])
+            anns = (anns if len(ansTypes) == 0 else
+                    [ann for ann in anns if ann['answer_type'] in ansTypes])
+        ids = [ann['image_id'] for ann in anns]
+        return ids
+    def loadQA(self, ids=[]):
+        """Load questions and answers with the specified question ids.
+        :param ids (int array)       : integer ids specifying question ids
+        :return: qa (object array)   : loaded qa objects
+        """
+        if type(ids) == list:
+            return [self.qa[id] for id in ids]
+        elif type(ids) == int:
+            return [self.qa[ids]]
+    def showQA(self, anns):
+        """Display the specified annotations.
+        :param anns (array of object): annotations to display
+        :return: None
+        """
+        if len(anns) == 0:
+            return 0
+        for ann in anns:
+            quesId = ann['question_id']
+            print('Question: %s' % (self.qqa[quesId]['question']))
+            for ans in ann['answers']:
+                print('Answer %d: %s' % (ans['answer_id'], ans['answer']))
+    def loadRes(self, resFile, quesFile):
+        """Load result file and return a result object.
+        :param   resFile (str)     : file name of result file
+        :return: res (obj)         : result api object
+        """
+        res = VQA()
+        res.questions = json.load(open(quesFile))
+        res.dataset['info'] = copy.deepcopy(self.questions['info'])
+        res.dataset['task_type'] = copy.deepcopy(self.questions['task_type'])
+        res.dataset['data_type'] = copy.deepcopy(self.questions['data_type'])
+        res.dataset['data_subtype'] = copy.deepcopy(
+            self.questions['data_subtype'])
+        res.dataset['license'] = copy.deepcopy(self.questions['license'])
+        print('Loading and preparing results...     ')
+        time_t = datetime.datetime.utcnow()
+        anns = json.load(open(resFile))
+        assert type(anns) == list, 'results is not an array of objects'
+        annsQuesIds = [ann['question_id'] for ann in anns]
+        assert set(annsQuesIds) == set(
+            self.getQuesIds()
+        ), 'Results do not correspond to current VQA set. Either the results do not have predictions for all question ids in annotation file or there is atleast one question id that does not belong to the question ids in the annotation file.'
+        for ann in anns:
+            quesId = ann['question_id']
+            if res.dataset['task_type'] == 'Multiple Choice':
+                assert (
+                    ann['answer'] in self.qqa[quesId]['multiple_choices']
+                ), 'predicted answer is not one of the multiple choices'
+            qaAnn = self.qa[quesId]
+            ann['image_id'] = qaAnn['image_id']
+            ann['question_type'] = qaAnn['question_type']
+            ann['answer_type'] = qaAnn['answer_type']
+        print('DONE (t=%0.2fs)' %
+              ((datetime.datetime.utcnow() - time_t).total_seconds()))
+        res.dataset['annotations'] = anns
+        res.createIndex()
+        return res
--- a/eval_mm/vqa_eval.py
+++ b/eval_mm/vqa_eval.py
+"""Copyright (c) 2022, salesforce.com, inc.
+All rights reserved.
+SPDX-License-Identifier: BSD-3-Clause
+For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+# coding=utf-8
+__author__ = 'aagrawal'
+import re
+# This code is based on the code written by Tsung-Yi Lin for MSCOCO Python API available at the following link:
+# (https://github.com/tylin/coco-caption/blob/master/pycocoevalcap/eval.py).
+import sys
+class VQAEval:
+    def __init__(self, vqa=None, vqaRes=None, n=2):
+        self.n = n
+        self.accuracy = {}
+        self.evalQA = {}
+        self.evalQuesType = {}
+        self.evalAnsType = {}
+        self.vqa = vqa
+        self.vqaRes = vqaRes
+        if vqa is not None:
+            self.params = {'question_id': vqa.getQuesIds()}
+        self.contractions = {
+            'aint': "ain't",
+            'arent': "aren't",
+            'cant': "can't",
+            'couldve': "could've",
+            'couldnt': "couldn't",
+            "couldn'tve": "couldn't've",
+            "couldnt've": "couldn't've",
+            'didnt': "didn't",
+            'doesnt': "doesn't",
+            'dont': "don't",
+            'hadnt': "hadn't",
+            "hadnt've": "hadn't've",
+            "hadn'tve": "hadn't've",
+            'hasnt': "hasn't",
+            'havent': "haven't",
+            'hed': "he'd",
+            "hed've": "he'd've",
+            "he'dve": "he'd've",
+            'hes': "he's",
+            'howd': "how'd",
+            'howll': "how'll",
+            'hows': "how's",
+            "Id've": "I'd've",
+            "I'dve": "I'd've",
+            'Im': "I'm",
+            'Ive': "I've",
+            'isnt': "isn't",
+            'itd': "it'd",
+            "itd've": "it'd've",
+            "it'dve": "it'd've",
+            'itll': "it'll",
+            "let's": "let's",
+            'maam': "ma'am",
+            'mightnt': "mightn't",
+            "mightnt've": "mightn't've",
+            "mightn'tve": "mightn't've",
+            'mightve': "might've",
+            'mustnt': "mustn't",
+            'mustve': "must've",
+            'neednt': "needn't",
+            'notve': "not've",
+            'oclock': "o'clock",
+            'oughtnt': "oughtn't",
+            "ow's'at": "'ow's'at",
+            "'ows'at": "'ow's'at",
+            "'ow'sat": "'ow's'at",
+            'shant': "shan't",
+            "shed've": "she'd've",
+            "she'dve": "she'd've",
+            "she's": "she's",
+            'shouldve': "should've",
+            'shouldnt': "shouldn't",
+            "shouldnt've": "shouldn't've",
+            "shouldn'tve": "shouldn't've",
+            "somebody'd": 'somebodyd',
+            "somebodyd've": "somebody'd've",
+            "somebody'dve": "somebody'd've",
+            'somebodyll': "somebody'll",
+            'somebodys': "somebody's",
+            'someoned': "someone'd",
+            "someoned've": "someone'd've",
+            "someone'dve": "someone'd've",
+            'someonell': "someone'll",
+            'someones': "someone's",
+            'somethingd': "something'd",
+            "somethingd've": "something'd've",
+            "something'dve": "something'd've",
+            'somethingll': "something'll",
+            'thats': "that's",
+            'thered': "there'd",
+            "thered've": "there'd've",
+            "there'dve": "there'd've",
+            'therere': "there're",
+            'theres': "there's",
+            'theyd': "they'd",
+            "theyd've": "they'd've",
+            "they'dve": "they'd've",
+            'theyll': "they'll",
+            'theyre': "they're",
+            'theyve': "they've",
+            'twas': "'twas",
+            'wasnt': "wasn't",
+            "wed've": "we'd've",
+            "we'dve": "we'd've",
+            'weve': "we've",
+            'werent': "weren't",
+            'whatll': "what'll",
+            'whatre': "what're",
+            'whats': "what's",
+            'whatve': "what've",
+            'whens': "when's",
+            'whered': "where'd",
+            'wheres': "where's",
+            'whereve': "where've",
+            'whod': "who'd",
+            "whod've": "who'd've",
+            "who'dve": "who'd've",
+            'wholl': "who'll",
+            'whos': "who's",
+            'whove': "who've",
+            'whyll': "why'll",
+            'whyre': "why're",
+            'whys': "why's",
+            'wont': "won't",
+            'wouldve': "would've",
+            'wouldnt': "wouldn't",
+            "wouldnt've": "wouldn't've",
+            "wouldn'tve": "wouldn't've",
+            'yall': "y'all",
+            "yall'll": "y'all'll",
+            "y'allll": "y'all'll",
+            "yall'd've": "y'all'd've",
+            "y'alld've": "y'all'd've",
+            "y'all'dve": "y'all'd've",
+            'youd': "you'd",
+            "youd've": "you'd've",
+            "you'dve": "you'd've",
+            'youll': "you'll",
+            'youre': "you're",
+            'youve': "you've",
+        }
+        self.manualMap = {
+            'none': '0',
+            'zero': '0',
+            'one': '1',
+            'two': '2',
+            'three': '3',
+            'four': '4',
+            'five': '5',
+            'six': '6',
+            'seven': '7',
+            'eight': '8',
+            'nine': '9',
+            'ten': '10',
+        }
+        self.articles = ['a', 'an', 'the']
+        self.periodStrip = re.compile('(?!<=\d)(\.)(?!\d)')
+        self.commaStrip = re.compile('(\d)(,)(\d)')
+        self.punct = [
+            ';',
+            r'/',
+            '[',
+            ']',
+            '"',
+            '{',
+            '}',
+            '(',
+            ')',
+            '=',
+            '+',
+            '\\',
+            '_',
+            '-',
+            '>',
+            '<',
+            '@',
+            '`',
+            ',',
+            '?',
+            '!',
+        ]
+    def evaluate(self, quesIds=None):
+        if quesIds == None:
+            quesIds = [quesId for quesId in self.params['question_id']]
+        gts = {}
+        res = {}
+        for quesId in quesIds:
+            gts[quesId] = self.vqa.qa[quesId]
+            res[quesId] = self.vqaRes.qa[quesId]
+        # =================================================
+        # Compute accuracy
+        # =================================================
+        accQA = []
+        accQuesType = {}
+        accAnsType = {}
+        print('computing accuracy')
+        step = 0
+        for quesId in quesIds:
+            resAns = res[quesId]['answer']
+            resAns = resAns.replace('\n', ' ')
+            resAns = resAns.replace('\t', ' ')
+            resAns = resAns.strip()
+            resAns = self.processPunctuation(resAns)
+            resAns = self.processDigitArticle(resAns)
+            gtAcc = []
+            gtAnswers = [ans['answer'] for ans in gts[quesId]['answers']]
+            if len(set(gtAnswers)) > 1:
+                for ansDic in gts[quesId]['answers']:
+                    ansDic['answer'] = self.processPunctuation(
+                        ansDic['answer'])
+            for gtAnsDatum in gts[quesId]['answers']:
+                otherGTAns = [
+                    item for item in gts[quesId]['answers']
+                    if item != gtAnsDatum
+                ]
+                matchingAns = [
+                    item for item in otherGTAns if item['answer'] == resAns
+                ]
+                acc = min(1, float(len(matchingAns)) / 3)
+                gtAcc.append(acc)
+            quesType = gts[quesId]['question_type']
+            ansType = gts[quesId]['answer_type']
+            avgGTAcc = float(sum(gtAcc)) / len(gtAcc)
+            accQA.append(avgGTAcc)
+            if quesType not in accQuesType:
+                accQuesType[quesType] = []
+            accQuesType[quesType].append(avgGTAcc)
+            if ansType not in accAnsType:
+                accAnsType[ansType] = []
+            accAnsType[ansType].append(avgGTAcc)
+            self.setEvalQA(quesId, avgGTAcc)
+            self.setEvalQuesType(quesId, quesType, avgGTAcc)
+            self.setEvalAnsType(quesId, ansType, avgGTAcc)
+            if step % 100 == 0:
+                self.updateProgress(step / float(len(quesIds)))
+            step = step + 1
+        self.setAccuracy(accQA, accQuesType, accAnsType)
+        print('Done computing accuracy')
+    def processPunctuation(self, inText):
+        outText = inText
+        for p in self.punct:
+            if (p + ' ' in inText or ' ' + p
+                    in inText) or (re.search(self.commaStrip, inText) != None):
+                outText = outText.replace(p, '')
+            else:
+                outText = outText.replace(p, ' ')
+        outText = self.periodStrip.sub('', outText, re.UNICODE)
+        return outText
+    def processDigitArticle(self, inText):
+        outText = []
+        tempText = inText.lower().split()
+        for word in tempText:
+            word = self.manualMap.setdefault(word, word)
+            if word not in self.articles:
+                outText.append(word)
+            else:
+                pass
+        for wordId, word in enumerate(outText):
+            if word in self.contractions:
+                outText[wordId] = self.contractions[word]
+        outText = ' '.join(outText)
+        return outText
+    def setAccuracy(self, accQA, accQuesType, accAnsType):
+        self.accuracy['overall'] = round(100 * float(sum(accQA)) / len(accQA),
+                                         self.n)
+        self.accuracy['perQuestionType'] = {
+            quesType: round(
+                100 * float(sum(accQuesType[quesType])) /
+                len(accQuesType[quesType]),
+                self.n,
+            )
+            for quesType in accQuesType
+        }
+        self.accuracy['perAnswerType'] = {
+            ansType: round(
+                100 * float(sum(accAnsType[ansType])) /
+                len(accAnsType[ansType]), self.n)
+            for ansType in accAnsType
+        }
+    def setEvalQA(self, quesId, acc):
+        self.evalQA[quesId] = round(100 * acc, self.n)
+    def setEvalQuesType(self, quesId, quesType, acc):
+        if quesType not in self.evalQuesType:
+            self.evalQuesType[quesType] = {}
+        self.evalQuesType[quesType][quesId] = round(100 * acc, self.n)
+    def setEvalAnsType(self, quesId, ansType, acc):
+        if ansType not in self.evalAnsType:
+            self.evalAnsType[ansType] = {}
+        self.evalAnsType[ansType][quesId] = round(100 * acc, self.n)
+    def updateProgress(self, progress):
+        barLength = 20
+        status = ''
+        if isinstance(progress, int):
+            progress = float(progress)
+        if not isinstance(progress, float):
+            progress = 0
+            status = 'error: progress var must be float\r\n'
+        if progress < 0:
+            progress = 0
+            status = 'Halt...\r\n'
+        if progress >= 1:
+            progress = 1
+            status = 'Done...\r\n'
+        block = int(round(barLength * progress))
+        text = '\rFinshed Percent: [{0}] {1}% {2}'.format(
+            '#' * block + '-' * (barLength - block), int(progress * 100),
+            status)
+        sys.stdout.write(text)
+        sys.stdout.flush()
--- a/finetune.py
+++ b/finetune.py
+# This code is based on the revised code from fastchat based on tatsu-lab/stanford_alpaca.
+from dataclasses import dataclass, field
+import json
+import math
+import logging
+import os
+from typing import Dict, Optional, List
+import torch
+from torch.utils.data import Dataset
+from deepspeed import zero
+from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+import transformers
+from transformers import Trainer, GPTQConfig, deepspeed
+from transformers.trainer_pt_utils import LabelSmoother
+from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
+from accelerate.utils import DistributedType
+IGNORE_TOKEN_ID = LabelSmoother.ignore_index
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="Qwen/Qwen-7B")
+@dataclass
+class DataArguments:
+    data_path: str = field(
+        default=None, metadata={"help": "Path to the training data."}
+    )
+    eval_data_path: str = field(
+        default=None, metadata={"help": "Path to the evaluation data."}
+    )
+    lazy_preprocess: bool = False
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    model_max_length: int = field(
+        default=8192,
+        metadata={
+            "help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    use_lora: bool = False
+    fix_vit: bool = True
+@dataclass
+class LoraArguments:
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_target_modules: List[str] = field(
+        default_factory=lambda: ["c_attn", "attn.c_proj", "w1", "w2"] ##["in_proj","out_proj","c_fc"]
+    )
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    q_lora: bool = False
+def maybe_zero_3(param):
+    if hasattr(param, "ds_id"):
+        assert param.ds_status == ZeroParamStatus.NOT_AVAILABLE
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v) for k, v in to_return.items()}
+    return to_return
+local_rank = None
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str, bias="none"):
+    """Collects the state dict and dump to disk."""
+    # check if zero3 mode enabled
+    if deepspeed.is_deepspeed_zero3_enabled():
+        state_dict = trainer.model_wrapped._zero3_consolidated_16bit_state_dict()
+    else:
+        if trainer.args.use_lora:
+            state_dict = get_peft_state_maybe_zero_3(
+                trainer.model.named_parameters(), bias
+            )
+        else:
+            state_dict = trainer.model.state_dict()
+    if trainer.args.should_save and trainer.args.local_rank == 0:
+        trainer._save(output_dir, state_dict=state_dict)
+def preprocess(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    max_len: int,
+    system_message: str = "You are a helpful assistant."
+) -> Dict:
+    roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"}
+    im_start = tokenizer.im_start_id
+    im_end = tokenizer.im_end_id
+    nl_tokens = tokenizer('\n').input_ids
+    _system = tokenizer('system').input_ids + nl_tokens
+    _user = tokenizer('user').input_ids + nl_tokens
+    _assistant = tokenizer('assistant').input_ids + nl_tokens
+    # Apply prompt templates
+    input_ids, targets = [], []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != roles["user"]:
+            source = source[1:]
+        input_id, target = [], []
+        system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens
+        input_id += system
+        target += [im_start] + [IGNORE_TOKEN_ID] * (len(system)-3) + [im_end] + nl_tokens
+        assert len(input_id) == len(target)
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            _input_id = tokenizer(role).input_ids + nl_tokens + \
+                tokenizer(sentence["value"]).input_ids + [im_end] + nl_tokens
+            input_id += _input_id
+            if role == '<|im_start|>user':
+                _target = [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id)-3) + [im_end] + nl_tokens
+            elif role == '<|im_start|>assistant':
+                _target = [im_start] + [IGNORE_TOKEN_ID] * len(tokenizer(role).input_ids) + \
+                    _input_id[len(tokenizer(role).input_ids)+1:-2] + [im_end] + nl_tokens
+            else:
+                raise NotImplementedError
+            target += _target
+        assert len(input_id) == len(target)
+        input_id += [tokenizer.pad_token_id] * (max_len - len(input_id))
+        target += [IGNORE_TOKEN_ID] * (max_len - len(target))
+        input_ids.append(input_id[:max_len])
+        targets.append(target[:max_len])
+    input_ids = torch.tensor(input_ids, dtype=torch.int)
+    targets = torch.tensor(targets, dtype=torch.int)
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+        attention_mask=input_ids.ne(tokenizer.pad_token_id),
+    )
+class SupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
+        super(SupervisedDataset, self).__init__()
+        rank0_print("Formatting inputs...")
+        sources = [example["conversations"] for example in raw_data]
+        data_dict = preprocess(sources, tokenizer, max_len)
+        self.input_ids = data_dict["input_ids"]
+        self.labels = data_dict["labels"]
+        self.attention_mask = data_dict["attention_mask"]
+    def __len__(self):
+        return len(self.input_ids)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        return dict(
+            input_ids=self.input_ids[i],
+            labels=self.labels[i],
+            attention_mask=self.attention_mask[i],
+        )
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer, max_len: int):
+        super(LazySupervisedDataset, self).__init__()
+        self.tokenizer = tokenizer
+        self.max_len = max_len
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.raw_data = raw_data
+        self.cached_data_dict = {}
+    def __len__(self):
+        return len(self.raw_data)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        if i in self.cached_data_dict:
+            return self.cached_data_dict[i]
+        ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer, self.max_len)
+        ret = dict(
+            input_ids=ret["input_ids"][0],
+            labels=ret["labels"][0],
+            attention_mask=ret["attention_mask"][0],
+        )
+        self.cached_data_dict[i] = ret
+        return ret
+def make_supervised_data_module(
+    tokenizer: transformers.PreTrainedTokenizer, data_args, max_len,
+) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    dataset_cls = (
+        LazySupervisedDataset if data_args.lazy_preprocess else SupervisedDataset
+    )
+    rank0_print("Loading data...")
+    train_json = json.load(open(data_args.data_path, "r"))
+    train_dataset = dataset_cls(train_json, tokenizer=tokenizer, max_len=max_len)
+    if data_args.eval_data_path:
+        eval_json = json.load(open(data_args.eval_data_path, "r"))
+        eval_dataset = dataset_cls(eval_json, tokenizer=tokenizer, max_len=max_len)
+    else:
+        eval_dataset = None
+    return dict(train_dataset=train_dataset, eval_dataset=eval_dataset)
+def train():
+    global local_rank
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments, LoraArguments)
+    )
+    (
+        model_args,
+        data_args,
+        training_args,
+        lora_args,
+    ) = parser.parse_args_into_dataclasses()
+    if getattr(training_args, 'deepspeed', None) and getattr(lora_args, 'q_lora', False):
+        training_args.distributed_state.distributed_type = DistributedType.DEEPSPEED
+    compute_dtype = (
+        torch.float16
+        if training_args.fp16
+        else (torch.bfloat16 if training_args.bf16 else torch.float32)
+    )
+    local_rank = training_args.local_rank
+    device_map = None
+    world_size = int(os.environ.get("WORLD_SIZE", 1))
+    ddp = world_size != 1
+    if lora_args.q_lora:
+        device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} if ddp else None
+        if len(training_args.fsdp) > 0 or deepspeed.is_deepspeed_zero3_enabled():
+            logging.warning(
+                "FSDP or ZeRO3 are not incompatible with QLoRA."
+            )
+    # Set RoPE scaling factor
+    config = transformers.AutoConfig.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        trust_remote_code=True,
+    )
+    config.use_cache = False
+    # Load model and tokenizer
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        config=config,
+        cache_dir=training_args.cache_dir,
+        device_map=device_map,
+        trust_remote_code=True,
+        quantization_config=GPTQConfig(
+            bits=4, disable_exllama=True
+        )
+        if training_args.use_lora and lora_args.q_lora
+        else None,
+    )
+    if not training_args.use_lora:
+        if training_args.fix_vit and hasattr(model,'transformer') and hasattr(model.transformer,'visual'):
+            model.transformer.visual.requires_grad_(False)
+            if hasattr(model.transformer.visual,'attn_pool'):
+                model.transformer.visual.attn_pool.requires_grad_(True)
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=False,
+        trust_remote_code=True,
+    )
+    tokenizer.pad_token_id = tokenizer.eod_id
+    if training_args.use_lora:
+        if lora_args.q_lora or "chat" in model_args.model_name_or_path.lower():
+            modules_to_save = None
+        else:
+            modules_to_save = ["wte", "lm_head"]
+        lora_config = LoraConfig(
+            r=lora_args.lora_r,
+            lora_alpha=lora_args.lora_alpha,
+            target_modules=lora_args.lora_target_modules,
+            lora_dropout=lora_args.lora_dropout,
+            bias=lora_args.lora_bias,
+            task_type="CAUSAL_LM",
+            modules_to_save=modules_to_save  # This argument serves for adding new tokens.
+        )
+        if lora_args.q_lora:
+            model = prepare_model_for_kbit_training(
+                model, use_gradient_checkpointing=training_args.gradient_checkpointing
+            )
+        model = get_peft_model(model, lora_config)
+        if training_args.gradient_checkpointing:
+            model.enable_input_require_grads()
+    # Load data
+    data_module = make_supervised_data_module(
+        tokenizer=tokenizer, data_args=data_args, max_len=training_args.model_max_length
+    )
+    # Start trainner
+    trainer = Trainer(
+        model=model, tokenizer=tokenizer, args=training_args, **data_module
+    )
+    trainer.train()
+    trainer.save_state()
+    safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir, bias=lora_args.lora_bias)
+if __name__ == "__main__":
+    train()
--- a/finetune/ds_config_zero2.json
+++ b/finetune/ds_config_zero2.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
\ No newline at end of file
--- a/finetune/ds_config_zero3.json
+++ b/finetune/ds_config_zero3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
--- a/finetune/finetune_ds.sh
+++ b/finetune/finetune_ds.sh
+#!/bin/bash
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+DIR=`pwd`
+GPUS_PER_NODE=8
+NNODES=1
+NODE_RANK=0
+MASTER_ADDR=localhost
+MASTER_PORT=6001
+MODEL="Qwen/Qwen-VL-Chat" #"Qwen/Qwen-VL-Chat"/"Qwen/Qwen-VL" # Set the path if you do not want to load from huggingface directly
+# ATTENTION: specify the path to your training data, which should be a json file consisting of a list of conversations.
+# See the section for finetuning in README for more information.
+DATA="path_to_data"
+DISTRIBUTED_ARGS="
+    --nproc_per_node $GPUS_PER_NODE \
+    --nnodes $NNODES \
+    --node_rank $NODE_RANK \
+    --master_addr $MASTER_ADDR \
+    --master_port $MASTER_PORT
+"
+torchrun $DISTRIBUTED_ARGS finetune.py \
+    --model_name_or_path $MODEL \
+    --data_path $DATA \
+    --bf16 True \
+    --fix_vit True \
+    --output_dir output_qwen \
+    --num_train_epochs 5 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --gradient_accumulation_steps 16 \
+    --evaluation_strategy "no" \
+    --save_strategy "steps" \
+    --save_steps 1000 \
+    --save_total_limit 10 \
+    --learning_rate 1e-5 \
+    --weight_decay 0.1 \
+    --adam_beta2 0.95 \
+    --warmup_ratio 0.01 \
+    --lr_scheduler_type "cosine" \
+    --logging_steps 1 \
+    --report_to "none" \
+    --model_max_length 2048 \
+    --gradient_checkpointing True \
+    --lazy_preprocess True \
+    --deepspeed finetune/ds_config_zero3.json