Initial commit

5e887c2c · wanglch · 5e887c2c · 5e887c2c · 5e887c2c · 5e887c2c
Commit 5e887c2c authored May 31, 2024 by wanglch
20 changed files
--- a/eval_mm/mmbench/mmbench_converter_dev.py
+++ b/eval_mm/mmbench/mmbench_converter_dev.py
+import pandas as pd
+import io
+import base64
+import json
+from PIL import Image
+
+'''
+This scripts convert mmbench_dev tsv file to jsonl
+'''
+
+datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
+
+global_choices = ['A', 'B', 'C', 'D']
+
+def decode_base64_to_image(base64_string):
+    image_data = base64.b64decode(base64_string)
+    image = Image.open(io.BytesIO(image_data))
+    return image
+
+
+with open('./data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.jsonl', 'w') as f:
+    for idx in range(len(datas)):
+        data = datas.iloc[idx]
+        
+        index = int(data['index'])
+        question = data['question']
+        hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
+
+        choices = []
+        for opt in global_choices:
+            if pd.isna(data[opt]):
+                continue
+            choices.append(data[opt])
+
+        answer = global_choices.index(data['answer'])
+
+        image = decode_base64_to_image(data['image'])
+        image.save("data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index)
+
+        f.write(json.dumps({
+            "index": index,
+            "image": "data/mmbench/mmbench_dev_20230712/images/%d.jpg" % index,
+            "hint": hint,
+            "question": question,
+            "choices": choices, 
+            "answer": answer,
+        }) + "\n")
+
--- a/eval_mm/mmbench/mmbench_converter_test.py
+++ b/eval_mm/mmbench/mmbench_converter_test.py
+import pandas as pd
+import io
+import base64
+import json
+from PIL import Image
+
+'''
+This script convert mmbench_test tsv file to jsonl
+This script is very similar to mmbench_converter_dev except there's no answer for accuracy calculation
+'''
+
+datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
+
+global_choices = ['A', 'B', 'C', 'D']
+
+def decode_base64_to_image(base64_string):
+    image_data = base64.b64decode(base64_string)
+    image = Image.open(io.BytesIO(image_data))
+    return image
+
+
+with open('./data/mmbench/mmbench_test_20230712/mmbench_test_20230712.jsonl', 'w') as f:
+    for idx in range(len(datas)):
+        data = datas.iloc[idx]
+        
+        index = int(data['index'])
+        question = data['question']
+        hint = data['hint'] if not pd.isna(data['hint']) else 'N/A'
+
+        choices = []
+        for opt in global_choices:
+            if pd.isna(data[opt]):
+                continue
+            choices.append(data[opt])
+
+        # answer = global_choices.index(data['answer'])
+
+        image = decode_base64_to_image(data['image'])
+        image.save("data/mmbench/mmbench_test_20230712/images/%d.jpg" % index)
+
+        f.write(json.dumps({
+            "index": index,
+            "image": "data/mmbench/mmbench_test_20230712/images/%d.jpg" % index,
+            "hint": hint,
+            "question": question,
+            "choices": choices, 
+            # "answer": answer,
+        }) + "\n")
+
--- a/eval_mm/mmbench/mmbench_evaluation.py
+++ b/eval_mm/mmbench/mmbench_evaluation.py
+import pandas as pd
+import json
+
+'''
+This script provides `global top-1 accuracy` metric calculation for mmbench_dev.
+'''
+
+predictions = json.load(open('mmbench_dev_20230712.json'))
+
+index2predictions = {}
+for pred in predictions:
+    index2predictions[pred['index']] = pred['prediction']
+
+datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
+
+glb_opts = ['A', 'B', 'C', 'D']
+index2answer = {}
+for idx in range(len(datas)):
+    data = datas.iloc[idx]
+    index2answer[data['index']] = glb_opts.index(data['answer'])
+
+identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
+
+correct = 0
+total = 0
+for index in identity_indexes:
+    for _ in range(4):
+        cycle_index = int(_ * 1e6 + index)
+        if index2predictions.get(cycle_index, None) is not None:
+            if index2predictions[cycle_index] == index2answer[cycle_index]:
+                continue
+            else:
+                print(cycle_index)
+                break
+    else:
+        correct += 1
+    total += 1
+
+print(correct, total)
--- a/eval_mm/mmbench/mmbench_evaluation_tricky.py
+++ b/eval_mm/mmbench/mmbench_evaluation_tricky.py
+import pandas as pd
+import json
+import random
+
+'''
+This script provides metric calculation for mmbench_dev with the same accuarcy algo as OpenCompass server
+'''
+
+predictions = json.load(open('mmbench_dev_20230712.json'))
+
+index2predictions = {}
+for pred in predictions:
+    index2predictions[pred['index']] = pred['prediction']
+
+
+from collections import Counter
+
+def most_common_elements(lst):
+    counter = Counter(lst)
+    max_count = max(counter.values())
+    most_common = [element for element, count in counter.items() if count == max_count]
+    return random.choice(most_common) # random sample from random choice
+
+datas = pd.read_csv("data/mmbench/mmbench_dev_20230712/mmbench_dev_20230712.tsv", sep='\t')
+
+glb_opts = ['A', 'B', 'C', 'D']
+index2answer = {}
+index2choices = {}
+index2rawanswer = {}
+for idx in range(len(datas)):
+    data = datas.iloc[idx]
+    
+    choices = []
+    for opt in glb_opts:
+        if not pd.isna(data[opt]):
+            choices.append(data[opt])
+    index2choices[data['index']] = choices
+
+    index2answer[data['index']] = glb_opts.index(data['answer'])
+    index2rawanswer[data['index']] = choices[glb_opts.index(data['answer'])]
+
+identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
+
+correct = 0
+total = 0
+for index in identity_indexes:
+    raw_preds = []
+    raw_answer = []
+    for _ in range(4):
+        cycle_index = int(_ * 1e6 + index)
+        if index2predictions.get(cycle_index, None) is not None:
+            raw_answer = index2rawanswer[cycle_index]
+            raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
+            raw_preds.append(raw_pred)
+
+    if len(set(raw_preds)) == 1:
+        if raw_preds[0] == raw_answer:
+            correct += 1
+    else:
+        result = most_common_elements(raw_preds)
+        if result == raw_answer:
+            correct += 1
+
+    total += 1
+
+print(correct, total, correct / total * 100.)
--- a/eval_mm/mmbench/mmbench_predict_to_submission.py
+++ b/eval_mm/mmbench/mmbench_predict_to_submission.py
+import pandas as pd
+import json
+import random
+
+'''
+This script convert the output file of our inference processor to target formation of OpenCompass evaluator server
+'''
+
+predictions = json.load(open('mmbench_test_20230712.json'))
+
+index2predictions = {}
+for pred in predictions:
+    index2predictions[pred['index']] = pred['prediction']
+
+from collections import Counter
+
+def most_common_elements(lst):
+    counter = Counter(lst)
+    max_count = max(counter.values())
+    most_common = [element for element, count in counter.items() if count == max_count]
+    print(most_common)
+    return random.choice(most_common)
+    # return most_common
+
+datas = pd.read_csv("data/mmbench/mmbench_test_20230712/mmbench_test_20230712.tsv", sep='\t')
+
+datas = datas.drop('image', axis=1)
+
+glb_opts = ['A', 'B', 'C', 'D']
+index2choices = {}
+for idx in range(len(datas)):
+    data = datas.iloc[idx]
+    
+    choices = []
+    for opt in glb_opts:
+        if not pd.isna(data[opt]):
+            choices.append(data[opt])
+    index2choices[data['index']] = choices
+
+identity_indexes = list(set([int(_ % 1e6) for _ in index2predictions.keys()]))
+
+
+processed_index2predictions = {}
+for index in identity_indexes:
+    raw_preds = []
+    for _ in range(4):
+        cycle_index = int(_ * 1e6 + index)
+        if index2predictions.get(cycle_index, None) is not None:
+            raw_pred = index2choices[cycle_index][index2predictions[cycle_index]]
+            raw_preds.append(raw_pred)
+    
+    if len(set(raw_preds)) == 1:
+        pred_answer = raw_preds[0]
+    else:
+        pred_answer = most_common_elements(raw_preds)
+
+    print(index, pred_answer)
+    for _ in range(4):
+        cycle_index = int(_ * 1e6 + index)
+        if index2predictions.get(cycle_index, None) is not None:
+            processed_index2predictions[cycle_index] = index2choices[cycle_index].index(pred_answer)
+
+
+predictions = []
+for idx in range(len(datas)):
+    data = datas.iloc[idx]
+    index = data['index']
+    prediction = glb_opts[processed_index2predictions[index]]
+    predictions.append(prediction)
+
+datas['prediction'] = predictions
+datas.to_excel("mmbench_test_20230712_230831_constrained.xlsx", index=False)
+# constrained means we force the model predict same answer when tested on a question for multiple times
--- a/eval_mm/mme/EVAL_MME.md
+++ b/eval_mm/mme/EVAL_MME.md
+# MME Benchmark
+
+[MME](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) is a comprehensive evaluation benchmark for multimodal large language models. It measures both perception and cognition abilities on a total of 14 subtasks, including existence, count, position, color, poster, celebrity, scene, landmark, artwork, OCR, commonsense reasoning, numerical calculation, text translation, and code reasoning.
+
+Qwen-VL-Chat achieves SOTAs on both perception and cognition evaluation.
+
+Perception Evaluation
+
+| Rank |      Model      |          Version         |  Score  |
+|:----:|:---------------:|:------------------------:|:-------:|
+|   1  | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)**|        **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)**       | **1487.57** |
+|   2  |    Skywork-MM   |      Skywork-MM-13B      | 1419.08 |
+|   3  |      MMICL      |         FlanT5xxl        | 1376.00 |
+|   4  |       Lynx      |         vicuna-7b        | 1373.23 |
+|   5  |      BLIVA      |         FlanT5xxl        | 1337.73 |
+
+Cognition Evaluation
+
+| Rank |       Model      |     Version    |    Score   |
+|:----:|:----------------:|:--------------:|:----------:|
+|   1  | **[Qwen-VL-Chat](https://github.com/QwenLM/Qwen-VL/)** |   **[Qwen-7B](https://github.com/QwenLM/Qwen-7B)**  | **360.71** |
+|   2  |       MMICL      |    FlanT5xxl   |   360.36   |
+|   3  |    Skywork-MM    | Skywork-MM-13B |   356.43   |
+|   4  |       BLIVA      |    FlanT5xxl   |   331.43   |
+|   5  |  LRV-Instruction |     LRV-7B     |   328.21   |
+
+Full Metrics
+
+```
+=========== Perception ===========
+total score: 1487.576330532213 
+
+         existence  score: 158.33333333333331
+         count  score: 150.0
+         position  score: 128.33333333333334
+         color  score: 170.0
+         posters  score: 178.57142857142856
+         celebrity  score: 120.58823529411764
+         scene  score: 152.25
+         landmark  score: 164.0
+         artwork  score: 125.5
+         OCR  score: 140.0
+
+
+=========== Cognition ===========
+total score: 360.71428571428567 
+
+         commonsense_reasoning  score: 130.7142857142857
+         numerical_calculation  score: 40.0
+         text_translation  score: 147.5
+         code_reasoning  score: 42.5
+```
+
+## How To Reproduce Results of MME Benchmark
+
+1. Download MME images and eval_tool from the [MME repo](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/blob/Evaluation/README.md)
+2. Rearrange images by executing `python get_images.py`
+3. Evaluate Qwen-VL-Chat results by executing `python eval.py`
+4. Calculate MME results by executing `python calculation.py --results_dir Qwen-VL-Chat`, which the calculation script comes from the MME eval_tool.
--- a/eval_mm/mme/cognition.jpg
+++ b/eval_mm/mme/cognition.jpg
--- a/eval_mm/mme/eval.py
+++ b/eval_mm/mme/eval.py
+import os
+from tqdm import tqdm
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+checkpoint = 'Qwen/Qwen-VL-Chat'
+tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
+model = AutoModelForCausalLM.from_pretrained(
+    checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+model.generation_config = GenerationConfig.from_pretrained(checkpoint, trust_remote_code=True)
+model.generation_config.top_p = 0.01
+
+
+root = 'Your_Results'
+output = 'Qwen-VL-Chat'
+os.makedirs(output, exist_ok=True)
+for filename in os.listdir(root):
+    with open(os.path.join(root, filename), 'r') as fin, open(os.path.join(output, filename), 'w') as fout:
+        lines = fin.read().splitlines()
+        filename = filename.replace('.txt', '')
+        for line in tqdm(lines):
+            img, question, gt = line.strip().split('\t')
+            img_path = os.path.join('images', filename, img)
+            assert os.path.exists(img_path), img_path
+            query = f'<img>{img_path}</img>\n{question}'
+            response, _ = model.chat(tokenizer, query=query, history=None)
+
+            print(img, question, gt, response, sep='\t', file=fout)
--- a/eval_mm/mme/get_images.py
+++ b/eval_mm/mme/get_images.py
+import os
+from tqdm import tqdm
+
+os.system('rm -rf images')
+os.system('mkdir images')
+
+os.system('cp -r ../MME_Benchmark_release/OCR images/')
+
+os.system('mkdir images/artwork')
+os.system('cp ../MME_Benchmark_release/artwork/questions_answers_YN/* images/artwork/')
+with open('LaVIN/artwork.txt') as fin:
+    paths = [ line.strip().split('\t', 1)[0] for line in fin ]
+    paths = list(set(paths))
+    for path in tqdm(paths):
+        os.system(f'cp ../MME_Benchmark_release/artwork/images/toy_dataset/{path} images/artwork/{path}')
+
+os.system('mkdir images/celebrity')
+os.system('cp ../MME_Benchmark_release/celebrity/images/* images/celebrity/')
+os.system('cp ../MME_Benchmark_release/celebrity/questions_answers_YN/* images/celebrity/')
+
+os.system('cp -r ../MME_Benchmark_release/code_reasoning images/')
+
+os.system('cp -r ../MME_Benchmark_release/color images/')
+
+os.system('cp -r ../MME_Benchmark_release/commonsense_reasoning images/')
+
+os.system('cp -r ../MME_Benchmark_release/count images/')
+
+os.system('cp -r ../MME_Benchmark_release/existence images/')
+
+os.system('mkdir images/landmark')
+os.system('cp ../MME_Benchmark_release/landmark/images/* images/landmark/')
+os.system('cp ../MME_Benchmark_release/landmark/questions_answers_YN/* images/landmark/')
+
+os.system('cp -r ../MME_Benchmark_release/numerical_calculation images/')
+
+os.system('cp -r ../MME_Benchmark_release/position images/')
+
+os.system('mkdir images/posters')
+os.system('cp ../MME_Benchmark_release/posters/images/* images/posters/')
+os.system('cp ../MME_Benchmark_release/posters/questions_answers_YN/* images/posters/')
+
+os.system('mkdir images/scene')
+os.system('cp ../MME_Benchmark_release/scene/images/* images/scene/')
+os.system('cp ../MME_Benchmark_release/scene/questions_answers_YN/* images/scene/')
+
+os.system('cp -r ../MME_Benchmark_release/text_translation images/')
--- a/eval_mm/mme/perception.jpg
+++ b/eval_mm/mme/perception.jpg
--- a/eval_mm/seed_bench/EVAL_SEED.md
+++ b/eval_mm/seed_bench/EVAL_SEED.md
+# Seed-Bench Evaluation
+
+[SEED-Bench](https://huggingface.co/spaces/AILab-CVC/SEED-Bench_Leaderboard) is a multimodal benchmark of 19K multiple-choice questions with accurate human annotations for evaluating Multimodal LLMs, covering 12 evaluation dimensions including both **image** and **video** understanding.
+
+Qwen-VL and Qwen-VL-Chat achieve SOTAs on this benchmark.
+
+<p align="center">
+    <img src="leaderboard.jpg"/>
+<p>
+
+## How To Process Video by Qwen-VL
+
+Qwen-VL and Qwen-VL-Chat didn't train any video data or tasks during training, but they can understand some videos in a zero-shot way. For the video question-answering task, we utilize four uniformly sampled frames per video sample. These frames are treated as separate images and are stitched into the context. For example:
+
+```
+{
+  "question_id": "v0",
+  "prompt": "<img>video_imgs_4/v0_0.jpg</img>\n<img>video_imgs_4/v0_1.jpg</img>\n<img>video_imgs_4/v0_2.jpg</img>\n<img>video_imgs_4/v0_3.jpg</img>\nQuestion: Can you identify the action taking place in the video?\nOptions: A. pretending to take something out of something\nB. pretending to take something from somewhere\nC. feigning to insert something into something\nD. simulating putting something onto something\nAnswer:"
+}
+```
+
+The above JSON line can be used as the input by `eval_mm/seed_bench/eval.py` and output the following results:
+```
+{"question_id": "v0", "prediction": "B"}
+```
+
+Please see [eval_mm/seed_bench/eval.py](eval.py) for more inference details.
+
+## How To Reproduce Results of Seed-Bench
+
+1. Download all images and videos by following the [instruction](https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md). Then modify the root path in `eval_mm/seed_bench/trans.py` with your customized path.
+```
+# path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
+seed_bench_input_path = 'SEED-Bench.json'
+# root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
+cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
+# root directory of evaluation dimension 10
+dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
+# root directory of evaluation dimension 11
+dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
+# root directory of evaluation dimension 12
+dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
+```
+
+2. Generate input files of Qwen-VL with the JSON formatting.
+```
+cd eval_mm/seed_bench/
+python trans.py
+```
+This script will output two JSONL files and one directory. `image_input.jsonl` is the input file of image evaluation and `video_input_4.jsonl` is the input file of video evaluation by 4 frames. The directory `video_imgs_4` contains all 4-framed images extracted from videos. We provide our [image_input.jsonl](http://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/seed_bench/image_input.jsonl) and [video_input_4.jsonl](http://ofasys-wlcb.oss-cn-wulanchabu.aliyuncs.com/Qwen-VL/evaluation/seed_bench/video_input_4.jsonl) here for reference.
+
+3. Produce the results of Seed-Bench.
+```
+# The number of available GPUs 
+export NPROC_PER_NODE=8
+
+# Produce the Qwen-VL-Chat results of image understanding
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    eval.py \
+    --checkpoint Qwen/Qwen-VL-Chat \
+    --dataset image_input.jsonl \
+    --batch-size 4 \
+    --num-workers 2
+# Collect the result files
+cat result_?.jsonl >results_chat_img.jsonl
+rm result_?.jsonl
+
+# Produce the results of video understanding
+python -m torch.distributed.launch --use-env \
+    --nproc_per_node ${NPROC_PER_NODE:-8} \
+    --nnodes ${WORLD_SIZE:-1} \
+    --node_rank ${RANK:-0} \
+    --master_addr ${MASTER_ADDR:-127.0.0.1} \
+    --master_port ${MASTER_PORT:-12345} \
+    eval.py \
+    --checkpoint Qwen/Qwen-VL-Chat \
+    --dataset video_input_4.jsonl \
+    --batch-size 2 \
+    --num-workers 1
+# Collect the result files
+cat result_?.jsonl >results_chat_vid.jsonl
+rm result_?.jsonl
+
+# The file `results_chat.jsonl` can be submitted to the leaderboard
+cat results_chat_img.jsonl results_chat_vid.jsonl >results_chat.jsonl
+```
+
+You can reproduce the Seed-Bench results of Qwen-VL by replacing `Qwen/Qwen-VL-Chat` with `Qwen/Qwen-VL` on the above script.
--- a/eval_mm/seed_bench/eval.py
+++ b/eval_mm/seed_bench/eval.py
+import argparse
+import itertools
+import json
+import os
+from functools import partial
+
+import torch
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+
+def collate_fn(batches, pad_token_id):
+
+    input_tokens = [_['input_tokens'] for _ in batches]
+    target_lengths = [_['target_lengths'] for _ in batches]
+    answers = [_['answer'] for _ in batches]
+    question_id = [_['question_id'] for _ in batches]
+
+    chunk_sizes = [len(_) for _ in input_tokens]
+
+    input_tokens = [_ for _ in itertools.chain.from_iterable(input_tokens)]
+
+    max_lengths = max([len(_) for _ in input_tokens])
+    input_tokens = [[pad_token_id] * (max_lengths - len(_)) + _
+                    for _ in input_tokens]
+    input_tokens = torch.LongTensor(input_tokens)
+
+    attention_mask = 1 - input_tokens.eq(pad_token_id).float()
+
+    return input_tokens, attention_mask, target_lengths, answers, chunk_sizes, question_id
+
+
+class MultipleChoiceDataste(torch.utils.data.Dataset):
+
+    def __init__(self, test, tokenizer):
+        self.datas = []
+        with open(test) as fin:
+            for line in tqdm(fin):
+                self.datas.append(json.loads(line.strip()))
+        self.tokenizer = tokenizer
+
+    def __len__(self):
+        return len(self.datas)
+
+    def __getitem__(self, idx):
+
+        data = self.datas[idx]
+        prompt = data['prompt']
+
+        prompt_tokens = self.tokenizer(prompt).input_ids
+        target_tokens = [
+            self.tokenizer(' ' + _).input_ids
+            for _ in ['A', 'B', 'C', 'D']
+        ]
+
+        return {
+            'input_tokens': [prompt_tokens + _ for _ in target_tokens],
+            'target_lengths': [len(_) for _ in target_tokens],
+            'answer': data['answer'],
+            'question_id': data['question_id'],
+        }
+
+
+class InferenceSampler(torch.utils.data.sampler.Sampler):
+
+    def __init__(self, size):
+        self._size = int(size)
+        assert size > 0
+        self._rank = torch.distributed.get_rank()
+        self._world_size = torch.distributed.get_world_size()
+        self._local_indices = self._get_local_indices(size, self._world_size,
+                                                      self._rank)
+
+    @staticmethod
+    def _get_local_indices(total_size, world_size, rank):
+        shard_size = total_size // world_size
+        left = total_size % world_size
+        shard_sizes = [shard_size + int(r < left) for r in range(world_size)]
+
+        begin = sum(shard_sizes[:rank])
+        end = min(sum(shard_sizes[:rank + 1]), total_size)
+        return range(begin, end)
+
+    def __iter__(self):
+        yield from self._local_indices
+
+    def __len__(self):
+        return len(self._local_indices)
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--checkpoint', type=str, default='')
+    parser.add_argument('--dataset', type=str, default='')
+    parser.add_argument('--batch-size', type=int, default=1)
+    parser.add_argument('--num-workers', type=int, default=1)
+    args = parser.parse_args()
+
+    torch.distributed.init_process_group(
+        backend='nccl',
+        world_size=int(os.getenv('WORLD_SIZE', '1')),
+        rank=int(os.getenv('RANK', '0')),
+    )
+
+    torch.cuda.set_device(int(os.getenv('LOCAL_RANK', 0)))
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.checkpoint, device_map='cuda', trust_remote_code=True).eval()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.checkpoint,
+                                              trust_remote_code=True)
+    model.generation_config = GenerationConfig.from_pretrained(args.checkpoint, trust_remote_code=True)
+    model.generation_config.top_p = 0.01
+
+    dataset = MultipleChoiceDataste(test=args.dataset, tokenizer=tokenizer)
+    dataloader = torch.utils.data.DataLoader(
+        dataset=dataset,
+        # sampler=InferenceSampler(1000),
+        sampler=InferenceSampler(len(dataset)),
+        batch_size=args.batch_size,
+        num_workers=args.num_workers,
+        pin_memory=True,
+        drop_last=False,
+        collate_fn=partial(collate_fn, pad_token_id=tokenizer.eod_id),
+    )
+
+    results = []
+    fout = open('result_{}.jsonl'.format(torch.distributed.get_rank()), 'w')
+    with torch.no_grad():
+        for _, (input_tokens, attention_mask, target_lengths, answers,
+                chunk_sizes, question_ids) in tqdm(enumerate(dataloader)):
+
+            outputs = model(
+                input_ids=input_tokens[:, :-1].cuda(),
+                attention_mask=attention_mask[:, :-1].cuda(),
+                return_dict=True,
+            )
+            losses = torch.nn.functional.cross_entropy(outputs.logits.permute(
+                0, 2, 1),
+                                                       input_tokens[:,
+                                                                    1:].cuda(),
+                                                       reduction='none')
+
+            losses = losses.split(chunk_sizes, dim=0)
+
+            for loss, target_length, answer, question_id in zip(losses, target_lengths,
+                                                   answers, question_ids):
+
+                target_loss = loss.mean(-1)
+                for _ in range(len(target_length)):
+                    target_loss[_] = loss[_, -target_length[_]:].mean()
+                pred = target_loss.argmin().item()
+                pred = chr(pred + 65)
+                if pred == answer:
+                    results.append(1)
+                else:
+                    results.append(0)
+                answer_record = {
+                    'question_id': question_id,
+                    'prediction': pred
+                }
+                print(json.dumps(answer_record), file=fout)
+    fout.close()
+
+    torch.distributed.barrier()
+
+    world_size = torch.distributed.get_world_size()
+    merged_results = [None for _ in range(world_size)]
+    torch.distributed.all_gather_object(merged_results, results)
+
+    merged_results = [_ for _ in itertools.chain.from_iterable(merged_results)]
+
+    if torch.distributed.get_rank() == 0:
+        print(f"Evaluating {args.dataset} ...")
+        print(f'Acc@1: {sum(merged_results) / len(merged_results)}')
+
+    torch.distributed.barrier()
--- a/eval_mm/seed_bench/leaderboard.jpg
+++ b/eval_mm/seed_bench/leaderboard.jpg
--- a/eval_mm/seed_bench/trans.py
+++ b/eval_mm/seed_bench/trans.py
+import os
+import av
+import json
+
+import torch
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from decord import VideoReader, cpu
+
+# path of SEED-Bench.json, download from https://huggingface.co/datasets/AILab-CVC/SEED-Bench/blob/main/SEED-Bench.json
+seed_bench_input_path = 'SEED-Bench.json'
+# root directory of evaluation dimension 1-9, following https://github.com/AILab-CVC/SEED-Bench/blob/main/DATASET.md
+cc3m_dir = "/YOUR_PATH_TO/seed_bench_image"
+# root directory of evaluation dimension 10
+dimension10_dir = "/YOUR_PATH_TO/SSV2/videos"
+# root directory of evaluation dimension 11
+dimension11_dir = "/YOUR_PATH_TO/EPIC-KITCHENS/3h91syskeag572hl6tvuovwv4d/videos/test"
+# root directory of evaluation dimension 12
+dimension12_dir = "/YOUR_PATH_TO/BreakfastII_15fps_qvga_sync"
+
+def is_integer_string(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+def filter_questions(data, task='all'):
+    if task == "image":
+        return [q for q in data if 1 <= q["question_type_id"] <= 9]
+    elif task == "video":
+        return [q for q in data if 10 <= q["question_type_id"] <= 12]
+    elif task == "all":
+        return data
+    elif is_integer_string(task):
+        return [q for q in data if q["question_type_id"] == int(task)]
+    else:
+        raise ValueError(f"Invalid task: {task}")
+
+def get_index(num_frames, num_segments):
+    if num_segments > num_frames:
+        offsets = np.array([
+            idx for idx in range(num_frames)
+        ])
+    else:
+        # uniform sampling
+        seg_size = float(num_frames - 1) / num_segments
+        start = int(seg_size / 2)
+        offsets = np.array([
+            start + int(np.round(seg_size * idx)) for idx in range(num_segments)
+        ])
+    return offsets
+
+with open(seed_bench_input_path) as fin:
+    qa_anno = json.load(fin)['questions']
+
+fout = open('image_input.jsonl', 'w')
+i_anno = filter_questions(qa_anno, 'image')
+for qa_item in tqdm(i_anno):
+    data_path = cc3m_dir + qa_item['data_id']
+    choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
+    choice_list = []
+    for i, c in enumerate(choices):
+        choice_list.append('{}. {}'.format(chr(i + 65), c))
+    choice_txt = '\n'.join(choice_list)
+    prompt = '<img>{}</img>\nQuestion: {}\nOptions: {}\nAnswer:'.format(
+        data_path, qa_item['question'], choice_txt)
+    print(json.dumps({
+        'question_id': qa_item['question_id'],
+        'prompt': prompt,
+        'answer': qa_item['answer'],
+    }), file=fout)
+fout.close()
+
+n_frames = 8
+os.system('rm -rf video_input_' + str(n_frames))
+os.makedirs('video_imgs_' + str(n_frames), exist_ok=True)
+
+fout = open('video_input_{}.jsonl'.format(n_frames), 'w')
+v_anno = filter_questions(qa_anno, 'video')
+for qa_item in tqdm(v_anno):
+    if qa_item['question_type_id'] == 12:
+        data_path = dimension12_dir + qa_item['data_id']
+    elif qa_item['question_type_id'] == 11:
+        data_path = dimension11_dir + qa_item['data_id'].split('/')[-1]
+    elif qa_item['question_type_id'] == 10:
+        data_path = dimension10_dir + qa_item['data_id']
+    else:
+        assert False, str(qa_item)
+    print(data_path)
+
+    use_pyav = False
+    if 'segment' in qa_item.keys():
+        segment = qa_item['segment']
+        if isinstance(segment[0], int):
+            # using pyav for decoding videos in evaluation dimension 12
+            use_pyav = True
+        start, end = segment[0], segment[1]
+    else:
+        start = 0.0
+        end = 0.0
+
+    if use_pyav:
+        # using pyav for decoding videos in evaluation dimension 12
+        reader = av.open(data_path)
+        frames = [torch.from_numpy(f.to_rgb().to_ndarray()) for f in reader.decode(video=0)]
+        video_len = len(frames)
+        start_frame, end_frame = start, end
+        end_frame = min(end_frame, video_len)
+        offset = get_index(end_frame - start_frame, n_frames)
+        frame_indices = offset + start_frame
+        images = torch.stack([frames[idx] for idx in frame_indices]).numpy()
+    else:
+        # using decord for decoding videos in evaluation dimension 10-11
+        try:
+            vr = VideoReader(data_path, num_threads=1, ctx=cpu(0))
+            video_len = len(vr)
+            fps = vr.get_avg_fps()
+            if 'segment' in qa_item.keys():
+                # obtain start and end frame for the video segment in evaluation dimension 11
+                start_frame = int(min(max(start * fps, 0), video_len - 1))
+                end_frame = int(min(max(end * fps, 0), video_len - 1))
+                tot_frames = int(end_frame - start_frame)
+                offset = get_index(tot_frames, n_frames)
+                frame_indices = offset + start_frame
+            else:
+                # sample frames of the video in evaluation dimension 10
+                frame_indices = get_index(video_len - 1, n_frames)
+            vr.seek(0)
+            images = vr.get_batch(frame_indices).asnumpy()
+        except Exception as e:
+            print(json.dumps({
+                'question_id': qa_item['question_id'],
+                'prompt': "Error" + str(e),
+                'answer': qa_item['answer'],
+            }), file=fout)
+            continue
+
+    prompt = ''
+    for i in range(images.shape[0]):
+        data = Image.fromarray(images[i])
+        img_path = 'video_imgs_{}/{}_{}.jpg'.format(n_frames, qa_item['question_id'], i)
+        data.save(img_path)
+        prompt += '<img>' + img_path + '</img>\n'
+
+    choices = [qa_item['choice_a'], qa_item['choice_b'], qa_item['choice_c'], qa_item['choice_d']]
+    choice_list = []
+    for i, c in enumerate(choices):
+        choice_list.append('{}. {}'.format(chr(i + 65), c))
+    choice_txt = '\n'.join(choice_list)
+
+    prompt += 'Question: {}\nOptions: {}\nAnswer:'.format(qa_item['question'], choice_txt)
+    print(json.dumps({
+        'question_id': qa_item['question_id'],
+        'prompt': prompt,
+        'answer': qa_item['answer'],
+    }), file=fout)
+fout.close()
--- a/eval_mm/vqa.py
+++ b/eval_mm/vqa.py
--- a/eval_mm/vqa_eval.py
+++ b/eval_mm/vqa_eval.py
--- a/finetune.py
+++ b/finetune.py
--- a/finetune/ds_config_zero2.json
+++ b/finetune/ds_config_zero2.json
--- a/finetune/ds_config_zero3.json
+++ b/finetune/ds_config_zero3.json
--- a/finetune/finetune_ds.sh
+++ b/finetune/finetune_ds.sh