[Feature]: Verify the acc of these public datasets (#269)

* [Feature]: Refactor public dataset eval * [Feature]: Verify public dataset acc

[Feature]: Verify the acc of these public datasets (#269)
* [Feature]: Refactor public dataset eval * [Feature]: Verify public dataset acc
dc6e54f6 · Yuan Liu · GitHub · 3f37c40a · dc6e54f6 · dc6e54f6
Unverified Commit dc6e54f6 authored Aug 25, 2023 by Yuan Liu Committed by GitHub Aug 25, 2023
11 changed files
--- a/configs/multimodal/minigpt_4/minigpt_4_7b_coco_caption.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_coco_caption.py
@@ -35,7 +35,8 @@ minigpt_4_coco_caption_model = dict(
    type='minigpt-4',
    low_resource=False,
    img_size=384,
-    llama_model='/path/to/vicuna-7b/',
+    llama_model='/path/to/vicuna_weights_7b/',
+    is_caption_task=True,
    prompt_constructor=dict(type=MiniGPT4COCOCaotionPromptConstructor,
                            image_prompt='###Human: <Img><ImageHere></Img>',
                            reply_prompt='###Assistant:'),

--- a/configs/multimodal/minigpt_4/minigpt_4_7b_flickr30k.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_flickr30k.py
@@ -24,19 +24,20 @@ dataset = dict(type='mmpretrain.Flickr30kCaption',
               split='val',
               pipeline=val_pipeline)

-minigpt_4_flickr30k_dataloader = dict(
-    batch_size=1,
-    num_workers=4,
-    dataset=dataset,
-    collate_fn=dict(type='pseudo_collate'),
-    sampler=dict(type='DefaultSampler', shuffle=False))
+minigpt_4_flickr30k_dataloader = dict(batch_size=1,
+                                      num_workers=4,
+                                      dataset=dataset,
+                                      collate_fn=dict(type='pseudo_collate'),
+                                      sampler=dict(type='DefaultSampler',
+                                                   shuffle=False))

 # model settings
 minigpt_4_flickr30k_model = dict(
    type='minigpt-4',
    low_resource=False,
    img_size=384,
-    llama_model='/path/to/vicuna-7b/',
+    llama_model='/path/to/vicuna_weights_7b/',
+    is_caption_task=True,
    prompt_constructor=dict(type=MiniGPT4COCOCaotionPromptConstructor,
                            image_prompt='###Human: <Img><ImageHere></Img>',
                            reply_prompt='###Assistant:'),
@@ -46,7 +47,7 @@ minigpt_4_flickr30k_model = dict(
 minigpt_4_flickr30k_evaluator = [
    dict(
        type='mmpretrain.COCOCaption',
-        ann_file='data/coco/annotations/coco_karpathy_val_gt.json',
+        ann_file='data/flickr30k/annotations/flickr30k_val_gt.json',
    )  # noqa
 ]


--- a/configs/multimodal/minigpt_4/minigpt_4_7b_gqa.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_gqa.py
@@ -39,7 +39,7 @@ minigpt_4_gqa_model = dict(type='minigpt-4',
                           low_resource=False,
                           img_size=224,
                           max_length=10,
-                           llama_model='/path/to/vicuna-7b/',
+                           llama_model='/path/to/vicuna_weights_7b/',
                           prompt_constructor=dict(
                               type=MiniGPT4VQAPromptConstructor,
                               image_prompt='###Human: <Img><ImageHere></Img>',

--- a/configs/multimodal/minigpt_4/minigpt_4_7b_ocr-vqa.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_ocr-vqa.py
@@ -41,7 +41,7 @@ minigpt_4_ocr_vqa_model = dict(
    low_resource=False,
    img_size=224,
    max_length=10,
-    llama_model='/path/to/vicuna-7b/',
+    llama_model='/path/to/vicuna_weights_7b/',
    prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor,
                            image_prompt='###Human: <Img><ImageHere></Img>',
                            reply_prompt='###Assistant:'),

--- a/configs/multimodal/minigpt_4/minigpt_4_7b_ok-vqa.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_ok-vqa.py
@@ -43,7 +43,7 @@ minigpt_4_ok_vqa_model = dict(
    low_resource=False,
    img_size=224,
    max_length=10,
-    llama_model='/path/to/vicuna-7b/',
+    llama_model='/path/to/vicuna_weights_7b/',
    prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor,
                            image_prompt='###Human: <Img><ImageHere></Img>',
                            reply_prompt='###Assistant:'),

--- a/configs/multimodal/minigpt_4/minigpt_4_7b_scienceqa.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_scienceqa.py
@@ -40,7 +40,7 @@ minigpt_4_scienceqa_model = dict(
    low_resource=False,
    img_size=224,
    max_length=10,
-    llama_model='/path/to/vicuna-7b/',
+    llama_model='/path/to/vicuna_weights_7b/',
    prompt_constructor=dict(type=MiniGPT4ScienceQAPromptConstructor,
                            image_prompt='###Human: <Img><ImageHere></Img>',
                            reply_prompt='###Assistant:'),

--- a/configs/multimodal/minigpt_4/minigpt_4_7b_textvqa.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_textvqa.py
@@ -43,7 +43,7 @@ minigpt_4_textvqa_model = dict(
    low_resource=False,
    img_size=224,
    max_length=10,
-    llama_model='/path/to/vicuna-7b/',
+    llama_model='/path/to/vicuna_weights_7b/',
    prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor,
                            image_prompt='###Human: <Img><ImageHere></Img>',
                            reply_prompt='###Assistant:'),

--- a/configs/multimodal/minigpt_4/minigpt_4_7b_vizwiz.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_vizwiz.py
@@ -40,7 +40,7 @@ minigpt_4_vizwiz_model = dict(
    low_resource=False,
    img_size=224,
    max_length=10,
-    llama_model='/path/to/vicuna-7b/',
+    llama_model='/path/to/vicuna_weights_7b/',
    prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor,
                            image_prompt='###Human: <Img><ImageHere></Img>',
                            reply_prompt='###Assistant:'),

--- a/configs/multimodal/minigpt_4/minigpt_4_7b_vqav2.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_vqav2.py
@@ -43,7 +43,7 @@ minigpt_4_vqav2_model = dict(
    low_resource=False,
    img_size=224,
    max_length=10,
-    llama_model='/path/to/vicuna-7b/',
+    llama_model='/path/to/vicuna_weights_7b/',
    prompt_constructor=dict(type=MiniGPT4VQAPromptConstructor,
                            image_prompt='###Human: <Img><ImageHere></Img>',
                            reply_prompt='###Assistant:'),

--- a/configs/multimodal/minigpt_4/minigpt_4_7b_vsr.py
+++ b/configs/multimodal/minigpt_4/minigpt_4_7b_vsr.py
@@ -37,10 +37,10 @@ minigpt_4_vsr_dataloader = dict(batch_size=1,
 # model settings
 minigpt_4_vsr_model = dict(
    type='minigpt-4',
-    low_resource=True,
+    low_resource=False,
    img_size=224,
    max_length=10,
-    llama_model='/path/to/vicuna-7b/',
+    llama_model='/path/to/vicuna_weights_7b/',
    prompt_constructor=dict(type=MiniGPT4VSRPromptConstructor,
                            image_prompt='###Human: <Img><ImageHere></Img>',
                            reply_prompt='###Assistant:'),

--- a/opencompass/multimodal/models/minigpt_4/minigpt_4.py
+++ b/opencompass/multimodal/models/minigpt_4/minigpt_4.py
@@ -50,6 +50,8 @@ class MiniGPT4Inferencer(MiniGPT4):
        img_size (int): The size of image. Defaults to 224.
        low_resource (bool): Whether loaded in low precision.
            Defaults to False.
+        is_caption_task (bool): Whether the task is caption task.
+            Defaults to False.
    """

    def __init__(self,
@@ -60,6 +62,7 @@ class MiniGPT4Inferencer(MiniGPT4):
                 max_length: int = 30,
                 img_size: int = 224,
                 low_resource: bool = False,
+                 is_caption_task: bool = False,
                 mode: str = 'generation',
                 n_segments: int = 1) -> None:
        super().__init__(llama_model=llama_model,
@@ -83,6 +86,7 @@ class MiniGPT4Inferencer(MiniGPT4):
                post_processor, MM_MODELS)
        self.do_sample = do_sample
        self.max_length = max_length
+        self.is_caption_task = is_caption_task

    def forward(self, batch):
        if self.mode == 'generation':
@@ -193,7 +197,10 @@ class MiniGPT4Inferencer(MiniGPT4):
            output_token = outputs[i]
            output_text = self.post_processor(output_token,
                                              self.llama_tokenizer)
-            data_sample.pred_answer = output_text
+            if self.is_caption_task:
+                data_sample.pred_caption = output_text
+            else:
+                data_sample.pred_answer = output_text
            data_samples[i] = data_sample
        return data_samples