[Fix] fix ifeval (#909)

53fe788d · Jingming · GitHub · 45c606bc · 53fe788d · 53fe788d
Unverified Commit 53fe788d authored Feb 23, 2024 by Jingming Committed by GitHub Feb 23, 2024
Show whitespace changes
Inline Side-by-side

Showing with 35 additions and 17 deletions

configs/datasets/IFEval/IFEval.md configs/datasets/IFEval/IFEval.md +6 -4

opencompass/datasets/IFEval/ifeval.py opencompass/datasets/IFEval/ifeval.py +29 -13

No files found.
--- a/configs/datasets/IFEval/IFEval.md
+++ b/configs/datasets/IFEval/IFEval.md
@@ -36,10 +36,12 @@ Hark! Hearken to the tale of thy journey to the land of the rising sun, Japan. \
 ## Evaluation results
 ```
-dataset    version    metric      mode      qwen-72b-chat-hf    mistral-7b-instruct-v0.2-hf    mixtral-8x7b-instruct-v0.1    chatglm3-6b-hf
+dataset    version    metric                        mode      baichuan2-7b-chat-hf    baichuan2-13b-chat-hf    internlm2-chat-7b-hf    internlm2-chat-20b-hf    llama-2-7b-chat-hf    llama-2-13b-chat-hf
---------  ---------  ----------  ------  ------------------  -----------------------------  ----------------------------  ----------------
+---------  ---------  ----------------------------  ------  ----------------------  -----------------------  ----------------------  -----------------------  --------------------  ---------------------
-IFEval     27a9cc     strict_acc  gen                  43.62                          49.17                         48.98             29.76
+IFEval     3321a3     Prompt-level-strict-accuracy  gen                      36.04                    35.49                   38.26                    33.09                 33.46                  33.64
-IFEval     27a9cc     loose_acc   gen                  45.47                          53.97                         54.71             32.16
+IFEval     3321a3     Inst-level-strict-accuracy    gen                      46.76                    46.76                   49.16                    45.32                 45.68                  45.44
+IFEval     3321a3     Prompt-level-loose-accuracy   gen                      37.52                    37.71                   42.51                    39.37                 43.81                  47.32
+IFEval     3321a3     Inst-level-loose-accuracy     gen                      48.44                    49.16                   53.72                    51.08                 55.64                  58.03
 ```
 ## Reference

--- a/opencompass/datasets/IFEval/ifeval.py
+++ b/opencompass/datasets/IFEval/ifeval.py
@@ -27,7 +27,9 @@ class IFEvalDataset(BaseDataset):
 class IFEvaluator(BaseEvaluator):
    def score(self, predictions, references):
-        results = []
+        results = dict()
+        for metric in ('strict', 'loose'):
+            results[metric] = []
        for pred, refer in zip(predictions, references):
            input = InputExample(
                key=refer['key'],
@@ -38,15 +40,29 @@ class IFEvaluator(BaseEvaluator):
                for k in list(kwarg.keys()):
                    if kwarg[k] is None:
                        kwarg.pop(k, None)
-            result = dict(
+            results['strict'].append(
-                strict=test_instruction_following_strict(input, pred),
+                test_instruction_following_strict(input, pred))
-                loose=test_instruction_following_loose(input, pred),
+            results['loose'].append(
-            )
+                test_instruction_following_loose(input, pred))
-            results.append(result)
+        final_scores = dict()
-        strict = sum(
+        for metric in ('strict', 'loose'):
-            [result['strict'].follow_all_instructions
+            prompt_total = 0
-             for result in results]) / len(results)
+            prompt_correct = 0
-        loose = sum(
+            inst_total = 0
-            [result['loose'].follow_all_instructions
+            inst_correct = 0
-             for result in results]) / len(results)
-        return dict(strict_acc=strict * 100, loose_acc=loose * 100)
+            for example in results[metric]:
+                follow_instruction_list = example.follow_instruction_list
+                instruction_id_list = example.instruction_id_list
+                prompt_total += 1
+                if all(follow_instruction_list):
+                    prompt_correct += 1
+                inst_total += len(instruction_id_list)
+                inst_correct += sum(follow_instruction_list)
+            prompt_score = f'Prompt-level-{metric}-accuracy'
+            inst_score = f'Inst-level-{metric}-accuracy'
+            final_scores[prompt_score] = prompt_correct / prompt_total * 100
+            final_scores[inst_score] = inst_correct / inst_total * 100
+        return final_scores