[Fix] Update APPS/TACO (#988)

* [Feature] update apps/taco * [Feature] update apps/taco

[Fix] Update APPS/TACO (#988)
* [Feature] update apps/taco * [Feature] update apps/taco
0221d308 · Connor-Shen · GitHub · 8a3c6e51 · 0221d308 · 0221d308
Unverified Commit 0221d308 authored Mar 19, 2024 by Connor-Shen Committed by GitHub Mar 19, 2024
8 changed files
--- a/configs/datasets/apps/README.md
+++ b/configs/datasets/apps/README.md
@@ -26,9 +26,9 @@ print(next(iter(ds))["question"])
 ## Evaluation results
-| dataset             | metric | Qwen1.5-1.8B | Qwen1.5-7B  | Qwen1.5-14B | Qwen1.5-72B | Baichuan2-7B | Baichuan2-13B | InternLM2-7B | InternLM2-20B |
+| dataset              | metric   | CodeLlama-7b-Python | internlm2-chat-1.8b-sft-hf  | internlm2-chat-7b-sft-hf | internlm2-chat-20b-sft-hf | 
-|----------------------|--------|---------------|---------------|---------------|---------------|----------------|----------------|---------------|---------------|
+|-----------------------|----------|-------------|-------------|-------------|-------------|
-| APPS (testset5000)      | pass@1  | 0.2           | 1.7           | 5.92          | 7.5           | 0              | 0.06          | 0             | 0             |
+| apps_mini                   | pass@1   | 1.3         | 0.7           | 7.1           | 9.3           | 
 Please refer to Table 3 of [code llama](https://scontent-nrt1-2.xx.fbcdn.net/v/t39.2365-6/369856151_1754812304950972_1159666448927483931_n.pdf?_nc_cat=107&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=TxT1PKkNBZoAX8zMHbm&_nc_ht=scontent-nrt1-2.xx&oh=00_AfDmmQAPzqX1-QOKIDUV5lGKzaZqt0CZUVtxFjHtnh6ycQ&oe=65F5AF8F) for original results if needed. 

--- a/configs/datasets/apps/apps_mini_gen.py
+++ b/configs/datasets/apps/apps_mini_gen.py
+from mmengine.config import read_base
+with read_base():
+    from .apps_mini_gen_c7893a import APPS_datasets  # noqa: F401, F403
\ No newline at end of file
--- a/configs/datasets/apps/apps_mini_gen_c7893a.py
+++ b/configs/datasets/apps/apps_mini_gen_c7893a.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import APPS_miniDataset, APPSEvaluator
+APPS_reader_cfg = dict(input_columns=["question", "starter"], output_column="problem_id", train_split='test')
+APPS_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template="Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n"),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+APPS_eval_cfg = dict(evaluator=dict(type=APPSEvaluator), pred_role="BOT")
+APPS_mini_datasets = [
+    dict(
+        type=APPS_miniDataset,
+        abbr="apps_mini",
+        path="codeparrot_mini/apps",
+        num_repeats=1,
+        reader_cfg=APPS_reader_cfg,
+        infer_cfg=APPS_infer_cfg,
+        eval_cfg=APPS_eval_cfg,
+    )
+]
--- a/configs/datasets/taco/README.md
+++ b/configs/datasets/taco/README.md
@@ -32,9 +32,9 @@ taco_skills = load_dataset('BAAI/TACO', skills=['Sorting', 'Range queries'], tok
 ```
 ## Evaluation results
-| dataset              | metric   | Qwen1.5-1.8B | Qwen1.5-7B  | Qwen1.5-14B | Qwen1.5-72B | Baichuan2-7B | Baichuan2-13B | InternLM2-7B | InternLM2-20B |
+| dataset              | metric   | CodeLlama-7b-Python | internlm2-chat-1.8b-sft-hf  | internlm2-chat-7b-sft-hf | internlm2-chat-20b-sft-hf | 
-|-----------------------|----------|-------------|-------------|-------------|-------------|--------------|--------------|-------------|-------------|
+|-----------------------|----------|-------------|-------------|-------------|-------------|
-| TACO                   | pass@1   | 0.2         | 1           | 2           | 2           | 0.2          | 0.1          | 0.1         | 0.1         |
+| TACO                   | pass@1   | 0.7         | 0.7           | 1.7           | 2.7           | 
 Please refer to [repo](https://github.com/FlagOpen/TACO/tree/main?tab=readme-ov-file) for original results if needed.

--- a/configs/datasets/taco/taco_gen.py
+++ b/configs/datasets/taco/taco_gen.py
 from mmengine.config import read_base
 with read_base():
-    from .taco_gen_d82929 import TACO_datasets  # noqa: F401, F403
+    from .taco_gen_c7893a import TACO_datasets  # noqa: F401, F403
\ No newline at end of file
--- a/configs/datasets/taco/taco_gen_d82929.py
+++ b/configs/datasets/taco/taco_gen_d82929.py
@@ -8,7 +8,7 @@ TACO_reader_cfg = dict(input_columns=["question", "starter"], output_column="pro
 TACO_infer_cfg = dict(
    prompt_template=dict(
        type=PromptTemplate,
-        template="\nQUESTION:\n{question} {starter}\nANSWER:\n"),
+        template="Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n"),
    retriever=dict(type=ZeroRetriever),
    inferencer=dict(type=GenInferencer, max_out_len=512),
 )

--- a/opencompass/datasets/apps.py
+++ b/opencompass/datasets/apps.py
@@ -18,7 +18,7 @@ from io import StringIO
 from unittest.mock import mock_open, patch
 import numpy as np
-from datasets import Dataset, DatasetDict, load_dataset
+from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
 from pyext import RuntimeModule
 from opencompass.openicl.icl_evaluator import BaseEvaluator
@@ -83,6 +83,60 @@ class APPSDataset(BaseDataset):
        })
+@LOAD_DATASET.register_module()
+class APPS_miniDataset(BaseDataset):
+    @staticmethod
+    def load(path: str, num_repeats: int = 1):
+        dataset = load_from_disk(path)
+        new_dataset = DatasetDict()
+        # add new column "starter" in the prompt
+        for split in dataset.keys():
+            new_samples = []
+            for _, sample in enumerate(dataset[split]):
+                starter_code = None if len(
+                    sample['starter_code']) == 0 else sample['starter_code']
+                try:
+                    input_output = json.loads(sample['input_output'])
+                    fn_name = (None if not input_output.get('fn_name') else
+                               input_output['fn_name'])
+                except ValueError:
+                    fn_name = None
+                starter = ''
+                if starter_code:
+                    starter += starter_code
+                if (not fn_name) and (not starter_code):
+                    call_format = '\\nUse Standard Input format'
+                    starter += call_format
+                else:
+                    call_format = '\\nUse Call-Based format'
+                    starter += call_format
+                # Add the new column "starter" to the sample
+                sample['starter'] = starter
+                new_samples.append(sample)
+            new_data = {
+                key: [sample[key] for sample in new_samples]
+                for key in new_samples[0].keys()
+            }
+            new_dataset[split] = Dataset.from_dict(new_data)
+        # num_repeats duplicate
+        train_repeated = []
+        test_repeated = []
+        for sample in new_dataset['train']:
+            train_repeated.extend([sample] * num_repeats)
+        for sample in new_dataset['test']:
+            test_repeated.extend([sample] * num_repeats)
+        dataset_train_repeated = new_dataset['train'].from_list(train_repeated)
+        dataset_test_repeated = new_dataset['test'].from_list(test_repeated)
+        return DatasetDict({
+            'train': dataset_train_repeated,
+            'test': dataset_test_repeated
+        })
 EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>']

--- a/opencompass/datasets/taco.py
+++ b/opencompass/datasets/taco.py
@@ -91,13 +91,15 @@ EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>']
 @ICL_EVALUATORS.register_module()
 class TACOEvaluator(BaseEvaluator):
-    def truncate_after_eof_strings(self, text):
+    def post_process(self, text):
-        pattern = '|'.join(re.escape(s) for s in EOF_STRINGS)
+        if '```' in text:
-        match = re.search(pattern, text)
+            blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
+            if len(blocks) == 0:
-        if match:
+                text = text.split('```')[1]  # fall back to default strategy
-            return text[:match.start()]
            else:
+                text = blocks[0]  # fetch the first code block
+                if not text.startswith('\n'):  # starting with ```python
+                    text = text[max(text.find('\n') + 1, 0):]
        return text
    TIMEOUT = 10
@@ -228,7 +230,7 @@ class TACOEvaluator(BaseEvaluator):
        assert len(predictions) == len(references)
        generations = defaultdict(list)
        for refer, pred in zip(references, predictions):
-            pred = self.truncate_after_eof_strings(pred)
+            pred = self.post_process(pred)
            generations[refer].append(pred)
        # convert to non-duplicated version
        test_set = test_set.to_pandas()