"git@developer.sourcefind.cn:modelzoo/resnet50_tensorflow.git" did not exist on "9b6cd10635f6662d8ce4afaa29adb63fadc2972b"
Unverified Commit 0221d308 authored by Connor-Shen's avatar Connor-Shen Committed by GitHub
Browse files

[Fix] Update APPS/TACO (#988)

* [Feature] update apps/taco

* [Feature] update apps/taco
parent 8a3c6e51
...@@ -26,9 +26,9 @@ print(next(iter(ds))["question"]) ...@@ -26,9 +26,9 @@ print(next(iter(ds))["question"])
## Evaluation results ## Evaluation results
| dataset | metric | Qwen1.5-1.8B | Qwen1.5-7B | Qwen1.5-14B | Qwen1.5-72B | Baichuan2-7B | Baichuan2-13B | InternLM2-7B | InternLM2-20B | | dataset | metric | CodeLlama-7b-Python | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-sft-hf |
|----------------------|--------|---------------|---------------|---------------|---------------|----------------|----------------|---------------|---------------| |-----------------------|----------|-------------|-------------|-------------|-------------|
| APPS (testset5000) | pass@1 | 0.2 | 1.7 | 5.92 | 7.5 | 0 | 0.06 | 0 | 0 | | apps_mini | pass@1 | 1.3 | 0.7 | 7.1 | 9.3 |
Please refer to Table 3 of [code llama](https://scontent-nrt1-2.xx.fbcdn.net/v/t39.2365-6/369856151_1754812304950972_1159666448927483931_n.pdf?_nc_cat=107&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=TxT1PKkNBZoAX8zMHbm&_nc_ht=scontent-nrt1-2.xx&oh=00_AfDmmQAPzqX1-QOKIDUV5lGKzaZqt0CZUVtxFjHtnh6ycQ&oe=65F5AF8F) for original results if needed. Please refer to Table 3 of [code llama](https://scontent-nrt1-2.xx.fbcdn.net/v/t39.2365-6/369856151_1754812304950972_1159666448927483931_n.pdf?_nc_cat=107&ccb=1-7&_nc_sid=3c67a6&_nc_ohc=TxT1PKkNBZoAX8zMHbm&_nc_ht=scontent-nrt1-2.xx&oh=00_AfDmmQAPzqX1-QOKIDUV5lGKzaZqt0CZUVtxFjHtnh6ycQ&oe=65F5AF8F) for original results if needed.
......
from mmengine.config import read_base
with read_base():
from .apps_mini_gen_c7893a import APPS_datasets # noqa: F401, F403
\ No newline at end of file
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import APPS_miniDataset, APPSEvaluator
APPS_reader_cfg = dict(input_columns=["question", "starter"], output_column="problem_id", train_split='test')
APPS_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template="Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n"),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
APPS_eval_cfg = dict(evaluator=dict(type=APPSEvaluator), pred_role="BOT")
APPS_mini_datasets = [
dict(
type=APPS_miniDataset,
abbr="apps_mini",
path="codeparrot_mini/apps",
num_repeats=1,
reader_cfg=APPS_reader_cfg,
infer_cfg=APPS_infer_cfg,
eval_cfg=APPS_eval_cfg,
)
]
...@@ -32,9 +32,9 @@ taco_skills = load_dataset('BAAI/TACO', skills=['Sorting', 'Range queries'], tok ...@@ -32,9 +32,9 @@ taco_skills = load_dataset('BAAI/TACO', skills=['Sorting', 'Range queries'], tok
``` ```
## Evaluation results ## Evaluation results
| dataset | metric | Qwen1.5-1.8B | Qwen1.5-7B | Qwen1.5-14B | Qwen1.5-72B | Baichuan2-7B | Baichuan2-13B | InternLM2-7B | InternLM2-20B | | dataset | metric | CodeLlama-7b-Python | internlm2-chat-1.8b-sft-hf | internlm2-chat-7b-sft-hf | internlm2-chat-20b-sft-hf |
|-----------------------|----------|-------------|-------------|-------------|-------------|--------------|--------------|-------------|-------------| |-----------------------|----------|-------------|-------------|-------------|-------------|
| TACO | pass@1 | 0.2 | 1 | 2 | 2 | 0.2 | 0.1 | 0.1 | 0.1 | | TACO | pass@1 | 0.7 | 0.7 | 1.7 | 2.7 |
Please refer to [repo](https://github.com/FlagOpen/TACO/tree/main?tab=readme-ov-file) for original results if needed. Please refer to [repo](https://github.com/FlagOpen/TACO/tree/main?tab=readme-ov-file) for original results if needed.
......
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .taco_gen_d82929 import TACO_datasets # noqa: F401, F403 from .taco_gen_c7893a import TACO_datasets # noqa: F401, F403
\ No newline at end of file \ No newline at end of file
...@@ -8,7 +8,7 @@ TACO_reader_cfg = dict(input_columns=["question", "starter"], output_column="pro ...@@ -8,7 +8,7 @@ TACO_reader_cfg = dict(input_columns=["question", "starter"], output_column="pro
TACO_infer_cfg = dict( TACO_infer_cfg = dict(
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
template="\nQUESTION:\n{question} {starter}\nANSWER:\n"), template="Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n"),
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512), inferencer=dict(type=GenInferencer, max_out_len=512),
) )
......
...@@ -18,7 +18,7 @@ from io import StringIO ...@@ -18,7 +18,7 @@ from io import StringIO
from unittest.mock import mock_open, patch from unittest.mock import mock_open, patch
import numpy as np import numpy as np
from datasets import Dataset, DatasetDict, load_dataset from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
from pyext import RuntimeModule from pyext import RuntimeModule
from opencompass.openicl.icl_evaluator import BaseEvaluator from opencompass.openicl.icl_evaluator import BaseEvaluator
...@@ -83,6 +83,60 @@ class APPSDataset(BaseDataset): ...@@ -83,6 +83,60 @@ class APPSDataset(BaseDataset):
}) })
@LOAD_DATASET.register_module()
class APPS_miniDataset(BaseDataset):
@staticmethod
def load(path: str, num_repeats: int = 1):
dataset = load_from_disk(path)
new_dataset = DatasetDict()
# add new column "starter" in the prompt
for split in dataset.keys():
new_samples = []
for _, sample in enumerate(dataset[split]):
starter_code = None if len(
sample['starter_code']) == 0 else sample['starter_code']
try:
input_output = json.loads(sample['input_output'])
fn_name = (None if not input_output.get('fn_name') else
input_output['fn_name'])
except ValueError:
fn_name = None
starter = ''
if starter_code:
starter += starter_code
if (not fn_name) and (not starter_code):
call_format = '\\nUse Standard Input format'
starter += call_format
else:
call_format = '\\nUse Call-Based format'
starter += call_format
# Add the new column "starter" to the sample
sample['starter'] = starter
new_samples.append(sample)
new_data = {
key: [sample[key] for sample in new_samples]
for key in new_samples[0].keys()
}
new_dataset[split] = Dataset.from_dict(new_data)
# num_repeats duplicate
train_repeated = []
test_repeated = []
for sample in new_dataset['train']:
train_repeated.extend([sample] * num_repeats)
for sample in new_dataset['test']:
test_repeated.extend([sample] * num_repeats)
dataset_train_repeated = new_dataset['train'].from_list(train_repeated)
dataset_test_repeated = new_dataset['test'].from_list(test_repeated)
return DatasetDict({
'train': dataset_train_repeated,
'test': dataset_test_repeated
})
EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>'] EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>']
......
...@@ -91,13 +91,15 @@ EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>'] ...@@ -91,13 +91,15 @@ EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>']
@ICL_EVALUATORS.register_module() @ICL_EVALUATORS.register_module()
class TACOEvaluator(BaseEvaluator): class TACOEvaluator(BaseEvaluator):
def truncate_after_eof_strings(self, text): def post_process(self, text):
pattern = '|'.join(re.escape(s) for s in EOF_STRINGS) if '```' in text:
match = re.search(pattern, text) blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
if len(blocks) == 0:
if match: text = text.split('```')[1] # fall back to default strategy
return text[:match.start()]
else: else:
text = blocks[0] # fetch the first code block
if not text.startswith('\n'): # starting with ```python
text = text[max(text.find('\n') + 1, 0):]
return text return text
TIMEOUT = 10 TIMEOUT = 10
...@@ -228,7 +230,7 @@ class TACOEvaluator(BaseEvaluator): ...@@ -228,7 +230,7 @@ class TACOEvaluator(BaseEvaluator):
assert len(predictions) == len(references) assert len(predictions) == len(references)
generations = defaultdict(list) generations = defaultdict(list)
for refer, pred in zip(references, predictions): for refer, pred in zip(references, predictions):
pred = self.truncate_after_eof_strings(pred) pred = self.post_process(pred)
generations[refer].append(pred) generations[refer].append(pred)
# convert to non-duplicated version # convert to non-duplicated version
test_set = test_set.to_pandas() test_set = test_set.to_pandas()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment