".github/git@developer.sourcefind.cn:OpenDAS/apex.git" did not exist on "a56e88dc7ae83b7771f5fe4bb06bc626a23ceec7"
Unverified Commit 8a3c6e51 authored by Connor-Shen's avatar Connor-Shen Committed by GitHub
Browse files

[Feature] Update APPS (#985)

* update post process

* update post process
parent d92595b6
...@@ -15,6 +15,8 @@ DatasetDict({ ...@@ -15,6 +15,8 @@ DatasetDict({
}) })
}) })
``` ```
We also offer an apps_mini subset, which includes 1500 questions divided proportionally of introductory, interview, and competition categories, with a ratio of 1:1:1(500 questions each).
## How to Use ## How to Use
You can also filter the dataset based on difficulty level: introductory, interview and competition. Just pass a list of difficulty levels to the filter. For example, if you want the most challenging questions, you need to select the competition level: You can also filter the dataset based on difficulty level: introductory, interview and competition. Just pass a list of difficulty levels to the filter. For example, if you want the most challenging questions, you need to select the competition level:
```python ```python
......
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .apps_gen_d82929 import APPS_datasets # noqa: F401, F403 from .apps_gen_c7893a import APPS_datasets # noqa: F401, F403
...@@ -8,7 +8,7 @@ APPS_reader_cfg = dict(input_columns=["question", "starter"], output_column="pro ...@@ -8,7 +8,7 @@ APPS_reader_cfg = dict(input_columns=["question", "starter"], output_column="pro
APPS_infer_cfg = dict( APPS_infer_cfg = dict(
prompt_template=dict( prompt_template=dict(
type=PromptTemplate, type=PromptTemplate,
template="\nQUESTION:\n{question} {starter}\nANSWER:\n"), template="Please write a python program to address the following QUESTION. Your ANSWER should be in a code block format like this: ```python # Write your code here ```. \nQUESTION:\n{question} {starter}\nANSWER:\n"),
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512), inferencer=dict(type=GenInferencer, max_out_len=512),
) )
......
...@@ -89,14 +89,16 @@ EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>'] ...@@ -89,14 +89,16 @@ EOF_STRINGS = ['\nQUESTION', '\n---', '\nANSWER', '<|endoftext|>']
@ICL_EVALUATORS.register_module() @ICL_EVALUATORS.register_module()
class APPSEvaluator(BaseEvaluator): class APPSEvaluator(BaseEvaluator):
def truncate_after_eof_strings(self, text): def post_process(self, text):
pattern = '|'.join(re.escape(s) for s in EOF_STRINGS) if '```' in text:
match = re.search(pattern, text) blocks = re.findall(r'```(.*?)```', text, re.DOTALL)
if len(blocks) == 0:
if match: text = text.split('```')[1] # fall back to default strategy
return text[:match.start()] else:
else: text = blocks[0] # fetch the first code block
return text if not text.startswith('\n'): # starting with ```python
text = text[max(text.find('\n') + 1, 0):]
return text
TIMEOUT = 10 TIMEOUT = 10
...@@ -226,7 +228,7 @@ class APPSEvaluator(BaseEvaluator): ...@@ -226,7 +228,7 @@ class APPSEvaluator(BaseEvaluator):
assert len(predictions) == len(references) assert len(predictions) == len(references)
generations = defaultdict(list) generations = defaultdict(list)
for refer, pred in zip(references, predictions): for refer, pred in zip(references, predictions):
pred = self.truncate_after_eof_strings(pred) pred = self.post_process(pred)
generations[refer].append(pred) generations[refer].append(pred)
# convert to non-duplicated version # convert to non-duplicated version
test_set = test_set.to_pandas() test_set = test_set.to_pandas()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment