Unverified Commit 8c85edd1 authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] deprecate old mbpps (#1064)

parent c1724013
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPDataset, MBPPEvaluator2
mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
# This prompt is used for WizardLMCode series
# You can use other config file for basic 3-shot generation
mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt="""Below is an instruction that describes a task. Write a response that appropriately completes the request.
### Instruction:
Create a Python script for this problem:
{text}
Test examples:
{test_list}
### Response:""",
),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator2), pred_role="BOT")
mbpp_datasets = [
dict(
type=MBPPDataset,
abbr="mbpp",
path="./data/mbpp/mbpp.jsonl",
reader_cfg=mbpp_reader_cfg,
infer_cfg=mbpp_infer_cfg,
eval_cfg=mbpp_eval_cfg,
)
]
...@@ -10,13 +10,13 @@ mbpp_infer_cfg = dict( ...@@ -10,13 +10,13 @@ mbpp_infer_cfg = dict(
type=PromptTemplate, type=PromptTemplate,
template=dict( template=dict(
round=[ round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\n assert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \n assert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"), dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "), dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \n assert is_not_prime(10) == True \n assert is_not_prime(35) == True \n"), dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "), dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"), dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "), dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"), dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
...@@ -25,7 +25,7 @@ mbpp_infer_cfg = dict( ...@@ -25,7 +25,7 @@ mbpp_infer_cfg = dict(
), ),
), ),
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer), inferencer=dict(type=GenInferencer, max_out_len=512),
) )
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT") mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
......
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
dict(role="BOT", prompt="[BEGIN]\n"),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
mbpp_datasets = [
dict(
type=MBPPDataset_V2,
abbr="mbpp_passk",
path="./data/mbpp/mbpp.jsonl",
reader_cfg=mbpp_reader_cfg,
infer_cfg=mbpp_infer_cfg,
eval_cfg=mbpp_eval_cfg,
)
]
# This config is used for pass@k evaluation with dataset repetition
# That model cannot generate multiple response for single input
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPDataset_V2, MBPPPassKEvaluator
mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
dict(role="BOT", prompt="[BEGIN]\n"),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
mbpp_datasets = [
dict(
type=MBPPDataset_V2,
abbr="mbpp_repeat10",
path="./data/mbpp/mbpp.jsonl",
num_repeats=10,
reader_cfg=mbpp_reader_cfg,
infer_cfg=mbpp_infer_cfg,
eval_cfg=mbpp_eval_cfg,
)
]
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SanitizedMBPPDataset, MBPPEvaluator
sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_list_2")
sanitized_mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n",),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n ",),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n",),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n ",),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n",),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n ",),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n",),
dict(role="BOT", prompt="[BEGIN]\n"),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr="sanitized_mbpp",
path="./data/mbpp/sanitized-mbpp.jsonl",
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg,
)
]
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator
sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
sanitized_mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
dict(role="BOT", prompt="[BEGIN]\n"),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr="sanitized_mbpp_passk",
path="./data/mbpp/sanitized-mbpp.jsonl",
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg,
)
]
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import SanitizedMBPPDataset, MBPPPassKEvaluator
sanitized_mbpp_reader_cfg = dict(input_columns=["text", "test_list"], output_column="test_column")
sanitized_mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the similar elements from the given two tuple lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"),
dict(role="BOT", prompt="[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: Write a function to find the largest integers from a given list of numbers using heap queue algorithm. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"),
dict(role="BOT", prompt="[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "),
dict(role="HUMAN", prompt="You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"),
dict(role="BOT", prompt="[BEGIN]\n"),
],
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
sanitized_mbpp_eval_cfg = dict(evaluator=dict(type=MBPPPassKEvaluator), pred_role="BOT")
sanitized_mbpp_datasets = [
dict(
type=SanitizedMBPPDataset,
abbr="sanitized_mbpp_repeat10",
path="./data/mbpp/sanitized-mbpp.jsonl",
num_repeats=10,
reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg,
)
]
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .mbpp_cn_gen_1d1481 import mbpp_cn_datasets # noqa: F401, F403 from .mbpp_cn_gen_9114d5 import mbpp_cn_datasets # noqa: F401, F403
\ No newline at end of file \ No newline at end of file
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPDataset, MBPPEvaluator
mbpp_reader_cfg = dict(
input_columns=['text', 'test_list'], output_column='test_list_2')
mbpp_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt=
"你是一名专业的 Python 程序员,你的任务是:编写一个函数,从给定的两个元组列表中查找相似的元素。 你的代码应该通过这些测试:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\r\n res = tuple(set(test_tup1) & set(test_tup2))\r\n return (res)' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"你是一名专业的 Python 程序员,你的任务是:编写一个 Python 函数来识别一个整数是否不是素数。 你的代码应该通过这些测试:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'import math\r\ndef is_not_prime(n):\r\n result = False\r\n for i in range(2,int(math.sqrt(n)) + 1):\r\n if n % i == 0:\r\n result = True\r\n return result' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"你是一名专业的 Python 程序员,你的任务是:编写一个函数,使用堆队列算法从给定的数字列表中查找最大整数。 你的代码应该通过这些测试:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'import heapq as hq\r\ndef heap_queue_largest(nums,n):\r\n largest_nums = hq.nlargest(n, nums)\r\n return largest_nums' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"你是一名专业的 Python 程序员,你的任务是: {text} 你的代码应该通过这些测试:\n\n {test_list} \n"
),
dict(role="BOT", prompt="[BEGIN]\n"),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
mbpp_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator), pred_role="BOT")
mbpp_cn_datasets = [
dict(
type=MBPPDataset,
abbr='mbpp_cn',
path='./data/mbpp_cn/mbpp_cn.jsonl',
reader_cfg=mbpp_reader_cfg,
infer_cfg=mbpp_infer_cfg,
eval_cfg=mbpp_eval_cfg)
]
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from.mbpp_plus_gen_94815c import mbpp_plus_datasets # noqa: F401, F403 from.mbpp_plus_gen_0b836a import mbpp_plus_datasets # noqa: F401, F403
\ No newline at end of file \ No newline at end of file
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MBPPEvaluator, MBPPPlusDataset
mbpp_plus_reader_cfg = dict(
input_columns=['text', 'test_list'], output_column='task_id')
mbpp_plus_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(
role="HUMAN",
prompt=
"You are an expert Python programmer, and here is your task: Write a function to find the shared elements from the given two lists. Your code should pass these tests:\n\n assert similar_elements((3, 4, 5, 6),(5, 7, 4, 10)) == (4, 5)\nassert similar_elements((1, 2, 3, 4),(5, 4, 3, 7)) == (3, 4) \nassert similar_elements((11, 12, 14, 13),(17, 15, 14, 13)) == (13, 14) \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'def similar_elements(test_tup1, test_tup2):\n return tuple(set(test_tup1) & set(test_tup2))' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"You are an expert Python programmer, and here is your task: Write a python function to identify non-prime numbers. Your code should pass these tests:\n\n assert is_not_prime(2) == False \nassert is_not_prime(10) == True \nassert is_not_prime(35) == True \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'import math\ndef is_not_prime(n):\n if n == 1:\n return True\n for i in range(2, int(math.sqrt(n))+1):\n if n % i == 0:\n return True\n return False' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"You are an expert Python programmer, and here is your task: Write a function to find the n largest integers from a given list of numbers, returned in descending order. Your code should pass these tests:\n\n assert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],3)==[85, 75, 65] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],2)==[85, 75] \nassert heap_queue_largest( [25, 35, 22, 85, 14, 65, 75, 22, 58],5)==[85, 75, 65, 58, 35] \n"
),
dict(
role="BOT",
prompt=
"[BEGIN]\n 'import heapq as hq\ndef heap_queue_largest(nums: list,n: int) -> list:\n largest_nums = hq.nlargest(n, nums)\n return largest_nums' \n[DONE] \n\n "
),
dict(
role="HUMAN",
prompt=
"You are an expert Python programmer, and here is your task: {text} Your code should pass these tests:\n\n {test_list} \n"
),
dict(role="BOT", prompt="[BEGIN]\n"),
], )),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
mbpp_plus_eval_cfg = dict(evaluator=dict(type=MBPPEvaluator, metric='MBPPPlus'), pred_role="BOT")
mbpp_plus_datasets = [
dict(
type=MBPPPlusDataset,
abbr='mbpp_plus',
path='./data/mbpp_plus/mbpp_plus.jsonl',
reader_cfg=mbpp_plus_reader_cfg,
infer_cfg=mbpp_plus_infer_cfg,
eval_cfg=mbpp_plus_eval_cfg)
]
from mmengine.config import read_base
with read_base():
from .needlebench_multi_reasoning_256k import needlebench_2needle_en_datasets as needlebench_multi_2needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_3needle_en_datasets as needlebench_multi_3needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_4needle_en_datasets as needlebench_multi_4needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_5needle_en_datasets as needlebench_multi_5needle_en_datasets
from .needlebench_multi_reasoning_256k import needlebench_2needle_zh_datasets as needlebench_multi_2needle_zh_datasets
from .needlebench_multi_reasoning_256k import needlebench_3needle_zh_datasets as needlebench_multi_3needle_zh_datasets
from .needlebench_multi_reasoning_256k import needlebench_4needle_zh_datasets as needlebench_multi_4needle_zh_datasets
from .needlebench_multi_reasoning_256k import needlebench_5needle_zh_datasets as needlebench_multi_5needle_zh_datasets
from .needlebench_single_256k import needlebench_en_datasets as needlebench_origin_en_datasets
from .needlebench_single_256k import needlebench_zh_datasets as needlebench_origin_zh_datasets
from .needlebench_multi_retrieval_256k import needlebench_en_datasets as needlebench_parallel_en_datasets
from .needlebench_multi_retrieval_256k import needlebench_zh_datasets as needlebench_parallel_zh_datasets
needlebench_datasets = sum((v for k, v in locals().items() if k.endswith('_datasets')), [])
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.multi import NeedleBenchMultiDataset
from opencompass.datasets.needlebench.multi import NeedleBenchMultiEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError("num must be at least 1.")
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchMultiEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT')
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [32000, 128000, 256000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
# ----------English Version----------
base_path = './data/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needle_file_name = 'multi_needle_reasoning_en.json'
diff = 10
num_needles = 2
needlebench_2needle_en_datasets = []
language = 'English'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_2needle_en_datasets.append(dataset_dict)
num_needles = 3
needlebench_3needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_3needle_en_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_4needle_en_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_en_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_en_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_5needle_en_datasets.append(dataset_dict)
# ----------Chinese Version----------
base_path = './data/needlebench'
file_list = ['zh_finance.jsonl']
needle_file_name = 'multi_needle_reasoning_zh.json'
diff = 10
num_needles = 2
needlebench_2needle_zh_datasets = []
language = 'Chinese'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_2needle_zh_datasets.append(dataset_dict)
num_needles = 3
needlebench_3needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_3needle_zh_datasets.append(dataset_dict)
num_needles = 4
needlebench_4needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_4needle_zh_datasets.append(dataset_dict)
num_needles = 5
needlebench_5needle_zh_datasets = []
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_{num_needles}needle_zh_256k',
'type': NeedleBenchMultiDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': language,
'needle_file_name': needle_file_name,
'num_needles': num_needles,
'diff': diff,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_5needle_zh_datasets.append(dataset_dict)
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelDataset
from opencompass.datasets.needlebench.parallel import NeedleBenchParallelEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError("num must be at least 1.")
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchParallelEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT')
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [32000, 128000, 256000]
document_depth_percent_intervals = 20
document_depth_percent_interval_type = "linear"
base_path = './data/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
depths = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'_parallel_en_256k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 3000,
'guide': True,
'language': 'English',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
for original_context_length in context_lengths:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'_parallel_zh_256k',
'type': NeedleBenchParallelDataset,
'path': base_path,
'needle_file_name': needle_file_name,
'length': original_context_length,
'depths': depths,
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 25,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_zh_datasets.append(dataset_dict)
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets.needlebench.origin import NeedleBenchOriginDataset
from opencompass.datasets.needlebench.origin import NeedleBenchOriginEvaluator
from opencompass.datasets.needlebench.origin import needlebench_postprocess
from opencompass.datasets.needlebench.origin import needlebench_dataset_postprocess
import math
def logistic(x, L=100, x0=50, k=0.1):
return round(L / (1 + math.exp(-k * (x - x0))), 3)
def generate_linear_space(start, end, num):
if num == 1:
return [start]
elif num < 1:
raise ValueError("num must be at least 1.")
step = (end - start) / (num - 1)
return [start + step * i for i in range(num)]
def generate_depth_percents(intervals, interval_type):
if interval_type == 'linear':
return generate_linear_space(0, 100, intervals)
elif interval_type == 'sigmoid':
linear_space = generate_linear_space(0, 100, intervals)
return [logistic(x) for x in linear_space]
else:
raise ValueError('Unsupported interval type')
needlebench_reader_cfg = dict(input_columns=['prompt'], output_column='answer')
needlebench_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='{prompt}'),
dict(role='BOT', prompt='{answer}\n'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer))
needlebench_eval_cfg = dict(
evaluator=dict(type=NeedleBenchOriginEvaluator),
pred_postprocessor=dict(type=needlebench_postprocess),
dataset_postprocessor=dict(type=needlebench_dataset_postprocess),
pred_role='BOT')
# context_lengths = list([16000, 32000, 48000, 64000, 80000, 96000, 112000, 128000, 144000, 160000, 176000, 192000, 200000])
context_lengths = [32000, 128000, 256000]
depths_list = [0, 10, 21, 31, 42, 52, 63, 73, 84, 94, 100]
base_path = './data/needlebench'
file_list = ['PaulGrahamEssays.jsonl']
needlebench_en_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_en_256k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 600,
'guide': True,
'language': 'English',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_en_datasets.append(dataset_dict)
file_list = ['zh_finance.jsonl']
needlebench_zh_datasets = []
needle_file_name = 'needles.jsonl'
for original_context_length in context_lengths:
for depth_percent in depths_list:
dataset_dict = {
'abbr': f'Length{original_context_length}'
f'Depth{int(depth_percent)}_origin_zh_256k',
'type': NeedleBenchOriginDataset,
'path': base_path,
'length': original_context_length,
'depth': int(depth_percent),
'tokenizer_model': 'gpt-4',
'file_list': file_list,
'num_repeats_per_file': 10,
'length_buffer': 200,
'guide': True,
'language': 'Chinese',
'needle_file_name': needle_file_name,
'reader_cfg': needlebench_reader_cfg,
'infer_cfg': needlebench_infer_cfg,
'eval_cfg': needlebench_eval_cfg
}
needlebench_zh_datasets.append(dataset_dict)
...@@ -9,8 +9,8 @@ from opencompass.tasks import OpenICLInferTask ...@@ -9,8 +9,8 @@ from opencompass.tasks import OpenICLInferTask
with read_base(): with read_base():
from .datasets.humaneval.humaneval_passk_gen_8e312c import humaneval_datasets from .datasets.humaneval.humaneval_passk_gen_8e312c import humaneval_datasets
from .datasets.mbpp.mbpp_passk_gen_1e1056 import mbpp_datasets from .datasets.mbpp.deprecated_mbpp_passk_gen_1e1056 import mbpp_datasets
from .datasets.mbpp.sanitized_mbpp_passk_gen_1e1056 import sanitized_mbpp_datasets from .datasets.mbpp.deprecated_sanitized_mbpp_passk_gen_1e1056 import sanitized_mbpp_datasets
datasets = [] datasets = []
datasets += humaneval_datasets datasets += humaneval_datasets
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment