Unverified Commit 32f40a8f authored by Fengzhe Zhou's avatar Fengzhe Zhou Committed by GitHub
Browse files

[Sync] Sync with internal codes 2023.01.08 (#777)

parent 8194199d
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import AgentInferencer
from opencompass.datasets import (
MATHDataset, MATHAgentEvaluator, math_postprocess
)
# use pal format but not perform well
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
# # ################################### NEW SHOT ###################################
dict(role='HUMAN', prompt='Find the coefficient of $x^3$ when $3(x^2 - x^3+x) +3(x +2x^3- 3x^2 + 3x^5+x^3) -5(1+x-4x^3 - x^2)$ is simplifie.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter
Tool Input:```python
from sympy import symbols, simplify
def solution():
x = symbols('x')
expr = 3*(x**2 - x**3 + x) + 3*(x + 2*x**3 - 3*x**2 + 3*x**5 + x**3) - 5*(1 + x - 4*x**3 - x**2)
simplified_expr = simplify(expr)
x3_coefficient = simplified_expr.as_coefficients_dict()[x**3]
result = x3_coefficient
return result
```"""),
dict(role='SYSTEM', prompt='Response:26'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $26$. I hope it is correct.'),
dict(role='HUMAN', prompt='The surface area of a sphere with radius $r$ is $4\pi r^2$. Including the area of its circular base, what is the total surface area of a hemisphere with radius 6 cm? Express your answer in terms of $\pi$.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter
Tool Input:```python
import math
def solution():
radius = 6
# Surface area of the hemisphere
hemisphere_area = 2 * math.pi * radius**2
# Area of the circular base
base_area = math.pi * radius**2
# Total surface area
total_surface_area = hemisphere_area + base_area
# Formatting the result in LaTeX
result = r'{}\pi'.format(total_surface_area / math.pi)
return result
```"""),
dict(role='SYSTEM', prompt='Response:108.0\\pi'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $108.0\pi$. I hope it is correct.'),
dict(role='HUMAN', prompt='Monica tosses a fair 6-sided die. If the roll is a prime number, then she wins that amount of dollars (so that, for example, if she rolls 3, then she wins 3 dollars). If the roll is composite, she wins nothing. Otherwise, she loses 3 dollars. What is the expected value of her winnings on one die toss? Express your answer as a dollar value to the nearest cent.'),
dict(role='BOT', prompt="""Tool:PythonInterpreter
Tool Input:```python
def solution():
# Probabilities of each outcome
prime_prob = 1 / 6
composite_prob = 1 / 3
otherwise_prob = 1 / 6
# Expected value of each outcome
prime_expected_value = (2 * prime_prob) + (3 * prime_prob) + (5 * prime_prob)
composite_expected_value = 0 * composite_prob
otherwise_expected_value = -3 * otherwise_prob
# Total expected value
total_expected_value = prime_expected_value + composite_expected_value + otherwise_expected_value
# Dollar value to the nearest cent
result = "{:.2f}".format(total_expected_value)
return result
```"""),
dict(role='SYSTEM', prompt='Response:1.17'),
dict(role='BOT', prompt='FinalAnswer: The final answer is $1.17$. I hope it is correct.'),
dict(role='HUMAN', prompt='{problem}'),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=AgentInferencer),
)
math_eval_cfg = dict(
evaluator=dict(type=MATHAgentEvaluator),
pred_postprocessor=dict(type=math_postprocess),
)
math_datasets = [
dict(
abbr='math-agent',
type=MATHDataset,
path='./data/math/math.json',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg,
)
]
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import MATHDataset, MATHEvaluator, math_postprocess_v2
math_reader_cfg = dict(input_columns=['problem'], output_column='solution')
math_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role="HUMAN",
prompt=
"Problem:\nFind the domain of the expression $\\frac{{\sqrt{{x-2}}}}{{\sqrt{{5-x}}}}$.}}\nSolution:"
),
dict(
role="BOT",
prompt=
"The expressions inside each square root must be non-negative. Therefore, $x-2 \ge 0$, so $x\ge2$, and $5 - x \ge 0$, so $x \le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{{[2,5)}}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.\n"
),
dict(
role="HUMAN",
prompt=
"Problem:\nIf $\det \mathbf{{A}} = 2$ and $\det \mathbf{{B}} = 12,$ then find $\det (\mathbf{{A}} \mathbf{{B}}).$\nSolution:"
),
dict(
role="BOT",
prompt=
"We have that $\det (\mathbf{{A}} \mathbf{{B}}) = (\det \mathbf{{A}})(\det \mathbf{{B}}) = (2)(12) = \\boxed{{24}}.$\nFinal Answer: The final answer is $24$. I hope it is correct.\n"
),
dict(
role="HUMAN",
prompt=
"Problem:\nTerrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?\nSolution:"
),
dict(
role="BOT",
prompt=
"If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\cdot 12\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\cdot15\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$: \\begin{{align*}} 30n&=480\\\\ \Rightarrow\qquad n&=480/30=\\boxed{{16}} \end{{align*}}\nFinal Answer: The final answer is $16$. I hope it is correct.\n"
),
dict(
role="HUMAN",
prompt=
"Problem:\nIf the system of equations: \\begin{{align*}} 6x-4y&=a,\\\\ 6y-9x &=b. \end{{align*}}has a solution $(x, y)$ where $x$ and $y$ are both nonzero, find $\\frac{{a}}{{b}},$ assuming $b$ is nonzero.\nSolution:"
),
dict(
role="BOT",
prompt=
"If we multiply the first equation by $-\\frac{{3}}{{2}}$, we obtain $$6y-9x=-\\frac{{3}}{{2}}a.$$Since we also know that $6y-9x=b$, we have $$-\\frac{{3}}{{2}}a=b\Rightarrow\\frac{{a}}{{b}}=\\boxed{{-\\frac{{2}}{{3}}}}.$$\nFinal Answer: The final answer is $-\\frac{{2}}{{3}}$. I hope it is correct.\n"
),
dict(role="HUMAN", prompt="Problem:\n{problem}\nSolution:\n"),
])),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))
# postprocess v2
math_eval_cfg = dict(
evaluator=dict(
type=MATHEvaluator,
version='v2'),
pred_postprocessor=dict(type=math_postprocess_v2))
math_datasets = [
dict(
type=MATHDataset,
abbr='math',
path='./data/math/math.json',
reader_cfg=math_reader_cfg,
infer_cfg=math_infer_cfg,
eval_cfg=math_eval_cfg)
]
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .CIBench_gen_8ab0dc import ci_datasets # noqa: F401, F403 from .math401_gen_ab5f39 import math401_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import MathBenchDataset, Math401Evaluator, mathbench_postprocess
cloze_prompt = [
dict(role='HUMAN', prompt='Q: Calculate 2.9-0.11.'),
dict(role='BOT', prompt='A: Let\'s think step by step, 2.9 - 0.11 equals 2.7900. The answer is 2.7900.\n'),
dict(role='HUMAN', prompt='Q: Calculate 0.15-0.032.'),
dict(role='BOT', prompt='A: Let\'s think step by step, 0.15 - 0.032 equals 0.1180. The answer is 0.1180.\n'),
dict(role='HUMAN', prompt='Q: Calculate 78*64.'),
dict(role='BOT', prompt='A: Let\'s think step by step, 78 multiplied by 64 equals 4992. The answer is 4992.\n'),
dict(role='HUMAN', prompt='Q: Calculate 62×42.'),
dict(role='BOT', prompt='A: Let\'s think step by step, 62 multiplied by 42 equals 2604. The answer is 2604.\n'),
dict(role='HUMAN', prompt='Q: Calculate {question}'),
dict(role='BOT', prompt='A: {answer}\n')]
math401_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=cloze_prompt,
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512),
)
math401_eval_cfg = dict(
evaluator=dict(type=Math401Evaluator),
pred_postprocessor=dict(type=mathbench_postprocess, name='en'))
math401_datasets = [
dict(
abbr="math401",
type=MathBenchDataset,
path=f"./data/math401/",
with_circular=False,
name="cloze_en",
reader_cfg=dict(
input_columns=["question"],
output_column="answer"
),
infer_cfg=math401_infer_cfg,
eval_cfg=math401_eval_cfg,
)]
...@@ -57,7 +57,7 @@ sanitized_mbpp_datasets = [ ...@@ -57,7 +57,7 @@ sanitized_mbpp_datasets = [
dict( dict(
type=SanitizedMBPPDataset, type=SanitizedMBPPDataset,
abbr='sanitized_mbpp', abbr='sanitized_mbpp',
path='./sanitized-mbpp.jsonl', path='./data/mbpp/sanitized-mbpp.jsonl',
reader_cfg=sanitized_mbpp_reader_cfg, reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg, infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg) eval_cfg=sanitized_mbpp_eval_cfg)
......
...@@ -57,7 +57,7 @@ sanitized_mbpp_datasets = [ ...@@ -57,7 +57,7 @@ sanitized_mbpp_datasets = [
dict( dict(
type=SanitizedMBPPDataset, type=SanitizedMBPPDataset,
abbr='sanitized_mbpp_passk', abbr='sanitized_mbpp_passk',
path='./sanitized-mbpp.jsonl', path='./data/mbpp/sanitized-mbpp.jsonl',
reader_cfg=sanitized_mbpp_reader_cfg, reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg, infer_cfg=sanitized_mbpp_infer_cfg,
eval_cfg=sanitized_mbpp_eval_cfg) eval_cfg=sanitized_mbpp_eval_cfg)
......
...@@ -57,7 +57,7 @@ sanitized_mbpp_datasets = [ ...@@ -57,7 +57,7 @@ sanitized_mbpp_datasets = [
dict( dict(
type=SanitizedMBPPDataset, type=SanitizedMBPPDataset,
abbr='sanitized_mbpp_repeat10', abbr='sanitized_mbpp_repeat10',
path='./sanitized-mbpp.jsonl', path='./data/mbpp/sanitized-mbpp.jsonl',
num_repeats=10, num_repeats=10,
reader_cfg=sanitized_mbpp_reader_cfg, reader_cfg=sanitized_mbpp_reader_cfg,
infer_cfg=sanitized_mbpp_infer_cfg, infer_cfg=sanitized_mbpp_infer_cfg,
......
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever, RandomRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import NQOpenDataset, NQEvaluator
nq_datasets = []
for k in [0, 1, 5, 25]:
nq_reader_cfg = dict(
input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
if k == 0:
nq_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}?'),
dict(role='BOT', prompt='A:'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=50)
)
else:
nq_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}?'),
dict(role='BOT', prompt='A: {answer}.\n'),
]
),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(role='HUMAN', prompt='Q: {question}?'),
dict(role='BOT', prompt='A:'),
]
),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
)
nq_eval_cfg = dict(evaluator=dict(type=NQEvaluator), pred_role="BOT")
nq_datasets.append(
dict(
type=NQOpenDataset,
abbr=f'nq_open_{k}shot',
path='./data/nq-open/',
reader_cfg=nq_reader_cfg,
infer_cfg=nq_infer_cfg,
eval_cfg=nq_eval_cfg)
)
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever, FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import TriviaQADataset_V2, TriviaQAEvaluator
triviaqa_datasets = []
for k in [0, 1, 5, 25]:
triviaqa_reader_cfg = dict(
input_columns=['question'], output_column='answer', train_split='train', test_split='validation')
if k == 0:
triviaqa_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A:'),
]
)
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=50)
)
else:
triviaqa_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A: {answer}.\n'),
]
),
),
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(role='HUMAN', prompt='Q: {question}'),
dict(role='BOT', prompt='A:'),
]
),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=list(range(k))),
inferencer=dict(type=GenInferencer, max_out_len=50, stopping_criteria=["Q:", "\n"]),
)
triviaqa_eval_cfg = dict(evaluator=dict(type=TriviaQAEvaluator), pred_role="BOT")
triviaqa_datasets.append(
dict(
type=TriviaQADataset_V2,
abbr=f'triviaqa_wiki_{k}shot',
path='./data/triviaqa',
reader_cfg=triviaqa_reader_cfg,
infer_cfg=triviaqa_infer_cfg,
eval_cfg=triviaqa_eval_cfg)
)
from mmengine.config import read_base from mmengine.config import read_base
with read_base(): with read_base():
from .mathbench_gen_ad37c1 import mathbench_datasets # noqa: F401, F403 from .winogrande_ll_c5cf57 import winogrande_datasets # noqa: F401, F403
from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import LoglikelihoodInferencer from opencompass.openicl.icl_inferencer import LLInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import winograndeDataset from opencompass.datasets import winograndeDataset
...@@ -18,7 +18,7 @@ winogrande_infer_cfg = dict( ...@@ -18,7 +18,7 @@ winogrande_infer_cfg = dict(
} }
), ),
retriever=dict(type=ZeroRetriever), retriever=dict(type=ZeroRetriever),
inferencer=dict(type=LoglikelihoodInferencer)) inferencer=dict(type=LLInferencer))
winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator)) winogrande_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
......
from mmengine.config import read_base
with read_base():
from .winogrande_ppl_8be6c3 import winogrande_datasets # noqa: F401, F403
...@@ -6,7 +6,7 @@ from opencompass.datasets import winograndeDataset ...@@ -6,7 +6,7 @@ from opencompass.datasets import winograndeDataset
# WARNING: This config cannot reproduce results in the paper. # WARNING: This config cannot reproduce results in the paper.
# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config) # e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
# Please try winogrande_ppl_8be6c3 # Please try winogrande_ll_c5cf57
winogrande_reader_cfg = dict( winogrande_reader_cfg = dict(
input_columns=['opt1', 'opt2'], input_columns=['opt1', 'opt2'],
......
...@@ -6,7 +6,7 @@ from opencompass.datasets import winograndeDataset ...@@ -6,7 +6,7 @@ from opencompass.datasets import winograndeDataset
# WARNING: This config cannot reproduce results in the paper. # WARNING: This config cannot reproduce results in the paper.
# e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config) # e.g. LLAMA2-7B Winogrande 69.2 (paper) -> 62.27 (this config)
# Please try winogrande_ppl_8be6c3 # Please try winogrande_ll_c5cf57
winogrande_reader_cfg = dict( winogrande_reader_cfg = dict(
input_columns=['opt1', 'opt2'], input_columns=['opt1', 'opt2'],
......
from mmengine.config import read_base
with read_base():
from .datasets.mmlu.mmlu_ppl_ac766d import mmlu_datasets
from .datasets.triviaqa.triviaqa_wiki_gen_d18bf4 import triviaqa_datasets
from .datasets.nq.nq_open_gen_e93f8a import nq_datasets
from .datasets.gsm8k.gsm8k_gen_3309bd import gsm8k_datasets
from .datasets.humaneval.humaneval_gen_a82cae import humaneval_datasets
from .datasets.agieval.agieval_mixed_2f14ad import agieval_datasets
from .datasets.SuperGLUE_BoolQ.SuperGLUE_BoolQ_ppl_314797 import BoolQ_datasets
from .datasets.hellaswag.hellaswag_ppl_a6e128 import hellaswag_datasets
from .datasets.obqa.obqa_ppl_6aac9e import obqa_datasets
from .datasets.winogrande.winogrande_ll_c5cf57 import winogrande_datasets
from .models.hf_llama.hf_llama2_7b import models
from .summarizers.example import summarizer
datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
work_dir = './outputs/llama2/'
from opencompass.models import HuggingFaceChatGLM3
api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)
models = [
dict(
type=HuggingFaceChatGLM3,
abbr='chatglm3-6b-32k-hf',
path='THUDM/chatglm3-6b-32k',
tokenizer_path='THUDM/chatglm3-6b-32k',
model_kwargs=dict(
device_map='auto',
trust_remote_code=True,
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
trust_remote_code=True,
),
meta_template=api_meta_template,
max_out_len=100,
max_seq_len=4096,
batch_size=1,
run_cfg=dict(num_gpus=1, num_procs=1)
)
]
from opencompass.models import VLLM
models = [
dict(
type=VLLM,
abbr='chatglm3-6b-32k-vllm',
path='THUDM/chatglm3-6b-32k',
max_out_len=100,
max_seq_len=4096,
batch_size=32,
generation_kwargs=dict(temperature=0),
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
...@@ -28,5 +28,6 @@ models = [ ...@@ -28,5 +28,6 @@ models = [
max_seq_len=2048, max_seq_len=2048,
batch_size=8, batch_size=8,
run_cfg=dict(num_gpus=4, num_procs=1), run_cfg=dict(num_gpus=4, num_procs=1),
end_str='<|end▁of▁sentence|>',
) )
] ]
from opencompass.models import HuggingFaceCausalLM
_meta_template = dict(
round=[
dict(role="HUMAN", begin=' [INST] ', end=' [/INST] '),
dict(role="BOT", begin='', end='', generate=True),
],
)
models = [
dict(
type=HuggingFaceCausalLM,
abbr='llama-2-13b-chat-hf',
path="meta-llama/Llama-2-13b-chat-hf",
tokenizer_path='meta-llama/Llama-2-13b-chat-hf',
model_kwargs=dict(
device_map='auto'
),
tokenizer_kwargs=dict(
padding_side='left',
truncation_side='left',
use_fast=False,
),
meta_template=_meta_template,
max_out_len=100,
max_seq_len=2048,
batch_size=8,
run_cfg=dict(num_gpus=2, num_procs=1),
end_str='[INST]',
)
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment