Unverified Commit 0ef7548d authored by Rima Shahbazyan's avatar Rima Shahbazyan Committed by GitHub
Browse files

Score tasks (#2452)

* score readme added

* generate until task's "until" parameter's default value fixed.

* score mmlu-pro and agieval added

* changed macro accuracy to micro for agieval

* Always E removed from agi eval

* redundancies removed

* MATH added

* minor cosmetic changes for math

* Licenses added Readme updated

* changes for flake8 + license header on math

* Score added to readme and precommit was run.

* Score added to readme and precommit was run.

* Import error fixed

* math task bugfix
postprocess minor fix

* CR for math added

* math CR

* math task bugfix
postprocess minor fix

CR for math added

* Math cr fixed

* reverting the default "until" parameter change and adjusting  score task configs
parent 9d36354e
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
from functools import partial
from typing import Any, Dict, List
import numpy as np
from datasets import Dataset
from lm_eval.tasks.score import utils
from lm_eval.tasks.score.utils import prompt_consistency_rate, robustness_doc_to_text
from lm_eval.utils import eval_logger
TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY = "option_order_robustness"
QUESTION_KEY = "query"
ANSWER_INDEX_KEY = "gold"
OPTIONS_KEY = "choices"
LABELS = ["A", "B", "C", "D", "E"]
agi_eval_prompt_consistency_rate = prompt_consistency_rate
agi_eval_robustness_doc_to_text = robustness_doc_to_text
def initial_process_docs(doc: Dataset) -> Dataset:
"""
add question_id to the documents
"""
bracket_pattern = r"^\([A-E]\)"
letter_space = r"^[A-E] "
letter_question_space = r"^[A-E]\? "
def __process(_doc, idx):
if "question" not in _doc:
question = _doc[QUESTION_KEY].split(" Answer Choices:")[0]
if question.startswith("Q: "):
question = question[3:]
_doc["question"] = question
if "question_id" not in _doc:
_doc["question_id"] = idx
if "answer_index" not in _doc:
_doc["answer_index"] = _doc[ANSWER_INDEX_KEY][0]
if "answer" not in _doc:
_doc["answer"] = LABELS[_doc["answer_index"]]
if "options" not in _doc:
prepared_options = []
for option in _doc[OPTIONS_KEY]:
if re.match(bracket_pattern, option):
prepared_options.append(option[3:])
elif re.match(letter_space, option):
prepared_options.append(option[2:])
elif re.match(letter_question_space, option):
prepared_options.append(option[3:])
else:
prepared_options.append(option)
_doc["options"] = prepared_options
return _doc
return doc.map(__process, with_indices=True)
prompt_robustness_process_docs = partial(
utils.process_docs_add_prompts,
templates_key=PROMPT_ROBUSTNESS_TEMPLATE_KEY,
template_file_path=TEMPLATE_FILE_PATH,
dataset_specific_preprocess=initial_process_docs,
)
option_order_robustness_process_docs = partial(
utils.option_order_robustness_process_docs,
template_file_path=TEMPLATE_FILE_PATH,
templates_key=OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY,
labels=LABELS[:-1],
dataset_specific_preprocess=initial_process_docs,
)
def prompt_robustness_process_results(doc, results) -> Dict[str, float]:
final_answer = utils.__postprocess_pred(results[0])
final_answer = utils.translate_model_answer_to_labels(
final_answer, option_format=doc["options_format"], labels=LABELS
)
gt = LABELS[doc["answer_index"]]
prompt_id = doc["prompt_id"]
question_id = doc["question_id"]
return {
f"{prompt_id}_accuracy": (question_id, prompt_id, final_answer, gt),
"consistency_rate": (question_id, prompt_id, final_answer, gt),
}
def option_order_robustness_process_results(doc, results) -> Dict[str, float]:
final_answer = utils.__postprocess_pred(results[0])
final_answer = utils.translate_model_answer_to_labels(
final_answer, option_format=doc["options_format"], labels=LABELS
)
gt = LABELS[doc["answer_index"]]
always_same_option = doc["always_same_option"]
question_id = doc["question_id"]
original_answer_index = doc["original_answer_index"]
answer_index = (doc["answer_index"],)
return {
f"per_option_accuracy_{always_same_option}": (
question_id,
always_same_option,
final_answer,
gt,
),
"options_consistency_rate": (
question_id,
always_same_option,
final_answer,
original_answer_index,
answer_index,
),
}
def per_prompt_accuracy(results: List[Dict[str, Any]], p_id=0) -> float:
accuracies = []
for result in results:
question_id, prompt_id, final_answer, gt = result
if prompt_id != p_id:
continue
accuracies.append(final_answer == gt)
accuracie = sum(accuracies) / len(accuracies)
eval_logger.info(f"Prompt - {prompt_id} accuracy: {accuracie}")
return np.round(accuracie, 4)
per_prompt_accuracy_0 = partial(per_prompt_accuracy, p_id=0)
per_prompt_accuracy_1 = partial(per_prompt_accuracy, p_id=1)
per_prompt_accuracy_2 = partial(per_prompt_accuracy, p_id=2)
per_prompt_accuracy_3 = partial(per_prompt_accuracy, p_id=3)
per_prompt_accuracy_4 = partial(per_prompt_accuracy, p_id=4)
per_prompt_accuracy_5 = partial(per_prompt_accuracy, p_id=5)
per_prompt_accuracy_6 = partial(per_prompt_accuracy, p_id=6)
per_prompt_accuracy_7 = partial(per_prompt_accuracy, p_id=7)
per_prompt_accuracy_8 = partial(per_prompt_accuracy, p_id=8)
per_prompt_accuracy_9 = partial(per_prompt_accuracy, p_id=9)
def per_option_accuracy(results: List[Dict[str, Any]], always_opt="a") -> float:
accuracies = []
for result in results:
question_id, always_same_option, final_answer, gt = result
if always_opt != always_same_option:
continue
accuracies.append(int(final_answer == gt))
accuracie = sum(accuracies) / len(accuracies)
eval_logger.info(f"Prompt - {always_opt.upper()} accuracy: {accuracie}")
return np.round(accuracie, 4)
per_option_accuracy_a = partial(per_option_accuracy, always_opt="A")
per_option_accuracy_b = partial(per_option_accuracy, always_opt="B")
per_option_accuracy_c = partial(per_option_accuracy, always_opt="C")
per_option_accuracy_d = partial(per_option_accuracy, always_opt="D")
options_consistency_rate = partial(utils.options_consistency_rate, labels=LABELS)
This diff is collapsed.
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
task: prompt_robustness_math_algebra
dataset_path: EleutherAI/hendrycks_math
process_docs: !function utils_math.prompt_robustness_process_docs
dataset_name: algebra
output_type: generate_until
test_split: test
doc_to_text: !function utils_math.math_robustness_doc_to_text
process_results: !function utils_math.process_results
doc_to_target: answer
generation_kwargs:
until: []
do_sample: false
temperature: 0
max_gen_toks: 1024
metric_list:
- metric: 0_accuracy
aggregation: !function utils_math.per_prompt_accuracy_0
higher_is_better: true
- metric: 1_accuracy
aggregation: !function utils_math.per_prompt_accuracy_1
higher_is_better: true
- metric: 2_accuracy
aggregation: !function utils_math.per_prompt_accuracy_2
higher_is_better: true
- metric: 3_accuracy
aggregation: !function utils_math.per_prompt_accuracy_3
higher_is_better: true
- metric: 4_accuracy
aggregation: !function utils_math.per_prompt_accuracy_4
higher_is_better: true
- metric: 5_accuracy
aggregation: !function utils_math.per_prompt_accuracy_5
higher_is_better: true
- metric: 6_accuracy
aggregation: !function utils_math.per_prompt_accuracy_6
higher_is_better: true
- metric: 7_accuracy
aggregation: !function utils_math.per_prompt_accuracy_7
higher_is_better: true
- metric: 8_accuracy
aggregation: !function utils_math.per_prompt_accuracy_8
higher_is_better: true
- metric: 9_accuracy
aggregation: !function utils_math.per_prompt_accuracy_9
higher_is_better: true
- metric: consistency_rate
aggregation: !function utils_math.math_prompt_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include: prompt_robustness_math_algebra.yaml
dataset_name: counting_and_probability
task: prompt_robustness_math_counting_and_prob
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include: prompt_robustness_math_algebra.yaml
dataset_name: geometry
task: prompt_robustness_math_geometry
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include: prompt_robustness_math_algebra.yaml
dataset_name: intermediate_algebra
task: prompt_robustness_math_intermediate_algebra
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include: prompt_robustness_math_algebra.yaml
dataset_name: number_theory
task: prompt_robustness_math_num_theory
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include: prompt_robustness_math_algebra.yaml
dataset_name: prealgebra
task: prompt_robustness_math_prealgebra
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include: prompt_robustness_math_algebra.yaml
dataset_name: precalculus
task: prompt_robustness_math_precalc
{
"prompt_robustness": [
{
"prompt": "Efficiently solve the following math challenge. Explain your approach step-by-step\nThe answer should end with: The final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem\nProblem: {question}\nLets think step by step"
},
{
"prompt": "You should solve this math problem.\nIf the problem is easy, provide a brief solution with little explanation.\nFor more difficult problems, follow this structured format\n## Step 1: [Brief description]\n[Simple explanation and calculations]\n\n## Step 2: [Brief description]\n[Simple explanation and calculations]\n\nRepeat steps until your reach a solution\n\nProblem: {question}\nEnd with:\nThe final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem."
},
{
"prompt": "Solve this math problem. Your anwer should end with 'The final answer is: $\\boxed{{answer}}$' where [answer] is just the final number or expression that solves the problem\nProblem: {question}"
},
{
"prompt": "Analyze and solve the math task.\nProblem: {question}\nEnd the answer with:\nThe final answer is: $\\boxed{{answer}}$ where [answer] is just the final number or expression that solves the problem."
},
{
"prompt": "{question}\nFind the solution to this math problem. Your answer should end with - The final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem."
},
{
"prompt": "Calculate the answer to this math problem\nProblem: {question}\nConclude your answer with:\nThe final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem."
},
{
"prompt": "{question}\nPlease solve this math problem efficiently. Finish with: The final answer is: $\\boxed{{answer}}$ where [answer] is just the final number or expression that solves the problem."
},
{
"prompt": "{question}\nSolve the following math problem\nShow each step of your solution\nConclude with:\nThe final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem\nLets think step by step"
},
{
"prompt": "Find the answer to the following math question. Conclude with: 'The final answer is: $\\boxed{{answer}}$'\nwhere [answer] is just the final number or expression that solves the problem\nProblem: {question}"
},
{
"prompt": "Please solve the math problem. For simple problems offer a quick solution with minimal details. For more challenging problems, explain your approach step-by-step. Finish with\nThe final answer is: $\\boxed{{answer}}$.\nwhere [answer] is just the final number or expression that solves the problem.\nProblem: {question}\nLets think step by step."
}
]
}
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
group: score_prompt_robustness_math
task:
- prompt_robustness_math_algebra
- prompt_robustness_math_counting_and_prob
- prompt_robustness_math_geometry
- prompt_robustness_math_intermediate_algebra
- prompt_robustness_math_num_theory
- prompt_robustness_math_prealgebra
- prompt_robustness_math_precalc
aggregate_metric_list:
- metric: 0_accuracy
aggregation: mean
weight_by_size: true
- metric: 1_accuracy
aggregation: mean
weight_by_size: true
- metric: 2_accuracy
aggregation: mean
weight_by_size: true
- metric: 3_accuracy
aggregation: mean
weight_by_size: true
- metric: 4_accuracy
aggregation: mean
weight_by_size: true
- metric: 5_accuracy
aggregation: mean
weight_by_size: true
- metric: 6_accuracy
aggregation: mean
weight_by_size: true
- metric: 7_accuracy
aggregation: mean
weight_by_size: true
- metric: 8_accuracy
aggregation: mean
weight_by_size: true
- metric: 9_accuracy
aggregation: mean
weight_by_size: true
- metric: consistency_rate
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
group: score_robustness_math
task:
- score_prompt_robustness_math
metadata:
version: 1.0
[
{
"id": "test/prealgebra/1088.json",
"problem": "Simplify $(5x+3) - 2(2x-4)$.",
"answer": "x+11"
},
{
"id": "test/algebra/1197.json",
"problem": "Two positive numbers $p$ and $q$ have the property that their sum is equal to their product. If their difference is $7$, what is $\\frac{1}{\\frac{1}{p^2}+\\frac{1}{q^2}}$? Your answer will be of the form $\\frac{a+b\\sqrt{c}}{d}$, where $a$ and $b$ don't both share the same common factor with $d$ and $c$ has no square as a factor. Find $a+b+c+d$.",
"answer": "161"
},
{
"id": "test/geometry/66.json",
"problem": "Square $ABCD$ has side lengths of 13 units. Point $E$ lies in the interior of the square such that $AE = 5$ units and $BE = 12$ units. What is the distance from $E$ to side $AD$?",
"answer": "\\frac{25}{13}"
},
{
"id": "test/geometry/1125.json",
"problem": "An aquarium has a rectangular base that measures 100 cm by 40 cm and has a height of 50 cm. The aquarium is filled with water to a depth of 37 cm. A rock with volume $1000 \\text{cm}^3$ is then placed in the aquarium and completely submerged. By how many centimeters does the water level rise? Express your answer as a decimal to the nearest 100th.",
"answer": "0.25\\text{ cm}"
},
{
"id": "test/prealgebra/1407.json",
"problem": "What number must be placed in the box in the equation below to produce an equation that has more than one solution: \\[4x + 6 + 7x - 9 = 12x - 7 - x + \\boxed{\\phantom{2}}?\\]",
"answer": "4"
},
{
"id": "test/prealgebra/224.json",
"problem": "I am going to buy exotic fruits. Dragonfruit costs $x-4$ dollars. Starfruit is five dollars less expensive than rambutan. Rambutan costs $2x$ dollars more than dragonfruit. How much does it cost to buy one rambutan, two starfruit, and three dragonfruit? Your answer will be an expression that depends on $x$.",
"answer": "-34 + 12x"
},
{
"id": "test/prealgebra/177.json",
"problem": "Let $\\boxed{N}$ mean the number of whole number divisors of $N$. For example, $\\boxed{3}=2$, because $3$ has two divisors, $1$ and $3.$ Find the value of \\[\\boxed{\\boxed{11}\\times\\boxed{20}}\\]",
"answer": "12"
},
{
"id": "test/number_theory/459.json",
"problem": "On a particular map, $3$ inches on the map equates to $10$ miles in real life. If you know that the real life distance between two buildings on the map is $53.25$ miles, what would the distance between the buildings be (in inches) on the map, expressed as a fraction?",
"answer": "\\frac{639}{40}"
},
{
"id": "test/intermediate_algebra/702.json",
"problem": "Find the coordinates of either of the vertices of the hyperbola \\[16x^2+16x-4y^2-20y-85=0.\\](Enter your answer as an ordered pair. Enter the coordinates of one of the vertices, not both.)",
"answer": "\\left(-\\tfrac52, -\\tfrac52\\right)"
},
{
"id": "test/intermediate_algebra/25.json",
"problem": "Find the coordinates of one of the foci of the hyperbola \\[x^2 - 10x = 4y^2 - 5.\\](Enter your answer as an ordered pair. Enter only one of the foci, not both.)",
"answer": "(0,0)"
},
{
"id": "test/intermediate_algebra/747.json",
"problem": "The graph of $y = f(x)$ passes through the point $(-3,5).$ If $f(x)$ is an odd function, then what other point must the graph pass through? Enter your answer as an ordered pair.",
"answer": "(0,0)"
}
]
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
from functools import partial
from itertools import combinations
from typing import Any, Dict, List
import datasets
import numpy as np
from lm_eval.tasks.score import utils
from lm_eval.tasks.score.math.math_grader import (
extract_answer,
math_equal,
normalize_answer_string,
)
from lm_eval.tasks.score.utils import robustness_doc_to_text
from lm_eval.utils import eval_logger
TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
math_robustness_doc_to_text = robustness_doc_to_text
def find_boxed_entries(answer_str):
stack = []
results = []
i = 0
while i < len(answer_str):
if answer_str[i : i + 7] == "\\boxed{":
stack.append(i + 7)
i += 7
elif answer_str[i] == "{":
if stack:
stack.append(i + 1)
i += 1
elif answer_str[i] == "}":
if stack:
start = stack.pop()
if not stack:
results.append(answer_str[start:i])
i += 1
else:
i += 1
if len(results) == 0:
raise ValueError("Not enough boxed entries")
else:
results = [normalize_answer_string(result) for result in results]
if len(results) == 1:
# Single boxed entry, trivial case
return results
else:
# Multiple boxed entries. There are two cases possible
# (a) The reference solution has the same question answered in multiple ways
# (b) The answer is split across multiple boxed entries and we need to merge
result_equal = True
for idx in range(len(results) - 1):
if not (results[idx] == results[idx + 1]):
result_equal = False
break
if result_equal:
# Same problem solved in multiple ways
return [results[0]]
else:
return results
def extract_answer_dataset(solution: str, problem: str, corrected_answers: list) -> str:
entries = find_boxed_entries(solution)
if len(entries) == 1:
parsed_answer = entries[0]
if len(entries) > 1:
for item in corrected_answers:
if item["problem"] == problem:
parsed_answer = item["answer"]
break
else:
parsed_answer = ", ".join(entries)
if not (
("Find the equation" in problem)
or ("Enter the equation" in problem)
or ("What is the equation" in problem)
or ("described by the equation" in problem)
or ("Find an equation" in problem)
) and ("=" in parsed_answer):
if parsed_answer.count("=") == 1:
# For greater count, it means we're just predicting values of multiple variables
parsed_answer = parsed_answer.split("=")[1]
return parsed_answer
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc: dict, idx, corrected_answer) -> dict:
out_doc = {
"question": doc["problem"],
"question_id": idx,
"solution": doc["solution"],
"answer": extract_answer_dataset(
doc["solution"], doc["problem"], corrected_answer
),
}
return out_doc
corrected_answer_path = os.path.join(
os.path.dirname(__file__), "to_be_fixed_questions.json"
)
with open(corrected_answer_path, "r") as f:
corrected_answers = json.load(f)
return dataset.map(
partial(_process_doc, corrected_answer=corrected_answers), with_indices=True
)
def prompt_robustness_process_docs(doc: datasets.Dataset) -> datasets.Dataset:
doc = process_docs(doc)
return utils.process_docs_add_prompts(
doc,
PROMPT_ROBUSTNESS_TEMPLATE_KEY,
TEMPLATE_FILE_PATH,
)
def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
answer = extract_answer(results[0])
if math_equal(answer, doc["answer"]):
retval = 1
else:
retval = 0
prompt_id = doc["prompt_id"]
results = {
f"{prompt_id}_accuracy": (prompt_id, retval),
"consistency_rate": (doc["question_id"], answer),
}
return results
def per_prompt_accuracy(results: List[Dict[str, Any]], p_id=0) -> float:
accuracies = []
for result in results:
prompt_id, retval = result
if prompt_id != p_id:
continue
accuracies.append(retval)
accuracy = sum(accuracies) / len(accuracies)
eval_logger.info(f"Prompt - {prompt_id} accuracy: {accuracy}")
return np.round(accuracy, 4)
per_prompt_accuracy_0 = partial(per_prompt_accuracy, p_id=0)
per_prompt_accuracy_1 = partial(per_prompt_accuracy, p_id=1)
per_prompt_accuracy_2 = partial(per_prompt_accuracy, p_id=2)
per_prompt_accuracy_3 = partial(per_prompt_accuracy, p_id=3)
per_prompt_accuracy_4 = partial(per_prompt_accuracy, p_id=4)
per_prompt_accuracy_5 = partial(per_prompt_accuracy, p_id=5)
per_prompt_accuracy_6 = partial(per_prompt_accuracy, p_id=6)
per_prompt_accuracy_7 = partial(per_prompt_accuracy, p_id=7)
per_prompt_accuracy_8 = partial(per_prompt_accuracy, p_id=8)
per_prompt_accuracy_9 = partial(per_prompt_accuracy, p_id=9)
def calculate_consistency_rate(responses: List[List[str]]) -> float:
"""
Calculate the Consistency Rate (CR) for a given set of responses.
Args:
responses: List of lists, where each inner list contains responses to the same question.
Returns:
The consistency rate as a float.
"""
total_similarity = 0
total_combinations = 0
for response_set in responses:
pairs = combinations(response_set, 2)
num_pairs = len(response_set) * (len(response_set) - 1) / 2
total_combinations += num_pairs
for answer1, answer2 in pairs:
total_similarity += int(math_equal(answer1, answer2))
return total_similarity / total_combinations if total_combinations > 0 else 0.0
def math_prompt_consistency_rate(results: List[Dict[str, Any]]) -> float:
"""
Calculate the Consistency Rate (CR) for a given set of responses.
Args:
responses: List of lists, where each inner list contains responses to the same question.
Returns:
The consistency rate as a float.
"""
question_answers_dict = {}
for result in results:
question_id, answer = result
if question_id not in question_answers_dict:
question_answers_dict[question_id] = []
question_answers_dict[question_id].append(answer)
question_answers_list = [answers for answers in question_answers_dict.values()]
return calculate_consistency_rate(question_answers_list)
{
"option_order_robustness":{
"prompt": "For the multiple-choice question related to {category}, which option (A-J) is correct?.\n\nQuestion: {question}{options}\n\nEnd the answer with the following:\nThe best answer is (the_answer_letter) where the (the_answer_letter) is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'.",
"options_format": "\n{letter}: {option}"
},
"prompt_robustness": [
{
"prompt": "{question}{options}\nExamine the question and choose the correct answer from the options 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'. End your answer with:\nThe best answer is [the_answer_letter].\nwhere the [the_answer_letter] is a letter from A to J.",
"options_format": "\n{letter}: {option}"
},
{
"prompt": "{question}{options}\nAnswer the multiple-choice question about {category} by selecting the correct option from A to J. Always conclude with 'The best answer is (answer_letter)' where the (answer_letter) is one of A, B, C, D, E, F, G, H, I, J.",
"options_format": "\n{letter}: {option}"
},
{
"prompt": "You must reply with only a single letter from A, B, C, D, E, F, G, H, I or J to this question. Conclude with:\nThe best answer is answer_letter where the answer_letter is a single letter from A to J. \n{question}{options}",
"options_format": "\n{letter}: {option}"
},
{
"prompt": "From the options A-J, select the correct answer to the following question. End the answer with - The best answer is answer_letter, where answer_letter is one of A, B, C, D, E, F, G, H, I, or J.\nQuestion: {question}{options}",
"options_format": "\n{letter}: {option}"
},
{
"prompt": "For the multiple-choice question related to {category}, which option (A-J) is correct?.\n\nQuestion:{question}{options}\nEnd the answer with the following:\nThe best answer is (the_answer_letter) where the (the_answer_letter) is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'.",
"options_format": "\n{letter}: {option}"
},
{
"prompt": "Evaluate the multiple-choice question and select the most fitting response from 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'. \nQuestion:{question}{options}\nAlways conclude with:\nThe best answer is [the_answer_letter].\nwhere the [the_answer_letter] is one of A, B, C, D, E, F, G, H, I or J.",
"options_format": "\n{letter}: {option}"
},
{
"prompt": "Answer to the following question about {category} by selecting the correct option A, B, C, D, E, F, G, H, I or J. {question}{options}\nThe answer should end with:\nThe best answer is [the_answer_letter] where [the_answer_letter] is one of the letters A to J. Let's think step by step.",
"options_format": "\n{letter}: {option}"
},
{
"prompt": "Select the correct answer from the options 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I','J' for the question provided below. Conclude by stating: The best answer is answer_letter where answer_letter is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'. Let's think step by step.\nQuestion: {question}{options}",
"options_format": "\n{letter}: {option}"
},
{
"prompt": "{question}{options}\nFor this question about {category} with 10 possible answers A, B, C, D, E, F, G, H, I, J choose the one that answers the question. If the problem is simple or straightforward, just provide the answer. If the answer is more complex, use a step-by-step approach and for each step briefly explain your reasoning. Always conclude with 'The best answer is (answer_letter)' where the (answer_letter) is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I','J'. Let's think step by step.",
"options_format": "\n{letter}: {option}"
},
{
"prompt": "Read the question and options below, then determine the correct answer choice (A-J)\nQuestion: {question}{options}\n\nFor simple questions, provide a quick answer. For complicated ones, think step by step, break down the question into smaller problems and reach to a conclusion\nEnd your answer by stating:\nThe best answer is [the_answer_letter].\nwhere [the_answer_letter] is one of A, B, C, D, E, F, G, H, I, or J.",
"options_format": "\n{letter}: {option}"
}
]
}
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tag: score_robustness_mmlu_pro
task: score_option_order_robustness_mmlu_pro
dataset_path: TIGER-Lab/MMLU-Pro
dataset_name: default
output_type: generate_until
validation_split: validation
test_split: test
process_docs: !function utils_mmlu_pro.option_order_robustness_process_docs
doc_to_text: !function utils_mmlu_pro.mmlu_pro_robustness_doc_to_text
doc_to_target: answer
generation_kwargs:
until: []
max_gen_toks: 1024
do_sample: False
process_results: !function utils_mmlu_pro.option_order_robustness_process_results
metric_list:
- metric: per_option_macro_accuracy_A
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
higher_is_better: true
- metric: per_option_macro_accuracy_B
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
higher_is_better: true
- metric: per_option_macro_accuracy_C
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
higher_is_better: true
- metric: per_option_macro_accuracy_D
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
higher_is_better: true
- metric: per_option_macro_accuracy_E
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
higher_is_better: true
- metric: per_option_macro_accuracy_F
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
higher_is_better: true
- metric: per_option_macro_accuracy_G
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
higher_is_better: true
- metric: per_option_macro_accuracy_H
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
higher_is_better: true
- metric: per_option_macro_accuracy_I
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
higher_is_better: true
- metric: per_option_macro_accuracy_J
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
higher_is_better: true
- metric: options_consistency_rate
aggregation: !function utils_mmlu_pro.options_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tag: score_robustness_mmlu_pro
task: score_prompt_robustness_mmlu_pro
dataset_path: TIGER-Lab/MMLU-Pro
dataset_name: default
output_type: generate_until
validation_split: validation
test_split: test
process_docs: !function utils_mmlu_pro.prompt_robustness_process_docs
doc_to_text: !function utils_mmlu_pro.mmlu_pro_robustness_doc_to_text
doc_to_target: answer
generation_kwargs:
until: []
max_gen_toks: 1024
do_sample: False
process_results: !function utils_mmlu_pro.prompt_robustness_process_results
metric_list:
- metric: 0_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
higher_is_better: true
- metric: 1_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
higher_is_better: true
- metric: 2_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
higher_is_better: true
- metric: 3_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
higher_is_better: true
- metric: 4_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
higher_is_better: true
- metric: 5_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
higher_is_better: true
- metric: 6_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
higher_is_better: true
- metric: 7_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
higher_is_better: true
- metric: 8_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
higher_is_better: true
- metric: 9_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
higher_is_better: true
- metric: consistency_rate
aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
from functools import partial
from typing import Any, Dict, List
import numpy as np
from lm_eval.tasks.score import utils
from lm_eval.tasks.score.utils import prompt_consistency_rate, robustness_doc_to_text
from lm_eval.utils import eval_logger
TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY = "option_order_robustness"
QUESTION_KEY = "question"
LABELS = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
mmlu_pro_prompt_consistency_rate = prompt_consistency_rate
mmlu_pro_robustness_doc_to_text = robustness_doc_to_text
prompt_robustness_process_docs = partial(
utils.process_docs_add_prompts,
templates_key=PROMPT_ROBUSTNESS_TEMPLATE_KEY,
template_file_path=TEMPLATE_FILE_PATH,
)
option_order_robustness_process_docs = partial(
utils.option_order_robustness_process_docs,
template_file_path=TEMPLATE_FILE_PATH,
templates_key=OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY,
labels=LABELS,
)
def prompt_robustness_process_results(doc, results) -> Dict[str, float]:
final_answer = utils.__postprocess_pred(results[0])
final_answer = utils.translate_model_answer_to_labels(
final_answer, option_format=doc["options_format"], labels=LABELS
)
gt = LABELS[doc["answer_index"]]
prompt_id = doc["prompt_id"]
question_id = doc["question_id"]
category = doc["category"]
return {
f"{prompt_id}_macro_accuracy": (
question_id,
prompt_id,
final_answer,
gt,
category,
),
"consistency_rate": (question_id, prompt_id, final_answer, gt),
}
def option_order_robustness_process_results(doc, results) -> Dict[str, float]:
final_answer = utils.__postprocess_pred(results[0])
final_answer = utils.translate_model_answer_to_labels(
final_answer, option_format=doc["options_format"], labels=LABELS
)
gt = LABELS[doc["answer_index"]]
always_same_option = doc["always_same_option"]
question_id = doc["question_id"]
original_answer_index = doc["original_answer_index"]
answer_index = (doc["answer_index"],)
category = doc["category"]
return {
f"per_option_macro_accuracy_{always_same_option}": (
question_id,
always_same_option,
final_answer,
gt,
category,
),
"options_consistency_rate": (
question_id,
always_same_option,
final_answer,
original_answer_index,
answer_index,
),
}
def per_prompt_macro_accuracy(results: List[Dict[str, Any]], p_id=0) -> float:
accuracies = {}
for result in results:
question_id, prompt_id, final_answer, gt, category = result
if prompt_id != p_id:
continue
if category not in accuracies:
accuracies[category] = []
accuracies[category].append(final_answer == gt)
for key in accuracies:
accuracies[key] = sum(accuracies[key]) / len(accuracies[key])
eval_logger.info(
f"Prompt - {prompt_id}, category - {key} accuracy: {accuracies[key]}"
)
return np.round(np.mean([v for v in accuracies.values()]), 4)
per_prompt_accuracy_0 = partial(per_prompt_macro_accuracy, p_id=0)
per_prompt_accuracy_1 = partial(per_prompt_macro_accuracy, p_id=1)
per_prompt_accuracy_2 = partial(per_prompt_macro_accuracy, p_id=2)
per_prompt_accuracy_3 = partial(per_prompt_macro_accuracy, p_id=3)
per_prompt_accuracy_4 = partial(per_prompt_macro_accuracy, p_id=4)
per_prompt_accuracy_5 = partial(per_prompt_macro_accuracy, p_id=5)
per_prompt_accuracy_6 = partial(per_prompt_macro_accuracy, p_id=6)
per_prompt_accuracy_7 = partial(per_prompt_macro_accuracy, p_id=7)
per_prompt_accuracy_8 = partial(per_prompt_macro_accuracy, p_id=8)
per_prompt_accuracy_9 = partial(per_prompt_macro_accuracy, p_id=9)
def per_option_macro_accuracy(results: List[Dict[str, Any]], always_opt="a") -> float:
accuracies = {}
for result in results:
question_id, always_same_option, final_answer, gt, category = result
if always_opt != always_same_option:
continue
if category not in accuracies:
accuracies[category] = []
accuracies[category].append(int(final_answer == gt))
for key in accuracies:
accuracies[key] = sum(accuracies[key]) / len(accuracies[key])
eval_logger.info(
f"Prompt - {always_opt.upper()}, category - {key} accuracy: {accuracies[key]}"
)
return np.round(np.mean([v for v in accuracies.values()]), 4)
per_option_macro_accuracy_a = partial(per_option_macro_accuracy, always_opt="A")
per_option_macro_accuracy_b = partial(per_option_macro_accuracy, always_opt="B")
per_option_macro_accuracy_c = partial(per_option_macro_accuracy, always_opt="C")
per_option_macro_accuracy_d = partial(per_option_macro_accuracy, always_opt="D")
per_option_macro_accuracy_e = partial(per_option_macro_accuracy, always_opt="E")
per_option_macro_accuracy_f = partial(per_option_macro_accuracy, always_opt="F")
per_option_macro_accuracy_g = partial(per_option_macro_accuracy, always_opt="G")
per_option_macro_accuracy_h = partial(per_option_macro_accuracy, always_opt="H")
per_option_macro_accuracy_i = partial(per_option_macro_accuracy, always_opt="I")
per_option_macro_accuracy_j = partial(per_option_macro_accuracy, always_opt="J")
options_consistency_rate = partial(utils.options_consistency_rate, labels=LABELS)
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
group: score_robustness
task:
- score_robustness_agieval
- score_robustness_mmlu_pro
- score_robustness_math
metadata:
version: 1.0
# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import json
import re
import string
import sys
from functools import partial
from itertools import combinations
from typing import Any, Dict, List
import numpy as np
from datasets import Dataset
from lm_eval.utils import eval_logger
NUMERALS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
ROMAN_NUMERALS = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"]
def __repeat_elements(lst, n):
result = []
for element in lst:
result.extend([element] * n)
return result
def process_docs_add_prompts(
doc: Dataset,
templates_key: str,
template_file_path: str,
dataset_specific_preprocess: callable = None,
) -> Dataset:
try:
with open(template_file_path) as f:
prompt_templates = json.load(f)[templates_key]
except FileNotFoundError:
eval_logger.error("Prompt templates not found")
sys.exit()
if dataset_specific_preprocess is not None:
doc = dataset_specific_preprocess(doc)
def process_batch(batch):
n = len(prompt_templates)
initial_len = len(next(iter(batch.values())))
result = {key: __repeat_elements(values, n) for key, values in batch.items()}
result["prompt_id"] = list(range(n)) * initial_len
result["prompt"] = [prompt_templates[i]["prompt"] for i in result["prompt_id"]]
if "options_format" in prompt_templates[0]:
result["options_format"] = [
prompt_templates[i]["options_format"] for i in result["prompt_id"]
]
return result
return doc.map(process_batch, batched=True)
def option_order_robustness_process_docs(
doc: Dataset,
template_file_path: str,
templates_key: str,
labels: list,
dataset_specific_preprocess: callable = None,
) -> Dataset:
try:
with open(template_file_path) as f:
prompt_template = json.load(f)[templates_key]
prompt = prompt_template["prompt"]
options_format = prompt_template["options_format"]
except FileNotFoundError:
eval_logger.error("Prompt templates not found")
sys.exit()
if dataset_specific_preprocess is not None:
doc = dataset_specific_preprocess(doc)
def repeat_doc_swap_correct_answer(batched_docs):
initial_len = len(next(iter(batched_docs.values())))
keys = list(batched_docs.keys())
new_batched_docs = {key: [] for key in keys}
new_batched_docs["always_same_option"] = []
new_batched_docs["prompt"] = []
new_batched_docs["options_format"] = []
new_batched_docs["original_answer_index"] = []
for doc_ind in range(initial_len):
for label_ind, label in enumerate(labels):
new_batched_docs["original_answer_index"].append(
batched_docs["answer_index"][doc_ind]
)
for key in keys:
new_batched_docs[key].append(
copy.deepcopy(batched_docs[key][doc_ind])
)
if label_ind < len(batched_docs["options"][doc_ind]):
if key == "options":
# Swap correct answer with label_ind option
new_batched_docs[key][-1][label_ind] = batched_docs[
"options"
][doc_ind][batched_docs["answer_index"][doc_ind]]
new_batched_docs[key][-1][
batched_docs["answer_index"][doc_ind]
] = batched_docs["options"][doc_ind][label_ind]
if key == "answer_index":
new_batched_docs[key][-1] = label_ind
if key == "answer":
new_batched_docs[key][-1] = label
new_batched_docs["always_same_option"].append(label)
new_batched_docs["prompt"].append(prompt)
new_batched_docs["options_format"].append(options_format)
return new_batched_docs
return doc.map(repeat_doc_swap_correct_answer, batched=True)
def robustness_doc_to_text(doc: Dataset) -> str:
upper_case = string.ascii_uppercase
lower_case = string.ascii_lowercase
prompt = doc["prompt"]
options_format = doc.get("options_format", "")
question = doc["question"]
catrgory = doc.get("category", "")
options = None
if options_format:
options = "".join(
[
options_format.format(
letter=upper_case[i],
option=doc["options"][i],
numeral=NUMERALS[i],
roman_numeral=ROMAN_NUMERALS[i],
lower_case_letter=lower_case[i],
)
for i in range(len(doc["options"]))
]
)
return prompt.format(question=question, options=options, category=catrgory)
def __postprocess_pred(pred):
if "the best answer is" not in pred.lower():
return pred
pred_proc = (
pred.lower().split("the best answer is ")[-1].split("\n")[0].split(" ")[0]
)
pred_proc = re.sub(r"[^a-zA-Z0-9]", "", pred_proc).strip()
return pred_proc.upper()
def translate_model_answer_to_labels(answer, labels, option_format=None):
answer = answer.upper()
if option_format is None:
return answer
elif "numeral" in option_format:
if "roman" in option_format:
if answer not in ROMAN_NUMERALS:
return answer
else:
return labels[ROMAN_NUMERALS.index(answer)]
if answer not in NUMERALS:
return answer
else:
return labels[NUMERALS.index(answer)]
return answer
def calculate_consistency_rate(responses: List[List[str]]) -> float:
"""
Calculate the Consistency Rate (CR) for a given set of responses.
Args:
responses: List of lists, where each inner list contains responses to the same question.
Returns:
The consistency rate as a float.
"""
total_similarity = 0
total_combinations = 0
for response_set in responses:
pairs = combinations(response_set, 2)
num_pairs = len(response_set) * (len(response_set) - 1) / 2
total_combinations += num_pairs
for answer1, answer2 in pairs:
total_similarity += int(answer1 == answer2)
return total_similarity / total_combinations if total_combinations > 0 else 0.0
def prompt_consistency_rate(results: List[Dict[str, Any]]) -> float:
"""
Calculate the Consistency Rate (CR) for a given set of responses.
Args:
responses: List of lists, where each inner list contains responses to the same question.
Returns:
The consistency rate as a float.
"""
question_answers_dict = {}
for result in results:
question_id, prompt_id, final_answer, gt = result
if question_id not in question_answers_dict:
question_answers_dict[question_id] = []
question_answers_dict[question_id].append(final_answer)
question_answers_list = [answers for answers in question_answers_dict.values()]
return calculate_consistency_rate(question_answers_list)
def options_consistency_rate(results: List[Dict[str, Any]], labels) -> float:
"""
Calculate the Consistency Rate (CR) for a given set of responses.
Args:
responses: List of lists, where each inner list contains responses to the same question.
Returns:
The consistency rate as a float.
"""
question_answers_dict = {}
for result in results:
(
question_id,
always_same_option,
final_answer,
original_answer_index,
answer_index,
) = result
if final_answer == labels[original_answer_index]:
final_answer = always_same_option
if final_answer == always_same_option:
final_answer = labels[original_answer_index]
if question_id not in question_answers_dict:
question_answers_dict[question_id] = []
question_answers_dict[question_id].append(final_answer)
question_answers_list = [answers for answers in question_answers_dict.values()]
return calculate_consistency_rate(question_answers_list)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment