Score tasks (#2452)

* score readme added * generate until task's "until" parameter's default value fixed. * score mmlu-pro and agieval added * changed macro accuracy to micro for agieval * Always E removed from agi eval * redundancies removed * MATH added * minor cosmetic changes for math * Licenses added Readme updated * changes for flake8 + license header on math * Score added to readme and precommit was run. * Score added to readme and precommit was run. * Import error fixed * math task bugfix postprocess minor fix * CR for math added * math CR * math task bugfix postprocess minor fix CR for math added * Math cr fixed * reverting the default "until" parameter change and adjusting score task configs

Score tasks (#2452)
* score readme added * generate until task's "until" parameter's default value fixed. * score mmlu-pro and agieval added * changed macro accuracy to micro for agieval * Always E removed from agi eval * redundancies removed * MATH added * minor cosmetic changes for math * Licenses added Readme updated * changes for flake8 + license header on math * Score added to readme and precommit was run. * Score added to readme and precommit was run. * Import error fixed * math task bugfix postprocess minor fix * CR for math added * math CR * math task bugfix postprocess minor fix CR for math added * Math cr fixed * reverting the default "until" parameter change and adjusting score task configs
0ef7548d · Rima Shahbazyan · GitHub · 9d36354e · 0ef7548d · 0ef7548d
Unverified Commit 0ef7548d authored Nov 26, 2024 by Rima Shahbazyan Committed by GitHub Nov 26, 2024
20 changed files
--- a/lm_eval/tasks/score/agi_eval/utils_agieval.py
+++ b/lm_eval/tasks/score/agi_eval/utils_agieval.py
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from functools import partial
+from typing import Any, Dict, List
+
+import numpy as np
+from datasets import Dataset
+
+from lm_eval.tasks.score import utils
+from lm_eval.tasks.score.utils import prompt_consistency_rate, robustness_doc_to_text
+from lm_eval.utils import eval_logger
+
+
+TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
+
+PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
+OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY = "option_order_robustness"
+
+QUESTION_KEY = "query"
+ANSWER_INDEX_KEY = "gold"
+OPTIONS_KEY = "choices"
+
+LABELS = ["A", "B", "C", "D", "E"]
+
+agi_eval_prompt_consistency_rate = prompt_consistency_rate
+agi_eval_robustness_doc_to_text = robustness_doc_to_text
+
+
+def initial_process_docs(doc: Dataset) -> Dataset:
+    """
+    add question_id to the documents
+    """
+
+    bracket_pattern = r"^\([A-E]\)"
+    letter_space = r"^[A-E] "
+    letter_question_space = r"^[A-E]\? "
+
+    def __process(_doc, idx):
+        if "question" not in _doc:
+            question = _doc[QUESTION_KEY].split(" Answer Choices:")[0]
+            if question.startswith("Q: "):
+                question = question[3:]
+            _doc["question"] = question
+        if "question_id" not in _doc:
+            _doc["question_id"] = idx
+        if "answer_index" not in _doc:
+            _doc["answer_index"] = _doc[ANSWER_INDEX_KEY][0]
+        if "answer" not in _doc:
+            _doc["answer"] = LABELS[_doc["answer_index"]]
+        if "options" not in _doc:
+            prepared_options = []
+            for option in _doc[OPTIONS_KEY]:
+                if re.match(bracket_pattern, option):
+                    prepared_options.append(option[3:])
+                elif re.match(letter_space, option):
+                    prepared_options.append(option[2:])
+                elif re.match(letter_question_space, option):
+                    prepared_options.append(option[3:])
+                else:
+                    prepared_options.append(option)
+            _doc["options"] = prepared_options
+        return _doc
+
+    return doc.map(__process, with_indices=True)
+
+
+prompt_robustness_process_docs = partial(
+    utils.process_docs_add_prompts,
+    templates_key=PROMPT_ROBUSTNESS_TEMPLATE_KEY,
+    template_file_path=TEMPLATE_FILE_PATH,
+    dataset_specific_preprocess=initial_process_docs,
+)
+
+option_order_robustness_process_docs = partial(
+    utils.option_order_robustness_process_docs,
+    template_file_path=TEMPLATE_FILE_PATH,
+    templates_key=OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY,
+    labels=LABELS[:-1],
+    dataset_specific_preprocess=initial_process_docs,
+)
+
+
+def prompt_robustness_process_results(doc, results) -> Dict[str, float]:
+    final_answer = utils.__postprocess_pred(results[0])
+    final_answer = utils.translate_model_answer_to_labels(
+        final_answer, option_format=doc["options_format"], labels=LABELS
+    )
+    gt = LABELS[doc["answer_index"]]
+    prompt_id = doc["prompt_id"]
+    question_id = doc["question_id"]
+    return {
+        f"{prompt_id}_accuracy": (question_id, prompt_id, final_answer, gt),
+        "consistency_rate": (question_id, prompt_id, final_answer, gt),
+    }
+
+
+def option_order_robustness_process_results(doc, results) -> Dict[str, float]:
+    final_answer = utils.__postprocess_pred(results[0])
+    final_answer = utils.translate_model_answer_to_labels(
+        final_answer, option_format=doc["options_format"], labels=LABELS
+    )
+    gt = LABELS[doc["answer_index"]]
+    always_same_option = doc["always_same_option"]
+    question_id = doc["question_id"]
+    original_answer_index = doc["original_answer_index"]
+    answer_index = (doc["answer_index"],)
+    return {
+        f"per_option_accuracy_{always_same_option}": (
+            question_id,
+            always_same_option,
+            final_answer,
+            gt,
+        ),
+        "options_consistency_rate": (
+            question_id,
+            always_same_option,
+            final_answer,
+            original_answer_index,
+            answer_index,
+        ),
+    }
+
+
+def per_prompt_accuracy(results: List[Dict[str, Any]], p_id=0) -> float:
+    accuracies = []
+    for result in results:
+        question_id, prompt_id, final_answer, gt = result
+        if prompt_id != p_id:
+            continue
+        accuracies.append(final_answer == gt)
+
+    accuracie = sum(accuracies) / len(accuracies)
+    eval_logger.info(f"Prompt - {prompt_id} accuracy: {accuracie}")
+
+    return np.round(accuracie, 4)
+
+
+per_prompt_accuracy_0 = partial(per_prompt_accuracy, p_id=0)
+per_prompt_accuracy_1 = partial(per_prompt_accuracy, p_id=1)
+per_prompt_accuracy_2 = partial(per_prompt_accuracy, p_id=2)
+per_prompt_accuracy_3 = partial(per_prompt_accuracy, p_id=3)
+per_prompt_accuracy_4 = partial(per_prompt_accuracy, p_id=4)
+per_prompt_accuracy_5 = partial(per_prompt_accuracy, p_id=5)
+per_prompt_accuracy_6 = partial(per_prompt_accuracy, p_id=6)
+per_prompt_accuracy_7 = partial(per_prompt_accuracy, p_id=7)
+per_prompt_accuracy_8 = partial(per_prompt_accuracy, p_id=8)
+per_prompt_accuracy_9 = partial(per_prompt_accuracy, p_id=9)
+
+
+def per_option_accuracy(results: List[Dict[str, Any]], always_opt="a") -> float:
+    accuracies = []
+    for result in results:
+        question_id, always_same_option, final_answer, gt = result
+        if always_opt != always_same_option:
+            continue
+        accuracies.append(int(final_answer == gt))
+
+    accuracie = sum(accuracies) / len(accuracies)
+    eval_logger.info(f"Prompt - {always_opt.upper()} accuracy: {accuracie}")
+
+    return np.round(accuracie, 4)
+
+
+per_option_accuracy_a = partial(per_option_accuracy, always_opt="A")
+per_option_accuracy_b = partial(per_option_accuracy, always_opt="B")
+per_option_accuracy_c = partial(per_option_accuracy, always_opt="C")
+per_option_accuracy_d = partial(per_option_accuracy, always_opt="D")
+
+options_consistency_rate = partial(utils.options_consistency_rate, labels=LABELS)
--- a/lm_eval/tasks/score/math/math_grader.py
+++ b/lm_eval/tasks/score/math/math_grader.py
--- a/lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+task: prompt_robustness_math_algebra
+dataset_path: EleutherAI/hendrycks_math
+process_docs: !function utils_math.prompt_robustness_process_docs
+dataset_name: algebra
+output_type: generate_until
+test_split: test
+doc_to_text:  !function utils_math.math_robustness_doc_to_text
+process_results: !function utils_math.process_results
+doc_to_target: answer
+generation_kwargs:
+  until: []
+  do_sample: false
+  temperature: 0
+  max_gen_toks: 1024
+metric_list:
+  - metric: 0_accuracy
+    aggregation:  !function utils_math.per_prompt_accuracy_0
+    higher_is_better: true
+  - metric: 1_accuracy
+    aggregation:  !function utils_math.per_prompt_accuracy_1
+    higher_is_better: true
+  - metric: 2_accuracy
+    aggregation:  !function utils_math.per_prompt_accuracy_2
+    higher_is_better: true
+  - metric: 3_accuracy
+    aggregation:  !function utils_math.per_prompt_accuracy_3
+    higher_is_better: true
+  - metric: 4_accuracy
+    aggregation:  !function utils_math.per_prompt_accuracy_4
+    higher_is_better: true
+  - metric: 5_accuracy
+    aggregation:  !function utils_math.per_prompt_accuracy_5
+    higher_is_better: true
+  - metric: 6_accuracy
+    aggregation:  !function utils_math.per_prompt_accuracy_6
+    higher_is_better: true
+  - metric: 7_accuracy
+    aggregation:  !function utils_math.per_prompt_accuracy_7
+    higher_is_better: true
+  - metric: 8_accuracy
+    aggregation:  !function utils_math.per_prompt_accuracy_8
+    higher_is_better: true
+  - metric: 9_accuracy
+    aggregation:  !function utils_math.per_prompt_accuracy_9
+    higher_is_better: true
+  - metric: consistency_rate
+    aggregation:  !function utils_math.math_prompt_consistency_rate
+    higher_is_better: true
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/score/math/prompt_robustness_math_counting_and_prob.yaml
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_counting_and_prob.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: counting_and_probability
+task: prompt_robustness_math_counting_and_prob
--- a/lm_eval/tasks/score/math/prompt_robustness_math_geometry.yaml
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_geometry.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: geometry
+task: prompt_robustness_math_geometry
--- a/lm_eval/tasks/score/math/prompt_robustness_math_intermediate_algebra.yaml
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_intermediate_algebra.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: intermediate_algebra
+task: prompt_robustness_math_intermediate_algebra
--- a/lm_eval/tasks/score/math/prompt_robustness_math_num_theory.yaml
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_num_theory.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: number_theory
+task: prompt_robustness_math_num_theory
--- a/lm_eval/tasks/score/math/prompt_robustness_math_prealgebra.yaml
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_prealgebra.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: prealgebra
+task: prompt_robustness_math_prealgebra
--- a/lm_eval/tasks/score/math/prompt_robustness_math_precalc.yaml
+++ b/lm_eval/tasks/score/math/prompt_robustness_math_precalc.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include: prompt_robustness_math_algebra.yaml
+dataset_name: precalculus
+task: prompt_robustness_math_precalc
--- a/lm_eval/tasks/score/math/prompt_templates.json
+++ b/lm_eval/tasks/score/math/prompt_templates.json
+{
+    "prompt_robustness": [
+        {
+            "prompt": "Efficiently solve the following math challenge. Explain your approach step-by-step\nThe answer should end with: The final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem\nProblem: {question}\nLets think step by step"
+        },
+        {
+            "prompt": "You should solve this math problem.\nIf the problem is easy, provide a brief solution with little explanation.\nFor more difficult problems, follow this structured format\n## Step 1: [Brief description]\n[Simple explanation and calculations]\n\n## Step 2: [Brief description]\n[Simple explanation and calculations]\n\nRepeat steps until your reach a solution\n\nProblem: {question}\nEnd with:\nThe final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem."
+        },
+        {
+            "prompt": "Solve this math problem. Your anwer should end with 'The final answer is: $\\boxed{{answer}}$' where [answer] is just the final number or expression that solves the problem\nProblem: {question}"
+        },
+        {
+            "prompt": "Analyze and solve the math task.\nProblem: {question}\nEnd the answer with:\nThe final answer is: $\\boxed{{answer}}$ where [answer] is just the final number or expression that solves the problem."
+        },
+        {
+            "prompt": "{question}\nFind the solution to this math problem. Your answer should end with - The final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem."
+        },
+        {
+            "prompt": "Calculate the answer to this math problem\nProblem: {question}\nConclude your answer with:\nThe final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem."
+        },
+        {
+            "prompt": "{question}\nPlease solve this math problem efficiently. Finish with: The final answer is: $\\boxed{{answer}}$ where [answer] is just the final number or expression that solves the problem."
+        },
+        {
+            "prompt": "{question}\nSolve the following math problem\nShow each step of your solution\nConclude with:\nThe final answer is: $\\boxed{{answer}}$\nwhere [answer] is just the final number or expression that solves the problem\nLets think step by step"
+        },
+        {
+            "prompt": "Find the answer to the following math question. Conclude with: 'The final answer is: $\\boxed{{answer}}$'\nwhere [answer] is just the final number or expression that solves the problem\nProblem: {question}"
+        },
+        {
+            "prompt": "Please solve the math problem. For simple problems offer a quick solution with minimal details. For more challenging problems, explain your approach step-by-step. Finish with\nThe final answer is: $\\boxed{{answer}}$.\nwhere [answer] is just the final number or expression that solves the problem.\nProblem: {question}\nLets think step by step."
+        }
+    ]
+
+}
--- a/lm_eval/tasks/score/math/score_prompt_robustness_math.yaml
+++ b/lm_eval/tasks/score/math/score_prompt_robustness_math.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+group: score_prompt_robustness_math
+task:
+  - prompt_robustness_math_algebra
+  - prompt_robustness_math_counting_and_prob
+  - prompt_robustness_math_geometry
+  - prompt_robustness_math_intermediate_algebra
+  - prompt_robustness_math_num_theory
+  - prompt_robustness_math_prealgebra
+  - prompt_robustness_math_precalc
+
+aggregate_metric_list:
+  - metric: 0_accuracy
+    aggregation: mean
+    weight_by_size: true
+  - metric: 1_accuracy
+    aggregation: mean
+    weight_by_size: true
+  - metric: 2_accuracy
+    aggregation: mean
+    weight_by_size: true
+  - metric: 3_accuracy
+    aggregation: mean
+    weight_by_size: true
+  - metric: 4_accuracy
+    aggregation: mean
+    weight_by_size: true
+  - metric: 5_accuracy
+    aggregation: mean
+    weight_by_size: true
+  - metric: 6_accuracy
+    aggregation: mean
+    weight_by_size: true
+  - metric: 7_accuracy
+    aggregation: mean
+    weight_by_size: true
+  - metric: 8_accuracy
+    aggregation: mean
+    weight_by_size: true
+  - metric: 9_accuracy
+    aggregation: mean
+    weight_by_size: true
+  - metric: consistency_rate
+    aggregation: mean
+    weight_by_size: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/score/math/score_robustness_math.yaml
+++ b/lm_eval/tasks/score/math/score_robustness_math.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+group: score_robustness_math
+task:
+  - score_prompt_robustness_math
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/score/math/to_be_fixed_questions.json
+++ b/lm_eval/tasks/score/math/to_be_fixed_questions.json
+[
+    {
+        "id": "test/prealgebra/1088.json",
+        "problem": "Simplify $(5x+3) - 2(2x-4)$.",
+        "answer": "x+11"
+    },
+    {
+        "id": "test/algebra/1197.json",
+        "problem": "Two positive numbers $p$ and $q$ have the property that their sum is equal to their product. If their difference is $7$, what is $\\frac{1}{\\frac{1}{p^2}+\\frac{1}{q^2}}$? Your answer will be of the form $\\frac{a+b\\sqrt{c}}{d}$, where $a$ and $b$ don't both share the same common factor with $d$ and $c$ has no square as a factor. Find $a+b+c+d$.",
+        "answer": "161"
+    },
+    {
+        "id": "test/geometry/66.json",
+        "problem": "Square $ABCD$ has side lengths of 13 units. Point $E$ lies in the interior of the square such that $AE = 5$ units and $BE = 12$ units. What is the distance from $E$ to side $AD$?",
+        "answer": "\\frac{25}{13}"
+    },
+    {
+        "id": "test/geometry/1125.json",
+        "problem": "An aquarium has a rectangular base that measures 100 cm by 40 cm and has a height of 50 cm. The aquarium is filled with water to a depth of 37 cm. A rock with volume $1000 \\text{cm}^3$ is then placed in the aquarium and completely submerged. By how many centimeters does the water level rise?  Express your answer as a decimal to the nearest 100th.",
+        "answer": "0.25\\text{ cm}"
+    },
+    {
+        "id": "test/prealgebra/1407.json",
+        "problem": "What number must be placed in the box in the equation below to produce an equation that has more than one solution: \\[4x + 6 + 7x - 9 = 12x - 7 - x + \\boxed{\\phantom{2}}?\\]",
+        "answer": "4"
+    },
+    {
+        "id": "test/prealgebra/224.json",
+        "problem": "I am going to buy exotic fruits.  Dragonfruit costs $x-4$ dollars.  Starfruit is five dollars less expensive than rambutan.  Rambutan costs $2x$ dollars more than dragonfruit.  How much does it cost to buy one rambutan, two starfruit, and three dragonfruit? Your answer will be an expression that depends on $x$.",
+        "answer": "-34 + 12x"
+    },
+    {
+        "id": "test/prealgebra/177.json",
+        "problem": "Let $\\boxed{N}$ mean the number of whole number divisors of $N$. For example, $\\boxed{3}=2$, because $3$ has two divisors, $1$ and $3.$ Find the value of \\[\\boxed{\\boxed{11}\\times\\boxed{20}}\\]",
+        "answer": "12"
+    },
+    {
+        "id": "test/number_theory/459.json",
+        "problem": "On a particular map, $3$ inches on the map equates to $10$ miles in real life. If you know that the real life distance between two buildings on the map is $53.25$ miles, what would the distance between the buildings be (in inches) on the map, expressed as a fraction?",
+        "answer": "\\frac{639}{40}"
+    },
+    {
+        "id": "test/intermediate_algebra/702.json",
+        "problem": "Find the coordinates of either of the vertices of the hyperbola \\[16x^2+16x-4y^2-20y-85=0.\\](Enter your answer as an ordered pair. Enter the coordinates of one of the vertices, not both.)",
+        "answer": "\\left(-\\tfrac52, -\\tfrac52\\right)"
+    },
+    {
+        "id": "test/intermediate_algebra/25.json",
+        "problem": "Find the coordinates of one of the foci of the hyperbola \\[x^2 - 10x = 4y^2 - 5.\\](Enter your answer as an ordered pair. Enter only one of the foci, not both.)",
+        "answer": "(0,0)"
+    },
+    {
+        "id": "test/intermediate_algebra/747.json",
+        "problem": "The graph of $y = f(x)$ passes through the point $(-3,5).$  If $f(x)$ is an odd function, then what other point must the graph pass through?  Enter your answer as an ordered pair.",
+        "answer": "(0,0)"
+    }
+]
--- a/lm_eval/tasks/score/math/utils_math.py
+++ b/lm_eval/tasks/score/math/utils_math.py
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from functools import partial
+from itertools import combinations
+from typing import Any, Dict, List
+
+import datasets
+import numpy as np
+
+from lm_eval.tasks.score import utils
+from lm_eval.tasks.score.math.math_grader import (
+    extract_answer,
+    math_equal,
+    normalize_answer_string,
+)
+from lm_eval.tasks.score.utils import robustness_doc_to_text
+from lm_eval.utils import eval_logger
+
+
+TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
+
+PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
+
+math_robustness_doc_to_text = robustness_doc_to_text
+
+
+def find_boxed_entries(answer_str):
+    stack = []
+    results = []
+    i = 0
+
+    while i < len(answer_str):
+        if answer_str[i : i + 7] == "\\boxed{":
+            stack.append(i + 7)
+            i += 7
+        elif answer_str[i] == "{":
+            if stack:
+                stack.append(i + 1)
+            i += 1
+        elif answer_str[i] == "}":
+            if stack:
+                start = stack.pop()
+                if not stack:
+                    results.append(answer_str[start:i])
+            i += 1
+        else:
+            i += 1
+
+    if len(results) == 0:
+        raise ValueError("Not enough boxed entries")
+    else:
+        results = [normalize_answer_string(result) for result in results]
+
+    if len(results) == 1:
+        # Single boxed entry, trivial case
+        return results
+
+    else:
+        # Multiple boxed entries. There are two cases possible
+        # (a) The reference solution has the same question answered in multiple ways
+        # (b) The answer is split across multiple boxed entries and we need to merge
+        result_equal = True
+        for idx in range(len(results) - 1):
+            if not (results[idx] == results[idx + 1]):
+                result_equal = False
+                break
+
+        if result_equal:
+            # Same problem solved in multiple ways
+            return [results[0]]
+        else:
+            return results
+
+
+def extract_answer_dataset(solution: str, problem: str, corrected_answers: list) -> str:
+    entries = find_boxed_entries(solution)
+
+    if len(entries) == 1:
+        parsed_answer = entries[0]
+
+    if len(entries) > 1:
+        for item in corrected_answers:
+            if item["problem"] == problem:
+                parsed_answer = item["answer"]
+                break
+        else:
+            parsed_answer = ", ".join(entries)
+
+    if not (
+        ("Find the equation" in problem)
+        or ("Enter the equation" in problem)
+        or ("What is the equation" in problem)
+        or ("described by the equation" in problem)
+        or ("Find an equation" in problem)
+    ) and ("=" in parsed_answer):
+        if parsed_answer.count("=") == 1:
+            # For greater count, it means we're just predicting values of multiple variables
+            parsed_answer = parsed_answer.split("=")[1]
+    return parsed_answer
+
+
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc: dict, idx, corrected_answer) -> dict:
+        out_doc = {
+            "question": doc["problem"],
+            "question_id": idx,
+            "solution": doc["solution"],
+            "answer": extract_answer_dataset(
+                doc["solution"], doc["problem"], corrected_answer
+            ),
+        }
+        return out_doc
+
+    corrected_answer_path = os.path.join(
+        os.path.dirname(__file__), "to_be_fixed_questions.json"
+    )
+
+    with open(corrected_answer_path, "r") as f:
+        corrected_answers = json.load(f)
+
+    return dataset.map(
+        partial(_process_doc, corrected_answer=corrected_answers), with_indices=True
+    )
+
+
+def prompt_robustness_process_docs(doc: datasets.Dataset) -> datasets.Dataset:
+    doc = process_docs(doc)
+    return utils.process_docs_add_prompts(
+        doc,
+        PROMPT_ROBUSTNESS_TEMPLATE_KEY,
+        TEMPLATE_FILE_PATH,
+    )
+
+
+def process_results(doc: dict, results: List[str]) -> Dict[str, int]:
+    answer = extract_answer(results[0])
+
+    if math_equal(answer, doc["answer"]):
+        retval = 1
+    else:
+        retval = 0
+
+    prompt_id = doc["prompt_id"]
+
+    results = {
+        f"{prompt_id}_accuracy": (prompt_id, retval),
+        "consistency_rate": (doc["question_id"], answer),
+    }
+    return results
+
+
+def per_prompt_accuracy(results: List[Dict[str, Any]], p_id=0) -> float:
+    accuracies = []
+    for result in results:
+        prompt_id, retval = result
+        if prompt_id != p_id:
+            continue
+        accuracies.append(retval)
+
+    accuracy = sum(accuracies) / len(accuracies)
+    eval_logger.info(f"Prompt - {prompt_id} accuracy: {accuracy}")
+
+    return np.round(accuracy, 4)
+
+
+per_prompt_accuracy_0 = partial(per_prompt_accuracy, p_id=0)
+per_prompt_accuracy_1 = partial(per_prompt_accuracy, p_id=1)
+per_prompt_accuracy_2 = partial(per_prompt_accuracy, p_id=2)
+per_prompt_accuracy_3 = partial(per_prompt_accuracy, p_id=3)
+per_prompt_accuracy_4 = partial(per_prompt_accuracy, p_id=4)
+per_prompt_accuracy_5 = partial(per_prompt_accuracy, p_id=5)
+per_prompt_accuracy_6 = partial(per_prompt_accuracy, p_id=6)
+per_prompt_accuracy_7 = partial(per_prompt_accuracy, p_id=7)
+per_prompt_accuracy_8 = partial(per_prompt_accuracy, p_id=8)
+per_prompt_accuracy_9 = partial(per_prompt_accuracy, p_id=9)
+
+
+def calculate_consistency_rate(responses: List[List[str]]) -> float:
+    """
+    Calculate the Consistency Rate (CR) for a given set of responses.
+
+    Args:
+    responses: List of lists, where each inner list contains responses to the same question.
+
+    Returns:
+    The consistency rate as a float.
+    """
+    total_similarity = 0
+    total_combinations = 0
+
+    for response_set in responses:
+        pairs = combinations(response_set, 2)
+        num_pairs = len(response_set) * (len(response_set) - 1) / 2
+        total_combinations += num_pairs
+        for answer1, answer2 in pairs:
+            total_similarity += int(math_equal(answer1, answer2))
+
+    return total_similarity / total_combinations if total_combinations > 0 else 0.0
+
+
+def math_prompt_consistency_rate(results: List[Dict[str, Any]]) -> float:
+    """
+    Calculate the Consistency Rate (CR) for a given set of responses.
+
+    Args:
+    responses: List of lists, where each inner list contains responses to the same question.
+
+    Returns:
+    The consistency rate as a float.
+    """
+    question_answers_dict = {}
+
+    for result in results:
+        question_id, answer = result
+        if question_id not in question_answers_dict:
+            question_answers_dict[question_id] = []
+        question_answers_dict[question_id].append(answer)
+
+    question_answers_list = [answers for answers in question_answers_dict.values()]
+
+    return calculate_consistency_rate(question_answers_list)
--- a/lm_eval/tasks/score/mmlu_pro/prompt_templates.json
+++ b/lm_eval/tasks/score/mmlu_pro/prompt_templates.json
+{
+    "option_order_robustness":{
+        "prompt": "For the multiple-choice question related to {category}, which option (A-J) is correct?.\n\nQuestion: {question}{options}\n\nEnd the answer with the following:\nThe best answer is (the_answer_letter) where the (the_answer_letter) is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'.",
+        "options_format": "\n{letter}: {option}"
+    },
+
+    "prompt_robustness": [
+            {
+                "prompt": "{question}{options}\nExamine the question and choose the correct answer from the options 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'. End your answer with:\nThe best answer is [the_answer_letter].\nwhere the [the_answer_letter] is a letter from A to J.",
+                "options_format": "\n{letter}: {option}"
+        },
+            {
+                "prompt": "{question}{options}\nAnswer the multiple-choice question about {category} by selecting the correct option from A to J. Always conclude with 'The best answer is (answer_letter)' where the (answer_letter) is one of A, B, C, D, E, F, G, H, I, J.",
+                "options_format": "\n{letter}: {option}"
+        },
+            {
+                "prompt": "You must reply with only a single letter from A, B, C, D, E, F, G, H, I or J to this question. Conclude with:\nThe best answer is answer_letter where the answer_letter is a single letter from A to J. \n{question}{options}",
+                "options_format": "\n{letter}: {option}"
+        },
+            {
+                "prompt": "From the options A-J, select the correct answer to the following question. End the answer with - The best answer is answer_letter, where answer_letter is one of A, B, C, D, E, F, G, H, I, or J.\nQuestion: {question}{options}",
+                "options_format": "\n{letter}: {option}"
+        },
+            {
+                "prompt": "For the multiple-choice question related to {category}, which option (A-J) is correct?.\n\nQuestion:{question}{options}\nEnd the answer with the following:\nThe best answer is (the_answer_letter) where the (the_answer_letter) is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'.",
+                "options_format": "\n{letter}: {option}"
+        },
+            {
+                "prompt": "Evaluate the multiple-choice question and select the most fitting response from 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'. \nQuestion:{question}{options}\nAlways conclude with:\nThe best answer is [the_answer_letter].\nwhere the [the_answer_letter] is one of A, B, C, D, E, F, G, H, I or J.",
+                "options_format": "\n{letter}: {option}"
+        },
+            {
+                "prompt": "Answer to the following question about {category} by selecting the correct option A, B, C, D, E, F, G, H, I or J. {question}{options}\nThe answer should end with:\nThe best answer is [the_answer_letter] where [the_answer_letter] is one of the letters A to J. Let's think step by step.",
+                "options_format": "\n{letter}: {option}"
+        },
+            {
+                "prompt": "Select the correct answer from the options 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I','J' for the question provided below. Conclude by stating: The best answer is answer_letter where answer_letter is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I' or 'J'. Let's think step by step.\nQuestion: {question}{options}",
+                "options_format": "\n{letter}: {option}"
+        },
+            {
+                "prompt": "{question}{options}\nFor this question about {category} with 10 possible answers A, B, C, D, E, F, G, H, I, J choose the one that answers the question. If the problem is simple or straightforward, just provide the answer. If the answer is more complex, use a step-by-step approach and for each step briefly explain your reasoning. Always conclude with 'The best answer is (answer_letter)' where the (answer_letter) is one of 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I','J'. Let's think step by step.",
+                "options_format": "\n{letter}: {option}"
+        },
+            {
+                "prompt": "Read the question and options below, then determine the correct answer choice (A-J)\nQuestion: {question}{options}\n\nFor simple questions, provide a quick answer. For complicated ones, think step by step, break down the question into smaller problems and reach to a conclusion\nEnd your answer by stating:\nThe best answer is [the_answer_letter].\nwhere [the_answer_letter] is one of A, B, C, D, E, F, G, H, I, or J.",
+                "options_format": "\n{letter}: {option}"
+        }
+
+    ]
+
+}
--- a/lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
+++ b/lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+tag: score_robustness_mmlu_pro
+task: score_option_order_robustness_mmlu_pro
+dataset_path: TIGER-Lab/MMLU-Pro
+dataset_name: default
+output_type: generate_until
+validation_split: validation
+test_split: test
+process_docs: !function utils_mmlu_pro.option_order_robustness_process_docs
+doc_to_text: !function utils_mmlu_pro.mmlu_pro_robustness_doc_to_text
+doc_to_target: answer
+generation_kwargs:
+  until: []
+  max_gen_toks: 1024
+  do_sample: False
+process_results: !function utils_mmlu_pro.option_order_robustness_process_results
+metric_list:
+  - metric: per_option_macro_accuracy_A
+    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_a
+    higher_is_better: true
+  - metric: per_option_macro_accuracy_B
+    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_b
+    higher_is_better: true
+  - metric: per_option_macro_accuracy_C
+    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_c
+    higher_is_better: true
+  - metric: per_option_macro_accuracy_D
+    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_d
+    higher_is_better: true
+  - metric: per_option_macro_accuracy_E
+    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_e
+    higher_is_better: true
+  - metric: per_option_macro_accuracy_F
+    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_f
+    higher_is_better: true
+  - metric: per_option_macro_accuracy_G
+    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_g
+    higher_is_better: true
+  - metric: per_option_macro_accuracy_H
+    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_h
+    higher_is_better: true
+  - metric: per_option_macro_accuracy_I
+    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_i
+    higher_is_better: true
+  - metric: per_option_macro_accuracy_J
+    aggregation:  !function utils_mmlu_pro.per_option_macro_accuracy_j
+    higher_is_better: true
+  - metric: options_consistency_rate
+    aggregation:  !function utils_mmlu_pro.options_consistency_rate
+    higher_is_better: true
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
+++ b/lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+tag: score_robustness_mmlu_pro
+task: score_prompt_robustness_mmlu_pro
+dataset_path: TIGER-Lab/MMLU-Pro
+dataset_name: default
+output_type: generate_until
+validation_split: validation
+test_split: test
+process_docs: !function utils_mmlu_pro.prompt_robustness_process_docs
+doc_to_text: !function utils_mmlu_pro.mmlu_pro_robustness_doc_to_text
+doc_to_target: answer
+generation_kwargs:
+  until: []
+  max_gen_toks: 1024
+  do_sample: False
+process_results: !function utils_mmlu_pro.prompt_robustness_process_results
+metric_list:
+  - metric: 0_macro_accuracy
+    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_0
+    higher_is_better: true
+  - metric: 1_macro_accuracy
+    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_1
+    higher_is_better: true
+  - metric: 2_macro_accuracy
+    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_2
+    higher_is_better: true
+  - metric: 3_macro_accuracy
+    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_3
+    higher_is_better: true
+  - metric: 4_macro_accuracy
+    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_4
+    higher_is_better: true
+  - metric: 5_macro_accuracy
+    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_5
+    higher_is_better: true
+  - metric: 6_macro_accuracy
+    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_6
+    higher_is_better: true
+  - metric: 7_macro_accuracy
+    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_7
+    higher_is_better: true
+  - metric: 8_macro_accuracy
+    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_8
+    higher_is_better: true
+  - metric: 9_macro_accuracy
+    aggregation:  !function utils_mmlu_pro.per_prompt_accuracy_9
+    higher_is_better: true
+  - metric: consistency_rate
+    aggregation:  !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
+    higher_is_better: true
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/score/mmlu_pro/utils_mmlu_pro.py
+++ b/lm_eval/tasks/score/mmlu_pro/utils_mmlu_pro.py
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from functools import partial
+from typing import Any, Dict, List
+
+import numpy as np
+
+from lm_eval.tasks.score import utils
+from lm_eval.tasks.score.utils import prompt_consistency_rate, robustness_doc_to_text
+from lm_eval.utils import eval_logger
+
+
+TEMPLATE_FILE_PATH = os.path.join(os.path.dirname(__file__), "prompt_templates.json")
+
+PROMPT_ROBUSTNESS_TEMPLATE_KEY = "prompt_robustness"
+OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY = "option_order_robustness"
+
+QUESTION_KEY = "question"
+
+LABELS = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"]
+
+mmlu_pro_prompt_consistency_rate = prompt_consistency_rate
+mmlu_pro_robustness_doc_to_text = robustness_doc_to_text
+
+
+prompt_robustness_process_docs = partial(
+    utils.process_docs_add_prompts,
+    templates_key=PROMPT_ROBUSTNESS_TEMPLATE_KEY,
+    template_file_path=TEMPLATE_FILE_PATH,
+)
+
+option_order_robustness_process_docs = partial(
+    utils.option_order_robustness_process_docs,
+    template_file_path=TEMPLATE_FILE_PATH,
+    templates_key=OPTION_ORDER_ROBUSTNESS_TEMPLATE_KEY,
+    labels=LABELS,
+)
+
+
+def prompt_robustness_process_results(doc, results) -> Dict[str, float]:
+    final_answer = utils.__postprocess_pred(results[0])
+    final_answer = utils.translate_model_answer_to_labels(
+        final_answer, option_format=doc["options_format"], labels=LABELS
+    )
+    gt = LABELS[doc["answer_index"]]
+    prompt_id = doc["prompt_id"]
+    question_id = doc["question_id"]
+    category = doc["category"]
+    return {
+        f"{prompt_id}_macro_accuracy": (
+            question_id,
+            prompt_id,
+            final_answer,
+            gt,
+            category,
+        ),
+        "consistency_rate": (question_id, prompt_id, final_answer, gt),
+    }
+
+
+def option_order_robustness_process_results(doc, results) -> Dict[str, float]:
+    final_answer = utils.__postprocess_pred(results[0])
+    final_answer = utils.translate_model_answer_to_labels(
+        final_answer, option_format=doc["options_format"], labels=LABELS
+    )
+    gt = LABELS[doc["answer_index"]]
+    always_same_option = doc["always_same_option"]
+    question_id = doc["question_id"]
+    original_answer_index = doc["original_answer_index"]
+    answer_index = (doc["answer_index"],)
+    category = doc["category"]
+    return {
+        f"per_option_macro_accuracy_{always_same_option}": (
+            question_id,
+            always_same_option,
+            final_answer,
+            gt,
+            category,
+        ),
+        "options_consistency_rate": (
+            question_id,
+            always_same_option,
+            final_answer,
+            original_answer_index,
+            answer_index,
+        ),
+    }
+
+
+def per_prompt_macro_accuracy(results: List[Dict[str, Any]], p_id=0) -> float:
+    accuracies = {}
+    for result in results:
+        question_id, prompt_id, final_answer, gt, category = result
+        if prompt_id != p_id:
+            continue
+        if category not in accuracies:
+            accuracies[category] = []
+        accuracies[category].append(final_answer == gt)
+
+    for key in accuracies:
+        accuracies[key] = sum(accuracies[key]) / len(accuracies[key])
+        eval_logger.info(
+            f"Prompt - {prompt_id}, category - {key} accuracy: {accuracies[key]}"
+        )
+
+    return np.round(np.mean([v for v in accuracies.values()]), 4)
+
+
+per_prompt_accuracy_0 = partial(per_prompt_macro_accuracy, p_id=0)
+per_prompt_accuracy_1 = partial(per_prompt_macro_accuracy, p_id=1)
+per_prompt_accuracy_2 = partial(per_prompt_macro_accuracy, p_id=2)
+per_prompt_accuracy_3 = partial(per_prompt_macro_accuracy, p_id=3)
+per_prompt_accuracy_4 = partial(per_prompt_macro_accuracy, p_id=4)
+per_prompt_accuracy_5 = partial(per_prompt_macro_accuracy, p_id=5)
+per_prompt_accuracy_6 = partial(per_prompt_macro_accuracy, p_id=6)
+per_prompt_accuracy_7 = partial(per_prompt_macro_accuracy, p_id=7)
+per_prompt_accuracy_8 = partial(per_prompt_macro_accuracy, p_id=8)
+per_prompt_accuracy_9 = partial(per_prompt_macro_accuracy, p_id=9)
+
+
+def per_option_macro_accuracy(results: List[Dict[str, Any]], always_opt="a") -> float:
+    accuracies = {}
+    for result in results:
+        question_id, always_same_option, final_answer, gt, category = result
+        if always_opt != always_same_option:
+            continue
+        if category not in accuracies:
+            accuracies[category] = []
+        accuracies[category].append(int(final_answer == gt))
+
+    for key in accuracies:
+        accuracies[key] = sum(accuracies[key]) / len(accuracies[key])
+        eval_logger.info(
+            f"Prompt - {always_opt.upper()}, category - {key} accuracy: {accuracies[key]}"
+        )
+
+    return np.round(np.mean([v for v in accuracies.values()]), 4)
+
+
+per_option_macro_accuracy_a = partial(per_option_macro_accuracy, always_opt="A")
+per_option_macro_accuracy_b = partial(per_option_macro_accuracy, always_opt="B")
+per_option_macro_accuracy_c = partial(per_option_macro_accuracy, always_opt="C")
+per_option_macro_accuracy_d = partial(per_option_macro_accuracy, always_opt="D")
+per_option_macro_accuracy_e = partial(per_option_macro_accuracy, always_opt="E")
+per_option_macro_accuracy_f = partial(per_option_macro_accuracy, always_opt="F")
+per_option_macro_accuracy_g = partial(per_option_macro_accuracy, always_opt="G")
+per_option_macro_accuracy_h = partial(per_option_macro_accuracy, always_opt="H")
+per_option_macro_accuracy_i = partial(per_option_macro_accuracy, always_opt="I")
+per_option_macro_accuracy_j = partial(per_option_macro_accuracy, always_opt="J")
+
+options_consistency_rate = partial(utils.options_consistency_rate, labels=LABELS)
--- a/lm_eval/tasks/score/score_robustness.yaml
+++ b/lm_eval/tasks/score/score_robustness.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#    http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+group: score_robustness
+task:
+  - score_robustness_agieval
+  - score_robustness_mmlu_pro
+  - score_robustness_math
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/score/utils.py
+++ b/lm_eval/tasks/score/utils.py
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import json
+import re
+import string
+import sys
+from functools import partial
+from itertools import combinations
+from typing import Any, Dict, List
+
+import numpy as np
+from datasets import Dataset
+
+from lm_eval.utils import eval_logger
+
+
+NUMERALS = ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"]
+ROMAN_NUMERALS = ["I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", "X"]
+
+
+def __repeat_elements(lst, n):
+    result = []
+    for element in lst:
+        result.extend([element] * n)
+    return result
+
+
+def process_docs_add_prompts(
+    doc: Dataset,
+    templates_key: str,
+    template_file_path: str,
+    dataset_specific_preprocess: callable = None,
+) -> Dataset:
+    try:
+        with open(template_file_path) as f:
+            prompt_templates = json.load(f)[templates_key]
+    except FileNotFoundError:
+        eval_logger.error("Prompt templates not found")
+        sys.exit()
+    if dataset_specific_preprocess is not None:
+        doc = dataset_specific_preprocess(doc)
+
+    def process_batch(batch):
+        n = len(prompt_templates)
+        initial_len = len(next(iter(batch.values())))
+
+        result = {key: __repeat_elements(values, n) for key, values in batch.items()}
+        result["prompt_id"] = list(range(n)) * initial_len
+        result["prompt"] = [prompt_templates[i]["prompt"] for i in result["prompt_id"]]
+        if "options_format" in prompt_templates[0]:
+            result["options_format"] = [
+                prompt_templates[i]["options_format"] for i in result["prompt_id"]
+            ]
+        return result
+
+    return doc.map(process_batch, batched=True)
+
+
+def option_order_robustness_process_docs(
+    doc: Dataset,
+    template_file_path: str,
+    templates_key: str,
+    labels: list,
+    dataset_specific_preprocess: callable = None,
+) -> Dataset:
+    try:
+        with open(template_file_path) as f:
+            prompt_template = json.load(f)[templates_key]
+            prompt = prompt_template["prompt"]
+            options_format = prompt_template["options_format"]
+    except FileNotFoundError:
+        eval_logger.error("Prompt templates not found")
+        sys.exit()
+
+    if dataset_specific_preprocess is not None:
+        doc = dataset_specific_preprocess(doc)
+
+    def repeat_doc_swap_correct_answer(batched_docs):
+        initial_len = len(next(iter(batched_docs.values())))
+        keys = list(batched_docs.keys())
+        new_batched_docs = {key: [] for key in keys}
+        new_batched_docs["always_same_option"] = []
+        new_batched_docs["prompt"] = []
+        new_batched_docs["options_format"] = []
+        new_batched_docs["original_answer_index"] = []
+
+        for doc_ind in range(initial_len):
+            for label_ind, label in enumerate(labels):
+                new_batched_docs["original_answer_index"].append(
+                    batched_docs["answer_index"][doc_ind]
+                )
+                for key in keys:
+                    new_batched_docs[key].append(
+                        copy.deepcopy(batched_docs[key][doc_ind])
+                    )
+                    if label_ind < len(batched_docs["options"][doc_ind]):
+                        if key == "options":
+                            # Swap correct answer with label_ind option
+                            new_batched_docs[key][-1][label_ind] = batched_docs[
+                                "options"
+                            ][doc_ind][batched_docs["answer_index"][doc_ind]]
+                            new_batched_docs[key][-1][
+                                batched_docs["answer_index"][doc_ind]
+                            ] = batched_docs["options"][doc_ind][label_ind]
+
+                        if key == "answer_index":
+                            new_batched_docs[key][-1] = label_ind
+
+                        if key == "answer":
+                            new_batched_docs[key][-1] = label
+
+                new_batched_docs["always_same_option"].append(label)
+                new_batched_docs["prompt"].append(prompt)
+                new_batched_docs["options_format"].append(options_format)
+        return new_batched_docs
+
+    return doc.map(repeat_doc_swap_correct_answer, batched=True)
+
+
+def robustness_doc_to_text(doc: Dataset) -> str:
+    upper_case = string.ascii_uppercase
+    lower_case = string.ascii_lowercase
+    prompt = doc["prompt"]
+    options_format = doc.get("options_format", "")
+    question = doc["question"]
+    catrgory = doc.get("category", "")
+    options = None
+    if options_format:
+        options = "".join(
+            [
+                options_format.format(
+                    letter=upper_case[i],
+                    option=doc["options"][i],
+                    numeral=NUMERALS[i],
+                    roman_numeral=ROMAN_NUMERALS[i],
+                    lower_case_letter=lower_case[i],
+                )
+                for i in range(len(doc["options"]))
+            ]
+        )
+    return prompt.format(question=question, options=options, category=catrgory)
+
+
+def __postprocess_pred(pred):
+    if "the best answer is" not in pred.lower():
+        return pred
+    pred_proc = (
+        pred.lower().split("the best answer is ")[-1].split("\n")[0].split(" ")[0]
+    )
+    pred_proc = re.sub(r"[^a-zA-Z0-9]", "", pred_proc).strip()
+    return pred_proc.upper()
+
+
+def translate_model_answer_to_labels(answer, labels, option_format=None):
+    answer = answer.upper()
+
+    if option_format is None:
+        return answer
+
+    elif "numeral" in option_format:
+        if "roman" in option_format:
+            if answer not in ROMAN_NUMERALS:
+                return answer
+            else:
+                return labels[ROMAN_NUMERALS.index(answer)]
+
+        if answer not in NUMERALS:
+            return answer
+        else:
+            return labels[NUMERALS.index(answer)]
+
+    return answer
+
+
+def calculate_consistency_rate(responses: List[List[str]]) -> float:
+    """
+    Calculate the Consistency Rate (CR) for a given set of responses.
+
+    Args:
+    responses: List of lists, where each inner list contains responses to the same question.
+
+    Returns:
+    The consistency rate as a float.
+    """
+    total_similarity = 0
+    total_combinations = 0
+
+    for response_set in responses:
+        pairs = combinations(response_set, 2)
+        num_pairs = len(response_set) * (len(response_set) - 1) / 2
+        total_combinations += num_pairs
+        for answer1, answer2 in pairs:
+            total_similarity += int(answer1 == answer2)
+
+    return total_similarity / total_combinations if total_combinations > 0 else 0.0
+
+
+def prompt_consistency_rate(results: List[Dict[str, Any]]) -> float:
+    """
+    Calculate the Consistency Rate (CR) for a given set of responses.
+
+    Args:
+    responses: List of lists, where each inner list contains responses to the same question.
+
+    Returns:
+    The consistency rate as a float.
+    """
+    question_answers_dict = {}
+
+    for result in results:
+        question_id, prompt_id, final_answer, gt = result
+        if question_id not in question_answers_dict:
+            question_answers_dict[question_id] = []
+        question_answers_dict[question_id].append(final_answer)
+
+    question_answers_list = [answers for answers in question_answers_dict.values()]
+
+    return calculate_consistency_rate(question_answers_list)
+
+
+def options_consistency_rate(results: List[Dict[str, Any]], labels) -> float:
+    """
+    Calculate the Consistency Rate (CR) for a given set of responses.
+
+    Args:
+    responses: List of lists, where each inner list contains responses to the same question.
+
+    Returns:
+    The consistency rate as a float.
+    """
+    question_answers_dict = {}
+    for result in results:
+        (
+            question_id,
+            always_same_option,
+            final_answer,
+            original_answer_index,
+            answer_index,
+        ) = result
+        if final_answer == labels[original_answer_index]:
+            final_answer = always_same_option
+        if final_answer == always_same_option:
+            final_answer = labels[original_answer_index]
+        if question_id not in question_answers_dict:
+            question_answers_dict[question_id] = []
+        question_answers_dict[question_id].append(final_answer)
+
+    question_answers_list = [answers for answers in question_answers_dict.values()]
+
+    return calculate_consistency_rate(question_answers_list)