Initial commit

c289ecc0 · xinghao · c289ecc0 · c289ecc0 · c289ecc0 · c289ecc0
Commit c289ecc0 authored Oct 21, 2025 by xinghao
20 changed files
--- a/opencompass/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_ppl_f37e78.py
+++ b/opencompass/configs/datasets/SuperGLUE_WSC/SuperGLUE_WSC_ppl_f37e78.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import WSCDataset
+
+WSC_reader_cfg = dict(
+    input_columns=['span1', 'span2', 'text', 'new_text'],
+    output_column='answer')
+
+WSC_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0: '{text}',
+            1: '{new_text}'
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+WSC_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+WSC_datasets = [
+    dict(
+        type=WSCDataset,
+        path='json',
+        abbr='WSC',
+        data_files='./data/SuperGLUE/WSC/val.jsonl',
+        split='train',
+        reader_cfg=WSC_reader_cfg,
+        infer_cfg=WSC_infer_cfg,
+        eval_cfg=WSC_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen.py
+++ b/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .SuperGLUE_WiC_gen_d06864 import WiC_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py
+++ b/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_gen_d06864.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import WiCDatasetV2
+from opencompass.utils.text_postprocessors import first_capital_postprocess
+
+WiC_reader_cfg = dict(
+    input_columns=[
+        'word',
+        'sentence1',
+        'sentence2',
+    ],
+    output_column='label',
+)
+
+WiC_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(round=[
+            dict(
+                role='HUMAN',
+                prompt=
+                "Sentence 1: {sentence1}\nSentence 2: {sentence2}\nAre '{word}' in the above two sentenses the same?\nA. Yes\nB. No\nAnswer:"
+            ),
+        ]),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+WiC_eval_cfg = dict(
+    evaluator=dict(type=AccEvaluator),
+    pred_role='BOT',
+    pred_postprocessor=dict(type=first_capital_postprocess),
+)
+
+WiC_datasets = [
+    dict(
+        abbr='WiC',
+        type=WiCDatasetV2,
+        path='./data/SuperGLUE/WiC/val.jsonl',
+        reader_cfg=WiC_reader_cfg,
+        infer_cfg=WiC_infer_cfg,
+        eval_cfg=WiC_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl.py
+++ b/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl.py
+from mmengine.config import read_base
+
+with read_base():
+    from .SuperGLUE_WiC_ppl_312de9 import WiC_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_312de9.py
+++ b/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_312de9.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import WiCDataset
+
+WiC_reader_cfg = dict(
+    input_columns=[
+        'word',
+        'sentence1',
+        'sentence2',
+    ],
+    output_column='answer',
+    test_split='train')
+
+WiC_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0:
+            dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    "Sentence 1: {sentence1}\nSentence 2: {sentence2}\n'{word}' in the above two sentenses are different."
+                ),
+            ]),
+            1:
+            dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt=
+                    "Sentence 1: {sentence1}\nSentence 2: {sentence2}\n'{word}' in the above two sentenses are the same."
+                ),
+            ]),
+        },
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer),
+)
+
+WiC_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+WiC_datasets = [
+    dict(
+        type=WiCDataset,
+        abbr='WiC',
+        path='json',
+        data_files='./data/SuperGLUE/WiC/val.jsonl',
+        split='train',
+        reader_cfg=WiC_reader_cfg,
+        infer_cfg=WiC_infer_cfg,
+        eval_cfg=WiC_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_3fb6fd.py
+++ b/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_3fb6fd.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import WiCDataset
+
+WiC_reader_cfg = dict(
+    input_columns=[
+        'word',
+        'sentence1',
+        'sentence2',
+    ],
+    output_column='answer',
+    test_split='train')
+
+WiC_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0: '{word} in {sentence1} and {sentence2} is different.',
+            1: '{word} in {sentence1} and {sentence2} is same.'
+        }),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer))
+
+WiC_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+WiC_datasets = [
+    dict(
+        type=WiCDataset,
+        abbr='WiC',
+        path='json',
+        data_files='./data/SuperGLUE/WiC/val.jsonl',
+        split='train',
+        reader_cfg=WiC_reader_cfg,
+        infer_cfg=WiC_infer_cfg,
+        eval_cfg=WiC_eval_cfg)
+]
--- a/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_c926be.py
+++ b/opencompass/configs/datasets/SuperGLUE_WiC/SuperGLUE_WiC_ppl_c926be.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import PPLInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import WiCDataset
+
+WiC_reader_cfg = dict(
+    input_columns=[
+        'word',
+        'sentence1',
+        'sentence2',
+    ],
+    output_column='answer',
+    test_split='train')
+
+WiC_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template={
+            0:
+            dict(round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{word} in {sentence1} and {sentence2} is different.'),
+            ]),
+            1:
+            dict(round=[
+                dict(role='HUMAN', prompt='{word} in {sentence1} and {sentence2} is same.'),
+            ]),
+        },
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=PPLInferencer),
+)
+
+WiC_eval_cfg = dict(evaluator=dict(type=AccEvaluator))
+
+WiC_datasets = [
+    dict(
+        type=WiCDataset,
+        abbr='WiC',
+        path='json',
+        data_files='./data/SuperGLUE/WiC/val.jsonl',
+        split='train',
+        reader_cfg=WiC_reader_cfg,
+        infer_cfg=WiC_infer_cfg,
+        eval_cfg=WiC_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/TabMWP/TabMWP_gen.py
+++ b/opencompass/configs/datasets/TabMWP/TabMWP_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .TabMWP_gen_2aef96 import TabMWP_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/TabMWP/TabMWP_gen_2aef96.py
+++ b/opencompass/configs/datasets/TabMWP/TabMWP_gen_2aef96.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import TabMWPDataset, TabMWPEvaluator
+
+# None of the TabMWP dataset in huggingface is correctly parsed, so we use our own dataset reader
+# Please download the dataset from https://github.com/lupantech/PromptPG/tree/main
+
+input_format='TQ'
+output_format='A'
+elements = {'Q': 'Question: {question}',
+            'T': 'Table: {table}',
+            'S': 'Solution: {solution}',
+            'A': 'Answer: The answer is {answer}.',
+            'AS': 'Answer: The answer is {answer}. BECAUSE: {solution}',
+            'SA': 'Answer: {solution} The answer is {answer}.'}
+
+
+TabMWP_reader_cfg = dict(
+    input_columns=['question', 'table'],
+    output_column='test_elements',
+    train_split='dev',
+    )
+
+TabMWP_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt= '\n'.join(elements[label] for label in input_format)
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+TabMWP_eval_cfg = dict(
+    evaluator=dict(type=TabMWPEvaluator)
+)
+
+TabMWP_datasets = [
+    dict(
+        type=TabMWPDataset,
+        path='./data/tabmwp/',
+        reader_cfg=TabMWP_reader_cfg,
+        infer_cfg=TabMWP_infer_cfg,
+        eval_cfg=TabMWP_eval_cfg,)
+]
--- a/opencompass/configs/datasets/TheoremQA/README.md
+++ b/opencompass/configs/datasets/TheoremQA/README.md
+# TheoremQA
+
+```bash
+python3 run.py --models hf_internlm2_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+python3 run.py --models hf_internlm2_chat_7b --datasets TheoremQA_5shot_gen_6f0af8 --debug
+```
+
+## Base Models
+
+|          model           |   TheoremQA |
+|:------------------------:|------------:|
+|    llama-7b-turbomind    |       10.25 |
+|   llama-13b-turbomind    |       11.25 |
+|   llama-30b-turbomind    |       14.25 |
+|   llama-65b-turbomind    |       15.62 |
+|   llama-2-7b-turbomind   |       12.62 |
+|  llama-2-13b-turbomind   |       11.88 |
+|  llama-2-70b-turbomind   |       15.62 |
+|   llama-3-8b-turbomind   |       20.25 |
+|  llama-3-70b-turbomind   |       33.62 |
+| internlm2-1.8b-turbomind |       10.50 |
+|  internlm2-7b-turbomind  |       21.88 |
+| internlm2-20b-turbomind  |       26.00 |
+|   qwen-1.8b-turbomind    |        9.38 |
+|    qwen-7b-turbomind     |       15.00 |
+|    qwen-14b-turbomind    |       21.62 |
+|    qwen-72b-turbomind    |       27.12 |
+|     qwen1.5-0.5b-hf      |        5.88 |
+|     qwen1.5-1.8b-hf      |       12.00 |
+|      qwen1.5-4b-hf       |       13.75 |
+|      qwen1.5-7b-hf       |        4.25 |
+|      qwen1.5-14b-hf      |       12.62 |
+|      qwen1.5-32b-hf      |       26.62 |
+|      qwen1.5-72b-hf      |       26.62 |
+|   qwen1.5-moe-a2-7b-hf   |        7.50 |
+|    mistral-7b-v0.1-hf    |       17.00 |
+|    mistral-7b-v0.2-hf    |       16.25 |
+|   mixtral-8x7b-v0.1-hf   |       24.12 |
+|  mixtral-8x22b-v0.1-hf   |       36.75 |
+|         yi-6b-hf         |       13.88 |
+|        yi-34b-hf         |       24.75 |
+|   deepseek-7b-base-hf    |       12.38 |
+|   deepseek-67b-base-hf   |       21.25 |
+
+## Chat Models
+
+|             model             |   TheoremQA |
+|:-----------------------------:|------------:|
+|     qwen1.5-0.5b-chat-hf      |        9.00 |
+|     qwen1.5-1.8b-chat-hf      |        9.25 |
+|      qwen1.5-4b-chat-hf       |       13.88 |
+|      qwen1.5-7b-chat-hf       |       12.25 |
+|      qwen1.5-14b-chat-hf      |       13.63 |
+|      qwen1.5-32b-chat-hf      |       19.25 |
+|      qwen1.5-72b-chat-hf      |       22.75 |
+|     qwen1.5-110b-chat-hf      |       17.50 |
+|    internlm2-chat-1.8b-hf     |       13.63 |
+|  internlm2-chat-1.8b-sft-hf   |       12.88 |
+|     internlm2-chat-7b-hf      |       18.50 |
+|   internlm2-chat-7b-sft-hf    |       18.75 |
+|     internlm2-chat-20b-hf     |       23.00 |
+|   internlm2-chat-20b-sft-hf   |       25.12 |
+|    llama-3-8b-instruct-hf     |       19.38 |
+|    llama-3-70b-instruct-hf    |       36.25 |
+| llama-3-8b-instruct-lmdeploy  |       19.62 |
+| llama-3-70b-instruct-lmdeploy |       34.50 |
+|  mistral-7b-instruct-v0.1-hf  |       12.62 |
+|  mistral-7b-instruct-v0.2-hf  |       11.38 |
+| mixtral-8x7b-instruct-v0.1-hf |       26.00 |
--- a/opencompass/configs/datasets/TheoremQA/TheoremQA_5shot_gen_6f0af8.py
+++ b/opencompass/configs/datasets/TheoremQA/TheoremQA_5shot_gen_6f0af8.py
+from mmengine.config import read_base
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.datasets import TheoremQADatasetV3, TheoremQA_postprocess_v3, TheoremQAEvaluatorV3
+
+with read_base():
+    from .TheoremQA_few_shot_examples import examples
+
+num_shot = 5
+rounds = []
+for index, (query, response) in enumerate(examples[:num_shot]):
+    if index == 0:
+        desc = 'You are supposed to provide a solution to a given problem.\n\n'
+    else:
+        desc = ''
+    rounds += [
+        dict(role='HUMAN', prompt=f'{desc}Problem:\n{query}\nSolution:'),
+        dict(role='BOT', prompt=f'{response}')
+    ]
+rounds += [dict(role='HUMAN', prompt='Problem:\n{Question}\nSolution:')]
+
+TheoremQA_reader_cfg = dict(input_columns=['Question', 'Answer_type'], output_column='Answer', train_split='test', test_split='test')
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=dict(round=rounds)),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=1024, stopping_criteria=['Problem:', 'Problem']),
+)
+
+TheoremQA_eval_cfg = dict(
+    evaluator=dict(type=TheoremQAEvaluatorV3),
+    pred_postprocessor=dict(type=TheoremQA_postprocess_v3)
+)
+
+TheoremQA_datasets = [
+    dict(
+        abbr='TheoremQA',
+        type=TheoremQADatasetV3,
+        path='data/TheoremQA/theoremqa_test.json',
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/TheoremQA/TheoremQA_few_shot_examples.py
+++ b/opencompass/configs/datasets/TheoremQA/TheoremQA_few_shot_examples.py
+examples = [
+    (
+        'In a 10 Gigabit Ethernet network, the average size of a frame is 1500 bytes. If a burst of noise lasting 1ms interrupts the network, how many frames are lost?',
+        'First, calculate the data rate in bytes/s:\n$$10 Gigabit/s * (1 Byte / 8 bits) = 1.25 * 10^9 Bytes/s$$\nNext, calculate the data loss in bytes due to the noise:\n$$1 ms * 1.25 * 10^9 Bytes/s = 1.25 * 10^6 Bytes$$\nFinally, divide the data loss by the average frame size to get the number of frames lost:\n$$1.25 * 10^6 Bytes / 1500 Bytes/frame \\approx 833.33 frames$$\nThe answer is 833.33',
+    ),
+    (
+        'Given x = 0.157, what is the value of $x \\times \\frac{\\prod_{n=1}^\\infty (1 - \\frac{x^2}{n^2 \\pi^2})}{\\sin(x)}$?',
+        "To evaluate the expression $x \\times \\frac{\\prod_{n=1}^{\\infty} (1 - \\frac{x^2}{n^2 \\pi^2})}{\\sin(x)}$ given x = 0.157, we first recognize that the product in the numerator is related to the sine function through the Euler's reflection formula for the sine function, which can be expressed as:\n$$\\sin(x) = x \\prod_{n=1}^{\\infty} \\left(1 - \\frac{x^2}{n^2 \\pi^2}\\right)$$\nTherefore, the given expression simplifies to: $x \\times \\frac{\\sin(x)}{\\sin(x)}$\nBecause sin(x) in the numerator and denominator cancels out, the expression simplifies further to just x.\nSo, given x = 0.157, the value of the expression is 0.157. This result is derived from the properties of the sine function and does not require computational evaluation.\nThe answer is 0.157",
+    ),
+    (
+        'Consider the basis C of $\\mathbb{R}^2$ consisting of vectors $u_1 = [2, 4]$ and $u_2 = [1, -1]$. If $y = [8, 12]$, find the C-coordinate vector of y.',
+        "The goal is to express y as a linear combination of the basis vectors of C, i.e., $y = a\\cdot u_1 + b\\cdot u_2$, where a and b are the scalar coefficients that we want to find. These coefficients will form the C-coordinate vector of y, which we'll denote as $[a, b]_C$.\nGiven:\n- $u_1 = [2, 4]$,\n- $u_2 = [1, -1]$,\n- $y = [8, 12]$.\nWe need to solve the system of linear equations:\n2a + 1b = 8\n4a - 1b = 12\nLet's solve this system of equations to find a and b.\nThe solution to the system of equations is $a = \\frac{10}{3} and b = \\frac{4}{3}$. Therefore, the C-coordinate vector of y in the basis consisting of vectors $u_1 = [2, 4]$ and $u_2 = [1, -1]$ is $\\left[\\frac{10}{3}, \\frac{4}{3}\\right]_C$.\nLet's calculate the numerical value of $\\left[\\frac{10}{3}, \\frac{4}{3}\\right]_C$ as [3.33, 1.33].\nThe answer is [3.33, 1.33]",
+    ),
+    (
+        'One can draw a simple, connected planar graph with 200 vertices and 397 edges. Is this statement True or False?',
+        "To determine the answer, we can use Euler's formula for planar graphs, which states that for any finite, connected, planar graph, $V - E + F = 2$, where V is the number of vertices, E is the number of edges, and F is the number of faces.\nGiven the modified question, we have V = 200 vertices and E = 397 edges. We want to find if we can have a graph that satisfies these conditions, adhering to Euler's formula.\nFirst, let's rearrange Euler's formula to solve for F:  F = E - V + 2\nSubstituting the given values: F = 397 - 200 + 2,  F = 199\nThis means a graph with 200 vertices and 397 edges would have 199 faces. However, to determine the truth of this possibility, we should check if this graph doesn't violate any other planar graph constraints, particularly regarding the number of edges.\nFor a simple, connected planar graph, there's also a relationship between vertices, edges, and faces given by the inequality: $E \\leq 3V - 6$\nSubstituting V = 200 gives: $E \\leq 3*200 - 6 = 594$\nWith E = 397, the condition $E \\leq 594$ is satisfied, meaning it's theoretically possible in terms of the edge condition for a planar graph.\nTherefore, one can draw a simple, connected planar graph with 200 vertices and 397 edges, resulting in 199 faces, without violating the conditions for it to be planar according to both Euler's formula and the constraint on the maximum number of edges.\nThe answer is True",
+    ),
+    (
+        'Given a finite group G, and a collection of permutations H on a set. Then (a) there always exists H such that G is isomorphic to H; (b) for any H, G is isomorphic to H; (c) G can never be isomorphic to H; (d) none of the above. Which option is correct?',
+        "This is based on Cayley's theorem, which states that every group G is isomorphic to a subgroup of the symmetric group acting on G.\nIn other words, for every finite group G, there exists a collection of permutations H (which in this context, can be thought of as the set of permutations representing the action of G on itself) such that G is isomorphic to H.\nTherefore, there always exists H such that G is isomorphic to H.\nThe answer is (a)",
+    ),
+]
--- a/opencompass/configs/datasets/TheoremQA/TheoremQA_few_shot_examples_official.py
+++ b/opencompass/configs/datasets/TheoremQA/TheoremQA_few_shot_examples_official.py
+examples = [
+    (
+        'In a 10 Gigabit Ethernet network, the average size of a frame is 1500 bytes. If a burst of noise lasting 1ms interrupts the network, how many frames are lost?',
+        'First, calculate the data rate in bytes/s:\n\n10 Gigabit/s * (1 Byte / 8 bits) = 1.25 * 10^9 Bytes/s\n\nNext, calculate the data loss in bytes due to the noise:\n\n1 ms * 1.25 * 10^9 Bytes/s = 1.25 * 10^6 Bytes\n\nFinally, divide the data loss by the average frame size to get the number of frames lost:\n\n1.25 * 10^6 Bytes / 1500 Bytes/frame ≈ 833.33 frames\nThe answer is 833.33'
+    ),
+    (
+        'Given x = 0.157, what is the value of x \\times \\frac{\\prod_{n=1}^\\infty (1 - \\frac{x^2}{n^2 \\pi^2})}{\\sin(x)}?',
+        "To evaluate the expression $x \\times \\frac{\\prod_{n=1}^{\\infty} (1 - \\frac{x^2}{n^2 \\pi^2})}{\\sin(x)}$ given x = 0.157, we first recognize that the product in the numerator is related to the sine function through the Euler's reflection formula for the sine function, which can be expressed as:\n\n$$\\sin(x) = x \\prod_{n=1}^{\\infty} \\left(1 - \\frac{x^2}{n^2 \\pi^2}\\right)$$\n\nTherefore, the given expression simplifies to: $x \\times \\frac{\\sin(x)}{\\sin(x)}$\n\nBecause sin(x) in the numerator and denominator cancels out, the expression simplifies further to just x.\n\nSo, given x = 0.157, the value of the expression is 0.157. This result is derived from the properties of the sine function and does not require computational evaluation.\nThe answer is 0.157"
+    ),
+    (
+        'Consider the basis C of \\mathbb{R}^2 consisting of vectors u_1 = [2, 4] and u_2 = [1, -1]. If y = [8, 12], find the C-coordinate vector of y.',
+        "The goal is to express y as a linear combination of the basis vectors of C, i.e., $y = a\\cdot u_1 + b\\cdot u_2$, where a and b are the scalar coefficients that we want to find. These coefficients will form the C-coordinate vector of y, which we'll denote as $[a, b]_C$.\n\nGiven:\n- $u_1 = [2, 4]$,\n- $u_2 = [1, -1]$,\n- $y = [8, 12]$.\n\nWe need to solve the system of linear equations:\n2a + 1b = 8\n4a - 1b = 12\n\nLet's solve this system of equations to find a and b.\n\nThe solution to the system of equations is $a = \\frac{10}{3} and b = \\frac{4}{3}$. Therefore, the C-coordinate vector of y in the basis consisting of vectors u_1 = [2, 4] and u_2 = [1, -1] is $\\left[\\frac{10}{3}, \\frac{4}{3}\\right]_C$. \nLet's calculate the numerical value of $\\left[\x0crac{10}{3}, \x0crac{4}{3}\right]_C$ as [3.33, 1.33].\nThe answer is [3.33, 1.33]"
+    ),
+    (
+        'One can draw a simple, connected planar graph with 200 vertices and 397 edges. Is this statement Trur or False?',
+        "To determine the answer, we can use Euler's formula for planar graphs, which states that for any finite, connected, planar graph, $V - E + F = 2$, where V is the number of vertices, E is the number of edges, and F is the number of faces.\n\nGiven the modified question, we have V = 200 vertices and E = 397 edges. We want to find if we can have a graph that satisfies these conditions, adhering to Euler's formula.\n\nFirst, let's rearrange Euler's formula to solve for F:  F = E - V + 2\n\nSubstituting the given values: F = 397 - 200 + 2,  F = 199\n\nThis means a graph with 200 vertices and 397 edges would have 199 faces. However, to determine the truth of this possibility, we should check if this graph doesn't violate any other planar graph constraints, particularly regarding the number of edges.\n\nFor a simple, connected planar graph, there's also a relationship between vertices, edges, and faces given by the inequality: $E \\leq 3V - 6$\n\nSubstituting V = 200 gives: $E \\leq 3*200 - 6 = 594$\n\nWith E = 397, the condition $E \\leq 594$ is satisfied, meaning it's theoretically possible in terms of the edge condition for a planar graph.\n\nTherefore, one can draw a simple, connected planar graph with 200 vertices and 397 edges, resulting in 199 faces, without violating the conditions for it to be planar according to both Euler's formula and the constraint on the maximum number of edges.\nThe answer is True"
+    ),
+    (
+        'Given a finite group G, and a collection of permutations H on a set. Then (a) there always exists H such that G is isomorphic to H; (b) for any H, G is isomorphic to H; (c) G can never be isomorphic to H; (d) none of the above. Which option is correct?',
+        "This is based on Cayley's theorem, which states that every group G is isomorphic to a subgroup of the symmetric group acting on G. \nIn other words, for every finite group G, there exists a collection of permutations H (which in this context, can be thought of as the set of permutations representing the action of G on itself) such that G is isomorphic to H.\n\nTherefore, there always exists H such that G is isomorphic to H.\nThe answer is (a)"
+    )
+]
--- a/opencompass/configs/datasets/TheoremQA/TheoremQA_gen.py
+++ b/opencompass/configs/datasets/TheoremQA/TheoremQA_gen.py
+from mmengine.config import read_base
+
+with read_base():
+    from .TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets  # noqa: F401, F403
--- a/opencompass/configs/datasets/TheoremQA/ThroremQA_0shot_cot_gen_8acdf7.py
+++ b/opencompass/configs/datasets/TheoremQA/ThroremQA_0shot_cot_gen_8acdf7.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import (
+    TheoremQADataset,
+    TheoremQA_postprocess_v3,
+    TheoremQA_postprocess_v4,
+    TheoremQAEvaluatorV3,
+)
+
+TheoremQA_reader_cfg = dict(
+    input_columns=['Question', 'Answer_type'],
+    output_column='Answer',
+    train_split='test',
+)
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=TheoremQA_prompt1 + TheoremQA_prompt2,
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# 正确的 evaluator 需要借助于 llm 来进行答案提取，此评测逻辑亦会有较多 FN 。
+TheoremQA_eval_cfg = dict(
+    evaluator=dict(type=TheoremQAEvaluatorV3),
+    pred_postprocessor=dict(type=TheoremQA_postprocess_v4),
+)
+
+TheoremQA_datasets = [
+    dict(
+        abbr='TheoremQA',
+        type=TheoremQADataset,
+        path='./data/TheoremQA/test.csv',
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/TheoremQA/deprecated_TheoremQA_gen_424e0a.py
+++ b/opencompass/configs/datasets/TheoremQA/deprecated_TheoremQA_gen_424e0a.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
+
+TheoremQA_reader_cfg = dict(input_columns=['Question', 'Answer_type'], output_column='Answer', train_split='test')
+
+TheoremQA_prompt1 = (
+    'Please read a math problem, and then think step by step to derive the answer. The answer is decided by Answer Type. '
+    'If the Answer type in [bool], the answer needs to be True or False. '
+    'Else if the Answer type in [integer, float] , The answer needs to be in numerical form. '
+    'Else if the Answer type in [list of integer, list of float] , the answer needs to be a list of number like [2, 3, 4]. '
+    'Else if the Answer type in [option], the answer needs to be an option like (a), (b), (c), (d).'
+    "You need to output the answer in your final sentence like 'Therefore, the answer is ...'."
+)
+TheoremQA_prompt2 = (
+    f'Below is an instruction that describes a task, paired with an input that provides further context. '
+    f'Write a response that appropriately completes the request.\n\n### Instruction:\n{TheoremQA_prompt1}\n\n### Input:\n{{Question}}\nAnswer_type:{{Answer_type}}\n### Response:\n'
+)
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(type=PromptTemplate, template=TheoremQA_prompt2),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
+
+TheoremQA_datasets = [
+    dict(
+        abbr='TheoremQA',
+        type=TheoremQADataset,
+        path='./data/TheoremQA/test.csv',
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/TheoremQA/deprecated_TheoremQA_gen_7009de.py
+++ b/opencompass/configs/datasets/TheoremQA/deprecated_TheoremQA_gen_7009de.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
+
+TheoremQA_reader_cfg = dict(input_columns=['Question', 'Answer_type'], output_column='Answer', train_split='test')
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            begin=[
+                dict(role='SYSTEM', fallback_role='HUMAN', prompt=TheoremQA_prompt1),
+            ],
+            round=[
+                dict(role='HUMAN', prompt=TheoremQA_prompt2),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
+
+TheoremQA_datasets = [
+    dict(
+        abbr='TheoremQA',
+        type=TheoremQADataset,
+        path='./data/TheoremQA/test.csv',
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/TheoremQA/deprecated_TheoremQA_gen_ef26ca.py
+++ b/opencompass/configs/datasets/TheoremQA/deprecated_TheoremQA_gen_ef26ca.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess
+
+TheoremQA_reader_cfg = dict(input_columns=['Question', 'Answer_type'], output_column='Answer', train_split='test')
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=TheoremQA_prompt1 + TheoremQA_prompt2,
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess))
+
+TheoremQA_datasets = [
+    dict(
+        abbr='TheoremQA',
+        type=TheoremQADataset,
+        path='./data/TheoremQA/test.csv',
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/TheoremQA/deprecated_TheoremQA_post_v2_gen_2c2583.py
+++ b/opencompass/configs/datasets/TheoremQA/deprecated_TheoremQA_post_v2_gen_2c2583.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess_v2
+
+TheoremQA_reader_cfg = dict(input_columns=['Question', 'Answer_type'], output_column='Answer', train_split='test')
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=TheoremQA_prompt1 + TheoremQA_prompt2,
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+# 正确的 evaluator 需要借助于 llm 来进行答案提取，此评测逻辑亦会有较多 FN 。
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess_v2))
+
+TheoremQA_datasets = [
+    dict(
+        abbr='TheoremQA',
+        type=TheoremQADataset,
+        path='./data/TheoremQA/test.csv',
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]
--- a/opencompass/configs/datasets/TheoremQA/deprecated_TheoremQA_post_v2_gen_ef26ca.py
+++ b/opencompass/configs/datasets/TheoremQA/deprecated_TheoremQA_post_v2_gen_ef26ca.py
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import AccEvaluator
+from opencompass.datasets import TheoremQADataset, TheoremQA_postprocess_v2
+
+TheoremQA_reader_cfg = dict(input_columns=['Question', 'Answer_type'], output_column='Answer', train_split='test')
+
+TheoremQA_prompt1 = """You are a mathematician, you are supposed to answer the given question. You need to output the answer in your final sentence like "Therefore, the answer is ...". The answer can only be one of the following forms:
+1. a numerical value like 0.1, no symbol and no unit at all.
+2. a list of number like [2, 3, 4].
+3. True/False.
+4. an option like (a), (b), (c), (d)
+"""
+TheoremQA_prompt2 = "Question: {Question}\nLet's think step by step."
+
+TheoremQA_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt=TheoremQA_prompt1 + TheoremQA_prompt2,
+                ),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer, max_out_len=512),
+)
+
+# 正确的 evaluator 需要借助于 llm 来进行答案提取，此评测逻辑亦会有较多 FN 。
+TheoremQA_eval_cfg = dict(evaluator=dict(type=AccEvaluator), pred_postprocessor=dict(type=TheoremQA_postprocess_v2))
+
+TheoremQA_datasets = [
+    dict(
+        abbr='TheoremQA',
+        type=TheoremQADataset,
+        path='./data/TheoremQA/test.csv',
+        reader_cfg=TheoremQA_reader_cfg,
+        infer_cfg=TheoremQA_infer_cfg,
+        eval_cfg=TheoremQA_eval_cfg,
+    )
+]