merged from main

3e5e9da2 · lintangsutawika · d429b47f · 7852985b · 3e5e9da2 · 3e5e9da2
Commit 3e5e9da2 authored Apr 16, 2024 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza5e.yaml
+++ b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza5e.yaml
+# Generated by utils.py
+dataset_name: eu_osakidetza5e
+include: eus_exams_eu
+task: eus_exams_eu_osakidetza5e
--- a/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza6e.yaml
+++ b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza6e.yaml
+# Generated by utils.py
+dataset_name: eu_osakidetza6e
+include: eus_exams_eu
+task: eus_exams_eu_osakidetza6e
--- a/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza7e.yaml
+++ b/lm_eval/tasks/eus_exams/eus_exams_eu_osakidetza7e.yaml
+# Generated by utils.py
+dataset_name: eu_osakidetza7e
+include: eus_exams_eu
+task: eus_exams_eu_osakidetza7e
--- a/lm_eval/tasks/eus_exams/utils.py
+++ b/lm_eval/tasks/eus_exams/utils.py
+import datasets
+
+
+def process_docs(dataset: datasets.Dataset):
+    """Filter out examples with no answer."""
+
+    def valid_example(example: dict) -> bool:
+        """Check if an example is valid."""
+        if example["answer"] not in [0, 1, 2, 3]:
+            return False
+        if example["candidates"] == ["", "", "", ""]:
+            return False
+        return True
+
+    return dataset.filter(valid_example)
--- a/lm_eval/tasks/eus_proficiency/README.md
+++ b/lm_eval/tasks/eus_proficiency/README.md
+# EusProficiency
+
+### Paper
+
+Title: Latxa: An Open Language Model and Evaluation Suite for Basque
+
+Abstract: https://arxiv.org/abs/2403.20266
+
+EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque. We collected the atarikoa exercises from EGA exams through the years 1998 to 2008. Atarikoa is the first qualifying test of EGA, which measures different aspects of language competency, such as reading comprehension, grammar, vocabulary, spelling, and writing. Each test generally has 85 multiple-choice questions, with 4 choices and a single correct answer.
+
+Homepage: https://github.com/hitz-zentroa/latxa
+
+
+### Citation
+
+```
+@misc{etxaniz2024latxa,
+      title={Latxa: An Open Language Model and Evaluation Suite for Basque},
+      author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
+      year={2024},
+      eprint={2403.20266},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+There are no groups.
+
+#### Tasks
+
+* `eus_proficiency`: EusProficiency comprises 5,169 exercises on different topics from past EGA exams, the official C1-level certificate of proficiency in Basque.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml
+++ b/lm_eval/tasks/eus_proficiency/eus_proficiency.yaml
+dataset_path: HiTZ/EusProficiency
+dataset_name: default
+task: eus_proficiency
+doc_to_text: "Galdera: {{question}}\nA: {{candidates[0]}}\nB: {{candidates[1]}}\nC: {{candidates[2]}}\nD: {{candidates[3]}}\nErantzuna:"
+doc_to_choice: ["A", "B", "C", "D"]
+validation_split: null
+test_split: test
+fewshot_split: test
+output_type: multiple_choice
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/eus_reading/README.md
+++ b/lm_eval/tasks/eus_reading/README.md
+# EusReading
+
+### Paper
+
+Title: Latxa: An Open Language Model and Evaluation Suite for Basque
+
+Abstract: https://arxiv.org/abs/2403.20266
+
+EusReading consists of 352 reading comprehension exercises (irakurmena) sourced from the set of past EGA exams from 1998 to 2008. Each test generally has 10 multiple-choice questions, with 4 choices and a single correct answer. These exercises are more challenging than Belebele due to the complexity and length of the input texts. As a result, EusReading is useful to measure long context understanding of models.
+
+Homepage: https://github.com/hitz-zentroa/latxa
+
+
+### Citation
+
+```
+@misc{etxaniz2024latxa,
+      title={Latxa: An Open Language Model and Evaluation Suite for Basque},
+      author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
+      year={2024},
+      eprint={2403.20266},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+There are no groups.
+
+#### Tasks
+
+* `eus_reading`: EusReading consists of 352 reading comprehension exercises (irakurmena) sourced from the set of past EGA exams from 1998 to 2008.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/eus_reading/eus_reading.yaml
+++ b/lm_eval/tasks/eus_reading/eus_reading.yaml
+dataset_path: HiTZ/EusReading
+dataset_name: default
+task: eus_reading
+doc_to_text: !function utils.doc_to_text_context
+doc_to_choice: !function utils.doc_to_choice
+validation_split: null
+test_split: test
+fewshot_split: test
+output_type: multiple_choice
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/eus_reading/utils.py
+++ b/lm_eval/tasks/eus_reading/utils.py
+from typing import List
+
+
+letters = ["A", "B", "C", "D"]
+
+
+def doc_to_text_context(doc) -> str:
+    """
+    Converts a document to a formatted string.
+
+    Args:
+        doc (dict): A dictionary containing the document information.
+
+    Returns:
+        str: A formatted string containing the question and answer choices.
+    """
+    candidates = doc["candidates"]
+    num_choices = len(candidates)
+    if num_choices < 2:
+        raise ValueError("Invalid number of candidates")
+    choices = letters[:num_choices]
+    formatted_choices = "\n".join(
+        [f"{choice}: {candidates[i]}" for i, choice in enumerate(choices)]
+    )
+    return f"Pasartea: {doc['context']}\n\nGaldera: {doc['question']}\n{formatted_choices}\nErantzuna:"
+
+
+def doc_to_choice(doc) -> List[str]:
+    """
+    Returns the answer choices for a document.
+
+    Args:
+        doc (dict): A dictionary containing the document information.
+
+    Returns:
+        list: A list of strings containing the answer choices.
+    """
+    num_choices = len(doc["candidates"])
+    if num_choices < 2:
+        raise ValueError("Invalid number of candidates")
+    return letters[:num_choices]
--- a/lm_eval/tasks/eus_trivia/README.md
+++ b/lm_eval/tasks/eus_trivia/README.md
+# EusTrivia
+
+### Paper
+
+Title: Latxa: An Open Language Model and Evaluation Suite for Basque
+
+Abstract: https://arxiv.org/abs/2403.20266
+
+EusTrivia consists of 1,715 trivia questions from multiple online sources. 56.3\% of the questions are elementary level (grades 3-6), while the rest are considered challenging. A significant portion of the questions focus specifically on the Basque Country, its language and culture. Each multiple-choice question contains two, three or four choices (3.84 on average) and a single correct answer. Five areas of knowledge are covered:
+
+- **Humanities and Natural Sciences** (27.8%): This category encompasses questions about history, geography, biology, ecology and other social and natural sciences.
+- **Leisure and Art** (24.5%): This category includes questions on sports and athletes, performative and plastic arts and artists, architecture, cultural events, and related topics.
+- **Music** (16.0%): Here are grouped all the questions about music and musicians, both classical and contemporary.
+- **Language and Literature** (17.1%): This category is concerned with all kinds of literature productions and writers, as well as metalinguistic questions (e.g., definitions, synonyms, and word usage).
+- **Mathematics and ICT** (14.5%): This category covers mathematical problems and questions about ICT, as well as questions about people known for their contributions to these fields of knowledge.
+
+Homepage: https://github.com/hitz-zentroa/latxa
+
+
+### Citation
+
+```
+@misc{etxaniz2024latxa,
+      title={Latxa: An Open Language Model and Evaluation Suite for Basque},
+      author={Julen Etxaniz and Oscar Sainz and Naiara Perez and Itziar Aldabe and German Rigau and Eneko Agirre and Aitor Ormazabal and Mikel Artetxe and Aitor Soroa},
+      year={2024},
+      eprint={2403.20266},
+      archivePrefix={arXiv},
+      primaryClass={cs.CL}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+There are no groups.
+
+#### Tasks
+
+* `eus_trivia`: EusTrivia consists of 1,715 trivia questions from multiple online sources.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/eus_trivia/eus_trivia.yaml
+++ b/lm_eval/tasks/eus_trivia/eus_trivia.yaml
+dataset_path: HiTZ/EusTrivia
+dataset_name: default
+task: eus_trivia
+doc_to_text: !function utils.doc_to_text
+doc_to_choice: !function utils.doc_to_choice
+validation_split: null
+test_split: test
+fewshot_split: test
+output_type: multiple_choice
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/eus_trivia/utils.py
+++ b/lm_eval/tasks/eus_trivia/utils.py
+from typing import List
+
+
+letters = ["A", "B", "C", "D"]
+
+
+def doc_to_text(doc) -> str:
+    """
+    Converts a document to a formatted string.
+
+    Args:
+        doc (dict): A dictionary containing the document information.
+
+    Returns:
+        str: A formatted string containing the question and answer choices.
+    """
+    candidates = doc["candidates"]
+    num_choices = len(candidates)
+    if num_choices < 2:
+        raise ValueError("Invalid number of candidates")
+    choices = letters[:num_choices]
+    formatted_choices = "\n".join(
+        [f"{choice}: {candidates[i]}" for i, choice in enumerate(choices)]
+    )
+    return f"Galdera: {doc['question']}\n{formatted_choices}\nErantzuna:"
+
+
+def doc_to_choice(doc) -> List[str]:
+    """
+    Returns the answer choices for a document.
+
+    Args:
+        doc (dict): A dictionary containing the document information.
+
+    Returns:
+        list: A list of strings containing the answer choices.
+    """
+    num_choices = len(doc["candidates"])
+    if num_choices < 2:
+        raise ValueError("Invalid number of candidates")
+    return letters[:num_choices]
--- a/lm_eval/tasks/glue/qqp/default.yaml
+++ b/lm_eval/tasks/glue/qqp/default.yaml
@@ -5,11 +5,11 @@ dataset_name: qqp
 output_type: multiple_choice
 training_split: train
 validation_split: validation
-doc_to_text: "\nSentence 1: {{question1}}\nSentence 2: {{question2}}\nAnswer:"
+doc_to_text: "Question 1: {{question1}}\nQuestion 2: {{question2}}\nQuestion: Do both questions ask the same thing?\nAnswer:"
 doc_to_target: label
 doc_to_choice: ["no", "yes"]
 metric_list:
  - metric: acc
  - metric: f1
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/super_glue/record/default.yaml
+++ b/lm_eval/tasks/super_glue/record/default.yaml
@@ -7,8 +7,9 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 doc_to_text: !function util.doc_to_text
-doc_to_target: "{{answers}}"
-doc_to_choice: "{{entities}}"
+doc_to_target: !function util.doc_to_target
+doc_to_choice: !function util.doc_to_choice
+process_docs: !function util.process_docs
 process_results: !function util.process_results
 metric_list:
  - metric: f1
@@ -17,4 +18,4 @@ metric_list:
    higher_is_better: True
    aggregation: mean
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/super_glue/record/util.py
+++ b/lm_eval/tasks/super_glue/record/util.py
+import datasets
 import numpy as np
 import transformers.data.metrics.squad_metrics as squad_metrics

@@ -21,6 +22,22 @@ def doc_to_target(doc):
    return format_answer(query=doc["query"], entity=doc["answers"][0])


+def doc_to_choice(doc):
+    return [format_answer(query=doc["query"], entity=ans) for ans in doc["entities"]]
+
+
+def process_docs(dataset: datasets.Dataset):
+    def _process_doc(doc):
+        return {
+            "passage": doc["passage"],
+            "query": doc["query"],
+            "entities": sorted(list(set(doc["entities"]))),
+            "answers": sorted(list(set(doc["answers"]))),
+        }
+
+    return dataset.map(_process_doc)
+
+
 def process_results(doc, results):
    # ReCoRD's evaluation is actually deceptively simple:
    # - Pick the maximum likelihood prediction entity

--- a/lm_eval/tasks/tmmluplus/README.md
+++ b/lm_eval/tasks/tmmluplus/README.md
+# TMMLU+
+
+### Paper
+
+Title: `An Improved Traditional Chinese Evaluation Suite for Foundation Model`
+
+Abstract: `We present TMMLU+, a comprehensive dataset designed for the Traditional Chinese massive multitask language understanding dataset. TMMLU+ is a multiple-choice question-answering dataset with 66 subjects from elementary to professional level. Compared to its predecessor, TMMLU, TMMLU+ is six times larger and boasts a more balanced subject distribution. We included benchmark results in TMMLU+ from closed-source models and 24 open-weight Chinese large language models of parameters ranging from 1.8B to 72B. Our findings reveal that Traditional Chinese models still trail behind their Simplified Chinese counterparts. Additionally, current large language models have yet to outperform human performance in average scores. We publicly release our dataset and the corresponding benchmark source code.`
+
+
+Homepage: [https://huggingface.co/datasets/ikala/tmmluplus](https://huggingface.co/datasets/ikala/tmmluplus)
+
+
+### Citation
+
+```
+@article{ikala2024improved,
+  title={An Improved Traditional Chinese Evaluation Suite for Foundation Model},
+  author={Tam, Zhi-Rui and Pai, Ya-Ting and Lee, Yen-Wei and Cheng, Sega and Shuai, Hong-Han},
+  journal={arXiv preprint arXiv:2403.01858},
+  year={2024}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+* `tmmluplus`: `The dataset comprises 22,690 multiple-choice questions from 66 subjects ranging from primary to professional level. `
+
+#### Tasks
+
+The following tasks evaluate subjects in the TMMLU+ dataset using loglikelihood-based multiple-choice scoring:
+
+* `tmmluplus_{subject_english}`
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/tmmluplus/default/_default_template_yaml
+++ b/lm_eval/tasks/tmmluplus/default/_default_template_yaml
+dataset_path: ZoneTwelve/tmmluplus # a copy of `ikala/tmmluplus`
+test_split: test
+fewshot_split: train
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+process_docs: !function utils.process_docs
+doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: answer
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.1
--- a/lm_eval/tasks/tmmluplus/default/_generate_configs.py
+++ b/lm_eval/tasks/tmmluplus/default/_generate_configs.py
+"""
+Take in a YAML, and output all "other" splits with this YAML
+"""
+import argparse
+import os
+
+import pandas as pd
+import yaml
+from tqdm import tqdm
+
+
+# Copy from https://github.com/iKala/ievals/blob/main/ievals/settings.py
+# from TMMLU+ offical example
+categories = {
+    "STEM": [
+        "physics",
+        "chemistry",
+        "biology",
+        "computer science",
+        "math",
+        "engineering",
+    ],
+    "humanities": ["history", "philosophy", "law"],
+    "social_sciences": [
+        "politics",
+        "culture",
+        "economics",
+        "geography",
+        "psychology",
+        "education",
+    ],
+    "other": ["other", "business", "health"],  # (business, health, misc.)
+}
+
+task_list = [
+    "engineering_math",
+    "dentistry",
+    "traditional_chinese_medicine_clinical_medicine",
+    "clinical_psychology",
+    "technical",
+    "culinary_skills",
+    "mechanical",
+    "logic_reasoning",
+    "real_estate",
+    "general_principles_of_law",
+    "finance_banking",
+    "anti_money_laundering",
+    "ttqav2",
+    "marketing_management",
+    "business_management",
+    "organic_chemistry",
+    "advance_chemistry",
+    "physics",
+    "secondary_physics",
+    "human_behavior",
+    "national_protection",
+    "jce_humanities",
+    "politic_science",
+    "agriculture",
+    "official_document_management",
+    "financial_analysis",
+    "pharmacy",
+    "educational_psychology",
+    "statistics_and_machine_learning",
+    "management_accounting",
+    "introduction_to_law",
+    "computer_science",
+    "veterinary_pathology",
+    "accounting",
+    "fire_science",
+    "optometry",
+    "insurance_studies",
+    "pharmacology",
+    "taxation",
+    "education_(profession_level)",
+    "economics",
+    "veterinary_pharmacology",
+    "nautical_science",
+    "occupational_therapy_for_psychological_disorders",
+    "trust_practice",
+    "geography_of_taiwan",
+    "physical_education",
+    "auditing",
+    "administrative_law",
+    "basic_medical_science",
+    "macroeconomics",
+    "trade",
+    "chinese_language_and_literature",
+    "tve_design",
+    "junior_science_exam",
+    "junior_math_exam",
+    "junior_chinese_exam",
+    "junior_social_studies",
+    "tve_mathematics",
+    "tve_chinese_language",
+    "tve_natural_sciences",
+    "junior_chemistry",
+    "music",
+    "education",
+    "three_principles_of_people",
+    "taiwanese_hokkien",
+]
+subject2name = {}
+# subject2category = {}
+SUBJECTS = {}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="tmmluplus")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    parser.add_argument("--group_prefix", default="")
+    parser.add_argument("--subject_file", default="subject.tsv")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    from pathlib import Path
+
+    # Initialization
+    SUBJECT_FILE = Path(__file__).parent / Path(args.subject_file)
+
+    df = pd.read_csv(SUBJECT_FILE, delimiter="\t")
+
+    for _, row in df.iterrows():
+        for _c in categories:
+            if row["subject"] in SUBJECTS:
+                raise ValueError("Duplicate tasks.")
+            if row["category"] in categories[_c]:  # append new item into SUBJECTS
+                SUBJECTS[row["subject"]] = _c
+                subject2name[row["subject"]] = row["name"]
+                break
+    # End of SUBJECTS initialization
+
+    # get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path) as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path) as f:
+            cot_file = json.load(f)
+
+    ALL_CATEGORIES = []
+    for subject, category in tqdm(SUBJECTS.items()):
+        if category not in ALL_CATEGORIES:
+            ALL_CATEGORIES.append(category)
+
+        if args.cot_prompt_path is not None:
+            description = cot_file[subject]
+        else:
+            name_of_subject = subject2name[subject].replace("＿", " ")
+            description = f"以下為{name_of_subject}的單選題，請提供正確答案的選項。\n\n"
+            # description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "group": f"tmmluplus_{args.task_prefix}_{category}"
+            if args.task_prefix != ""
+            else f"tmmluplus_{category}",
+            "group_alias": category.replace("_", " "),
+            "task": f"tmmluplus_{args.task_prefix}_{subject}"
+            if args.task_prefix != ""
+            else f"tmmluplus_{subject}",
+            "task_alias": subject.replace("_", " "),
+            "dataset_name": subject,
+            "description": description,
+        }
+
+        file_save_path = args.save_prefix_path + f"_{subject}.yaml"
+        # eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
+        with open(file_save_path, "w") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                # width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
+
+    if args.task_prefix != "":
+        mmlu_subcategories = [
+            f"tmmluplus_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
+        ]
+    else:
+        mmlu_subcategories = [f"tmmluplus_{category}" for category in ALL_CATEGORIES]
+
+    if args.group_prefix != "":
+        file_save_path = args.group_prefix + ".yaml"
+    else:
+        file_save_path = args.save_prefix_path + ".yaml"
+
+    # eval_logger.info(f"Saving benchmark config to {file_save_path}")
+    with open(file_save_path, "w") as yaml_file:
+        yaml.dump(
+            {
+                "group": f"tmmluplus_{args.task_prefix}"
+                if args.task_prefix != ""
+                else "tmmluplus",
+                "task": mmlu_subcategories,
+            },
+            yaml_file,
+            indent=4,
+            default_flow_style=False,
+        )
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus.yaml
+group: tmmluplus
+task:
+- tmmluplus_other
+- tmmluplus_social_sciences
+- tmmluplus_humanities
+- tmmluplus_STEM
--- a/lm_eval/tasks/tmmluplus/default/tmmluplus_accounting.yaml
+++ b/lm_eval/tasks/tmmluplus/default/tmmluplus_accounting.yaml
+"dataset_name": "accounting"
+"description": "以下為會計學的單選題，請提供正確答案的選項。\n\n"
+"group": "tmmluplus_other"
+"group_alias": "other"
+"include": "_default_template_yaml"
+"task": "tmmluplus_accounting"
+"task_alias": "accounting"