Commit e15672a4 authored by Yen-Ting Lin's avatar Yen-Ting Lin
Browse files

Add UMTCEval tasks and configurations

parent 3c0df6e9
......@@ -62,7 +62,7 @@ lm_eval \
--log_samples \
--verbosity DEBUG \
--wandb_args project=lm-eval-harness-integration,job_type=eval,name=$model \
--hf_hub_log_args hub_results_org=yentinglin,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \
--hf_hub_log_args hub_results_org=yentinglin,hub_repo_name=lm-eval-results,push_results_to_hub=True,public_repo=False \
--seed 42 \
--trust_remote_code \
"
......
dataset_path: VincentUni/UMTCEval
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.1
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import os
import pandas as pd
import yaml
from tqdm import tqdm
categories = {
"law": [
"default",
],
}
task_list = [
"default",
]
subject2name = {}
SUBJECTS = {}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", default="_default_template_yaml")
parser.add_argument("--save_prefix_path", default="umtceval")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
parser.add_argument("--group_prefix", default="")
parser.add_argument("--subject_file", default="../subject.tsv")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
from pathlib import Path
# Initialization
SUBJECT_FILE = Path(__file__).parent / Path(args.subject_file)
df = pd.read_csv(SUBJECT_FILE, delimiter="\t")
for _, row in df.iterrows():
for _c in categories:
if row["subject"] in SUBJECTS:
raise ValueError(f"Duplicate tasks. {row['subject']} already exists.")
if row["category"] in categories[_c]: # append new item into SUBJECTS
SUBJECTS[row["subject"]] = _c
subject2name[row["subject"]] = row["name"]
break
# End of SUBJECTS initialization
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path) as f:
cot_file = json.load(f)
ALL_CATEGORIES = []
for subject, category in tqdm(SUBJECTS.items()):
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
if args.cot_prompt_path is not None:
description = cot_file[subject]
else:
name_of_subject = subject2name[subject].replace("_", " ")
description = f"以下為{name_of_subject}的單選題,請提供正確答案的選項。\n\n"
# description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
"include": base_yaml_name,
"group": f"umtceval_{args.task_prefix}_{category}"
if args.task_prefix != ""
else f"umtceval_{category}",
"group_alias": category.replace("_", " "),
"task": f"umtceval_{args.task_prefix}_{subject}"
if args.task_prefix != ""
else f"umtceval_{subject}",
"task_alias": subject.replace("_", " "),
"dataset_name": subject,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
# eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
# width=float("inf"),
allow_unicode=True,
default_style='"',
)
if args.task_prefix != "":
mmlu_subcategories = [
f"umtceval_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
]
else:
mmlu_subcategories = [f"umtceval_{category}" for category in ALL_CATEGORIES]
if args.group_prefix != "":
file_save_path = args.group_prefix + ".yaml"
else:
file_save_path = args.save_prefix_path + ".yaml"
# eval_logger.info(f"Saving benchmark config to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
{
"group": f"umtceval_{args.task_prefix}"
if args.task_prefix != ""
else "umtceval",
"task": mmlu_subcategories,
},
yaml_file,
indent=4,
default_flow_style=False,
)
group: umtceval
task:
- umtceval_law
"dataset_name": "hr_law"
"description": "以下為欣興電子人資的單選題,請提供正確答案的選項。\n\n"
"group": "umtceval_law"
"group_alias": "law"
"include": "_default_template_yaml"
"task": "umtceval_hr_law"
"task_alias": "hr law"
"dataset_name": "process_en"
"description": "以下為欣興電子製程的單選題,請提供正確答案的選項。\n\n"
"group": "umtceval_law"
"group_alias": "law"
"include": "_default_template_yaml"
"task": "umtceval_process_en"
"task_alias": "process en"
"dataset_name": "umtc"
"description": "以下為欣興電子的單選題,請提供正確答案的選項。\n\n"
"group": "umtceval_law"
"group_alias": "law"
"include": "_default_template_yaml"
"task": "umtceval_umtc"
"task_alias": "umtc"
import datasets
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _helper(doc):
# modifies the contents of a single
# document in our dataset.
answer_list = ["A", "B", "C", "D"]
choices = [doc["A"], doc["B"], doc["C"], doc["D"]]
out_doc = {
"questions": doc["question"],
"choices": choices,
"goal": answer_list.index(doc["answer"]),
}
return out_doc
return dataset.map(_helper) # returns back a datasets.Dataset object
subject name category
umtc 欣興電子 default
hr_law 欣興電子人資 default
process_en 欣興電子製程 default
......@@ -2,28 +2,57 @@
# Define the models to run
declare -a models=(
"yentinglin/Llama-3-Taiwan-70B-Instruct-rc3"
"yentinglin/Llama-3-Taiwan-70B-Instruct-rc2"
"yentinglin/Llama-3-Taiwan-70B-Instruct-rc1"
"yentinglin/Llama-3-Taiwan-70B-Instruct"
"yentinglin/Taiwan-Llama-3-70B-Cooldown"
"yentinglin/Taiwan-Llama-3-70B"
"yentinglin/Llama-3-Taiwan-8B-Instruct-rc1"
"yentinglin/Taiwan-Llama-3-8B-Instruct"
"yentinglin/Taiwan-Llama-3-8B-Cooldown"
"yentinglin/Taiwan-Llama-3-8B"
"meta-llama/Meta-Llama-3-70B-Instruct"
"meta-llama/Meta-Llama-3-70B"
"meta-llama/Meta-Llama-3-8B-Instruct"
"meta-llama/Meta-Llama-3-8B"
"Qwen/Qwen1.5-110B-Chat"
"Qwen/Qwen1.5-110B"
"Qwen/Qwen1.5-32B"
"Qwen/Qwen1.5-32B-Chat"
"Qwen/Qwen1.5-72B-Chat"
"Qwen/Qwen1.5-72B"
"Qwen/Qwen1.5-MoE-A2.7B"
"Qwen/Qwen1.5-MoE-A2.7B-Chat"
"Qwen/Qwen1.5-4B"
"Qwen/Qwen1.5-4B-Chat"
"Qwen/Qwen1.5-0.5B"
"Qwen/Qwen1.5-0.5B-Chat"
"Qwen/Qwen1.5-1.8B"
"Qwen/Qwen1.5-7B"
"Qwen/Qwen1.5-14B"
"Qwen/Qwen1.5-14B-Chat"
"deepseek-ai/DeepSeek-V2-Chat"
"01-ai/Yi-34B-Chat"
"01-ai/Yi-1.5-34B"
"01-ai/Yi-1.5-34B-Chat"
"01-ai/Yi-1.5-34B-32K"
"01-ai/Yi-1.5-34B-Chat-16K"
"01-ai/Yi-1.5-9B-32K"
"01-ai/Yi-1.5-9B-Chat-16K"
"01-ai/Yi-1.5-9B"
"01-ai/Yi-1.5-9B-Chat"
"01-ai/Yi-1.5-6B"
"01-ai/Yi-1.5-6B-Chat"
"CohereForAI/c4ai-command-r-plus"
"CohereForAI/c4ai-command-r-v01"
"CohereForAI/aya-23-35B"
"CohereForAI/aya-23-8B"
"mistralai/Mixtral-8x22B-Instruct-v0.1"
"mistralai/Mixtral-8x22B-v0.1"
"mistralai/Mistral-7B-Instruct-v0.3"
"mistralai/Mistral-7B-v0.3"
"mistralai/Mistral-7B-Instruct-v0.2"
"mistralai/Mixtral-8x7B-Instruct-v0.1"
"mistralai/Mixtral-8x7B-v0.1"
"mistralai/Mistral-7B-v0.1"
"MediaTek-Research/Breeze-7B-32k-Instruct-v1_0"
"MediaTek-Research/Breeze-7B-Instruct-v0_1"
"MediaTek-Research/Breeze-7B-Base-v0_1"
"MediaTek-Research/Breeze-7B-Instruct-v1_0"
"MediaTek-Research/Breeze-7B-Base-v1_0"
"INX-TEXT/Bailong-instruct-7B"
"taide/Llama3-TAIDE-LX-8B-Chat-Alpha1"
"taide/TAIDE-LX-7B-Chat"
"taide/TAIDE-LX-7B"
......@@ -35,7 +64,7 @@ declare -a models=(
SLURM_SCRIPT="harness_eval.slurm"
# Parameters for the script
PARAMS="tmlu,twllm_eval,tw_legal,ccp,pega,tmmluplus,mmlu,pega_mmlu"
PARAMS="tmlu,twllm_eval,tw_legal,ccp,pega,tmmluplus,mmlu,pega_mmlu,umtceval"
# Loop through each model and submit a job
for model in "${models[@]}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment