Unverified Commit cd71c7f0 authored by Yen-Ting Lin's avatar Yen-Ting Lin Committed by GitHub
Browse files

clean for pr

parent a2af2101
......@@ -20,9 +20,4 @@ ipython_config.py
lm_eval/caching/.cache
# don't track files created by wandb
wandb
examples/wandb
evals/
harness_eval_main_log.txt
None/
logs/
eval_results/
\ No newline at end of file
examples/wandb
\ No newline at end of file
#!/bin/bash
# this is a multi-node SLURM script using `accelerate` launcher
#SBATCH --job-name=eval-harness
#SBATCH --partition=no_multi,defq
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1 # crucial - only 1 task per node
#SBATCH --gres=gpu:8 # EDIT this if it's not 8-gpus per node
#SBATCH --exclusive
#SBATCH --output=logs/%x-%j.out
#SBATCH --error=logs/%x-%j.err
echo "START TIME: $(date)"
source ~/.bashrc
source activate harness
# auto-fail on any errors in this script
set -eo pipefail
# logging script's variables/commands for future debug needs
set -x
# EDIT the conda evn and any startup scripts
# source /path/to/start-xxx-user # if you have something to preload before the job
# conda activate stas-xxx # if you have conda env to activate
LOG_PATH="harness_eval_main_log.txt"
# EDIT if it's not 8-gpus per node
GPUS_PER_NODE=8
NNODES=$SLURM_NNODES
NUM_PROCESSES=$(expr $NNODES \* $GPUS_PER_NODE)
# define the node 0 hostname:port
MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
MASTER_PORT=6000
# note `\$SLURM_PROCID` we don't want it interpolated till `srun` since otherwise all nodes will get
# 0 and the launcher will hang
#
# same goes for `\$(hostname -s|tr -dc '0-9')` - we want it to interpolate at `srun` time
LAUNCHER=""
model=$1
tasks=$2
echo "MODEL: $model"
echo "TASKS: $tasks"
# EDIT the path+name of the python script and whatever args it needs
export PROGRAM="\
lm_eval \
--model hf \
--model_args pretrained=$model,parallelize=True,trust_remote_code=True \
--tasks $tasks \
--num_fewshot 0 \
--batch_size 8 \
--output_path eval_results \
--write_out \
--verbosity DEBUG \
--wandb_args project=lm-eval-harness-integration,job_type=eval,name=$model \
--hf_hub_log_args hub_results_org=yentinglin,hub_repo_name=lm-eval-results,push_results_to_hub=True,public_repo=False \
--seed 42 \
--trust_remote_code \
"
export CMD="$LAUNCHER $PROGRAM"
echo $CMD
# EDIT if you want to redirect /tmp to /scratch (some local SSD path) since /tmp is tiny on compute nodes
# export TMPDIR=/scratch
# EDIT: useful for debug if needed
#
# to debug NCCL issues
# export NCCL_DEBUG=INFO
#
# to unravel async errors w/o the correct traceback - potentially makes everything very slower
# export CUDA_LAUNCH_BLOCKING=1
#
# to force crashing on nccl issues like hanging broadcast
# export NCCL_ASYNC_ERROR_HANDLING=1
# srun error handling:
# --wait=60: wait 60 sec after the first task terminates before terminating all remaining tasks
# --kill-on-bad-exit=1: terminate a step if any task exits with a non-zero exit code
SRUN_ARGS=" \
--wait=60 \
--kill-on-bad-exit=1 \
--jobid $SLURM_JOB_ID \
"
# bash -c is needed for the delayed interpolation of env vars to work
srun $SRUN_ARGS bash -c "$CMD" 2>&1 | tee -a $LOG_PATH
echo "END TIME: $(date)"
dataset_path: Jimmy2005/CCPEval
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 0.1
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import os
import pandas as pd
import yaml
from tqdm import tqdm
categories = {
"chemical_engineering": [
"chemical_engineering_en",
"chemical_engineering_tw",
],
"identity": [
"self",
],
"truthful_qa": [
"taiwan",
],
}
task_list = [
"chemical_engineering_en",
"chemical_engineering_tw",
"self",
"taiwan",
]
subject2name = {}
SUBJECTS = {}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", default="_default_template_yaml")
parser.add_argument("--save_prefix_path", default="ccp")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
parser.add_argument("--group_prefix", default="")
parser.add_argument("--subject_file", default="../subject.tsv")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
from pathlib import Path
# Initialization
SUBJECT_FILE = Path(__file__).parent / Path(args.subject_file)
df = pd.read_csv(SUBJECT_FILE, delimiter="\t")
for _, row in df.iterrows():
for _c in categories:
if row["subject"] in SUBJECTS:
raise ValueError(f"Duplicate tasks. {row['subject']} already exists.")
if row["category"] in categories[_c]: # append new item into SUBJECTS
SUBJECTS[row["subject"]] = _c
subject2name[row["subject"]] = row["name"]
break
# End of SUBJECTS initialization
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path) as f:
cot_file = json.load(f)
ALL_CATEGORIES = []
for subject, category in tqdm(SUBJECTS.items()):
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
if args.cot_prompt_path is not None:
description = cot_file[subject]
else:
name_of_subject = subject2name[subject].replace("_", " ")
description = f"以下為{name_of_subject}的單選題,請提供正確答案的選項。\n\n"
# description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
"include": base_yaml_name,
"group": f"ccp_{args.task_prefix}_{category}"
if args.task_prefix != ""
else f"ccp_{category}",
"group_alias": category.replace("_", " "),
"task": f"ccp_{args.task_prefix}_{subject}"
if args.task_prefix != ""
else f"ccp_{subject}",
"task_alias": subject.replace("_", " "),
"dataset_name": subject,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
# eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
# width=float("inf"),
allow_unicode=True,
default_style='"',
)
if args.task_prefix != "":
mmlu_subcategories = [
f"ccp_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
]
else:
mmlu_subcategories = [f"ccp_{category}" for category in ALL_CATEGORIES]
if args.group_prefix != "":
file_save_path = args.group_prefix + ".yaml"
else:
file_save_path = args.save_prefix_path + ".yaml"
# eval_logger.info(f"Saving benchmark config to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
{
"group": f"ccp_{args.task_prefix}"
if args.task_prefix != ""
else "ccp",
"task": mmlu_subcategories,
},
yaml_file,
indent=4,
default_flow_style=False,
)
group: ccp
task:
- ccp_chemical_engineering
- ccp_truthful_qa
- ccp_identity
"dataset_name": "CCP"
"description": "以下為長春集團的單選題,請提供正確答案的選項。\n\n"
"group": "ccp_identity"
"group_alias": "identity"
"include": "_default_template_yaml"
"task": "ccp_CCP"
"task_alias": "CCP"
"dataset_name": "chemical_engineering_en"
"description": "以下為chemical engineering的單選題,請提供正確答案的選項。\n\n"
"group": "ccp_chemical_engineering"
"group_alias": "chemical engineering"
"include": "_default_template_yaml"
"task": "ccp_chemical_engineering_en"
"task_alias": "chemical engineering en"
"dataset_name": "chemical_engineering_tw"
"description": "以下為化工的單選題,請提供正確答案的選項。\n\n"
"group": "ccp_chemical_engineering"
"group_alias": "chemical engineering"
"include": "_default_template_yaml"
"task": "ccp_chemical_engineering_tw"
"task_alias": "chemical engineering tw"
"dataset_name": "taiwan"
"description": "以下為台灣的單選題,請提供正確答案的選項。\n\n"
"group": "ccp_truthful_qa"
"group_alias": "truthful qa"
"include": "_default_template_yaml"
"task": "ccp_taiwan"
"task_alias": "taiwan"
import datasets
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _helper(doc):
# modifies the contents of a single
# document in our dataset.
answer_list = ["A", "B", "C", "D"]
choices = [doc["A"], doc["B"], doc["C"], doc["D"]]
out_doc = {
"questions": doc["question"],
"choices": choices,
"goal": answer_list.index(doc["answer"]),
}
return out_doc
return dataset.map(_helper) # returns back a datasets.Dataset object
subject name category
chemical_engineering_en chemical engineering chemical_engineering_en
chemical_engineering_tw 化工 chemical_engineering_tw
taiwan 台灣 taiwan
CCP 長春集團 self
\ No newline at end of file
dataset_path: yentinglin/PegaEval
test_split: train
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.1
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import os
import pandas as pd
import yaml
from tqdm import tqdm
categories = {
"software": [
"default",
],
}
task_list = [
"default",
]
subject2name = {}
SUBJECTS = {}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", default="_default_template_yaml")
parser.add_argument("--save_prefix_path", default="pega")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
parser.add_argument("--group_prefix", default="")
parser.add_argument("--subject_file", default="../subject.tsv")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
from pathlib import Path
# Initialization
SUBJECT_FILE = Path(__file__).parent / Path(args.subject_file)
df = pd.read_csv(SUBJECT_FILE, delimiter="\t")
for _, row in df.iterrows():
for _c in categories:
if row["subject"] in SUBJECTS:
raise ValueError(f"Duplicate tasks. {row['subject']} already exists.")
if row["category"] in categories[_c]: # append new item into SUBJECTS
SUBJECTS[row["subject"]] = _c
subject2name[row["subject"]] = row["name"]
break
# End of SUBJECTS initialization
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path) as f:
cot_file = json.load(f)
ALL_CATEGORIES = []
for subject, category in tqdm(SUBJECTS.items()):
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
if args.cot_prompt_path is not None:
description = cot_file[subject]
else:
name_of_subject = subject2name[subject].replace("_", " ")
description = f"以下為{name_of_subject}的單選題,請提供正確答案的選項。\n\n"
# description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
"include": base_yaml_name,
"group": f"pega_{args.task_prefix}_{category}"
if args.task_prefix != ""
else f"pega_{category}",
"group_alias": category.replace("_", " "),
"task": f"pega_{args.task_prefix}_{subject}"
if args.task_prefix != ""
else f"pega_{subject}",
"task_alias": subject.replace("_", " "),
"dataset_name": subject,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
# eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
# width=float("inf"),
allow_unicode=True,
default_style='"',
)
if args.task_prefix != "":
mmlu_subcategories = [
f"pega_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
]
else:
mmlu_subcategories = [f"pega_{category}" for category in ALL_CATEGORIES]
if args.group_prefix != "":
file_save_path = args.group_prefix + ".yaml"
else:
file_save_path = args.save_prefix_path + ".yaml"
# eval_logger.info(f"Saving benchmark config to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
{
"group": f"pega_{args.task_prefix}"
if args.task_prefix != ""
else "pega",
"task": mmlu_subcategories,
},
yaml_file,
indent=4,
default_flow_style=False,
)
group: pega
task:
- pega_software
"dataset_name": "default"
"description": "以下為軟體的單選題,請提供正確答案的選項。\n\n"
"group": "pega_software"
"group_alias": "software"
"include": "_default_template_yaml"
"task": "pega_default"
"task_alias": "default"
import datasets
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _helper(doc):
# modifies the contents of a single
# document in our dataset.
answer_list = ["A", "B", "C", "D"]
choices = [doc["A"], doc["B"], doc["C"], doc["D"]]
out_doc = {
"questions": doc["question"],
"choices": choices,
"goal": answer_list.index(doc["answer"]),
}
return out_doc
return dataset.map(_helper) # returns back a datasets.Dataset object
subject name category
default 軟體 default
dataset_path: kennyponpon/pega_llm_mmlu_test
test_split: test
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.1
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import os
import pandas as pd
import yaml
from tqdm import tqdm
categories = {
"law": [
"default",
],
}
task_list = [
"default",
]
subject2name = {}
SUBJECTS = {}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", default="_default_template_yaml")
parser.add_argument("--save_prefix_path", default="pega_mmlu")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
parser.add_argument("--group_prefix", default="")
parser.add_argument("--subject_file", default="../subject.tsv")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
from pathlib import Path
# Initialization
SUBJECT_FILE = Path(__file__).parent / Path(args.subject_file)
df = pd.read_csv(SUBJECT_FILE, delimiter="\t")
for _, row in df.iterrows():
for _c in categories:
if row["subject"] in SUBJECTS:
raise ValueError(f"Duplicate tasks. {row['subject']} already exists.")
if row["category"] in categories[_c]: # append new item into SUBJECTS
SUBJECTS[row["subject"]] = _c
subject2name[row["subject"]] = row["name"]
break
# End of SUBJECTS initialization
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path) as f:
cot_file = json.load(f)
ALL_CATEGORIES = []
for subject, category in tqdm(SUBJECTS.items()):
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
if args.cot_prompt_path is not None:
description = cot_file[subject]
else:
name_of_subject = subject2name[subject].replace("_", " ")
description = f"以下為{name_of_subject}的單選題,請提供正確答案的選項。\n\n"
# description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
"include": base_yaml_name,
"group": f"pega_mmlu_{args.task_prefix}_{category}"
if args.task_prefix != ""
else f"pega_mmlu_{category}",
"group_alias": category.replace("_", " "),
"task": f"pega_mmlu_{args.task_prefix}_{subject}"
if args.task_prefix != ""
else f"pega_mmlu_{subject}",
"task_alias": subject.replace("_", " "),
"dataset_name": subject,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
# eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
# width=float("inf"),
allow_unicode=True,
default_style='"',
)
if args.task_prefix != "":
mmlu_subcategories = [
f"pega_mmlu_{args.task_prefix}_{category}" for category in ALL_CATEGORIES
]
else:
mmlu_subcategories = [f"pega_mmlu_{category}" for category in ALL_CATEGORIES]
if args.group_prefix != "":
file_save_path = args.group_prefix + ".yaml"
else:
file_save_path = args.save_prefix_path + ".yaml"
# eval_logger.info(f"Saving benchmark config to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
{
"group": f"pega_mmlu_{args.task_prefix}"
if args.task_prefix != ""
else "pega_mmlu",
"task": mmlu_subcategories,
},
yaml_file,
indent=4,
default_flow_style=False,
)
group: pega_mmlu
task:
- pega_mmlu_law
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment