Unverified Commit 9822b06e authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into weight_by_size

parents 51f27158 b177c82c
include: _arc_yaml
task: arc_te
dataset_path: alexandrainst/m_arc
dataset_name: te
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_uk
dataset_path: alexandrainst/m_arc
dataset_name: uk
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_vi
dataset_path: alexandrainst/m_arc
dataset_name: vi
training_split: train
validation_split: validation
test_split: test
include: _arc_yaml
task: arc_zh
dataset_path: alexandrainst/m_arc
dataset_name: zh
training_split: train
validation_split: validation
test_split: test
import re
import datasets
def preprocess(text):
if text is None:
return " "
text = text.strip()
text = text.replace(" [title]", ". ")
text = re.sub("\\[.*?\\]", "", text)
text = text.replace(" ", " ")
return text
def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def _process_doc(doc):
# breakpoint()
out_doc = {
"id": doc["id"],
"query": "Question: " + preprocess(doc["instruction"]) + "\nAnswer:",
"choices": [
preprocess(doc["option_a"]),
preprocess(doc["option_b"]),
preprocess(doc["option_c"]),
preprocess(doc["option_d"]),
preprocess(doc["option_e"]),
],
"gold": ["A", "B", "C", "D", "E"].index(doc["answer"]),
}
return out_doc
return dataset.map(_process_doc)
import datasets
import re
import datasets
def preprocess(text):
text = text.strip()
......
group:
- m_mmlu
dataset_path: alexandrainst/m_mmlu
test_split: test
fewshot_split: train
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{instruction.strip()}}\nA. {{option_a}}\nB. {{option_b}}\nC. {{option_c}}\nD. {{option_d}}\nAnswer:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: answer
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
version: 0.0
import datasets
import yaml
from tqdm import tqdm
def main() -> None:
dataset_path = "alexandrainst/m_mmlu"
for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
file_name = f"m_mmlu_{task}.yaml"
try:
with open(f"{file_name}", "w") as f:
f.write("# Generated by _generate_configs.py\n")
yaml.dump(
{
"include": "_default_yaml",
"task": f"{dataset_path.split('/')[-1]}_{task}",
"dataset_name": task,
},
f,
)
except FileExistsError:
pass
if __name__ == "__main__":
main()
# Generated by _generate_configs.py
dataset_name: ar
include: _default_yaml
task: m_mmlu_ar
# Generated by _generate_configs.py
dataset_name: bn
include: _default_yaml
task: m_mmlu_bn
# Generated by _generate_configs.py
dataset_name: ca
include: _default_yaml
task: m_mmlu_ca
# Generated by _generate_configs.py
dataset_name: da
include: _default_yaml
task: m_mmlu_da
# Generated by _generate_configs.py
dataset_name: de
include: _default_yaml
task: m_mmlu_de
# Generated by _generate_configs.py
dataset_name: en
include: _default_yaml
task: m_mmlu_en
# Generated by _generate_configs.py
dataset_name: es
include: _default_yaml
task: m_mmlu_es
# Generated by _generate_configs.py
dataset_name: eu
include: _default_yaml
task: m_mmlu_eu
# Generated by _generate_configs.py
dataset_name: fr
include: _default_yaml
task: m_mmlu_fr
# Generated by _generate_configs.py
dataset_name: gu
include: _default_yaml
task: m_mmlu_gu
# Generated by _generate_configs.py
dataset_name: hi
include: _default_yaml
task: m_mmlu_hi
# Generated by _generate_configs.py
dataset_name: hr
include: _default_yaml
task: m_mmlu_hr
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment