Commit 948f120f authored by Baber's avatar Baber
Browse files

Merge branch 'main' into autobatchtest

# Conflicts:
#	lm_eval/models/huggingface.py
parents a5b1c7a8 bd80a6c0
"fewshot_split": "zho_Hans"
"include": "_default_template_yaml"
"task": "belebele_zho_Hans"
"test_split": "zho_Hans"
dataset_name: zho_Hans
fewshot_split: test
include: _default_template_yaml
task: belebele_zho_Hans
test_split: test
"fewshot_split": "zho_Hant"
"include": "_default_template_yaml"
"task": "belebele_zho_Hant"
"test_split": "zho_Hant"
dataset_name: zho_Hant
fewshot_split: test
include: _default_template_yaml
task: belebele_zho_Hant
test_split: test
"fewshot_split": "zsm_Latn"
"include": "_default_template_yaml"
"task": "belebele_zsm_Latn"
"test_split": "zsm_Latn"
dataset_name: zsm_Latn
fewshot_split: test
include: _default_template_yaml
task: belebele_zsm_Latn
test_split: test
"fewshot_split": "zul_Latn"
"include": "_default_template_yaml"
"task": "belebele_zul_Latn"
"test_split": "zul_Latn"
dataset_name: zul_Latn
fewshot_split: test
include: _default_template_yaml
task: belebele_zul_Latn
test_split: test
......@@ -4,48 +4,51 @@ task:
# ANLI R1
- group: anli_r1_flan
group_alias: ANLI R1
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
- task: anli_r1
- task: anli_r1_prompt-0
task_alias: prompt-0
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r1
- task: anli_r1_prompt-1
task_alias: prompt-1
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r1
- task: anli_r1_prompt-2
task_alias: prompt-2
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r1
- task: anli_r1_prompt-3
task_alias: prompt-3
include: _held_in_template_yaml
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r1
- task: anli_r1_prompt-4
task_alias: prompt-4
include: _held_in_template_yaml
doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r1
- task: anli_r1_prompt-5
task_alias: prompt-5
include: _held_in_template_yaml
doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r1
- task: anli_r1_prompt-6
task_alias: prompt-6
include: _held_in_template_yaml
doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r1
- task: anli_r1_prompt-7
task_alias: prompt-7
include: _held_in_template_yaml
doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r1
- task: anli_r1_prompt-8
task_alias: prompt-8
include: _held_in_template_yaml
doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
......@@ -53,48 +56,51 @@ task:
# ANLI R2
- group: anli_r2_flan
group_alias: ANLI R2
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
- task: anli_r2
- task: anli_r2_prompt-0
task_alias: prompt-0
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r2
- task: anli_r2_prompt-1
task_alias: prompt-1
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r2
- task: anli_r2_prompt-2
task_alias: prompt-2
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r2
- task: anli_r2_prompt-3
task_alias: prompt-3
include: _held_in_template_yaml
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r2
- task: anli_r2_prompt-4
task_alias: prompt-4
include: _held_in_template_yaml
doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r2
- task: anli_r2_prompt-5
task_alias: prompt-5
include: _held_in_template_yaml
doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r2
- task: anli_r2_prompt-6
task_alias: prompt-6
include: _held_in_template_yaml
doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r2
- task: anli_r2_prompt-7
task_alias: prompt-7
include: _held_in_template_yaml
doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r2
- task: anli_r2_prompt-8
task_alias: prompt-8
include: _held_in_template_yaml
doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
......@@ -102,48 +108,51 @@ task:
# ANLI R3
- group: anli_r3_flan
group_alias: ANLI R3
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
- task: anli_r3
- task: anli_r3_prompt-0
task_alias: prompt-0
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r3
- task: anli_r3_prompt-1
task_alias: prompt-1
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r3
- task: anli_r3_prompt-2
task_alias: prompt-2
include: _held_in_template_yaml
doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r3
- task: anli_r3_prompt-3
task_alias: prompt-3
include: _held_in_template_yaml
doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r3
- task: anli_r3_prompt-4
task_alias: prompt-4
include: _held_in_template_yaml
doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r3
- task: anli_r3_prompt-5
task_alias: prompt-5
include: _held_in_template_yaml
doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r3
- task: anli_r3_prompt-6
task_alias: prompt-6
include: _held_in_template_yaml
doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r3
- task: anli_r3_prompt-7
task_alias: prompt-7
include: _held_in_template_yaml
doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
- task: anli_r3
- task: anli_r3_prompt-8
task_alias: prompt-8
include: _held_in_template_yaml
doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
......@@ -151,38 +160,41 @@ task:
# Arc Easy
- group: arc_easy_flan
group_alias: Arc Easy
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
- task: arc_easy
- task: arc_easy_prompt-0
task_alias: prompt-0
include: _held_in_template_yaml
doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_easy
- task: arc_easy_prompt-1
task_alias: prompt-1
include: _held_in_template_yaml
doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_easy
- task: arc_easy_prompt-2
task_alias: prompt-2
include: _held_in_template_yaml
doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_easy
- task: arc_easy_prompt-3
task_alias: prompt-3
include: _held_in_template_yaml
doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_easy
- task: arc_easy_prompt-4
task_alias: prompt-4
include: _held_in_template_yaml
doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_easy
- task: arc_easy_prompt-5
task_alias: prompt-5
include: _held_in_template_yaml
doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_easy
- task: arc_easy_prompt-6
task_alias: prompt-6
include: _held_in_template_yaml
doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
......@@ -190,38 +202,41 @@ task:
# Arc Challenge
- group: arc_challenge_flan
group_alias: Arc Challenge
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
- task: arc_challenge
- task: arc_challenge_prompt-0
task_alias: prompt-0
include: _held_in_template_yaml
doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_challenge
- task: arc_challenge_prompt-1
task_alias: prompt-1
include: _held_in_template_yaml
doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_challenge
- task: arc_challenge_prompt-2
task_alias: prompt-2
include: _held_in_template_yaml
doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_challenge
- task: arc_challenge_prompt-3
task_alias: prompt-3
include: _held_in_template_yaml
doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_challenge
- task: arc_challenge_prompt-4
task_alias: prompt-4
include: _held_in_template_yaml
doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_challenge
- task: arc_challenge_prompt-5
task_alias: prompt-5
include: _held_in_template_yaml
doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
- task: arc_challenge
- task: arc_challenge_prompt-6
task_alias: prompt-6
include: _held_in_template_yaml
doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
......@@ -229,53 +244,56 @@ task:
# BoolQ
- group: boolq_flan
group_alias: BoolQ
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
- task: boolq
- task: boolq_prompt-0
task_alias: prompt-0
include: _held_in_template_yaml
doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
- task: boolq
- task: boolq_prompt-1
task_alias: prompt-1
include: _held_in_template_yaml
doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
- task: boolq
- task: boolq_prompt-2
task_alias: prompt-2
include: _held_in_template_yaml
doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
- task: boolq
- task: boolq_prompt-3
task_alias: prompt-3
include: _held_in_template_yaml
doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
- task: boolq
- task: boolq_prompt-4
task_alias: prompt-4
include: _held_in_template_yaml
doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
- task: boolq
- task: boolq_prompt-5
task_alias: prompt-5
include: _held_in_template_yaml
doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
- task: boolq
- task: boolq_prompt-6
task_alias: prompt-6
include: _held_in_template_yaml
doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
- task: boolq
- task: boolq_prompt-7
task_alias: prompt-7
include: _held_in_template_yaml
doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
- task: boolq
- task: boolq_prompt-8
task_alias: prompt-8
include: _held_in_template_yaml
doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
doc_to_target: "{{['no', 'yes'][label]}}"
- task: boolq
- task: boolq_prompt-9
task_alias: prompt-9
include: _held_in_template_yaml
doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
......@@ -283,48 +301,51 @@ task:
# RTE
- group: rte_flan
group_alias: RTE
aggregate_metric_list:
- metric: acc
weight_by_size: True
task:
- task: rte
- task: rte_prompt-0
task_alias: prompt-0
include: _held_in_template_yaml
doc_to_text: "{{sentence1}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{sentence2}}\"?\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
- task: rte
- task: rte_prompt-1
task_alias: prompt-1
include: _held_in_template_yaml
doc_to_text: "{{sentence1}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
- task: rte
- task: rte_prompt-1
task_alias: prompt-2
include: _held_in_template_yaml
doc_to_text: "{{sentence1}}\n\nQ with options: Can we draw the following conclusion?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
- task: rte
- task: rte_prompt-3
task_alias: prompt-3
include: _held_in_template_yaml
doc_to_text: "{{sentence1}}\nDoes this next sentence follow, given the preceding text?\n{{sentence2}}\n\nOPTIONS:\n- yes\n- no"
doc_to_target: "{{['yes', 'no'][label]}}"
- task: rte
- task: rte_prompt-4
task_alias: prompt-4
include: _held_in_template_yaml
doc_to_text: "{{sentence1}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{sentence2}}"
doc_to_target: "{{['yes', 'no'][label]}}"
- task: rte
- task: rte_prompt-5
task_alias: prompt-5
include: _held_in_template_yaml
doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nThe answer is"
doc_to_target: "{{['yes', 'no'][label]}}"
- task: rte
- task: rte_prompt-6
task_alias: prompt-6
include: _held_in_template_yaml
doc_to_text: "Read the text and determine if the sentence is true:\n\n{{sentence1}}\n\nSentence: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
doc_to_target: "{{['yes', 'no'][label]}}"
- task: rte
- task: rte_prompt-7
task_alias: prompt-7
include: _held_in_template_yaml
doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{sentence1}}\n\nHypothesis: {{sentence2}}\nOPTIONS:\n- yes\n- no\nA:"
doc_to_target: "{{['yes', 'no'][label]}}"
- task: rte
- task: rte_prompt-8
task_alias: prompt-8
include: _held_in_template_yaml
doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{sentence2}}\n\n{{sentence1}}\nOPTIONS:\n- yes\n- no"
......
......@@ -7,3 +7,9 @@ task:
- minerva_math_num_theory
- minerva_math_prealgebra
- minerva_math_precalc
aggregate_metric_list:
- metric: exact_match
aggregation: mean
weight_by_size: true
metadata:
version: 1.0
......@@ -15,3 +15,7 @@ task:
task_alias: "professional_medicine (mmlu)"
- task: mmlu_college_biology
task_alias: "college_biology (mmlu)"
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: True
group: bertaqa
tag: bertaqa
dataset_path: HiTZ/BertaQA
dataset_name: null
validation_split: null
......
group: blimp
task:
- "blimp_adjunct_island"
- "blimp_anaphor_gender_agreement"
- "blimp_anaphor_number_agreement"
- "blimp_animate_subject_passive"
- "blimp_animate_subject_trans"
- "blimp_causative"
- "blimp_complex_NP_island"
- "blimp_coordinate_structure_constraint_complex_left_branch"
- "blimp_coordinate_structure_constraint_object_extraction"
- "blimp_determiner_noun_agreement_1"
- "blimp_determiner_noun_agreement_2"
- "blimp_determiner_noun_agreement_irregular_1"
- "blimp_determiner_noun_agreement_irregular_2"
- "blimp_determiner_noun_agreement_with_adj_2"
- "blimp_determiner_noun_agreement_with_adj_irregular_1"
- "blimp_determiner_noun_agreement_with_adj_irregular_2"
- "blimp_determiner_noun_agreement_with_adjective_1"
- "blimp_distractor_agreement_relational_noun"
- "blimp_distractor_agreement_relative_clause"
- "blimp_drop_argument"
- "blimp_ellipsis_n_bar_1"
- "blimp_ellipsis_n_bar_2"
- "blimp_existential_there_object_raising"
- "blimp_existential_there_quantifiers_1"
- "blimp_existential_there_quantifiers_2"
- "blimp_existential_there_subject_raising"
- "blimp_expletive_it_object_raising"
- "blimp_inchoative"
- "blimp_intransitive"
- "blimp_irregular_past_participle_adjectives"
- "blimp_irregular_past_participle_verbs"
- "blimp_irregular_plural_subject_verb_agreement_1"
- "blimp_irregular_plural_subject_verb_agreement_2"
- "blimp_left_branch_island_echo_question"
- "blimp_left_branch_island_simple_question"
- "blimp_matrix_question_npi_licensor_present"
- "blimp_npi_present_1"
- "blimp_npi_present_2"
- "blimp_only_npi_licensor_present"
- "blimp_only_npi_scope"
- "blimp_passive_1"
- "blimp_passive_2"
- "blimp_principle_A_c_command"
- "blimp_principle_A_case_1"
- "blimp_principle_A_case_2"
- "blimp_principle_A_domain_1"
- "blimp_principle_A_domain_2"
- "blimp_principle_A_domain_3"
- "blimp_principle_A_reconstruction"
- "blimp_regular_plural_subject_verb_agreement_1"
- "blimp_regular_plural_subject_verb_agreement_2"
- "blimp_sentential_negation_npi_licensor_present"
- "blimp_sentential_negation_npi_scope"
- "blimp_sentential_subject_island"
- "blimp_superlative_quantifiers_1"
- "blimp_superlative_quantifiers_2"
- "blimp_tough_vs_raising_1"
- "blimp_tough_vs_raising_2"
- "blimp_transitive"
- "blimp_wh_island"
- "blimp_wh_questions_object_gap"
- "blimp_wh_questions_subject_gap"
- "blimp_wh_questions_subject_gap_long_distance"
- "blimp_wh_vs_that_no_gap"
- "blimp_wh_vs_that_no_gap_long_distance"
- "blimp_wh_vs_that_with_gap"
- "blimp_wh_vs_that_with_gap_long_distance"
aggregate_metric_list:
- metric: acc
aggregation: mean
weight_by_size: False
metadata:
version: 2.0
group: blimp
dataset_path: blimp
output_type: multiple_choice
validation_split: train
......
# CatalanBench
### Paper
CatalanBench is a benchmark for evaluating language models in Catalan tasks. This is, it evaluates the ability of a language model to understand and generate Catalan text. CatalanBench offers a combination of pre-existing, open datasets and datasets developed exclusivelly for this benchmark. All the details of CatalanBench will be published in a paper soon.
The new evaluation datasets included in CatalanBench are:
| Task | Category | Homepage |
|:-------------:|:-----:|:-----:|
| ARC_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/arc_ca |
| MGSM_ca | Math | https://huggingface.co/datasets/projecte-aina/mgsm_ca |
| OpenBookQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/openbookqa_ca |
| Parafraseja | Paraphrasing | https://huggingface.co/datasets/projecte-aina/Parafraseja |
| PIQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/piqa_ca |
| SIQA_ca | Question Answering | https://huggingface.co/datasets/projecte-aina/siqa_ca |
| XStoryCloze_ca | Commonsense Reasoning | https://huggingface.co/datasets/projecte-aina/xstorycloze_ca |
The datasets included in CatalanBench that have been made public in previous pubications are:
| Task | Category | Paper title | Homepage |
|:-------------:|:-----:|:-------------:|:-----:|
| Belebele_ca | Reading Comprehension | [The Belebele Benchmark: a Parallel Reading Comprehension Dataset in 122 Language Variants](https://arxiv.org/abs/2308.16884) | https://huggingface.co/datasets/facebook/belebele |
| caBREU | Summarization | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/caBreu |
| CatalanQA | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/catalanqa |
| CatCoLA | Linguistic Acceptability | CatCoLA: Catalan Corpus of Linguistic Acceptability | https://huggingface.co/datasets/nbel/CatCoLA |
| COPA-ca | Commonsense Reasoning | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/COPA-ca |
| CoQCat | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/CoQCat |
| FLORES_ca | Translation | [The FLORES-101 Evaluation Benchmark for Low-Resource and Multilingual Machine Translation](https://arxiv.org/abs/2106.03193) | https://huggingface.co/datasets/facebook/flores |
| PAWS-ca | Paraphrasing | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/PAWS-ca |
| TE-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/teca |
| VeritasQA_ca | Truthfulness | VeritasQA: A Truthfulness Benchmark Aimed at Multilingual Transferability | TBA |
| WNLI-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/wnli-ca |
| XNLI-ca | Natural Language Inference | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/xnli-ca |
| XQuAD-ca | Question Answering | [Building a Data Infrastructure for a Mid-Resource Language: The Case of Catalan](https://aclanthology.org/2024.lrec-main.231/) | https://huggingface.co/datasets/projecte-aina/xquad-ca |
### Citation
Paper for CatalanBench coming soon.
<!--```bibtex
@inproceedings{baucells-2024-iberobench,
title = "IberoBench: A Benchmark for LLM Evaluation in Iberian Languages",
author = "Baucells, Irene and
AUTHORS, ADD",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
year = "2024",
publisher = "Association for Computational Linguistics",
}
```
-->
### Groups and Tasks
#### Groups
- `catalan_bench`: All tasks included in CatalanBench.
- `flores_ca`: All FLORES translation tasks from or to Catalan.
#### Tags
- `cabreu`: Three CaBREU tasks for each type of summary (extractive, abstractive and extreme).
- `phrases_va`: Two Phrases_va tasks for language adaptation between Catalan and Valencian.
#### Tasks
The following tasks evaluate tasks on CatalanBench dataset using various scoring methods.
- `arc_ca_challenge`
- `arc_ca_easy`
- `belebele_cat_Latn`
- `cabreu`
- `catalanqa`
- `catcola`
- `copa_ca`
- `coqcat`
- `flores_ca`
- `flores_ca-de`
- `flores_ca-en`
- `flores_ca-es`
- `flores_ca-eu`
- `flores_ca-fr`
- `flores_ca-gl`
- `flores_ca-it`
- `flores_ca-pt`
- `flores_de-ca`
- `flores_en-ca`
- `flores_es-ca`
- `flores_eu-ca`
- `flores_fr-ca`
- `flores_gl-ca`
- `flores_it-ca`
- `flores_pt-ca`
- `mgsm_direct_ca`
- `openbookqa_ca`
- `parafraseja`
- `paws_ca`
- `phrases_ca`
- `piqa_ca`
- `siqa_ca`
- `teca`
- `veritasqa_gen_ca`
- `veritasqa_mc1_ca`
- `veritasqa_mc2_ca`
- `wnli_ca`
- `xnli_ca`
- `xquad_ca`
- `xstorycloze_ca`
Some of these tasks are taken from benchmarks already available in LM Evaluation Harness. These are:
- `belebele_cat_Latn`: Belebele Catalan
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation?
* [ ] Yes, original implementation contributed by author of the benchmark
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
tag: arc_ca
dataset_path: projecte-aina/arc_ca
output_type: multiple_choice
training_split: null
validation_split: validation
test_split: test
doc_to_text: "Pregunta: {{question}}\nResposta:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Pregunta: {{question}}\nResposta:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
tag: cabreu
dataset_path: projecte-aina/caBreu
dataset_name: null
output_type: generate_until
test_split: test
training_split: train
validation_split: validation
process_docs: !function utils.process_doc_cabreu
metric_list:
- metric: bleu
aggregation: bleu
higher_is_better: true
- metric: !function utils.rouge1
aggregation: !function utils.rouge1_agg
higher_is_better: true
metadata:
version: 1.0
task: arc_ca_challenge
dataset_name: ARC-Challenge
include: _arc_ca_common_yaml
task: arc_ca_easy
dataset_name: ARC-Easy
include: _arc_ca_common_yaml
include: _cabreu_common_yaml
task: cabreu_abstractive
description: "Examina el text següent i genera'n un resum abstractiu, expressant el significat del text original d'una manera més natural i concisa.\n"
doc_to_text: >-
Text: {{content}}
Resum:
doc_to_target: '{{summaries["abstractive"]["a1"]}}'
include: _cabreu_common_yaml
task: cabreu_extractive
description: "Examina el text següent i genera'n un resum extractiu, utilitzant les frases o oracions més rellevants del text original.\n"
doc_to_text: >-
Text: {{content}}
Resum:
doc_to_target: '{{summaries["extractive"]["a1"]}}'
include: _cabreu_common_yaml
task: cabreu_extreme
description: "Examina el text següent i genera'n un resum que sigui el més concís possible i que preservi el significat del text original.\n"
doc_to_text: >-
Text: {{content}}
Resum:
doc_to_target: '{{summaries["extreme"]["a1"]}}'
group: catalan_bench
task:
- belebele_cat_Latn
- xnli_ca
- catcola
- copa_ca
- openbookqa_ca
- parafraseja
- paws_ca
- piqa_ca
- siqa_ca
- teca
- wnli_ca
- arc_ca_easy
- arc_ca_challenge
- xstorycloze_ca
- xquad_ca
- catalanqa
- coqcat
- flores_ca
- cabreu
- mgsm_direct_ca
- phrases_va
metadata:
version: 1.0
task: catalanqa
dataset_path: projecte-aina/catalanqa
dataset_name: null
output_type: generate_until
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Context: {{context}}\n\nPregunta: {{question}}\n\nResposta:"
doc_to_target: '{{answers[0]["text"]}}'
target_delimiter: ' '
process_results: !function utils.process_results_qa
generation_kwargs:
until:
- "\n"
do_sample: false
temperature: 0.0
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
- metric: f1
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment