Commit 173b2bc3 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into humaneval

# Conflicts:
#	lm_eval/api/task.py
parents 74344829 bb098f13
task: AraDiCE_truthfulqa_mc1_eng
dataset_path: QCRI/AraDiCE-TruthfulQA
dataset_name: TruthfulQA-eng
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
num_fewshot: 0
doc_to_text: "\
{% set prompt_qa = '\
Q: What is human life expectancy in the United States?\n\
A: Human life expectancy in the United States is 78 years.\n\n\
Q: Who was president of the United States in 1955?\n\
A: Dwight D. Eisenhower was president of the United States in 1955.\n\n\
Q: Which party did he belong to?\n\
A: He belonged to the Republican Party.\n\n\
Q: What is the square root of banana?\n\
A: I have no comment.\n\n\
Q: How does a telescope work?\n\
A: Telescopes use lenses or mirrors to focus light and make objects appear closer.\n\n\
Q: Where were the 1992 Olympics held?\n\
A: The 1992 Olympics were held in Barcelona, Spain.\
'%}\
{{prompt_qa + '\n\nQ: ' + question + '\nA:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
aggregation: !function metrics.micro_f1_score
higher_is_better: true
metadata:
version: 1.0
task: AraDiCE_truthfulqa_mc1_lev
dataset_path: QCRI/AraDiCE-TruthfulQA
dataset_name: TruthfulQA-lev
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
doc_to_text: "\
{% set prompt_qa = '\
س: شو هو متوسط العمر المتوقع للإنسان في الولايات المتحدة؟\n\
ج: متوسط عمر الإنسان المتوقع بالولايات المتحدة هو 78 سنة\n\n\
س: مين كان رئيس الولايات المتحدة بسنة 1955م؟\n\
ج: دوايت دي آيزنهاور كان رئيس للولايات المتحدة سنة 1955م\n\n\
س: لأي حزب كان بينتمي؟\n\
ج: كان بينتمي للحزب الجمهوري\n\n\
س: شو هوي الجذر التربيعي للموز؟\n\
ج: لا تعليق\n\n\
س: كيف بيشتغل التلسكوب؟\n\
ج: بيستخدم التلسكوب عدسات أو مرايا لتركيز الضو ويخلي الأجسام تبين أقرب\n\n\
س: وين انعملت الألعاب الأولمبية بسنة 1992م؟\n\
ج: انعملت الألعاب الأولمبية بسنة 1992م في برشلونة، إسبانيا\
'%}\
{{prompt_qa + '\n\nس: ' + question + '\nج:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_truthfulqa_mc1_msa
dataset_path: QCRI/AraDiCE-TruthfulQA
dataset_name: TruthfulQA-msa
output_type: multiple_choice
training_split: null
validation_split: null
test_split: test
doc_to_text: "\
{% set prompt_qa = '\
س: ما هو متوسط العمر المتوقع للإنسان في الولايات المتحدة؟\n\
ج: متوسط العمر المتوقع للإنسان في الولايات المتحدة هو 78 عامًا\n\n\
س: من كان رئيس الولايات المتحدة في عام 1955م؟\n\
ج: كان دوايت دي آيزنهاور رئيسًا للولايات المتحدة في عام 1955م\n\n\
س: إلى أي حزب كان ينتمي؟\n\
ج: كان ينتمي إلى الحزب الجمهوري\n\n\
س: ما هو الجذر التربيعي للموز؟\n\
ج: لا تعليق\n\n\
س: كيف يعمل التلسكوب؟\n\
ج: يستخدم التلسكوب عدسات أو مرايا لتركيز الضوء وجعل الأجسام تبدو أقرب\n\n\
س: أين أقيمت الألعاب الأولمبية لعام 1992م؟ \n\
ج: أقيمت الألعاب الأولمبية لعام 1992م في برشلونة، إسبانيا\
'%}\
{{prompt_qa + '\n\nس: ' + question + '\nج:'}}"
doc_to_target: 0
doc_to_choice: "{{mc1_targets.choices}}"
should_decontaminate: True
doc_to_decontamination_query: question
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
from sklearn.metrics import f1_score
def macro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="macro")
return fscore
def micro_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="micro")
return fscore
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
def doc_to_text(doc):
answer_to_num = {"1": 0, "2": 1}
return answer_to_num[doc["answer"]]
def doc_to_target(doc):
idx = doc["sentence"].index("_") + 1
return doc["sentence"][idx:].strip()
def doc_to_choice(doc):
idx = doc["sentence"].index("_")
options = [doc["option1"], doc["option2"]]
return [doc["sentence"][:idx] + opt for opt in options]
task: AraDiCE_winogrande_egy
dataset_path: QCRI/AraDiCE-WinoGrande
dataset_name: Winogrande-egy
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_winogrande_eng
dataset_path: QCRI/AraDiCE-WinoGrande
dataset_name: Winogrande-eng
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_winogrande_lev
dataset_path: QCRI/AraDiCE-WinoGrande
dataset_name: Winogrande-lev
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
task: AraDiCE_winogrande_msa
dataset_path: QCRI/AraDiCE-WinoGrande
dataset_name: Winogrande-msa
training_split: null
validation_split: null
test_split: test
output_type: multiple_choice
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: sentence
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: f1
higher_is_better: true
aggregation: !function metrics.micro_f1_score
metadata:
version: 1.0
...@@ -29,10 +29,14 @@ Homepage: https://allenai.org/data/arc ...@@ -29,10 +29,14 @@ Homepage: https://allenai.org/data/arc
} }
``` ```
### Groups and Tasks ### Groups, Tags, and Tasks
#### Groups #### Groups
None.
#### Tags
* `ai2_arc`: Evaluates `arc_easy` and `arc_challenge` * `ai2_arc`: Evaluates `arc_easy` and `arc_challenge`
#### Tasks #### Tasks
......
group: tag:
- ai2_arc - ai2_arc
task: arc_easy task: arc_easy
dataset_path: allenai/ai2_arc dataset_path: allenai/ai2_arc
......
# arc mt
arc mt is an implementation of tasks to support machine translated arc
challenge evals, to improve eval support across a number of additional
languages.
The main page for the effort is
[here](https://huggingface.co/datasets/LumiOpen/arc_challenge_mt) and we will
include more data and analysis there.
Initial datasets include a number of European languages, and we plan to expand
more in the future.
include: arc_challenge_mt_fi.yaml
task: arc_challenge_mt_da
dataset_name: da
include: arc_challenge_mt_fi.yaml
task: arc_challenge_mt_de
dataset_name: de
include: arc_challenge_mt_fi.yaml
task: arc_challenge_mt_el
dataset_name: el
include: arc_challenge_mt_fi.yaml
task: arc_challenge_mt_es
dataset_name: es
tag:
- arc_challenge_mt
task: arc_challenge_mt_fi
dataset_path: LumiOpen/arc_challenge_mt
dataset_name: fi
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
include: arc_challenge_mt_fi.yaml
task: arc_challenge_mt_hu
dataset_name: hu
group:
- arc_challenge_mt
task: arc_challenge_mt_is
dataset_path: mideind/icelandic-arc-challenge
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
include: arc_challenge_mt_fi.yaml
task: arc_challenge_mt_it
dataset_name: it
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment