Commit 2184b8de authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'cont-metrics' of https://github.com/EleutherAI/lm-evaluation-harness into alt_worlds

parents b1ba4e71 1522009c
"dataset_name": "ukr_Cyrl"
"include": "_default_template_yaml"
"task": "belebele_ukr_Cyrl"
"dataset_name": "urd_Arab"
"include": "_default_template_yaml"
"task": "belebele_urd_Arab"
"dataset_name": "urd_Latn"
"include": "_default_template_yaml"
"task": "belebele_urd_Latn"
"dataset_name": "uzn_Latn"
"include": "_default_template_yaml"
"task": "belebele_uzn_Latn"
"dataset_name": "vie_Latn"
"include": "_default_template_yaml"
"task": "belebele_vie_Latn"
"dataset_name": "war_Latn"
"include": "_default_template_yaml"
"task": "belebele_war_Latn"
"dataset_name": "wol_Latn"
"include": "_default_template_yaml"
"task": "belebele_wol_Latn"
"dataset_name": "xho_Latn"
"include": "_default_template_yaml"
"task": "belebele_xho_Latn"
"dataset_name": "yor_Latn"
"include": "_default_template_yaml"
"task": "belebele_yor_Latn"
"dataset_name": "zho_Hans"
"include": "_default_template_yaml"
"task": "belebele_zho_Hans"
"dataset_name": "zho_Hant"
"include": "_default_template_yaml"
"task": "belebele_zho_Hant"
"dataset_name": "zsm_Latn"
"include": "_default_template_yaml"
"task": "belebele_zsm_Latn"
"dataset_name": "zul_Latn"
"include": "_default_template_yaml"
"task": "belebele_zul_Latn"
group: flan-cot group: flan-cot
output_type: greedy_until output_type: generate_until
validation_split: validation validation_split: validation
doc_to_target: "{{answer}}" doc_to_target: "{{answer}}"
metric_list: metric_list:
......
output_type: greedy_until output_type: generate_until
validation_split: validation validation_split: validation
metric_list: metric_list:
- metric: exact_match - metric: exact_match
......
...@@ -9,4 +9,4 @@ task: ...@@ -9,4 +9,4 @@ task:
- wsc - wsc
- ai2_arc - ai2_arc
- blimp - blimp
- hendrycksTest* - mmlu
...@@ -6,7 +6,7 @@ task: ...@@ -6,7 +6,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -19,7 +19,7 @@ task: ...@@ -19,7 +19,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -32,7 +32,7 @@ task: ...@@ -32,7 +32,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -44,7 +44,7 @@ task: ...@@ -44,7 +44,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -56,7 +56,7 @@ task: ...@@ -56,7 +56,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train_r1 training_split: train_r1
validation_split: dev_r1 validation_split: dev_r1
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -68,7 +68,7 @@ task: ...@@ -68,7 +68,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train_r2 training_split: train_r2
validation_split: dev_r2 validation_split: dev_r2
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -80,7 +80,7 @@ task: ...@@ -80,7 +80,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train_r3 training_split: train_r3
validation_split: dev_r3 validation_split: dev_r3
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -93,7 +93,7 @@ task: ...@@ -93,7 +93,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -105,7 +105,7 @@ task: ...@@ -105,7 +105,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
...@@ -118,7 +118,7 @@ task: ...@@ -118,7 +118,7 @@ task:
use_prompt: promptsource:* use_prompt: promptsource:*
training_split: train training_split: train
validation_split: validation validation_split: validation
output_type: greedy_until output_type: generate_until
metric_list: metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
......
from textdistance import levenshtein
from transformers import AutoTokenizer
# Change this tokenizer to fit with the model you are using.
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8b", max_new_tokens=128)
def token_edit_distance(references, predictions, **kwargs):
print(references)
print(predictions)
print("###")
ref_tokens = tokenizer.encode(references[0])
pred_tokens = tokenizer.encode(predictions[0])
return levenshtein.distance(ref_tokens, pred_tokens)
...@@ -175,8 +175,8 @@ all_subtasks = [ ...@@ -175,8 +175,8 @@ all_subtasks = [
def main() -> None: def main() -> None:
for path, task_type in zip( for path, task_type in zip(
["multiple_choice", "greedy_until"], ["multiple_choice", "generate_until"],
["multiple_choice_template_yaml", "greedy_until_template_yaml"], ["multiple_choice_template_yaml", "generate_until_template_yaml"],
): ):
os.makedirs(path, exist_ok=True) os.makedirs(path, exist_ok=True)
for task in all_subtasks: for task in all_subtasks:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment