add TED to BigBench and Brier score to MMLU

2b7d8c2d · lintangsutawika · 5cc65a79 · 2b7d8c2d · 2b7d8c2d · 2b7d8c2d
Commit 2b7d8c2d authored Nov 07, 2023 by lintangsutawika
3 changed files
--- a/lm_eval/tasks/bigbench/aux_metric.py
+++ b/lm_eval/tasks/bigbench/aux_metric.py
+from textdistance import levenshtein
+from transformers import AutoTokenizer
+
+# Change this tokenizer to fit with the model you are using.
+tokenizer = AutoTokenizer.from_pretrained("EleutherAI/pythia-2.8b")
+
+def token_edit_distance(references, predictions, **kwargs):
+    ref_tokens = tokenizer.encode(references[0])
+    pred_tokens = tokenizer.encode(predictions[0])
+    return levenshtein.distance(ref_tokens, pred_tokens)
--- a/lm_eval/tasks/bigbench/generate_until_template_yaml
+++ b/lm_eval/tasks/bigbench/generate_until_template_yaml
 group: bigbench
-dataset_path: bigbench # will switch to `hails/bigbench` when all tasks are pushed
+dataset_path: hails/bigbench # will switch to `hails/bigbench` when all tasks are pushed
 output_type: generate_until
 dataset_kwargs:
  # num_shots: 0 # TODO: num of shots for `bigbench` HF dataset should be controlled through this, not through the typical methods
@@ -14,3 +14,6 @@ metric_list:
    aggregation: mean
    higher_is_better: true
    ignore_punctuation: true
+  - metric: !function aux_metric.token_edit_distance # pip install textdistance
+    aggregation: mean
+    higher_is_better: false
\ No newline at end of file
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
@@ -15,3 +15,6 @@ metric_list:
  - metric: acc_norm
    aggregation: mean
    higher_is_better: true
+  - metric: brier_score
+    aggregation: mean
+    higher_is_better: false
\ No newline at end of file