remove direct util, update common yaml

3eab4b9a · JessicaOjo · 21fb0db7 · 3eab4b9a · 21fb0db7
Commit 3eab4b9a authored May 10, 2024 by JessicaOjo
Showing with 31 additions and 17 deletions

lm_eval/tasks/afrimgsm/direct/afrimgsm_common_yaml lm_eval/tasks/afrimgsm/direct/afrimgsm_common_yaml +31 -0

lm_eval/tasks/afrimgsm/direct/utils.py lm_eval/tasks/afrimgsm/direct/utils.py +0 -17

No files found.
--- a/lm_eval/tasks/afrimgsm/direct/afrimgsm_common_yaml
+++ b/lm_eval/tasks/afrimgsm/direct/afrimgsm_common_yaml
+group: mgsm_direct
+task: afrimgsm_direct
+dataset_path: masakhane/afrimgsm
+output_type: generate_until
+training_split: train
+test_split: test
+fewshot_split: train
+target_delimiter: ""
+doc_to_target: '{% if answer is not none %}{{answer}}{% else %}{{answer_number|string}}{% endif %}'
+doc_to_text: '{% if answer is not none %}{{question+"\nAnswer:"}}{% else %}{{"Question: "+question+"\nAnswer:"}}{% endif %}'
+generation_kwargs:
+  do_sample: false
+  until:
+  - 'Question:'
+  - </s>
+  - <|im_end|>
+filter_list:
+  - filter:
+    - function: regex-numbers
+      group_select: -1
+      regex_pattern: (-?[0-9.,]{2,})|(-?[0-9]+)
+    - function: take_first
+    name: flexible-extract
+metric_list:
+  - metric: squad
+    aggregation: squad_f1
+    average: weighted
+    hf_evaluate: False
+    higher_is_better: True
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/afrimgsm/direct/utils.py
+++ b/lm_eval/tasks/afrimgsm/direct/utils.py
-import evaluate
-
-
-def squad_f1(items):
-    unzipped_list = list(zip(*items))
-    print(unzipped_list)
-    ref_squad, pred_squad = unzipped_list[0], unzipped_list[1]
-    reference, prediction = [], []
-    for index in range(len(reference)):
-        pred_dict = {'prediction_text': str(reference[index]), 'id': str(index)}
-        ref_dict = {'answers': {'answer_start': [1], 'text': str(prediction[index])}, 'id': str(index)}
-        reference.append(pred_dict)
-        prediction.append(ref_dict)
-
-    squad_metric = evaluate.load("squad")
-    results_squad = squad_metric.compute(predictions=pred_squad, references=ref_squad)
-    return round(results_squad['f1'], 2)