update squadv2

60ca1a27 · lintangsutawika · 30aa9c33 · 60ca1a27 · 60ca1a27 · 60ca1a27
Commit 60ca1a27 authored Aug 15, 2023 by lintangsutawika
4 changed files
--- a/lm_eval/tasks/squadv2/README.md
+++ b/lm_eval/tasks/squadv2/README.md
+# Task-name
+### Paper
+Title: `paper title goes here`
+Abstract: `link to paper PDF or arXiv abstract goes here`
+`Short description of paper / benchmark goes here:`
+Homepage: `homepage to the benchmark's website goes here, if applicable`
+### Citation
+```
+BibTeX-formatted citation goes here
+```
+### Subtasks
+List or describe tasks defined in this folder, and their names here:
+* `task_name`: `1-sentence description of what this particular task does`
+* `task_name2`: .....
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/squadv2/default.yaml
+++ b/lm_eval/tasks/squadv2/default.yaml
+task: squadv2
+dataset_path: squad_v2
+output_type: greedy_until
+training_split: train
+validation_split: validation
+doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
+doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['unanswerable']}}{% endif %}"
+target_delimiter: ""
+should_decontaminate: true
+doc_to_decontamination_query: context
+process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "\n\n"
+    - "\n"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+metric_list:
+  - metric: !function metric.exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function metric.f1
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function metric.HasAns_exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function metric.HasAns_f1
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function metric.NoAns_exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function metric.NoAns_f1
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function metric.best_exact
+    aggregation: mean
+    higher_is_better: true
+  - metric: !function metric.best_f1
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/squadv2/metric.py
+++ b/lm_eval/tasks/squadv2/metric.py
+import evaluate
+from functools import partial
+def _squad_metric(predictions, references):
+    squad_metric = evaluate.load("squad_v2")
+    return squad_metric.compute(predictions=predictions, references=references)
+# Exact match (the normalized answer exactly match the gold answer)
+def exact(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("exact", 0)
+# The F-score of predicted tokens versus the gold answer
+def f1(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("f1", 0)
+# Exact match (the normalized answer exactly match the gold answer)
+def HasAns_exact(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("HasAns_exact", 0)
+# The F-score of predicted tokens versus the gold answer
+def HasAns_f1(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("HasAns_f1", 0)
+# Exact match (the normalized answer exactly match the gold answer)
+def NoAns_exact(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("NoAns_exact", 0)
+# The F-score of predicted tokens versus the gold answer
+def NoAns_f1(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("NoAns_f1", 0)
+# Best exact match (with varying threshold)
+def best_exact(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("best_exact", 0)
+# Best F1 (with varying threshold)
+def best_f1(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("best_f1", 0)
--- a/lm_eval/tasks/squadv2/utils.py
+++ b/lm_eval/tasks/squadv2/utils.py
+import evaluate
+from math import exp
+from functools import partial
+def process_results(doc, results):
+    continuation  = results[0]
+    no_answer_probability = 0 # exp(logprob_unanswerable)
+    predictions = {
+        "id": doc["id"],
+        "prediction_text": continuation,
+        "no_answer_probability": no_answer_probability,
+    }
+    references = {
+        "id": doc["id"],
+        "answers": doc["answers"],
+    }
+    print(_squad_metric(predictions, references))
+    return _squad_metric(predictions, references)
+def _squad_metric(predictions, references):
+    squad_metric = evaluate.load("squad_v2")
+    return squad_metric.compute(predictions=predictions, references=references)
+# Exact match (the normalized answer exactly match the gold answer)
+def exact(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("exact", 0)
+# The F-score of predicted tokens versus the gold answer
+def f1(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("f1", 0)
+# Exact match (the normalized answer exactly match the gold answer)
+def HasAns_exact(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("HasAns_exact", 0)
+# The F-score of predicted tokens versus the gold answer
+def HasAns_f1(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("HasAns_f1", 0)
+# Exact match (the normalized answer exactly match the gold answer)
+def NoAns_exact(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("NoAns_exact", 0)
+# The F-score of predicted tokens versus the gold answer
+def NoAns_f1(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("NoAns_f1", 0)
+# Best exact match (with varying threshold)
+def best_exact(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("best_exact", 0)
+# Best F1 (with varying threshold)
+def best_f1(predictions, references):
+    return _squad_metric(predictions=predictions, references=references).get("best_f1", 0)