Unverified Commit 2c8ffb80 authored by Kajetan Dymkiewicz's avatar Kajetan Dymkiewicz Committed by GitHub
Browse files

Fix for mc2 calculation (#2768)



* fix for mc2 calculation

* increment versions and changelog

---------
Co-authored-by: default avatarBaber <baber@hey.com>
parent c8044f30
......@@ -45,3 +45,6 @@ If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog
*_mc2 v2.0 (2024-Mar-11) PR #2768 - original code assumed labels were in sorted order - not always true
......@@ -9,4 +9,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
version: 1.0
version: 2.0
......@@ -46,13 +46,17 @@ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
def process_results_mc2(doc, results):
lls, is_greedy = zip(*results)
ll, _ = zip(*results)
ll = np.array(ll)
# Split on the first `0` as everything before it is true (`1`).
split_idx = list(doc["mc2_targets"]["labels"]).index(0)
# Convert log-likelihoods to probabilities.
probs = np.exp(ll)
# Normalize probabilities.
probs_norm = probs / np.sum(probs)
labels = np.array(doc["mc2_targets"]["labels"])
# Compute the normalized probability mass for the correct answer.
ll_true, ll_false = lls[:split_idx], lls[split_idx:]
p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
p_true = p_true / (sum(p_true) + sum(p_false))
pm_true = np.sum(probs_norm[labels == 1])
return {"acc": sum(p_true)}
return {"acc": pm_true}
......@@ -51,3 +51,6 @@ If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
### Changelog
mc2 version 3.0 (2024-Mar-11) PR #2768 - original code assumed labels were in sorted order - not always true
......@@ -10,4 +10,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
version: 2.0
version: 3.0
......@@ -8,16 +8,20 @@ ROUGE_SCORER = None
def process_results_mc2(doc, results):
lls, is_greedy = zip(*results)
ll, _ = zip(*results)
ll = np.array(ll)
# Split on the first `0` as everything before it is true (`1`).
split_idx = list(doc["mc2_targets"]["labels"]).index(0)
# Convert log-likelihoods to probabilities.
probs = np.exp(ll)
# Normalize probabilities.
probs_norm = probs / np.sum(probs)
labels = np.array(doc["mc2_targets"]["labels"])
# Compute the normalized probability mass for the correct answer.
ll_true, ll_false = lls[:split_idx], lls[split_idx:]
p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false))
p_true = p_true / (sum(p_true) + sum(p_false))
pm_true = np.sum(probs_norm[labels == 1])
return {"acc": sum(p_true)}
return {"acc": pm_true}
def process_docs_gen(dataset: datasets.Dataset) -> datasets.Dataset:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment