Commit 215de045 authored by nicholaskross's avatar nicholaskross
Browse files

rewrote based on BoolQ implementation

parent 57c751fa
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
import json import json
import random import random
import os import os
from lm_eval.base import Dataset from lm_eval.base import Dataset, rf, mean
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import simple_accuracy_metric from . common import simple_accuracy_metric
import numpy as np import numpy as np
...@@ -96,21 +96,36 @@ class SATAnalogies(Dataset): ...@@ -96,21 +96,36 @@ class SATAnalogies(Dataset):
return text return text
def doc_to_target(self, doc):
# assumes answer_key is the true-answer's letter
return doc['answer_key']
def construct_requests(self, ctx):
# assumes the output is the predicted-answer's letter
ll_a = rf.loglikelihood(ctx, ' a')
ll_b = rf.loglikelihood(ctx, ' b')
ll_c = rf.loglikelihood(ctx, ' c')
ll_d = rf.loglikelihood(ctx, ' d')
ll_e = rf.loglikelihood(ctx, ' e')
return ll_a, ll_b, ll_c, ll_d, ll_e
def process_results(self, doc, results):
predicted_odds = np.array(list(results))
gold = doc["answer_key"]
acc = 1. if np.argmax(predicted_odds) == gold else 0.
return [
{
"submetric": "acc",
"value": acc,
"higher_is_better": True,
"aggregation": mean
}
]
def evaluate(self, docs, lm): def evaluate(self, docs, lm):
golds = [doc["answer_key"] for doc in docs] # functionality already implemented above
preds = [] raise NotImplementedError()
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
num_fewshot=1,
provide_description=None,
# unless Dataset evaluate()s should get num_fewshot/ provide_description
)
probs_before_numpy = []
for choice in doc["choices"]:
this_choice = " " + choice
probs_before_numpy.append(lm.loglikelihood(ctx, this_choice))
probs = np.array(probs_before_numpy)
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment