Unverified Commit 41eb4a65 authored by Leo Gao's avatar Leo Gao Committed by GitHub
Browse files

Merge pull request #150 from zphang/record_update

ReCoRD fix
parents 487b5313 2719a522
...@@ -44,7 +44,7 @@ TASK_REGISTRY = { ...@@ -44,7 +44,7 @@ TASK_REGISTRY = {
"cb": superglue.CommitmentBank, "cb": superglue.CommitmentBank,
"copa": superglue.Copa, "copa": superglue.Copa,
"multirc": superglue.MultiRC, "multirc": superglue.MultiRC,
#"record": superglue.ReCoRD, "record": superglue.ReCoRD,
"wic": superglue.WordsInContext, "wic": superglue.WordsInContext,
"wsc": superglue.SGWinogradSchemaChallenge, "wsc": superglue.SGWinogradSchemaChallenge,
......
...@@ -272,9 +272,8 @@ class ReCoRD(HFTask): ...@@ -272,9 +272,8 @@ class ReCoRD(HFTask):
def training_docs(self): def training_docs(self):
# In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing. # In ReCoRD, each doc manifests multiple "examples" in the context of few shot example packing.
# Each doc consists of multiple answer candidates, each of which is scored yes/no. # Each doc consists of multiple answer candidates, each of which is scored yes/no.
# Hence, we one "doc" for each (context + passage, answer) pair. # Hence, we create one "doc" for each (context + passage, answer) pair.
# Moreover, we only use the correct answers for context packing # Moreover, we only use the correct answers for context packing
# (This is not an issue for evaluation, where we can directly score multiple candidates at once).
if self._training_docs is None: if self._training_docs is None:
self._training_docs = [] self._training_docs = []
for doc in self.data["train"]: for doc in self.data["train"]:
...@@ -288,13 +287,14 @@ class ReCoRD(HFTask): ...@@ -288,13 +287,14 @@ class ReCoRD(HFTask):
return self._training_docs return self._training_docs
def validation_docs(self): def validation_docs(self):
for doc in self.data["validation"]: for example_idx, doc in enumerate(self.data["validation"]):
for entity in list(set(doc["entities"])): for entity in sorted(list(set(doc["entities"]))):
yield { yield {
"passage": doc["passage"], "passage": doc["passage"],
"query": doc["query"], "query": doc["query"],
"entity": entity, "entity": entity,
"label": entity in doc["answers"], "label": entity in doc["answers"],
"example_idx": example_idx,
} }
def doc_to_text(self, doc): def doc_to_text(self, doc):
...@@ -313,26 +313,22 @@ class ReCoRD(HFTask): ...@@ -313,26 +313,22 @@ class ReCoRD(HFTask):
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
requests = [ requests = [
rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=entity)) rf.loglikelihood(ctx, self.format_answer(query=doc["query"], entity=doc["entity"]))
for entity in doc["entity"]
] ]
return requests return requests
def process_results(self, doc, results): def process_results(self, doc, results):
# ReCoRD's evaluation is actually deceptively simple: # We defer the actual meat of ReCoRD's evaluation until we start collating the results across "docs"
# - Pick the maximum likelihood prediction entity assert len(results) == 1
# - Evaluate the accuracy and token F1 PER EXAMPLE scoring_info = {
# - Average over all examples "example_idx": doc["example_idx"],
max_idx = np.argmax(np.array(results)) "pred_score": results[0][0],
"entity": doc["entity"],
prediction = doc["entities"][max_idx] "label": doc["label"],
gold_label_set = list(set(doc["answers"])) }
f1 = metric_max_over_ground_truths(squad_metrics.compute_f1, prediction, gold_label_set)
em = metric_max_over_ground_truths(squad_metrics.compute_exact, prediction, gold_label_set)
return { return {
"f1": f1, "f1": scoring_info,
"em": em, "em": scoring_info,
} }
def higher_is_better(self): def higher_is_better(self):
...@@ -343,10 +339,49 @@ class ReCoRD(HFTask): ...@@ -343,10 +339,49 @@ class ReCoRD(HFTask):
def aggregation(self): def aggregation(self):
return { return {
"f1": mean, "f1": self.record_eval_em,
"em": mean, "em": self.record_eval_f1,
} }
@classmethod
def record_eval_aggregation(cls, items, scoring_function):
# ReCoRD's evaluation is actually deceptively simple:
# - Pick the maximum likelihood prediction entity
# - Evaluate the accuracy and token F1 PER EXAMPLE
# - Average over all examples
# Reconstruct an example_idx -> example results mapping
# (remember, each example spans multiple docs)
example_dict = {}
for item in items:
example_idx = item["example_idx"]
if example_idx not in example_dict:
example_dict[example_idx] = []
example_dict[example_idx].append(item)
# Compute score for each example
score_list = []
for example in example_dict.values():
max_idx = int(np.argmax(np.array([result["pred_score"] for result in example])))
entities = [result["entity"] for result in example]
prediction = entities[max_idx]
gold_label_set = list(set(result["entity"] for result in example if result["label"]))
if not gold_label_set:
# When we limit the number of docs processed, some examples may not have any valid answers.
# We skip these example.
continue
per_example_score = metric_max_over_ground_truths(scoring_function, prediction, gold_label_set)
score_list.append(per_example_score)
return np.mean(score_list)
@classmethod
def record_eval_em(cls, items):
return cls.record_eval_aggregation(items, scoring_function=squad_metrics.compute_exact)
@classmethod
def record_eval_f1(cls, items):
return cls.record_eval_aggregation(items, scoring_function=squad_metrics.compute_f1)
class WordsInContext(HFTask): class WordsInContext(HFTask):
DATASET_PATH = "super_glue" DATASET_PATH = "super_glue"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment