Commit 37c3139d authored by thefazzer's avatar thefazzer
Browse files

Merge remote-tracking branch 'origin/master' into fazz/refactor-task-coqa

parents 79c9b68a 7ad6bf45
env env
*.pyc *.pyc
data/ data/
.idea
lm_cache
\ No newline at end of file
import abc import abc
import random import random
import numpy as np import numpy as np
import sklearn
import math from lm_eval.metrics import mean
class LM(abc.ABC): class LM(abc.ABC):
...@@ -15,7 +15,8 @@ class LM(abc.ABC): ...@@ -15,7 +15,8 @@ class LM(abc.ABC):
:param requests: list :param requests: list
A list of pairs (context, continuation) A list of pairs (context, continuation)
context: str context: str
Context string Context string. Implementations of LM must be able to handle an
empty context string.
continuation: str continuation: str
The continuation over which log likelihood will be calculated. If The continuation over which log likelihood will be calculated. If
there is a word boundary, the space should be in the continuation. there is a word boundary, the space should be in the continuation.
...@@ -29,6 +30,7 @@ class LM(abc.ABC): ...@@ -29,6 +30,7 @@ class LM(abc.ABC):
""" """
pass pass
# TODO: Add an optional max length
@abc.abstractmethod @abc.abstractmethod
def greedy_until(self, requests): def greedy_until(self, requests):
"""Generate greedily until a stopping sequence """Generate greedily until a stopping sequence
...@@ -37,9 +39,9 @@ class LM(abc.ABC): ...@@ -37,9 +39,9 @@ class LM(abc.ABC):
A list of pairs (context, until) A list of pairs (context, until)
context: str context: str
Context string Context string
until: str until: [str]
The string sequence to generate until. This string sequence may The string sequences to generate until. These string sequences
span across multiple tokens, or may be part of one token. may each span across multiple tokens, or may be part of one token.
:return: list :return: list
A list of strings continuation A list of strings continuation
continuation: str continuation: str
...@@ -60,6 +62,14 @@ class LM(abc.ABC): ...@@ -60,6 +62,14 @@ class LM(abc.ABC):
class Task(abc.ABC): class Task(abc.ABC):
"""A task represents an entire benchmark including its dataset, problems,
answers, and evaluation methods. See BoolQ for a simple example implementation
A `doc` can be any python object which represents one instance of evaluation.
This is usually a dictionary e.g.
{"question": ..., "answer": ...} or
{"question": ..., question, answer)
"""
def __init__(self): def __init__(self):
self.download() self.download()
self._training_docs = None self._training_docs = None
...@@ -147,9 +157,9 @@ class Task(abc.ABC): ...@@ -147,9 +157,9 @@ class Task(abc.ABC):
@abc.abstractmethod @abc.abstractmethod
def aggregation(self): def aggregation(self):
""" """
:returns: {str: [float] -> float} :returns: {str: [metric_score] -> float}
A dictionary where keys are the names of submetrics and values are A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics functions that aggregate a list of metric scores
""" """
pass pass
...@@ -212,62 +222,9 @@ class MultipleChoiceTask(Task): ...@@ -212,62 +222,9 @@ class MultipleChoiceTask(Task):
} }
def mean(arr):
return sum(arr) / len(arr)
def median(arr):
return arr[len(arr) // 2]
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
return sklearn.metrics.matthews_corrcoef(golds, preds)
def f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
return np.max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def perplexity(items):
return math.exp(-mean(items))
req_ret_lens = { req_ret_lens = {
'loglikelihood': 2, 'loglikelihood': 2,
'greedy_until': None,
} }
import os import os
...@@ -275,13 +232,9 @@ import json ...@@ -275,13 +232,9 @@ import json
import hashlib import hashlib
from sqlitedict import SqliteDict from sqlitedict import SqliteDict
def hash_args(args): def hash_args(attr, args):
dat = b"" dat = json.dumps([attr] + list(args))
for arg in args: return hashlib.sha256(dat.encode('utf-8')).hexdigest()
assert isinstance(arg, str) or isinstance(arg, int)
dat += str(arg).encode()
dat += b"\0"
return hashlib.sha256(dat).hexdigest()
class CachingLM: class CachingLM:
...@@ -298,7 +251,7 @@ class CachingLM: ...@@ -298,7 +251,7 @@ class CachingLM:
# figure out which ones are cached and which ones are new # figure out which ones are cached and which ones are new
for req in requests: for req in requests:
hsh = attr + '_' + hash_args(req) hsh = hash_args(attr, req)
if hsh in self.dbdict: if hsh in self.dbdict:
ob = self.dbdict[hsh] ob = self.dbdict[hsh]
...@@ -320,9 +273,9 @@ class CachingLM: ...@@ -320,9 +273,9 @@ class CachingLM:
res[resptr] = r res[resptr] = r
# caching # caching
hsh = attr + '_' + hash_args(req) hsh = hash_args(attr, req)
self.dbdict[hsh] = r self.dbdict[hsh] = r
self.dbdict.commit()
return res return res
return fn return fn
...@@ -338,12 +291,19 @@ class Request: ...@@ -338,12 +291,19 @@ class Request:
self.index = index self.index = index
def __iter__(self): def __iter__(self):
if req_ret_lens[self.type] is None:
raise IndexError('This request type does not return multiple arguments!')
i = 0 i = 0
for i in range(req_ret_lens[self.type]): for i in range(req_ret_lens[self.type]):
yield Request(self.type, self.args, i) yield Request(self.type, self.args, i)
def __getitem__(self, i): def __getitem__(self, i):
if req_ret_lens[self.type] is None:
raise IndexError('This request type does not return multiple arguments!')
return Request(self.type, self.args, i) return Request(self.type, self.args, i)
def __eq__(self, other):
return self.type == other.type and self.args == other.args and self.index == other.index
class RequestFactory: class RequestFactory:
......
...@@ -39,6 +39,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit): ...@@ -39,6 +39,7 @@ def evaluate(lm, task_dict, provide_description, num_fewshot, limit):
) )
reqs = task.construct_requests(doc, ctx) reqs = task.construct_requests(doc, ctx)
if not isinstance(reqs, (list, tuple)): reqs = [reqs]
for i, req in enumerate(reqs): for i, req in enumerate(reqs):
requests[req.type].append(req) requests[req.type].append(req)
......
import math
from pprint import pprint
import numpy as np
import sacrebleu
import sklearn
def mean(arr):
return sum(arr) / len(arr)
def median(arr):
return arr[len(arr) // 2]
def matthews_corrcoef(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
return sklearn.metrics.matthews_corrcoef(golds, preds)
def f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds)
return np.max(fscore)
def acc_all(items):
# Only count as correct if all answers are labeled correctly for each question
question_scoring_dict = {}
preds = list(zip(*items))[0]
docs = list(zip(*items))[1]
for doc, pred in zip(docs, preds):
question_id = doc["idx"]["question"]
if question_id not in question_scoring_dict:
question_scoring_dict[question_id] = []
gold_label = doc["label"] == 1
question_scoring_dict[question_id].append(gold_label == pred)
acc = np.mean([int(all(x)) for x in question_scoring_dict.values()])
return acc
def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
"""Compute max metric between prediction and each ground truth."""
scores_for_ground_truths = []
for ground_truth in ground_truths:
score = metric_fn(prediction, ground_truth)
scores_for_ground_truths.append(score)
return max(scores_for_ground_truths)
def perplexity(items):
return math.exp(-mean(items))
def bleu(items):
"""The Bilingual Evaluation Understudy Score, or BLEU for short, is a metric
for evaluating a generated sentence to a reference sentence. It counts matching
n-grams in the candidate translation to n-grams in the reference text, where
1-gram or unigram would be each token and a bigram comparison would be each
word pair. The comparison is made regardless of word order
Source: https://machinelearningmastery.com/calculate-bleu-score-for-text-python/
Paper: https://www.aclweb.org/anthology/P02-1040/
Higher is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_bleu(preds, refs).score
def chrf(items):
"""chrF++ is a tool for automatic evaluation of machine translation output
based on character n-gram precision and recall enhanced with word n-grams.
Source: https://github.com/m-popovic/chrF
Paper: https://www.aclweb.org/anthology/W15-3049.pdf
Higher is better # TODO I think
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_chrf(preds, refs).score
def ter(items):
"""Translation Error Rate is an error metric for machine translation that
measures the number of edits required to change a system output into one
of the references
Source: http://www.cs.umd.edu/~snover/tercom/
Paper: http://mt-archive.info/AMTA-2006-Snover.pdf
Lower is better
"""
refs = list(zip(*items))[0]
preds = list(zip(*items))[1]
refs, preds = _sacreformat(refs, preds)
return sacrebleu.corpus_ter(preds, refs).score
def _sacreformat(refs, preds):
"""Format refs and preds for sacrebleu corpus calculation. It is very particular"""
# Sacrebleu expects (List[str], List[List[str])
# e.g. sacrebleu.corpus_bleu([pred_t], [[ref1_stream], [ref2_stream], ...])
# Note [ref1_stream] is the first reference for each pred.
# So lists are size N and (M, N) for N preds and M possible refs for each pred
# This is a different order of dimensions that I would expect
# We expect refs to be List[str] or List[List[str]], the outer list corresponding to preds
# Must become List[List[str]] with the inner list corresponding to preds
if not isinstance(refs, list):
refs = list(refs)
if not isinstance(refs[0], list):
refs = [[ref] for ref in refs]
refs = list(zip(*refs))
# Note the number of refs in each ref list much match the number of preds
# We expect preds to be List[str] or List[List[str]]. Must become List[str]
if not isinstance(preds, list):
preds = list(preds)
if isinstance(preds[0], list):
assert len(preds[0]) == 1, f"Pred must be a str, was {preds[0]}"
preds = [pred[0] for pred in preds]
return refs, preds
...@@ -19,5 +19,9 @@ class DummyLM(LM): ...@@ -19,5 +19,9 @@ class DummyLM(LM):
return res return res
def greedy_until(self, requests): def greedy_until(self, requests):
# TODO: implement res = []
pass
for _ in requests:
res.append("lol")
return res
...@@ -7,41 +7,75 @@ from tqdm import tqdm ...@@ -7,41 +7,75 @@ from tqdm import tqdm
class GPT2LM(LM): class GPT2LM(LM):
def __init__(self, device="cpu"): MAX_GEN_TOKS = 256
def __init__(self, device="cpu", pretrained='gpt2'):
self.device = torch.device(device) self.device = torch.device(device)
self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained('gpt2').to(self.device) self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained(pretrained).to(self.device)
self.gpt2.eval() self.gpt2.eval()
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2') self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained(pretrained)
self.tokenizer.pad_token = "<|endoftext|>" self.tokenizer.pad_token = "<|endoftext|>"
@classmethod @classmethod
def create_from_arg_string(cls, arg_string): def create_from_arg_string(cls, arg_string):
args = utils.simple_parse_args_string(arg_string) args = utils.simple_parse_args_string(arg_string)
return cls(device=args.get("device", "cpu")) return cls(device=args.get("device", "cpu"), pretrained=args.get("pretrained", "gpt2"))
def loglikelihood(self, requests): def loglikelihood(self, requests):
res = [] res = []
# TODO: vectorize properly with torch.no_grad():
for context, continuation in tqdm(requests): # TODO: vectorize properly
# when too long to fit in context, truncate from the left # TODO: automatic batch size detection for vectorization
context_enc = self.tokenizer.encode(context) for context, continuation in tqdm(requests):
continuation_enc = self.tokenizer.encode(continuation) # when too long to fit in context, truncate from the left
inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device)
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024) if context == "":
# end of text as context
cont_toks = inp[:, ctxlen:] # [batch, seq] context_enc = [50256]
logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1] # [batch, seq, vocab] else:
context_enc = self.tokenizer.encode(context)
greedy_tokens = logits.argmax(dim=-1)
max_equal = (greedy_tokens == cont_toks).all() continuation_enc = self.tokenizer.encode(continuation)
inp = torch.tensor([(context_enc + continuation_enc)[-1024:]], dtype=torch.long).to(self.device)
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - 1024)
cont_toks = inp[:, ctxlen:] # [batch, seq]
logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1] # [batch, seq, vocab]
greedy_tokens = logits.argmax(dim=-1)
max_equal = (greedy_tokens == cont_toks).all()
logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq] logits = torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # [batch, seq]
res.append((float(logits.sum()), bool(max_equal))) res.append((float(logits.sum()), bool(max_equal)))
return res return res
def greedy_until(self, requests): def greedy_until(self, requests):
# TODO: implement # TODO: implement fully general `until` that handles untils that are
pass # multiple tokens or that span multiple tokens correctly
res = []
for context, until in tqdm(requests):
if isinstance(until, str): until = [until]
context_enc = torch.tensor([self.tokenizer.encode(context)]).to(self.device)
primary_until, = self.tokenizer.encode(until[0])
cont = self.gpt2.generate(
context_enc,
max_length=context_enc.shape[1] + self.MAX_GEN_TOKS,
eos_token_id=primary_until,
do_sample=False
)
s = self.tokenizer.decode(cont[0].tolist()[context_enc.shape[1]:])
for term in until:
s = s.split(term)[0]
res.append(s)
return res
...@@ -72,7 +72,12 @@ class GPT3LM(LM): ...@@ -72,7 +72,12 @@ class GPT3LM(LM):
inps = [] inps = []
ctxlens = [] ctxlens = []
for context, continuation in chunk: for context, continuation in chunk:
context_enc = self.tokenizer.encode(context) if context == "":
# end of text as context
context_enc = [50256]
else:
context_enc = self.tokenizer.encode(context)
continuation_enc = self.tokenizer.encode(continuation) continuation_enc = self.tokenizer.encode(continuation)
inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:] inp = (context_enc + continuation_enc)[-self.MAX_LENGTH:]
ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH) ctxlen = len(context_enc) - max(0, len(context_enc) + len(continuation_enc) - self.MAX_LENGTH)
...@@ -108,6 +113,7 @@ class GPT3LM(LM): ...@@ -108,6 +113,7 @@ class GPT3LM(LM):
max_tokens=self.MAX_GEN_TOKS, max_tokens=self.MAX_GEN_TOKS,
temperature=0., temperature=0.,
logprobs=10, logprobs=10,
stop=until
) )
res.append(response.choices[0]['text']) res.append(response.choices[0]['text'])
......
from pprint import pprint
from . import superglue from . import superglue
from . import glue from . import glue
from . import arc from . import arc
...@@ -21,7 +23,10 @@ from . import triviaqa ...@@ -21,7 +23,10 @@ from . import triviaqa
from . import pubmedqa from . import pubmedqa
from . import sciq from . import sciq
from . import webqs from . import webqs
from . import qa4mre
from . import translation
from . import headqa
from . import mathqa
TASK_REGISTRY = { TASK_REGISTRY = {
# GLUE # GLUE
...@@ -49,19 +54,26 @@ TASK_REGISTRY = { ...@@ -49,19 +54,26 @@ TASK_REGISTRY = {
"lambada": lambada.LAMBADA, "lambada": lambada.LAMBADA,
"piqa": piqa.PiQA, "piqa": piqa.PiQA,
# Science related
"pubmedqa" : pubmedqa.Pubmed_QA, "pubmedqa" : pubmedqa.Pubmed_QA,
"sciq" : sciq.SciQ, "sciq" : sciq.SciQ,
#"qa4mre" : qa4mre.QA4MRE,
"qa4mre_2011" : qa4mre.QA4MRE_2011,
"qa4mre_2012" : qa4mre.QA4MRE_2012,
"qa4mre_2013" : qa4mre.QA4MRE_2013,
#"triviaqa": triviaqa.TriviaQA, #"triviaqa": triviaqa.TriviaQA,
"arc_easy": arc.ARCEasy, "arc_easy": arc.ARCEasy,
"arc_challenge": arc.ARCChallenge, "arc_challenge": arc.ARCChallenge,
# "quac": quac.QuAC, # not implemented yet # "quac": quac.QuAC, # not implemented yet
"hellaswag": hellaswag.HellaSwag, # not implemented yet "hellaswag": hellaswag.HellaSwag, # not implemented yet
# "openbookqa": openbookqa.OpenBookQA, # not implemented yet "openbookqa": openbookqa.OpenBookQA,
# "sat": sat.SATAnalogies, # not implemented yet # "sat": sat.SATAnalogies, # not implemented yet
# "squad": squad.SQuAD, # not implemented yet # "squad": squad.SQuAD, # not implemented yet
"race": race.RACE, "race": race.RACE,
# "naturalqs": naturalqs.NaturalQs, # not implemented yet # "naturalqs": naturalqs.NaturalQs, # not implemented yet
"headqa": headqa.HeadQA,
"mathqa": mathqa.MathQA,
"webqs": webqs.WebQs, "webqs": webqs.WebQs,
"wsc273": wsc273.WinogradSchemaChallenge273, "wsc273": wsc273.WinogradSchemaChallenge273,
"winogrande": winogrande.Winogrande, "winogrande": winogrande.Winogrande,
...@@ -80,6 +92,11 @@ TASK_REGISTRY = { ...@@ -80,6 +92,11 @@ TASK_REGISTRY = {
"arithmetic_2dm": arithmetic.Arithmetic2DMultiplication, "arithmetic_2dm": arithmetic.Arithmetic2DMultiplication,
"arithmetic_1dc": arithmetic.Arithmetic1DComposite, "arithmetic_1dc": arithmetic.Arithmetic1DComposite,
# TODO Perhaps make these groups of tasks
# e.g. anli, arithmetic, openai_translations, harness_translations
# e.g. wmt14-fr-en
**translation.create_tasks_from_benchmarks(translation.selected_benchmarks)
} }
...@@ -87,7 +104,12 @@ ALL_TASKS = sorted(list(TASK_REGISTRY)) ...@@ -87,7 +104,12 @@ ALL_TASKS = sorted(list(TASK_REGISTRY))
def get_task(task_name): def get_task(task_name):
return TASK_REGISTRY[task_name] try:
return TASK_REGISTRY[task_name]
except KeyError as e:
print("Available tasks:")
pprint(TASK_REGISTRY)
raise KeyError(f"Missing task {task_name}")
def get_task_dict(task_name_list): def get_task_dict(task_name_list):
......
import numpy as np import numpy as np
from lm_eval.base import rf, mean from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask from . common import HFTask
class ANLIBase(HFTask): class ANLIBase(HFTask):
...@@ -39,7 +40,7 @@ class ANLIBase(HFTask): ...@@ -39,7 +40,7 @@ class ANLIBase(HFTask):
# of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly
# appended onto the question, with no "Answer:" or even a newline. Do we *really* # appended onto the question, with no "Answer:" or even a newline. Do we *really*
# want to do it exactly as OA did? # want to do it exactly as OA did?
return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + '\nTrue, False, or Neither?' return doc['premise'] + '\nQuestion: ' + doc['hypothesis'] + ' True, False, or Neither?\nAnswer:'
def doc_to_target(self, doc): def doc_to_target(self, doc):
# True = entailment # True = entailment
......
import numpy as np import numpy as np
from lm_eval.base import rf, mean from lm_eval.base import MultipleChoiceTask
from ..metrics import mean
from . common import HFTask from . common import HFTask
class ARCEasy(HFTask): class ARCEasy(HFTask, MultipleChoiceTask):
DATASET_PATH = "ai2_arc" DATASET_PATH = "ai2_arc"
DATASET_NAME = "ARC-Easy" DATASET_NAME = "ARC-Easy"
letter_to_num = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4}
def __init__(self):
super().__init__()
self.data = self.__clean_data()
def __clean_data(self):
""" Resolves various edge cases in the unprocessed HF ARC dataset. """
# NOTE: Some `doc["answerKey"]`s are in numeric string format being one
# of {'1', '2', '3', '4', '5'}. We map them back to letters.
num_to_letter = {'1': 'A', '2': 'B', '3': 'C', '4': 'D', '5': 'E'}
result = {}
for split, data in self.data.items():
result[split] = []
for doc in data:
# Ensure all `answerKey`s and `label`s are in letter format.
doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
doc["choices"]["label"] = [
num_to_letter.get(label, label) for label in doc["choices"]["label"]
]
result[split].append(doc)
return result
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -39,68 +17,41 @@ class ARCEasy(HFTask): ...@@ -39,68 +17,41 @@ class ARCEasy(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def fewshot_description(self): def _convert_standard(self, doc):
# TODO: figure out description # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
return "" # of {'1', '2', '3', '4', '5'}. We map them back to letters.
num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}
def doc_to_text(self, doc): doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
return "Question: " + doc['question'] + '\nAnswer:' out_doc = {
"id": doc["id"],
def doc_to_target(self, doc): "query": "Question: " + doc["question"] + "\nAnswer:",
index = self.letter_to_num[doc["answerKey"]] "choices": doc["choices"]["text"],
return " " + doc['choices']['text'][index] "gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]),
}
return out_doc
def construct_requests(self, doc, ctx): def _load_docs(self, docs):
""" Uses RequestFactory to construct Requests and returns an iterable of for record in docs:
Requests which will be sent to the LM. yield self._convert_standard(record)
:param doc: def training_docs(self):
The document as returned from training_docs, validation_docs, or test_docs. docs = super().training_docs()
:param ctx: str return self._load_docs(docs)
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_choices = []
for choice in doc["choices"]["text"]:
ll_choices.append(rf.loglikelihood(ctx, " " + choice)[0])
return ll_choices
def process_results(self, doc, results): def validation_docs(self):
"""Take a single document and the LM results and evaluates, returning a docs = super().validation_docs()
dict where keys are the names of submetrics and values are the values of return self._load_docs(docs)
the metric for that one document
:param doc: def test_docs(self):
The document as returned from training_docs, validation_docs, or test_docs. docs = super().test_docs()
:param results: return self._load_docs(docs)
The results of the requests created in construct_requests.
"""
gold = self.letter_to_num[doc["answerKey"]]
pred = np.argmax(results)
return {
"acc": pred == gold
}
def aggregation(self): def fewshot_description(self):
""" # TODO: figure out description
:returns: {str: [float] -> float} return ""
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def higher_is_better(self): def doc_to_text(self, doc):
""" return doc["query"]
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
class ARCChallenge(ARCEasy): class ARCChallenge(ARCEasy):
......
...@@ -2,7 +2,8 @@ import abc ...@@ -2,7 +2,8 @@ import abc
import json import json
import os import os
from collections import namedtuple from collections import namedtuple
from lm_eval.base import Task, mean, rf from lm_eval.base import Task, rf
from lm_eval.metrics import mean
from best_download import download_file from best_download import download_file
ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion']) ArithmeticDoc = namedtuple('ArithmeticDoc', ['context', 'completion'])
...@@ -56,14 +57,17 @@ class Arithmetic(Task): ...@@ -56,14 +57,17 @@ class Arithmetic(Task):
return doc.completion return doc.completion
def load_doc(self, doc_json): def load_doc(self, doc_json):
return ArithmeticDoc(context=doc_json['context'], completion=doc_json['completion']) return ArithmeticDoc(context=doc_json['context'].strip()
.replace('\n\n', '\n')
.replace('Q:', 'Question:')
.replace('A:', 'Answer:'), completion=doc_json['completion'])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll, is_prediction = rf.loglikelihood(ctx, doc.completion) ll, is_prediction = rf.loglikelihood(ctx, doc.completion)
return is_prediction return is_prediction
def process_results(self, doc, results): def process_results(self, doc, results):
ll, is_prediction = results is_prediction, = results
return { return {
"acc": is_prediction "acc": is_prediction
} }
......
import datasets import datasets
import numpy as np import numpy as np
import lm_eval.metrics
from ..base import Task from ..base import Task
...@@ -44,7 +46,7 @@ class HFTask(Task): ...@@ -44,7 +46,7 @@ class HFTask(Task):
def simple_accuracy_metric(preds, golds): def simple_accuracy_metric(preds, golds):
acc = float((np.array(preds) == np.array(golds)).mean()) acc = float(lm_eval.metrics.mean())
return { return {
"major": acc, "major": acc,
"minor": {"acc": acc}, "minor": {"acc": acc},
......
import numpy as np import numpy as np
from lm_eval.base import rf, mean, f1_score, matthews_corrcoef from lm_eval.base import rf
from ..metrics import mean, matthews_corrcoef, f1_score
from scipy.stats import pearsonr, spearmanr from scipy.stats import pearsonr, spearmanr
from tqdm import auto as tqdm_lib from tqdm import auto as tqdm_lib
from . common import HFTask, yesno from . common import HFTask, yesno
from ..utils import general_detokenize
# Single-Sentence Tasks # Single-Sentence Tasks
...@@ -22,17 +23,18 @@ class CoLA(HFTask): ...@@ -22,17 +23,18 @@ class CoLA(HFTask):
return True return True
def fewshot_description(self): def fewshot_description(self):
return "Does this sentence make sense?:\tTrue or False?" # TODO
return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "Sentence: {}\nAnswer:".format(doc["sentence"]) return "{}\nQuestion: Does this sentence make sense?\nAnswer:".format(doc["sentence"])
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format({1: "True", 0: "False"}[doc["label"]]) return " {}".format({1: "yes", 0: "no"}[doc["label"]])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll_true, _ = rf.loglikelihood(ctx, " True") ll_true, _ = rf.loglikelihood(ctx, " yes")
ll_false, _ = rf.loglikelihood(ctx, " False") ll_false, _ = rf.loglikelihood(ctx, " no")
return ll_true, ll_false return ll_true, ll_false
def process_results(self, doc, results): def process_results(self, doc, results):
...@@ -68,19 +70,19 @@ class SST(HFTask): ...@@ -68,19 +70,19 @@ class SST(HFTask):
return True return True
def fewshot_description(self): def fewshot_description(self):
return "Indicate if each sentence is Positive or Negative." return "Indicate if the sentiment of each sentence is positive or negative."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "sentence:\t{}\t\nanswer:".format( return "{}\nQuestion: Is this sentence positive or negative?\nAnswer:".format(
doc["sentence"], general_detokenize(doc["sentence"]),
) )
def doc_to_target(self, doc): def doc_to_target(self, doc):
return " {}".format({1: "Positive", 0: "Negative"}[doc["label"]]) return " {}".format({1: "positive", 0: "negative"}[doc["label"]])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll_positive, _ = rf.loglikelihood(ctx, " Positive") ll_positive, _ = rf.loglikelihood(ctx, " positive")
ll_negative, _ = rf.loglikelihood(ctx, " Negative") ll_negative, _ = rf.loglikelihood(ctx, " negative")
return ll_positive, ll_negative return ll_positive, ll_negative
def process_results(self, doc, results): def process_results(self, doc, results):
...@@ -127,9 +129,9 @@ class MNLI(HFTask): ...@@ -127,9 +129,9 @@ class MNLI(HFTask):
return self.data["test_matched"] return self.data["test_matched"]
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format( return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
doc["premise"], doc["premise"],
doc["hypothesis"], doc["hypothesis"].strip() + ('' if doc["hypothesis"].strip().endswith('.') else '.'),
) )
def doc_to_target(self, doc): def doc_to_target(self, doc):
...@@ -187,7 +189,7 @@ class QNLI(HFTask): ...@@ -187,7 +189,7 @@ class QNLI(HFTask):
return True return True
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "question:\t{}\nresponse:\t{}\nDoes this answer the question, Yes or No?:".format( return "{}\n{}\nQuestion: Does this response answer the question?\nAnswer:".format(
doc["question"], doc["question"],
doc["sentence"], doc["sentence"],
) )
...@@ -195,11 +197,11 @@ class QNLI(HFTask): ...@@ -195,11 +197,11 @@ class QNLI(HFTask):
def doc_to_target(self, doc): def doc_to_target(self, doc):
# True = entailment # True = entailment
# False = not entailment # False = not entailment
return " {}".format({0: "Yes", 1: "No"}[doc["label"]]) return " {}".format({0: "yes", 1: "no"}[doc["label"]])
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll_yes, _ = rf.loglikelihood(ctx, " Yes") ll_yes, _ = rf.loglikelihood(ctx, " yes")
ll_no, _ = rf.loglikelihood(ctx, " No") ll_no, _ = rf.loglikelihood(ctx, " no")
return ll_yes, ll_no return ll_yes, ll_no
def process_results(self, doc, results): def process_results(self, doc, results):
...@@ -235,7 +237,7 @@ class WNLI(HFTask): ...@@ -235,7 +237,7 @@ class WNLI(HFTask):
return True return True
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format( return "{}\nQuestion: {} True, False or Neither?\nAnswer:".format(
doc["sentence1"], doc["sentence1"],
doc["sentence2"], doc["sentence2"],
) )
...@@ -284,7 +286,7 @@ class RTE(HFTask): ...@@ -284,7 +286,7 @@ class RTE(HFTask):
return True return True
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "{}\nquestion:\t{}\tTrue or False?\nanswer:".format( return "{}\nQuestion: {} True or False?\nAnswer:".format(
doc["sentence1"], doc["sentence1"],
doc["sentence2"], doc["sentence2"],
) )
...@@ -338,9 +340,9 @@ class MRPC(HFTask): ...@@ -338,9 +340,9 @@ class MRPC(HFTask):
return "Indicate if both sentences mean the same thing." return "Indicate if both sentences mean the same thing."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format( return "Sentence 1: {}\nSentence 2: {}\nQuestion: Do both sentences mean the same thing?\nAnswer:".format(
doc["sentence1"], general_detokenize(doc["sentence1"]),
doc["sentence2"], general_detokenize(doc["sentence2"]),
) )
def doc_to_target(self, doc): def doc_to_target(self, doc):
...@@ -390,7 +392,7 @@ class QQP(HFTask): ...@@ -390,7 +392,7 @@ class QQP(HFTask):
return "Indicate if both questions ask the same thing." return "Indicate if both questions ask the same thing."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "question 1:\t{}\nquestion 2:\t{}\nanswer:".format( return "Question 1: {}\nQuestion 2: {}\nQuestion: Do both questions ask the same thing?\nAnswer:".format(
doc["question1"], doc["question1"],
doc["question2"], doc["question2"],
) )
...@@ -443,7 +445,7 @@ class STSB(HFTask): ...@@ -443,7 +445,7 @@ class STSB(HFTask):
"where 5 means identical and 0 means unrelated." "where 5 means identical and 0 means unrelated."
def doc_to_text(self, doc): def doc_to_text(self, doc):
return "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format( return "sentence 1: {}\nsentence 2: {}\nAnswer:".format(
doc["sentence1"], doc["sentence1"],
doc["sentence2"], doc["sentence2"],
) )
......
from . common import HFTask
from lm_eval.base import MultipleChoiceTask
class HeadQA(HFTask, MultipleChoiceTask):
DATASET_PATH = "head_qa"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def _convert_standard(self, doc):
out_doc = {
"id": doc["qid"],
"query": "Question: " + doc["qtext"] + "\nAnswer:",
"choices": [answer["atext"] for answer in doc["answers"]],
"gold": int(doc["ra"]) - 1,
}
return out_doc
def _load_docs(self, docs):
for doc in docs:
yield self._convert_standard(doc)
def training_docs(self):
docs = super().training_docs()
return self._load_docs(docs)
def validation_docs(self):
docs = super().validation_docs()
return self._load_docs(docs)
def test_docs(self):
docs = super().test_docs()
return self._load_docs(docs)
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return doc["query"]
import re import re
import numpy as np from lm_eval.base import MultipleChoiceTask
from ..base import rf, mean
from . common import HFTask from . common import HFTask
class HellaSwag(HFTask): class HellaSwag(HFTask, MultipleChoiceTask):
DATASET_PATH = "hellaswag" DATASET_PATH = "hellaswag"
DATASET_NAME = None DATASET_NAME = None
@classmethod
def remove_brackets(cls, text):
""" Removes brackets from HellaSwag documents.
NOTE: The brackets are artifacts of the WikiHow dataset portion underlying
HellaSwag.
"""
text = re.sub('\[.*?\]', '', text)
return text
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -24,19 +14,37 @@ class HellaSwag(HFTask): ...@@ -24,19 +14,37 @@ class HellaSwag(HFTask):
return True return True
def has_test_docs(self): def has_test_docs(self):
return True return False
@classmethod
def preprocess(cls, text):
text = text.strip()
# NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
text = text.replace(" [title]", ". ")
text = re.sub('\\[.*?\\]', '', text)
text = text.replace(" ", " ")
return text
def _convert_standard(self, doc):
ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
out_doc = {
"query": self.preprocess(doc['activity_label'] + ': ' + ctx),
"choices": [self.preprocess(ending) for ending in doc['endings']],
"gold": int(doc['label']),
}
return out_doc
def _load_docs(self, docs):
for record in docs:
yield self._convert_standard(record)
def training_docs(self): def training_docs(self):
if self.has_training_docs(): docs = super().training_docs()
return self.data["train"] return self._load_docs(docs)
def validation_docs(self): def validation_docs(self):
if self.has_validation_docs(): docs = super().validation_docs()
return self.data["validation"] return self._load_docs(docs)
def test_docs(self):
if self.has_test_docs():
return self.data["test"]
def fewshot_description(self): def fewshot_description(self):
return "Label for the relevant action: Sentences describing the " \ return "Label for the relevant action: Sentences describing the " \
...@@ -44,73 +52,4 @@ class HellaSwag(HFTask): ...@@ -44,73 +52,4 @@ class HellaSwag(HFTask):
"plausibly completes the situation." "plausibly completes the situation."
def doc_to_text(self, doc): def doc_to_text(self, doc):
text = doc['activity_label'] + ': ' + doc['ctx'] + '\n' return doc["query"]
return self.remove_brackets(text)
def doc_to_target(self, doc):
letter_answer = doc['label']
if letter_answer == '0':
index = 0
elif letter_answer == '1':
index = 1
elif letter_answer == '2':
index = 2
elif letter_answer == '3':
index = 3
else:
raise ValueError(
"HellaSwag from HF datasets contained an invalid answer key")
target = doc['endings'][index]
return " " + self.remove_brackets(target)
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
ll_answers = []
for i in range(4):
continuation = " " + self.remove_brackets(doc['endings'][i])
ll_answers.append(rf.loglikelihood(ctx, continuation))
return ll_answers
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
gold = int(doc['label'])
pred = np.argmax(results)
acc = 1. if pred == gold else 0.
return {
"acc": acc
}
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
return {
"acc": mean
}
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
return {
"acc": True
}
from lm_eval.base import Task, rf, mean, perplexity from lm_eval.base import Task, rf
from lm_eval.metrics import mean, perplexity
from lm_eval.utils import sh from lm_eval.utils import sh
import json import json
import math import math
...@@ -9,7 +10,7 @@ class LAMBADA(Task): ...@@ -9,7 +10,7 @@ class LAMBADA(Task):
def download(self): def download(self):
sh("mkdir -p data/lambada") sh("mkdir -p data/lambada")
download_file( download_file(
"https://storage.googleapis.com/gpt-2/data/lambada_test.jsonl", "http://eaidata.bmk.sh/data/lambada_test.jsonl",
"data/lambada/lambada_test.jsonl", "data/lambada/lambada_test.jsonl",
"4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226" "4aa8d02cd17c719165fc8a7887fddd641f43fcafa4b1c806ca8abc31fabdb226"
) )
...@@ -53,18 +54,18 @@ class LAMBADA(Task): ...@@ -53,18 +54,18 @@ class LAMBADA(Task):
ll, is_greedy = results ll, is_greedy = results
return { return {
'perplexity': ll, 'ppl': ll,
'accuracy': int(is_greedy) 'acc': int(is_greedy)
} }
def aggregation(self): def aggregation(self):
return { return {
'perplexity': perplexity, 'ppl': perplexity,
'accuracy': mean 'acc': mean
} }
def higher_is_better(self): def higher_is_better(self):
return { return {
'perplexity': False, 'ppl': False,
'accuracy': True 'acc': True
} }
from . common import HFTask
from lm_eval.base import mean, rf, MultipleChoiceTask
import re
class MathQA(HFTask, MultipleChoiceTask):
DATASET_PATH = "math_qa"
DATASET_NAME = None
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def _convert_standard(self, doc):
answer_idx = ['a', 'b', 'c', 'd', 'e'].index(doc['correct'])
choices = [c[4:].rstrip(" ,") for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc['options'])]
out_doc = {
"query": "Question: " + doc['Problem'] +"\nAnswer:",
"choices": choices,
"gold": answer_idx,
}
return out_doc
def _load_docs(self, docs):
for record in docs:
yield self._convert_standard(record)
def training_docs(self):
docs = super().training_docs()
return self._load_docs(docs)
def validation_docs(self):
docs = super().validation_docs()
return self._load_docs(docs)
def test_docs(self):
docs = super().test_docs()
return self._load_docs(docs)
def fewshot_description(self):
# TODO: figure out description
return ""
def doc_to_text(self, doc):
return doc["query"]
import numpy as np from lm_eval.base import MultipleChoiceTask
from scipy.stats import pearsonr, spearmanr from .common import HFTask
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import HFTask, simple_accuracy_metric, yesno
class OpenBookQA(HFTask):
class OpenBookQA(HFTask, MultipleChoiceTask):
DATASET_PATH = "openbookqa" DATASET_PATH = "openbookqa"
DATASET_NAME = "main" DATASET_NAME = "main"
...@@ -17,82 +15,34 @@ class OpenBookQA(HFTask): ...@@ -17,82 +15,34 @@ class OpenBookQA(HFTask):
def has_test_docs(self): def has_test_docs(self):
return True return True
def _convert_standard(self, doc):
out_doc = {
"id": doc["id"],
"query": doc["question_stem"],
"choices": doc["choices"]["text"],
"gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()),
}
return out_doc
def _load_docs(self, docs):
for record in docs:
yield self._convert_standard(record)
def training_docs(self): def training_docs(self):
if self.has_training_docs(): docs = super().training_docs()
if self._training_docs is None: return self._load_docs(docs)
self._training_docs = list(self.data["train"])
return self._training_docs
def validation_docs(self): def validation_docs(self):
if self.has_validation_docs(): docs = super().validation_docs()
return self.data["validation"] return self._load_docs(docs)
def test_docs(self): def test_docs(self):
if self.has_test_docs(): docs = super().test_docs()
return self.data["test"] return self._load_docs(docs)
def fewshot_description(self): def fewshot_description(self):
# TODO: figure out fewshot description # TODO: figure out fewshot description
return "" return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc['question_stem'] + '\n' return doc["query"]
def doc_to_target(self, doc):
letter_answer = doc['answerKey']
if letter_answer == 'A':
index = 0
elif letter_answer == 'B':
index = 1
elif letter_answer == 'C':
index = 2
elif letter_answer == 'D':
index = 3
else:
raise ValueError("OpenBookQA from HF datasets contained an invalid answer key")
return doc['choices']['text'][index] + '.'
def construct_requests(self, doc, ctx):
""" Uses RequestFactory to construct Requests and returns an iterable of
Requests which will be sent to the LM.
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def process_results(self, doc, results):
"""Take a single document and the LM results and evaluates, returning a
dict where keys are the names of submetrics and values are the values of
the metric for that one document
:param doc:
The document as returned from training_docs, validation_docs, or test_docs.
:param results:
The results of the requests created in construct_requests.
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def aggregation(self):
"""
:returns: {str: [float] -> float}
A dictionary where keys are the names of submetrics and values are
functions that aggregate a list of metrics
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
def higher_is_better(self):
"""
:returns: {str: bool}
A dictionary where keys are the names of submetrics and values are
whether a higher value of the submetric is better
"""
# TODO: implement evaluation.
raise NotImplementedError('Evaluation not implemented')
import numpy as np import numpy as np
from lm_eval.base import rf, mean from lm_eval.base import rf
from ..metrics import mean
from . common import HFTask from . common import HFTask
...@@ -21,15 +22,15 @@ class PiQA(HFTask): ...@@ -21,15 +22,15 @@ class PiQA(HFTask):
return "" return ""
def doc_to_text(self, doc): def doc_to_text(self, doc):
return doc["goal"] + "\n" return "Question: "+doc["goal"] + "\nAnswer:"
def doc_to_target(self, doc): def doc_to_target(self, doc):
solutions = [doc["sol1"], doc["sol2"]] solutions = [doc["sol1"], doc["sol2"]]
return solutions[doc["label"]] return " " + solutions[doc["label"]]
def construct_requests(self, doc, ctx): def construct_requests(self, doc, ctx):
ll_1, _ = rf.loglikelihood(ctx, doc['sol1']) ll_1, _ = rf.loglikelihood(ctx, " " + doc['sol1'])
ll_2, _ = rf.loglikelihood(ctx, doc['sol2']) ll_2, _ = rf.loglikelihood(ctx, " " + doc['sol2'])
return ll_1, ll_2 return ll_1, ll_2
def process_results(self, doc, results): def process_results(self, doc, results):
......
...@@ -2,7 +2,8 @@ import numpy as np ...@@ -2,7 +2,8 @@ import numpy as np
import json import json
import random import random
from .common import HFTask from .common import HFTask
from lm_eval.base import rf, mean from lm_eval.base import rf
from ..metrics import mean
class Pubmed_QA(HFTask): class Pubmed_QA(HFTask):
...@@ -30,7 +31,7 @@ class Pubmed_QA(HFTask): ...@@ -30,7 +31,7 @@ class Pubmed_QA(HFTask):
def doc_to_text(self, doc): def doc_to_text(self, doc):
ctxs = "\n".join(doc["context"]["contexts"]) ctxs = "\n".join(doc["context"]["contexts"])
return "abstract: {}\nquestion: {}\nanswer:".format( return "Abstract: {}\nQuestion: {}\nAnswer:".format(
ctxs, ctxs,
doc["question"], doc["question"],
doc["final_decision"] doc["final_decision"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment