Unverified Commit ffbaef21 authored by Stella Biderman's avatar Stella Biderman Committed by GitHub
Browse files

Merge pull request #3 from zphang/refactor

LM Eval Refactor; GPT-3; GLUE tasks
parents 7a32afeb e7a87e71
import abc
import random
class LM(abc.ABC):
@abc.abstractmethod
def generate(self, context, until):
pass
@abc.abstractmethod
def loglikelihood(self, context, continuation):
pass
class Dataset(abc.ABC):
@abc.abstractmethod
def has_training_docs(self):
pass
@abc.abstractmethod
def has_validation_docs(self):
pass
@abc.abstractmethod
def training_docs(self):
pass
@abc.abstractmethod
def validation_docs(self):
pass
@abc.abstractmethod
def test_docs(self):
pass
def fewshot_examples(self, k):
traindocs = list(self.training_docs())
random.seed(123)
random.shuffle(traindocs)
return traindocs[:k]
@abc.abstractmethod
def fewshot_description(self):
pass
@abc.abstractmethod
def doc_to_text(self, doc, include_target=True):
pass
@abc.abstractmethod
def evaluate(self, docs, lm, provide_description, num_fewshot):
pass
# NLP generally do not require separately downloading data
#coqa #coqa
mkdir -p data/coqa mkdir -p data/coqa
......
import transformers
from base import LM
import torch
import torch.nn.functional as F
class GPT2LM(LM):
def __init__(self, dev='cpu'):
self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained('gpt2').to(dev)
self.tok = transformers.GPT2Tokenizer.from_pretrained('gpt2')
self.dev = dev
def generate(self, context, until):
context = torch.tensor([self.tok.encode(context.strip())], dtype=torch.long).to(self.dev)
res = self.gpt2.generate(context, eos_token_id=self.tok.encoder[until], do_sample=False, max_length=1024)
# chop off the prompt and the final eos token
return self.tok.decode(res[0][len(context[0]):-1]).strip()
def loglikelihood(self, context, continuation):
print('likelihood:', context, continuation)
inp = torch.tensor([self.tok.encode(context + continuation)], dtype=torch.long).to(self.dev)
ctxlen = len(self.tok.encode(context.strip()))
cont_toks = inp[:, ctxlen:] # [batch, seq]
logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1] # [batch, seq, vocab]
return torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)
import base
import nlp
def yesno(x):
if x: return 'yes'
else: return 'no'
def mean(x):
return sum(x) / len(x)
class BoolQ(base.Dataset):
def __init__(self):
self.dataset = nlp.load_dataset('boolq')
def training_docs(self):
yield from self.dataset['train']
def validation_docs(self):
yield from self.dataset['validation']
def test_docs(self):
return []
def fewshot_description(self):
return "Read the following passages and answer each question with a yes or a no."
def doc_to_text(self, doc, include_target=True):
return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: " + (yesno(doc['answer']) if include_target else "")
def evaluate(self, docs, lm, provide_description, num_fewshot):
acc = []
for doc in docs:
ctx = '\n\n'.join(map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))) + '\n\n'
ctx += self.doc_to_text(doc, include_target=False).strip()
ctx = ((self.fewshot_description() + "\n\n") if provide_description else "") + ctx
ans = lm.loglikelihood(ctx, 'yes') > lm.loglikelihood(ctx, 'no')
acc.append(int(ans == doc['answer']))
return mean(acc)
\ No newline at end of file
import abc
import random
class LM(abc.ABC):
@abc.abstractmethod
def generate(self, context, max_gen_length):
"""Conditional text generation with an LM
:param context: str
Context string for conditional generation
:param max_gen_length: int
Maximum number of tokens to generate
:return: str
"""
pass
@abc.abstractmethod
def loglikelihood(self, context, continuation):
"""Compute log-likelihood of a generation a continuation from a context
Assume that the final text will simple be
context + continuation
:param context: str
Context string for conditional generation
:param continuation: str
Maximum number of tokens to generate
:return: float
"""
pass
@classmethod
def create_from_arg_string(cls, arg_string):
"""Constructor method, in case models need additional arguments
e.g. OpenAI API engine, paths for loading, other params
:param arg_string: str
Left up to individual model class to handle
"""
return cls()
class Dataset(abc.ABC):
@abc.abstractmethod
def has_training_docs(self):
"""Whether the task has a training set"""
pass
@abc.abstractmethod
def has_validation_docs(self):
"""Whether the task has a validation set"""
pass
@abc.abstractmethod
def has_test_docs(self):
"""Whether the task has a test set"""
pass
@abc.abstractmethod
def training_docs(self):
"""
:return: Iterable[obj]
A iterable of any object, that doc_to_text can handle
"""
pass
@abc.abstractmethod
def validation_docs(self):
pass
@abc.abstractmethod
def test_docs(self):
pass
def fewshot_examples(self, k):
traindocs = list(self.training_docs())
random.shuffle(traindocs)
return traindocs[:k]
@abc.abstractmethod
def doc_to_text(self, doc, include_target=True):
pass
@abc.abstractmethod
def evaluate(self, docs, lm, provide_description, num_fewshot):
"""Take iterable of docs and evaluates, returning a dict with the following format:
{
"major": float,
"minor": dict,
"higher_is_better": bool,
}
* `major` should be a single, representative number, for programmatic comparison
* `minor` should be a dictionary containing all relevant sub-metrics
* `higher_is_better` determines whether a higher metric is better
"""
pass
def fewshot_description(self):
return ""
def fewshot_context(self, doc, num_fewshot, provide_description):
raw_description = self.fewshot_description()
description = (raw_description + "\n\n") if provide_description and raw_description else ""
labeled_examples = "\n\n".join(
map(self.doc_to_text, self.fewshot_examples(k=num_fewshot))
) + "\n\n"
example = self.doc_to_text(doc, include_target=False).strip()
return description + labeled_examples + example
class Registry:
def __init__(self, registry_name):
self.registry_name = registry_name
self.registry = {}
def register(self, name):
def register_cls(new_cls):
if name in self.registry:
raise ValueError('Cannot register duplicate ({})'.format(self.registry_name, name))
self.registry[name] = new_cls
return new_cls
return register_cls
import importlib
import os
from lm_eval.base import Registry
MODEL_REGISTRY = Registry(registry_name="models")
# Load all modules in models directory to populate registry
models_dir = os.path.dirname(__file__)
for file in os.listdir(models_dir):
path = os.path.join(models_dir, file)
if (
not file.startswith('_')
and not file.startswith('.')
and (file.endswith('.py') or os.path.isdir(path))
):
module_name = file[:file.find('.py')] if file.endswith('.py') else file
module = importlib.import_module('lm_eval.models.' + module_name)
def get_model(model_name):
return MODEL_REGISTRY.registry[model_name]
from lm_eval.base import LM
from . import MODEL_REGISTRY
@MODEL_REGISTRY.register("dummy")
class DummyLM(LM):
def generate(self, context, max_gen_length):
return "lol"
def loglikelihood(self, context, continuation):
return 0.0
import transformers
import torch
import torch.nn.functional as F
from lm_eval.base import LM
from lm_eval import utils
from . import MODEL_REGISTRY
@MODEL_REGISTRY.register("gpt2")
class GPT2LM(LM):
def __init__(self, device="cpu"):
self.device = torch.device(device)
self.gpt2 = transformers.GPT2LMHeadModel.from_pretrained('gpt2').to(self.device)
self.tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
@classmethod
def create_from_arg_string(cls, arg_string):
args = utils.simple_parse_args_string(arg_string)
return cls(device=args.get("device", "cpu"))
def generate(self, context, max_gen_length):
context = torch.tensor([self.tokenizer.encode(context.strip())], dtype=torch.long).to(self.device)
res = self.gpt2.generate(
context,
eos_token_id=self.tokenizer.eos_token_id,
do_sample=False,
max_length=max_gen_length,
)
# chop off the prompt and the final eos token
return self.tokenizer.decode(res[0][len(context[0]):-1]).strip()
def loglikelihood(self, context, continuation):
inp = torch.tensor([self.tokenizer.encode(context + continuation)], dtype=torch.long).to(self.device)
ctxlen = len(self.tokenizer.encode(context.strip()))
cont_toks = inp[:, ctxlen:] # [batch, seq]
logits = F.log_softmax(self.gpt2(inp)[0], dim=-1)[:, ctxlen - 1:-1] # [batch, seq, vocab]
return torch.gather(logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)
import os
import openai
import transformers
from lm_eval.base import LM
from lm_eval import utils
from . import MODEL_REGISTRY
@MODEL_REGISTRY.register("gpt3")
class GPT3LM(LM):
def __init__(self, engine):
self.engine = engine
self.tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2')
# Read from environment variable OPENAI_API_SECRET_KEY
openai.api_key = os.environ["OPENAI_API_SECRET_KEY"]
@classmethod
def create_from_arg_string(cls, arg_string):
args = utils.simple_parse_args_string(arg_string)
return cls(engine=args.get("engine", "davinci"))
def generate(self, context, max_gen_length):
response = openai.Completion.create(
engine=self.engine,
prompt=context,
max_tokens=max_gen_length,
temperature=0.0,
)
return response.choices[0]["text"]
def loglikelihood(self, context, continuation):
full_text = context + continuation
full_text_length = len(self.tokenizer.tokenize(full_text))
context_length = len(self.tokenizer.tokenize(context))
continuation_length = len(self.tokenizer.tokenize(continuation))
assert full_text_length == context_length + continuation_length
response = openai.Completion.create(
engine=self.engine,
prompt=full_text,
echo=True,
max_tokens=0, temperature=0.0,
logprobs=0,
)
logprobs = response.choices[0]["logprobs"]["token_logprobs"]
continuation_logprobs = logprobs[-continuation_length:]
return sum(continuation_logprobs)
import importlib
import os
from lm_eval.base import Registry
TASK_REGISTRY = Registry(registry_name="tasks")
# Load all modules in models directory to populate registry
tasks_dir = os.path.dirname(__file__)
for file in os.listdir(tasks_dir):
path = os.path.join(tasks_dir, file)
if (
not file.startswith('_')
and not file.startswith('.')
and (file.endswith('.py') or os.path.isdir(path))
):
module_name = file[:file.find('.py')] if file.endswith('.py') else file
module = importlib.import_module('lm_eval.tasks.' + module_name)
ALL_TASKS = sorted(list(TASK_REGISTRY.registry))
def get_task(model_name):
return TASK_REGISTRY.registry[model_name]
import nlp
import numpy as np
import random
from ..base import Dataset
class NLP_TASK(Dataset):
NLP_PATH = None
NLP_NAME = None
def _load_nlp_dataset(self):
return nlp.load_dataset(path=self.NLP_PATH, name=self.NLP_NAME)
def training_docs(self):
if self.has_training_docs():
return self._load_nlp_dataset()["train"]
def validation_docs(self):
if self.has_validation_docs():
return self._load_nlp_dataset()["validation"]
def test_docs(self):
if self.has_test_docs():
return self._load_nlp_dataset()["test"]
def fewshot_examples(self, k):
training_docs = self.training_docs()
n = len(training_docs)
indices = random.sample(range(n), k)
return [training_docs[i] for i in indices]
def simple_accuracy_metric(preds, golds):
acc = float((np.array(preds) == np.array(golds)).mean())
return {
"major": acc,
"minor": {"acc": acc},
"higher_is_better": True,
}
def yesno(x):
if x:
return 'yes'
else:
return 'no'
from base import Dataset
import os
import json import json
import random import random
from lm_eval.base import Dataset
from . import TASK_REGISTRY
@TASK_REGISTRY.register("coqa")
class CoQA(Dataset): class CoQA(Dataset):
def has_training_docs(self): def has_training_docs(self):
return True return True
......
import numpy as np
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import f1_score, matthews_corrcoef
from tqdm import auto as tqdm_lib
from . common import NLP_TASK, simple_accuracy_metric, yesno
from . import TASK_REGISTRY
def get_accuracy_and_f1(preds, golds):
golds = np.array(golds)
preds = np.array(preds)
acc = float((preds == golds).mean())
f1 = float(f1_score(y_true=golds, y_pred=preds))
minor = {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
return {
"major": minor["acc_and_f1"],
"minor": minor,
"higher_is_better": True,
}
@TASK_REGISTRY.register("cola")
class CoLA(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "cola"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Does this sentence make sense?:\tTrue or False?"
def doc_to_text(self, doc, include_target=True):
text = "\nSentence:{}\nAnswer: ".format(doc["sentence"])
if include_target:
text += " {}".format({1: "True", 0: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' True') > lm.loglikelihood(ctx, ' False'))
golds = np.array(golds)
preds = np.array(preds)
mcc = float(matthews_corrcoef(y_true=golds, y_pred=preds))
return {
"major": mcc,
"minor": {"mcc": mcc},
"higher_is_better": True,
}
@TASK_REGISTRY.register("mnli")
class MNLI(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "mnli"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def validation_docs(self):
if self.has_validation_docs():
return self._load_nlp_dataset()["validation_matched"]
def test_docs(self):
if self.has_test_docs():
return self._load_nlp_dataset()["test_matched"]
def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
doc["premise"],
doc["hypothesis"],
)
if include_target:
# True = entailment
# False = contradiction
# Neither = neutral
text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
probs = np.array([
self.lm.loglikelihood(ctx, ' True'),
self.lm.loglikelihood(ctx, ' Neither'),
self.lm.loglikelihood(ctx, ' False'),
])
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("mrpc")
class MRPC(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "mrpc"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing."
def doc_to_text(self, doc, include_target=True):
text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
doc["sentence1"],
doc["sentence2"],
)
if include_target:
text += " {}".format(yesno(doc["label"]))
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
return get_accuracy_and_f1(preds=preds, golds=golds)
@TASK_REGISTRY.register("rte")
class RTE(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "rte"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
doc["sentence1"],
doc["sentence2"],
)
if include_target:
# 0 = entailment
# 1 = not_entailment
text += " {}".format({0: "True", 1: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' False') > lm.loglikelihood(ctx, ' True'))
return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("qnli")
class QNLI(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "qnli"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\tTrue or False?\nanswer:".format(
doc["question"],
doc["sentence"],
)
if include_target:
# True = entailment
# False = not entailment
text += " {}".format({0: "True", 1: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(self.lm.loglikelihood(ctx, ' False') > self.lm.loglikelihood(ctx, ' True'))
return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("qqp")
class QQP(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "qqp"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing."
def doc_to_text(self, doc, include_target=True):
text = "question 1:\t{}\nquestion 2:\t{}\nanswer:".format(
doc["question1"],
doc["question2"],
)
if include_target:
text += " {}".format(yesno(doc["label"]))
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
return get_accuracy_and_f1(preds=preds, golds=golds)
@TASK_REGISTRY.register("stsb")
class STSB(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "stsb"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if both sentences mean the same thing from a scale of 0-5, " \
"where 5 means identical and 0 means unrelated."
def doc_to_text(self, doc, include_target=True):
text = "sentence 1:\t{}\nsentence 2:\t{}\nanswer:".format(
doc["sentence1"],
doc["sentence2"],
)
if include_target:
text += " {}".format(yesno(doc["label"]))
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
output = lm.generate(context=ctx, max_gen_length=5).strip()
first_element = output.split()[0]
if first_element.isnumeric():
pred = max(min(float(first_element), 5.0), 0.0)
else:
pred = 2.5
preds.append(pred)
pearson_corr = float(pearsonr(preds, golds)[0])
spearman_corr = float(spearmanr(preds, golds)[0])
minor = {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}
return {
"major": minor["corr"],
"minor": minor,
"higher_is_better": True,
}
@TASK_REGISTRY.register("sst")
class SST(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "sst2"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Indicate if each sentence is Positive or Negative."
def doc_to_text(self, doc, include_target=True):
text = "sentence:\t{}\t\nanswer:".format(
doc["sentence"],
)
if include_target:
text += " {}".format({1: "Positive", 0: "Negative"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' Positive') > lm.loglikelihood(ctx, ' Negative'))
return simple_accuracy_metric(preds=preds, golds=golds)
@TASK_REGISTRY.register("wnli")
class WNLI(NLP_TASK):
NLP_PATH = "glue"
NLP_NAME = "wnli"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def doc_to_text(self, doc, include_target=True):
text = "{}\nquestion:\t{}\tTrue, False or Neither?\nanswer:".format(
doc["premise"],
doc["hypothesis"],
)
if include_target:
# True = entailment
# False = contradiction
# Neither = neutral
text += " {}".format({0: "True", 1: "Neither", 2: "False"}[doc["label"]])
return text
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in tqdm_lib.tqdm(docs):
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
probs = np.array([
self.lm.loglikelihood(ctx, ' True'),
self.lm.loglikelihood(ctx, ' Neither'),
self.lm.loglikelihood(ctx, ' False'),
])
preds.append(np.argmax(probs))
return simple_accuracy_metric(preds=preds, golds=golds)
from . common import NLP_TASK, simple_accuracy_metric, yesno
from . import TASK_REGISTRY
@TASK_REGISTRY.register("boolq")
class BoolQ(NLP_TASK):
NLP_PATH = "super_glue"
NLP_NAME = "boolq"
def has_training_docs(self):
return True
def has_validation_docs(self):
return True
def has_test_docs(self):
return True
def fewshot_description(self):
return "Read the following passages and answer each question with a yes or a no."
def doc_to_text(self, doc, include_target=True):
return f"{doc['passage']}\nquestion: {doc['question']}\nanswer: " \
+ (yesno(doc['label']) if include_target else "")
def evaluate(self, docs, lm, provide_description, num_fewshot):
golds = [doc["label"] for doc in docs]
preds = []
for doc in docs:
ctx = self.fewshot_context(
doc=doc,
provide_description=provide_description,
num_fewshot=num_fewshot,
)
preds.append(lm.loglikelihood(ctx, ' yes') > lm.loglikelihood(ctx, ' no'))
return simple_accuracy_metric(preds=preds, golds=golds)
def simple_parse_args_string(args_string):
"""
Parses something like
args1=val1,arg2=val2
Into a dictionary
"""
args_string = args_string.strip()
if not args_string:
return {}
arg_list = args_string.split(",")
args_dict = {}
for arg in arg_list:
k, v = arg.split("=")
args_dict[k] = v
return args_dict
from gpt2 import GPT2LM import argparse
import json
import numpy as np
import random
lm = GPT2LM() from lm_eval import models, tasks
print(lm.generate('1 + 1 = 2.\n3 + 5 = 8.\n4 + 9 = 13.\n4 + 3 = 7.\n2 + 3 =', '.'))
\ No newline at end of file def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True)
parser.add_argument('--model_args', default="")
parser.add_argument('--tasks', default="all_tasks")
parser.add_argument('--provide_description', action="store_true")
parser.add_argument('--num_fewshot', type=int, default=1)
parser.add_argument('--seed', type=int, default=1234)
return parser.parse_args()
def main():
args = parse_args()
random.seed(args.seed)
np.random.seed(args.seed)
lm = models.get_model(args.model).create_from_arg_string(args.model_args)
if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS
else:
task_names = args.tasks.split(",")
task_dict = {
task_name: tasks.get_task(task_name)()
for task_name in task_names
}
results = {}
for task_name, task in task_dict.items():
if not task.has_validation_docs():
continue
result = task.evaluate(
docs=task.validation_docs(),
lm=lm,
provide_description=args.provide_description,
num_fewshot=args.num_fewshot,
)
results[task_name] = result
print(json.dumps(results, indent=2))
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment