Commit 8458afa8 authored by Jonathan Tow's avatar Jonathan Tow
Browse files

Add initial mc-answer-prompt-experiment features

parent e63d1396
...@@ -513,43 +513,8 @@ class Task(abc.ABC): ...@@ -513,43 +513,8 @@ class Task(abc.ABC):
example = self.doc_to_text(doc) example = self.doc_to_text(doc)
return description + labeled_examples + example return description + labeled_examples + example
from lm_eval.mctask_experimental import MULTIPLE_CHOICE_TASK
class MultipleChoiceTask(Task, abc.ABC): MultipleChoiceTask = MULTIPLE_CHOICE_TASK
def doc_to_target(self, doc):
return " " + doc['choices'][doc['gold']]
def construct_requests(self, doc, ctx):
lls = [
rf.loglikelihood(ctx, " {}".format(choice))[0]
for choice in doc['choices']
]
return lls
def process_results(self, doc, results):
gold = doc["gold"]
acc = 1. if np.argmax(results) == gold else 0.
completion_len = np.array([float(len(i)) for i in doc["choices"]])
acc_norm = 1. if np.argmax(results / completion_len) == gold else 0.
return {
"acc": acc,
"acc_norm": acc_norm,
}
def higher_is_better(self):
return {
"acc": True,
"acc_norm": True,
}
def aggregation(self):
return {
"acc": mean,
"acc_norm": mean,
}
class PerplexityTask(Task, abc.ABC): class PerplexityTask(Task, abc.ABC):
......
import collections import collections
import itertools import itertools
import os
import random import random
import lm_eval.metrics import lm_eval.metrics
import lm_eval.models import lm_eval.models
...@@ -107,6 +108,9 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None, ...@@ -107,6 +108,9 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
Dictionary of results Dictionary of results
""" """
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
print(f"{'='*20}")
print(f"Task Module: {lm_eval.base.MultipleChoiceTask.__name__}")
print(f"{'='*20}")
# TODO: todo: implement proper description-providing system # TODO: todo: implement proper description-providing system
assert not provide_description # not implemented. assert not provide_description # not implemented.
......
""" Multiple Choice Format Experiments.
TODO: Generalize the formatting of fewshot examples.
"""
import os
import abc
import hashlib
from argparse import ArgumentError
from dataclasses import dataclass
import typing
from attr import field
import numpy as np
import lm_eval.base as base
from lm_eval.metrics import mean
@dataclass
class MultipleChoiceDoc:
question: str
# The possible answer keys, e.g. `["A", "B", "C", "D"]`.
# These should be the type as gold?
keys: typing.List[str]
options: typing.List[str]
gold: int
id: int = field(init=False)
def __post_init__(self):
self.id = hashlib.sha224(self.question.encode('utf-8')).hexdigest()
class BaseMultipleChoiceTask(base.Task, abc.ABC):
def doc_to_text(self, doc: MultipleChoiceDoc):
return self.format_prompt(doc)
@abc.abstractclassmethod
def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
pass
@abc.abstractmethod
def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
pass
@abc.abstractmethod
def loglikelihood_continuation(self, doc: MultipleChoiceDoc) -> typing.List[str]:
pass
def construct_requests(self, doc: MultipleChoiceDoc, ctx: str):
lls = []
conts = self.loglikelihood_continuation(doc)
for cont in conts:
lls.append(base.rf.loglikelihood(ctx, f" {cont}")[0])
return lls
def process_results(self, doc: MultipleChoiceDoc, results: typing.List):
gold = doc.gold
ans = np.argmax(results)
is_correct = 1. if ans == gold else 0.
# Normalize by completion length.
conts = self.loglikelihood_continuation(doc)
completion_len = np.array([float(len(i)) for i in conts])
acc_norm = 1. if np.argmax(results / completion_len) == gold else 0.
return {
"acc": is_correct,
"acc_norm": acc_norm,
# Bundle answers: (id, answer key, answer index, is correct).
"answer_bundle": (doc.id, doc.keys[ans], ans, is_correct),
}
def higher_is_better(self):
return {
"acc": True,
"acc_norm": True,
"answer_bundle": True,
}
def aggregation(self):
return {
"acc": mean,
"acc_norm": mean,
"answer_bundle": answer_bundle
}
def answer_bundle(items):
""" Bundles answers into a csv file. """
from pathlib import Path
import csv
cols = ["question_id", "model_answer", "model_answer_index", "is_correct"]
rows = [*items]
path = os.environ["QUESTION_RESULT_PATH"]
with open(f'{path}/question-by-question-results.csv', 'w') as f:
write = csv.writer(f)
write.writerow(cols)
write.writerows(rows)
return 0
def key2num(doc: MultipleChoiceDoc, key: str) -> int:
return str(doc.keys.index(key) + 1) # `+ 1` for 1-based indexing.
def format_key(key: str, type: str):
""" Formats a multiple choice key. E.g.
format_key("A", "period") => "A."
format_key("A", "parens") => "(A)"
format_key("A", "colon") => "A:"
Args:
- type: "period" | "parens" | "colon"
"""
if type == "parens":
return f"({key})"
elif type == "period":
return f"{key}."
elif type == "colon":
return f"{key}:"
else:
raise ArgumentError()
class MC_NoOptionList_OptionLL_Task(BaseMultipleChoiceTask):
"""
Format:
Question: <question>
Answer:
Continuation:
loglikelihood_continuation = <option_i>
"""
def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
prompt = "Question: " + doc.question + "\n"
prompt += "Answer:"
return prompt
# return _format_prompt(doc, list_options=False)
def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
return " " + doc.options[doc.gold]
def loglikelihood_continuation(self, doc: MultipleChoiceDoc) -> typing.List[str]:
return [option for option in doc.options]
class MC_WithOptionList_OptionLL_Task(BaseMultipleChoiceTask):
"""
Format:
Question: <question>
<key1>: <option1>
<key2>: <option2>
...
Answer:
Continuation:
loglikelihood_continuation = <option_i>
"""
def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
prompt = "Question: " + doc.question + "\n"
prompt += "\n".join([
f"{format_key(doc.keys[i], 'colon')} {option}"
for i, option in enumerate(doc.options)
])
prompt += "\nAnswer:"
return prompt
def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
return " " + doc.options[doc.gold]
def loglikelihood_continuation(self, doc: MultipleChoiceDoc) -> typing.List[str]:
return [option for option in doc.options]
class MC_WithOptionList_LetterLL_Task(BaseMultipleChoiceTask):
"""
Format:
Question: <question>
<key1>: <option1>
<key2>: <option2>
...
Answer:
Continuation:
loglikelihood_continuation = <key_i>
"""
def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
prompt = "Question: " + doc.question + "\n"
prompt += "\n".join([
f"{format_key(doc.keys[i], 'colon')} {option}"
for i, option in enumerate(doc.options)
])
prompt += "\nAnswer:"
return prompt
def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
return " " + doc.keys[doc.gold]
def loglikelihood_continuation(self, doc: MultipleChoiceDoc) -> typing.List[str]:
return [key for key in doc.keys]
class MC_WithOptionList_NumLL_Task(BaseMultipleChoiceTask):
"""
Format:
Question: <question>
1: <option1>
2: <option2>
...
Answer:
Continuation:
loglikelihood_continuation = <key2num(key_i)>
"""
def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
prompt = "Question: " + doc.question + "\n"
prompt += "\n".join([
f"{format_key(key2num(doc, doc.keys[i]), 'colon')} {option}"
for i, option in enumerate(doc.options)
])
prompt += "\nAnswer:"
return prompt
def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
return f" {doc.gold + 1}" # `+ 1` for 1-based indexing.
def loglikelihood_continuation(self, doc: MultipleChoiceDoc) -> typing.List[str]:
return [key2num(doc, key) for key in doc.keys]
# TODO: Try to come up with a way to do this it at runtime.
if os.environ["MC_SETTING"] == "freeform":
MULTIPLE_CHOICE_TASK = MC_NoOptionList_OptionLL_Task
elif os.environ["MC_SETTING"] == "option":
MULTIPLE_CHOICE_TASK = MC_WithOptionList_OptionLL_Task
elif os.environ["MC_SETTING"] == "letter":
MULTIPLE_CHOICE_TASK = MC_WithOptionList_LetterLL_Task
elif os.environ["MC_SETTING"] == "number":
MULTIPLE_CHOICE_TASK = MC_WithOptionList_NumLL_Task
else:
print("No such MC_SETTING:", os.environ["MC_SETTING"])
\ No newline at end of file
...@@ -66,12 +66,13 @@ def oa_completion(**kwargs): ...@@ -66,12 +66,13 @@ def oa_completion(**kwargs):
except openai.error.OpenAIError: except openai.error.OpenAIError:
import traceback import traceback
traceback.print_exc() traceback.print_exc()
traceback.print_exc(file=os.path.join(os.environ["QUESTION_RESULT_PATH"], "traceback.txt"))
time.sleep(backoff_time) time.sleep(backoff_time)
backoff_time *= 1.5 backoff_time *= 1.5
class GPT3LM(BaseLM): class GPT3LM(BaseLM):
REQ_CHUNK_SIZE = 20 REQ_CHUNK_SIZE = 40
def __init__(self, engine, truncate=False, api_key=None, pass_strings=False): def __init__(self, engine, truncate=False, api_key=None, pass_strings=False):
""" """
...@@ -87,6 +88,7 @@ class GPT3LM(BaseLM): ...@@ -87,6 +88,7 @@ class GPT3LM(BaseLM):
import openai import openai
self.engine = engine self.engine = engine
print(self.max_length)
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2') self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.pass_strings = pass_strings self.pass_strings = pass_strings
...@@ -156,7 +158,9 @@ class GPT3LM(BaseLM): ...@@ -156,7 +158,9 @@ class GPT3LM(BaseLM):
inp = self.tok_decode(inp) inp = self.tok_decode(inp)
inps.append(inp) inps.append(inp)
ctxlens.append(ctxlen) ctxlens.append(ctxlen)
response = None
while True:
try:
response = oa_completion( response = oa_completion(
engine=self.engine, engine=self.engine,
prompt=inps, prompt=inps,
...@@ -164,6 +168,12 @@ class GPT3LM(BaseLM): ...@@ -164,6 +168,12 @@ class GPT3LM(BaseLM):
max_tokens=1, max_tokens=1,
logprobs=10, logprobs=10,
) )
break
except Exception as e:
print(e)
print("pausing")
time.sleep(1)
continue
for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(response.choices, ctxlens, chunk): for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(response.choices, ctxlens, chunk):
answer = get_result(resp, ctxlen) answer = get_result(resp, ctxlen)
...@@ -204,10 +214,14 @@ class GPT3LM(BaseLM): ...@@ -204,10 +214,14 @@ class GPT3LM(BaseLM):
for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))): for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
inps = [] inps = []
for context, _ in chunk: for context, _ in chunk:
context_enc = self.tok_encode(context) context_enc = self.tok_encode(context, max_length=self.max_length, truncation=False)
inp = context_enc[-(self.max_length - self.max_gen_toks):] inp = context_enc[-(self.max_length - self.max_gen_toks):]
inps.append(self.tok_decode(inp)) inps.append(self.tok_decode(inp))
response = None
while True:
try:
response = oa_completion( response = oa_completion(
engine=self.engine, engine=self.engine,
prompt=inps, prompt=inps,
...@@ -217,6 +231,13 @@ class GPT3LM(BaseLM): ...@@ -217,6 +231,13 @@ class GPT3LM(BaseLM):
stop=until, stop=until,
) )
break
except Exception as e:
print(e)
print("pausing")
time.sleep(1)
continue
for resp, (context, until_) in zip(response.choices, chunk): for resp, (context, until_) in zip(response.choices, chunk):
s = resp['text'] s = resp['text']
...@@ -242,7 +263,6 @@ class GPT3LM(BaseLM): ...@@ -242,7 +263,6 @@ class GPT3LM(BaseLM):
class GooseAILM(GPT3LM): class GooseAILM(GPT3LM):
def __init__(self, engine, truncate=False, api_key=None, force_pile_tokenizer=False): def __init__(self, engine, truncate=False, api_key=None, force_pile_tokenizer=False):
super().__init__(engine, truncate=truncate, api_key=api_key or os.environ["GOOSEAI_API_SECRET_KEY"], pass_strings=True) super().__init__(engine, truncate=truncate, api_key=api_key or os.environ["GOOSEAI_API_SECRET_KEY"], pass_strings=True)
self.REQ_CHUNK_SIZE = 1
import openai import openai
openai.api_base = "https://api.goose.ai/v1" openai.api_base = "https://api.goose.ai/v1"
......
import csv import csv
import random import random
from lm_eval.base import MultipleChoiceTask from lm_eval.base import MultipleChoiceTask
from lm_eval.mctask_experimental import MultipleChoiceDoc
from ..utils import sh from ..utils import sh
from pathlib import Path from pathlib import Path
from best_download import download_file from best_download import download_file
...@@ -63,26 +64,13 @@ class GeneralHendrycksTest(MultipleChoiceTask): ...@@ -63,26 +64,13 @@ class GeneralHendrycksTest(MultipleChoiceTask):
return True return True
def _convert_standard(self, doc): def _convert_standard(self, doc):
def format_example(doc, choices): keys = ['A', 'B', 'C', 'D']
""" return MultipleChoiceDoc(
Question: <prompt> question=doc[0],
Choices: keys=keys,
A. <choice1> options=doc[1:5],
B. <choice2> gold=keys.index(doc[5])
C. <choice3> )
D. <choice4>
Answer:
"""
prompt = "Question: " + doc[0] + "\nChoices:\n"
prompt += "".join([f"{choices[j]}. {doc[j+1]}\n" for j in range(4)])
prompt += "Answer:"
return prompt
choices = ['A', 'B', 'C', 'D']
return {
"query": format_example(doc, choices),
"choices": doc[1:5],
"gold": choices.index(doc[5])
}
def _load_docs(self, filename): def _load_docs(self, filename):
reader = csv.reader(open(filename, 'r'), quotechar='"', delimiter=',') reader = csv.reader(open(filename, 'r'), quotechar='"', delimiter=',')
...@@ -113,6 +101,3 @@ class GeneralHendrycksTest(MultipleChoiceTask): ...@@ -113,6 +101,3 @@ class GeneralHendrycksTest(MultipleChoiceTask):
self._fewshot_docs = list(self._load_docs(filename)) self._fewshot_docs = list(self._load_docs(filename))
return rnd.sample(list(self._fewshot_docs), k) return rnd.sample(list(self._fewshot_docs), k)
def doc_to_text(self, doc):
return doc["query"]
# Usage:
# sh mc-answer-prompt-experiment.sh \
# -e <engine> \
# -k <number of examples> \
# -s <mc-setting = "freeform" | "option" | "letter" | "number"> \
while getopts e:k:s: flag
do
case "${flag}" in
e) engine=${OPTARG};;
k) k_shot=${OPTARG};;
s) setting=${OPTARG};;
esac
done
ENGINE=$engine
KSHOT=$k_shot
MC_SETTING=$setting
# Set environment variables.
#export GOOSEAI_API_SECRET_KEY=sk-
export MC_SETTING=$setting
# Setup paths.
RESULT_DIR=$(pwd)/mc-task-results/$ENGINE/$KSHOT-shot
mkdir -p $RESULT_DIR
export QUESTION_RESULT_PATH=$RESULT_DIR/$MC_SETTING
mkdir -p $RESULT_DIR/$MC_SETTING
# Tasks to run.
HENDRYCKS_TEST=hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions
# Runner function.
run_experiment(){
local curr_engine=$1
local setting=$2
local output_path=$RESULT_DIR/$setting
# Log stuff.
echo "\n"
echo "###################################################"
echo "PID: $PPID"
echo "MC Setting: $setting"
echo "Few-shot: $KSHOT"
echo "Current Engine: $curr_engine"
echo "Current Results Dir:\n$output_path"
echo "Start Time: $(date)"
echo "###################################################"
echo "\n"
python3 -m scripts.write_out --output_base_path $output_path --tasks hendrycksTest-abstract_algebra --sets test --num_fewshot $KSHOT
mv $output_path/hendrycksTest-abstract_algebra $output_path/example_prompt
python3 main.py \
--model gooseai \
--model_args engine=$curr_engine \
--tasks $HENDRYCKS_TEST \
--output_path $output_path/results.json \
--num_fewshot $KSHOT
# Test Call.
# python3 main.py \
# --device cpu \
# --model gpt2 \
# --tasks anagrams1 \
# --limit 2 \
# --output_path $output_path/results.json
}
# Run experiment.
touch $RESULT_DIR/$MC_SETTING/out.log
run_experiment $ENGINE $MC_SETTING > $RESULT_DIR/$MC_SETTING/out.log
# Setup subshells?
# ()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment