"docs/en/user_guides/backends_support.md" did not exist on "8a8da91bd8e4bca84557132ae67508c5d3b7385c"
Commit 8458afa8 authored by Jonathan Tow's avatar Jonathan Tow
Browse files

Add initial mc-answer-prompt-experiment features

parent e63d1396
......@@ -513,43 +513,8 @@ class Task(abc.ABC):
example = self.doc_to_text(doc)
return description + labeled_examples + example
class MultipleChoiceTask(Task, abc.ABC):
def doc_to_target(self, doc):
return " " + doc['choices'][doc['gold']]
def construct_requests(self, doc, ctx):
lls = [
rf.loglikelihood(ctx, " {}".format(choice))[0]
for choice in doc['choices']
]
return lls
def process_results(self, doc, results):
gold = doc["gold"]
acc = 1. if np.argmax(results) == gold else 0.
completion_len = np.array([float(len(i)) for i in doc["choices"]])
acc_norm = 1. if np.argmax(results / completion_len) == gold else 0.
return {
"acc": acc,
"acc_norm": acc_norm,
}
def higher_is_better(self):
return {
"acc": True,
"acc_norm": True,
}
def aggregation(self):
return {
"acc": mean,
"acc_norm": mean,
}
from lm_eval.mctask_experimental import MULTIPLE_CHOICE_TASK
MultipleChoiceTask = MULTIPLE_CHOICE_TASK
class PerplexityTask(Task, abc.ABC):
......
import collections
import itertools
import os
import random
import lm_eval.metrics
import lm_eval.models
......@@ -107,6 +108,9 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
Dictionary of results
"""
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
print(f"{'='*20}")
print(f"Task Module: {lm_eval.base.MultipleChoiceTask.__name__}")
print(f"{'='*20}")
# TODO: todo: implement proper description-providing system
assert not provide_description # not implemented.
......
""" Multiple Choice Format Experiments.
TODO: Generalize the formatting of fewshot examples.
"""
import os
import abc
import hashlib
from argparse import ArgumentError
from dataclasses import dataclass
import typing
from attr import field
import numpy as np
import lm_eval.base as base
from lm_eval.metrics import mean
@dataclass
class MultipleChoiceDoc:
question: str
# The possible answer keys, e.g. `["A", "B", "C", "D"]`.
# These should be the type as gold?
keys: typing.List[str]
options: typing.List[str]
gold: int
id: int = field(init=False)
def __post_init__(self):
self.id = hashlib.sha224(self.question.encode('utf-8')).hexdigest()
class BaseMultipleChoiceTask(base.Task, abc.ABC):
def doc_to_text(self, doc: MultipleChoiceDoc):
return self.format_prompt(doc)
@abc.abstractclassmethod
def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
pass
@abc.abstractmethod
def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
pass
@abc.abstractmethod
def loglikelihood_continuation(self, doc: MultipleChoiceDoc) -> typing.List[str]:
pass
def construct_requests(self, doc: MultipleChoiceDoc, ctx: str):
lls = []
conts = self.loglikelihood_continuation(doc)
for cont in conts:
lls.append(base.rf.loglikelihood(ctx, f" {cont}")[0])
return lls
def process_results(self, doc: MultipleChoiceDoc, results: typing.List):
gold = doc.gold
ans = np.argmax(results)
is_correct = 1. if ans == gold else 0.
# Normalize by completion length.
conts = self.loglikelihood_continuation(doc)
completion_len = np.array([float(len(i)) for i in conts])
acc_norm = 1. if np.argmax(results / completion_len) == gold else 0.
return {
"acc": is_correct,
"acc_norm": acc_norm,
# Bundle answers: (id, answer key, answer index, is correct).
"answer_bundle": (doc.id, doc.keys[ans], ans, is_correct),
}
def higher_is_better(self):
return {
"acc": True,
"acc_norm": True,
"answer_bundle": True,
}
def aggregation(self):
return {
"acc": mean,
"acc_norm": mean,
"answer_bundle": answer_bundle
}
def answer_bundle(items):
""" Bundles answers into a csv file. """
from pathlib import Path
import csv
cols = ["question_id", "model_answer", "model_answer_index", "is_correct"]
rows = [*items]
path = os.environ["QUESTION_RESULT_PATH"]
with open(f'{path}/question-by-question-results.csv', 'w') as f:
write = csv.writer(f)
write.writerow(cols)
write.writerows(rows)
return 0
def key2num(doc: MultipleChoiceDoc, key: str) -> int:
return str(doc.keys.index(key) + 1) # `+ 1` for 1-based indexing.
def format_key(key: str, type: str):
""" Formats a multiple choice key. E.g.
format_key("A", "period") => "A."
format_key("A", "parens") => "(A)"
format_key("A", "colon") => "A:"
Args:
- type: "period" | "parens" | "colon"
"""
if type == "parens":
return f"({key})"
elif type == "period":
return f"{key}."
elif type == "colon":
return f"{key}:"
else:
raise ArgumentError()
class MC_NoOptionList_OptionLL_Task(BaseMultipleChoiceTask):
"""
Format:
Question: <question>
Answer:
Continuation:
loglikelihood_continuation = <option_i>
"""
def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
prompt = "Question: " + doc.question + "\n"
prompt += "Answer:"
return prompt
# return _format_prompt(doc, list_options=False)
def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
return " " + doc.options[doc.gold]
def loglikelihood_continuation(self, doc: MultipleChoiceDoc) -> typing.List[str]:
return [option for option in doc.options]
class MC_WithOptionList_OptionLL_Task(BaseMultipleChoiceTask):
"""
Format:
Question: <question>
<key1>: <option1>
<key2>: <option2>
...
Answer:
Continuation:
loglikelihood_continuation = <option_i>
"""
def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
prompt = "Question: " + doc.question + "\n"
prompt += "\n".join([
f"{format_key(doc.keys[i], 'colon')} {option}"
for i, option in enumerate(doc.options)
])
prompt += "\nAnswer:"
return prompt
def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
return " " + doc.options[doc.gold]
def loglikelihood_continuation(self, doc: MultipleChoiceDoc) -> typing.List[str]:
return [option for option in doc.options]
class MC_WithOptionList_LetterLL_Task(BaseMultipleChoiceTask):
"""
Format:
Question: <question>
<key1>: <option1>
<key2>: <option2>
...
Answer:
Continuation:
loglikelihood_continuation = <key_i>
"""
def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
prompt = "Question: " + doc.question + "\n"
prompt += "\n".join([
f"{format_key(doc.keys[i], 'colon')} {option}"
for i, option in enumerate(doc.options)
])
prompt += "\nAnswer:"
return prompt
def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
return " " + doc.keys[doc.gold]
def loglikelihood_continuation(self, doc: MultipleChoiceDoc) -> typing.List[str]:
return [key for key in doc.keys]
class MC_WithOptionList_NumLL_Task(BaseMultipleChoiceTask):
"""
Format:
Question: <question>
1: <option1>
2: <option2>
...
Answer:
Continuation:
loglikelihood_continuation = <key2num(key_i)>
"""
def format_prompt(cls, doc: MultipleChoiceDoc) -> str:
prompt = "Question: " + doc.question + "\n"
prompt += "\n".join([
f"{format_key(key2num(doc, doc.keys[i]), 'colon')} {option}"
for i, option in enumerate(doc.options)
])
prompt += "\nAnswer:"
return prompt
def doc_to_target(self, doc: MultipleChoiceDoc) -> str:
return f" {doc.gold + 1}" # `+ 1` for 1-based indexing.
def loglikelihood_continuation(self, doc: MultipleChoiceDoc) -> typing.List[str]:
return [key2num(doc, key) for key in doc.keys]
# TODO: Try to come up with a way to do this it at runtime.
if os.environ["MC_SETTING"] == "freeform":
MULTIPLE_CHOICE_TASK = MC_NoOptionList_OptionLL_Task
elif os.environ["MC_SETTING"] == "option":
MULTIPLE_CHOICE_TASK = MC_WithOptionList_OptionLL_Task
elif os.environ["MC_SETTING"] == "letter":
MULTIPLE_CHOICE_TASK = MC_WithOptionList_LetterLL_Task
elif os.environ["MC_SETTING"] == "number":
MULTIPLE_CHOICE_TASK = MC_WithOptionList_NumLL_Task
else:
print("No such MC_SETTING:", os.environ["MC_SETTING"])
\ No newline at end of file
......@@ -66,12 +66,13 @@ def oa_completion(**kwargs):
except openai.error.OpenAIError:
import traceback
traceback.print_exc()
traceback.print_exc(file=os.path.join(os.environ["QUESTION_RESULT_PATH"], "traceback.txt"))
time.sleep(backoff_time)
backoff_time *= 1.5
class GPT3LM(BaseLM):
REQ_CHUNK_SIZE = 20
REQ_CHUNK_SIZE = 40
def __init__(self, engine, truncate=False, api_key=None, pass_strings=False):
"""
......@@ -87,6 +88,7 @@ class GPT3LM(BaseLM):
import openai
self.engine = engine
print(self.max_length)
self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained('gpt2')
self.pass_strings = pass_strings
......@@ -156,14 +158,22 @@ class GPT3LM(BaseLM):
inp = self.tok_decode(inp)
inps.append(inp)
ctxlens.append(ctxlen)
response = oa_completion(
engine=self.engine,
prompt=inps,
echo=True,
max_tokens=1,
logprobs=10,
)
response = None
while True:
try:
response = oa_completion(
engine=self.engine,
prompt=inps,
echo=True,
max_tokens=1,
logprobs=10,
)
break
except Exception as e:
print(e)
print("pausing")
time.sleep(1)
continue
for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(response.choices, ctxlens, chunk):
answer = get_result(resp, ctxlen)
......@@ -204,18 +214,29 @@ class GPT3LM(BaseLM):
for chunk, until in tqdm(list(sameuntil_chunks(reord.get_reordered(), self.REQ_CHUNK_SIZE))):
inps = []
for context, _ in chunk:
context_enc = self.tok_encode(context)
context_enc = self.tok_encode(context, max_length=self.max_length, truncation=False)
inp = context_enc[-(self.max_length - self.max_gen_toks):]
inps.append(self.tok_decode(inp))
response = oa_completion(
engine=self.engine,
prompt=inps,
max_tokens=self.max_gen_toks,
temperature=0.,
# logprobs=10,
stop=until,
)
response = None
while True:
try:
response = oa_completion(
engine=self.engine,
prompt=inps,
max_tokens=self.max_gen_toks,
temperature=0.,
# logprobs=10,
stop=until,
)
break
except Exception as e:
print(e)
print("pausing")
time.sleep(1)
continue
for resp, (context, until_) in zip(response.choices, chunk):
s = resp['text']
......@@ -242,7 +263,6 @@ class GPT3LM(BaseLM):
class GooseAILM(GPT3LM):
def __init__(self, engine, truncate=False, api_key=None, force_pile_tokenizer=False):
super().__init__(engine, truncate=truncate, api_key=api_key or os.environ["GOOSEAI_API_SECRET_KEY"], pass_strings=True)
self.REQ_CHUNK_SIZE = 1
import openai
openai.api_base = "https://api.goose.ai/v1"
......@@ -264,4 +284,4 @@ class GooseAILM(GPT3LM):
@property
def max_gen_toks(self):
return 64
\ No newline at end of file
return 64
import csv
import random
from lm_eval.base import MultipleChoiceTask
from lm_eval.mctask_experimental import MultipleChoiceDoc
from ..utils import sh
from pathlib import Path
from best_download import download_file
......@@ -63,26 +64,13 @@ class GeneralHendrycksTest(MultipleChoiceTask):
return True
def _convert_standard(self, doc):
def format_example(doc, choices):
"""
Question: <prompt>
Choices:
A. <choice1>
B. <choice2>
C. <choice3>
D. <choice4>
Answer:
"""
prompt = "Question: " + doc[0] + "\nChoices:\n"
prompt += "".join([f"{choices[j]}. {doc[j+1]}\n" for j in range(4)])
prompt += "Answer:"
return prompt
choices = ['A', 'B', 'C', 'D']
return {
"query": format_example(doc, choices),
"choices": doc[1:5],
"gold": choices.index(doc[5])
}
keys = ['A', 'B', 'C', 'D']
return MultipleChoiceDoc(
question=doc[0],
keys=keys,
options=doc[1:5],
gold=keys.index(doc[5])
)
def _load_docs(self, filename):
reader = csv.reader(open(filename, 'r'), quotechar='"', delimiter=',')
......@@ -113,6 +101,3 @@ class GeneralHendrycksTest(MultipleChoiceTask):
self._fewshot_docs = list(self._load_docs(filename))
return rnd.sample(list(self._fewshot_docs), k)
def doc_to_text(self, doc):
return doc["query"]
# Usage:
# sh mc-answer-prompt-experiment.sh \
# -e <engine> \
# -k <number of examples> \
# -s <mc-setting = "freeform" | "option" | "letter" | "number"> \
while getopts e:k:s: flag
do
case "${flag}" in
e) engine=${OPTARG};;
k) k_shot=${OPTARG};;
s) setting=${OPTARG};;
esac
done
ENGINE=$engine
KSHOT=$k_shot
MC_SETTING=$setting
# Set environment variables.
#export GOOSEAI_API_SECRET_KEY=sk-
export MC_SETTING=$setting
# Setup paths.
RESULT_DIR=$(pwd)/mc-task-results/$ENGINE/$KSHOT-shot
mkdir -p $RESULT_DIR
export QUESTION_RESULT_PATH=$RESULT_DIR/$MC_SETTING
mkdir -p $RESULT_DIR/$MC_SETTING
# Tasks to run.
HENDRYCKS_TEST=hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions
# Runner function.
run_experiment(){
local curr_engine=$1
local setting=$2
local output_path=$RESULT_DIR/$setting
# Log stuff.
echo "\n"
echo "###################################################"
echo "PID: $PPID"
echo "MC Setting: $setting"
echo "Few-shot: $KSHOT"
echo "Current Engine: $curr_engine"
echo "Current Results Dir:\n$output_path"
echo "Start Time: $(date)"
echo "###################################################"
echo "\n"
python3 -m scripts.write_out --output_base_path $output_path --tasks hendrycksTest-abstract_algebra --sets test --num_fewshot $KSHOT
mv $output_path/hendrycksTest-abstract_algebra $output_path/example_prompt
python3 main.py \
--model gooseai \
--model_args engine=$curr_engine \
--tasks $HENDRYCKS_TEST \
--output_path $output_path/results.json \
--num_fewshot $KSHOT
# Test Call.
# python3 main.py \
# --device cpu \
# --model gpt2 \
# --tasks anagrams1 \
# --limit 2 \
# --output_path $output_path/results.json
}
# Run experiment.
touch $RESULT_DIR/$MC_SETTING/out.log
run_experiment $ENGINE $MC_SETTING > $RESULT_DIR/$MC_SETTING/out.log
# Setup subshells?
# ()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment