Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
8458afa8
"docs/en/user_guides/backends_support.md" did not exist on "8a8da91bd8e4bca84557132ae67508c5d3b7385c"
Commit
8458afa8
authored
Feb 24, 2022
by
Jonathan Tow
Browse files
Add initial mc-answer-prompt-experiment features
parent
e63d1396
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
358 additions
and
80 deletions
+358
-80
lm_eval/base.py
lm_eval/base.py
+2
-37
lm_eval/evaluator.py
lm_eval/evaluator.py
+4
-0
lm_eval/mctask_experimental.py
lm_eval/mctask_experimental.py
+229
-0
lm_eval/models/gpt3.py
lm_eval/models/gpt3.py
+40
-20
lm_eval/tasks/hendrycks_test.py
lm_eval/tasks/hendrycks_test.py
+8
-23
mc-answer-prompt-experiment.sh
mc-answer-prompt-experiment.sh
+75
-0
No files found.
lm_eval/base.py
View file @
8458afa8
...
...
@@ -513,43 +513,8 @@ class Task(abc.ABC):
example
=
self
.
doc_to_text
(
doc
)
return
description
+
labeled_examples
+
example
class
MultipleChoiceTask
(
Task
,
abc
.
ABC
):
def
doc_to_target
(
self
,
doc
):
return
" "
+
doc
[
'choices'
][
doc
[
'gold'
]]
def
construct_requests
(
self
,
doc
,
ctx
):
lls
=
[
rf
.
loglikelihood
(
ctx
,
" {}"
.
format
(
choice
))[
0
]
for
choice
in
doc
[
'choices'
]
]
return
lls
def
process_results
(
self
,
doc
,
results
):
gold
=
doc
[
"gold"
]
acc
=
1.
if
np
.
argmax
(
results
)
==
gold
else
0.
completion_len
=
np
.
array
([
float
(
len
(
i
))
for
i
in
doc
[
"choices"
]])
acc_norm
=
1.
if
np
.
argmax
(
results
/
completion_len
)
==
gold
else
0.
return
{
"acc"
:
acc
,
"acc_norm"
:
acc_norm
,
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
,
"acc_norm"
:
True
,
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
,
"acc_norm"
:
mean
,
}
from
lm_eval.mctask_experimental
import
MULTIPLE_CHOICE_TASK
MultipleChoiceTask
=
MULTIPLE_CHOICE_TASK
class
PerplexityTask
(
Task
,
abc
.
ABC
):
...
...
lm_eval/evaluator.py
View file @
8458afa8
import
collections
import
itertools
import
os
import
random
import
lm_eval.metrics
import
lm_eval.models
...
...
@@ -107,6 +108,9 @@ def evaluate(lm, task_dict, provide_description=None, num_fewshot=0, limit=None,
Dictionary of results
"""
# TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
print
(
f
"
{
'='
*
20
}
"
)
print
(
f
"Task Module:
{
lm_eval
.
base
.
MultipleChoiceTask
.
__name__
}
"
)
print
(
f
"
{
'='
*
20
}
"
)
# TODO: todo: implement proper description-providing system
assert
not
provide_description
# not implemented.
...
...
lm_eval/mctask_experimental.py
0 → 100644
View file @
8458afa8
""" Multiple Choice Format Experiments.
TODO: Generalize the formatting of fewshot examples.
"""
import
os
import
abc
import
hashlib
from
argparse
import
ArgumentError
from
dataclasses
import
dataclass
import
typing
from
attr
import
field
import
numpy
as
np
import
lm_eval.base
as
base
from
lm_eval.metrics
import
mean
@
dataclass
class
MultipleChoiceDoc
:
question
:
str
# The possible answer keys, e.g. `["A", "B", "C", "D"]`.
# These should be the type as gold?
keys
:
typing
.
List
[
str
]
options
:
typing
.
List
[
str
]
gold
:
int
id
:
int
=
field
(
init
=
False
)
def
__post_init__
(
self
):
self
.
id
=
hashlib
.
sha224
(
self
.
question
.
encode
(
'utf-8'
)).
hexdigest
()
class
BaseMultipleChoiceTask
(
base
.
Task
,
abc
.
ABC
):
def
doc_to_text
(
self
,
doc
:
MultipleChoiceDoc
):
return
self
.
format_prompt
(
doc
)
@
abc
.
abstractclassmethod
def
format_prompt
(
cls
,
doc
:
MultipleChoiceDoc
)
->
str
:
pass
@
abc
.
abstractmethod
def
doc_to_target
(
self
,
doc
:
MultipleChoiceDoc
)
->
str
:
pass
@
abc
.
abstractmethod
def
loglikelihood_continuation
(
self
,
doc
:
MultipleChoiceDoc
)
->
typing
.
List
[
str
]:
pass
def
construct_requests
(
self
,
doc
:
MultipleChoiceDoc
,
ctx
:
str
):
lls
=
[]
conts
=
self
.
loglikelihood_continuation
(
doc
)
for
cont
in
conts
:
lls
.
append
(
base
.
rf
.
loglikelihood
(
ctx
,
f
"
{
cont
}
"
)[
0
])
return
lls
def
process_results
(
self
,
doc
:
MultipleChoiceDoc
,
results
:
typing
.
List
):
gold
=
doc
.
gold
ans
=
np
.
argmax
(
results
)
is_correct
=
1.
if
ans
==
gold
else
0.
# Normalize by completion length.
conts
=
self
.
loglikelihood_continuation
(
doc
)
completion_len
=
np
.
array
([
float
(
len
(
i
))
for
i
in
conts
])
acc_norm
=
1.
if
np
.
argmax
(
results
/
completion_len
)
==
gold
else
0.
return
{
"acc"
:
is_correct
,
"acc_norm"
:
acc_norm
,
# Bundle answers: (id, answer key, answer index, is correct).
"answer_bundle"
:
(
doc
.
id
,
doc
.
keys
[
ans
],
ans
,
is_correct
),
}
def
higher_is_better
(
self
):
return
{
"acc"
:
True
,
"acc_norm"
:
True
,
"answer_bundle"
:
True
,
}
def
aggregation
(
self
):
return
{
"acc"
:
mean
,
"acc_norm"
:
mean
,
"answer_bundle"
:
answer_bundle
}
def
answer_bundle
(
items
):
""" Bundles answers into a csv file. """
from
pathlib
import
Path
import
csv
cols
=
[
"question_id"
,
"model_answer"
,
"model_answer_index"
,
"is_correct"
]
rows
=
[
*
items
]
path
=
os
.
environ
[
"QUESTION_RESULT_PATH"
]
with
open
(
f
'
{
path
}
/question-by-question-results.csv'
,
'w'
)
as
f
:
write
=
csv
.
writer
(
f
)
write
.
writerow
(
cols
)
write
.
writerows
(
rows
)
return
0
def
key2num
(
doc
:
MultipleChoiceDoc
,
key
:
str
)
->
int
:
return
str
(
doc
.
keys
.
index
(
key
)
+
1
)
# `+ 1` for 1-based indexing.
def
format_key
(
key
:
str
,
type
:
str
):
""" Formats a multiple choice key. E.g.
format_key("A", "period") => "A."
format_key("A", "parens") => "(A)"
format_key("A", "colon") => "A:"
Args:
- type: "period" | "parens" | "colon"
"""
if
type
==
"parens"
:
return
f
"(
{
key
}
)"
elif
type
==
"period"
:
return
f
"
{
key
}
."
elif
type
==
"colon"
:
return
f
"
{
key
}
:"
else
:
raise
ArgumentError
()
class
MC_NoOptionList_OptionLL_Task
(
BaseMultipleChoiceTask
):
"""
Format:
Question: <question>
Answer:
Continuation:
loglikelihood_continuation = <option_i>
"""
def
format_prompt
(
cls
,
doc
:
MultipleChoiceDoc
)
->
str
:
prompt
=
"Question: "
+
doc
.
question
+
"
\n
"
prompt
+=
"Answer:"
return
prompt
# return _format_prompt(doc, list_options=False)
def
doc_to_target
(
self
,
doc
:
MultipleChoiceDoc
)
->
str
:
return
" "
+
doc
.
options
[
doc
.
gold
]
def
loglikelihood_continuation
(
self
,
doc
:
MultipleChoiceDoc
)
->
typing
.
List
[
str
]:
return
[
option
for
option
in
doc
.
options
]
class
MC_WithOptionList_OptionLL_Task
(
BaseMultipleChoiceTask
):
"""
Format:
Question: <question>
<key1>: <option1>
<key2>: <option2>
...
Answer:
Continuation:
loglikelihood_continuation = <option_i>
"""
def
format_prompt
(
cls
,
doc
:
MultipleChoiceDoc
)
->
str
:
prompt
=
"Question: "
+
doc
.
question
+
"
\n
"
prompt
+=
"
\n
"
.
join
([
f
"
{
format_key
(
doc
.
keys
[
i
],
'colon'
)
}
{
option
}
"
for
i
,
option
in
enumerate
(
doc
.
options
)
])
prompt
+=
"
\n
Answer:"
return
prompt
def
doc_to_target
(
self
,
doc
:
MultipleChoiceDoc
)
->
str
:
return
" "
+
doc
.
options
[
doc
.
gold
]
def
loglikelihood_continuation
(
self
,
doc
:
MultipleChoiceDoc
)
->
typing
.
List
[
str
]:
return
[
option
for
option
in
doc
.
options
]
class
MC_WithOptionList_LetterLL_Task
(
BaseMultipleChoiceTask
):
"""
Format:
Question: <question>
<key1>: <option1>
<key2>: <option2>
...
Answer:
Continuation:
loglikelihood_continuation = <key_i>
"""
def
format_prompt
(
cls
,
doc
:
MultipleChoiceDoc
)
->
str
:
prompt
=
"Question: "
+
doc
.
question
+
"
\n
"
prompt
+=
"
\n
"
.
join
([
f
"
{
format_key
(
doc
.
keys
[
i
],
'colon'
)
}
{
option
}
"
for
i
,
option
in
enumerate
(
doc
.
options
)
])
prompt
+=
"
\n
Answer:"
return
prompt
def
doc_to_target
(
self
,
doc
:
MultipleChoiceDoc
)
->
str
:
return
" "
+
doc
.
keys
[
doc
.
gold
]
def
loglikelihood_continuation
(
self
,
doc
:
MultipleChoiceDoc
)
->
typing
.
List
[
str
]:
return
[
key
for
key
in
doc
.
keys
]
class
MC_WithOptionList_NumLL_Task
(
BaseMultipleChoiceTask
):
"""
Format:
Question: <question>
1: <option1>
2: <option2>
...
Answer:
Continuation:
loglikelihood_continuation = <key2num(key_i)>
"""
def
format_prompt
(
cls
,
doc
:
MultipleChoiceDoc
)
->
str
:
prompt
=
"Question: "
+
doc
.
question
+
"
\n
"
prompt
+=
"
\n
"
.
join
([
f
"
{
format_key
(
key2num
(
doc
,
doc
.
keys
[
i
]),
'colon'
)
}
{
option
}
"
for
i
,
option
in
enumerate
(
doc
.
options
)
])
prompt
+=
"
\n
Answer:"
return
prompt
def
doc_to_target
(
self
,
doc
:
MultipleChoiceDoc
)
->
str
:
return
f
"
{
doc
.
gold
+
1
}
"
# `+ 1` for 1-based indexing.
def
loglikelihood_continuation
(
self
,
doc
:
MultipleChoiceDoc
)
->
typing
.
List
[
str
]:
return
[
key2num
(
doc
,
key
)
for
key
in
doc
.
keys
]
# TODO: Try to come up with a way to do this it at runtime.
if
os
.
environ
[
"MC_SETTING"
]
==
"freeform"
:
MULTIPLE_CHOICE_TASK
=
MC_NoOptionList_OptionLL_Task
elif
os
.
environ
[
"MC_SETTING"
]
==
"option"
:
MULTIPLE_CHOICE_TASK
=
MC_WithOptionList_OptionLL_Task
elif
os
.
environ
[
"MC_SETTING"
]
==
"letter"
:
MULTIPLE_CHOICE_TASK
=
MC_WithOptionList_LetterLL_Task
elif
os
.
environ
[
"MC_SETTING"
]
==
"number"
:
MULTIPLE_CHOICE_TASK
=
MC_WithOptionList_NumLL_Task
else
:
print
(
"No such MC_SETTING:"
,
os
.
environ
[
"MC_SETTING"
])
\ No newline at end of file
lm_eval/models/gpt3.py
View file @
8458afa8
...
...
@@ -66,12 +66,13 @@ def oa_completion(**kwargs):
except
openai
.
error
.
OpenAIError
:
import
traceback
traceback
.
print_exc
()
traceback
.
print_exc
(
file
=
os
.
path
.
join
(
os
.
environ
[
"QUESTION_RESULT_PATH"
],
"traceback.txt"
))
time
.
sleep
(
backoff_time
)
backoff_time
*=
1.5
class
GPT3LM
(
BaseLM
):
REQ_CHUNK_SIZE
=
2
0
REQ_CHUNK_SIZE
=
4
0
def
__init__
(
self
,
engine
,
truncate
=
False
,
api_key
=
None
,
pass_strings
=
False
):
"""
...
...
@@ -87,6 +88,7 @@ class GPT3LM(BaseLM):
import
openai
self
.
engine
=
engine
print
(
self
.
max_length
)
self
.
tokenizer
=
transformers
.
GPT2TokenizerFast
.
from_pretrained
(
'gpt2'
)
self
.
pass_strings
=
pass_strings
...
...
@@ -156,14 +158,22 @@ class GPT3LM(BaseLM):
inp
=
self
.
tok_decode
(
inp
)
inps
.
append
(
inp
)
ctxlens
.
append
(
ctxlen
)
response
=
oa_completion
(
engine
=
self
.
engine
,
prompt
=
inps
,
echo
=
True
,
max_tokens
=
1
,
logprobs
=
10
,
)
response
=
None
while
True
:
try
:
response
=
oa_completion
(
engine
=
self
.
engine
,
prompt
=
inps
,
echo
=
True
,
max_tokens
=
1
,
logprobs
=
10
,
)
break
except
Exception
as
e
:
print
(
e
)
print
(
"pausing"
)
time
.
sleep
(
1
)
continue
for
resp
,
ctxlen
,
(
cache_key
,
context_enc
,
continuation_enc
)
in
zip
(
response
.
choices
,
ctxlens
,
chunk
):
answer
=
get_result
(
resp
,
ctxlen
)
...
...
@@ -204,18 +214,29 @@ class GPT3LM(BaseLM):
for
chunk
,
until
in
tqdm
(
list
(
sameuntil_chunks
(
reord
.
get_reordered
(),
self
.
REQ_CHUNK_SIZE
))):
inps
=
[]
for
context
,
_
in
chunk
:
context_enc
=
self
.
tok_encode
(
context
)
context_enc
=
self
.
tok_encode
(
context
,
max_length
=
self
.
max_length
,
truncation
=
False
)
inp
=
context_enc
[
-
(
self
.
max_length
-
self
.
max_gen_toks
):]
inps
.
append
(
self
.
tok_decode
(
inp
))
response
=
oa_completion
(
engine
=
self
.
engine
,
prompt
=
inps
,
max_tokens
=
self
.
max_gen_toks
,
temperature
=
0.
,
# logprobs=10,
stop
=
until
,
)
response
=
None
while
True
:
try
:
response
=
oa_completion
(
engine
=
self
.
engine
,
prompt
=
inps
,
max_tokens
=
self
.
max_gen_toks
,
temperature
=
0.
,
# logprobs=10,
stop
=
until
,
)
break
except
Exception
as
e
:
print
(
e
)
print
(
"pausing"
)
time
.
sleep
(
1
)
continue
for
resp
,
(
context
,
until_
)
in
zip
(
response
.
choices
,
chunk
):
s
=
resp
[
'text'
]
...
...
@@ -242,7 +263,6 @@ class GPT3LM(BaseLM):
class
GooseAILM
(
GPT3LM
):
def
__init__
(
self
,
engine
,
truncate
=
False
,
api_key
=
None
,
force_pile_tokenizer
=
False
):
super
().
__init__
(
engine
,
truncate
=
truncate
,
api_key
=
api_key
or
os
.
environ
[
"GOOSEAI_API_SECRET_KEY"
],
pass_strings
=
True
)
self
.
REQ_CHUNK_SIZE
=
1
import
openai
openai
.
api_base
=
"https://api.goose.ai/v1"
...
...
@@ -264,4 +284,4 @@ class GooseAILM(GPT3LM):
@
property
def
max_gen_toks
(
self
):
return
64
\ No newline at end of file
return
64
lm_eval/tasks/hendrycks_test.py
View file @
8458afa8
import
csv
import
random
from
lm_eval.base
import
MultipleChoiceTask
from
lm_eval.mctask_experimental
import
MultipleChoiceDoc
from
..utils
import
sh
from
pathlib
import
Path
from
best_download
import
download_file
...
...
@@ -63,26 +64,13 @@ class GeneralHendrycksTest(MultipleChoiceTask):
return
True
def
_convert_standard
(
self
,
doc
):
def
format_example
(
doc
,
choices
):
"""
Question: <prompt>
Choices:
A. <choice1>
B. <choice2>
C. <choice3>
D. <choice4>
Answer:
"""
prompt
=
"Question: "
+
doc
[
0
]
+
"
\n
Choices:
\n
"
prompt
+=
""
.
join
([
f
"
{
choices
[
j
]
}
.
{
doc
[
j
+
1
]
}
\n
"
for
j
in
range
(
4
)])
prompt
+=
"Answer:"
return
prompt
choices
=
[
'A'
,
'B'
,
'C'
,
'D'
]
return
{
"query"
:
format_example
(
doc
,
choices
),
"choices"
:
doc
[
1
:
5
],
"gold"
:
choices
.
index
(
doc
[
5
])
}
keys
=
[
'A'
,
'B'
,
'C'
,
'D'
]
return
MultipleChoiceDoc
(
question
=
doc
[
0
],
keys
=
keys
,
options
=
doc
[
1
:
5
],
gold
=
keys
.
index
(
doc
[
5
])
)
def
_load_docs
(
self
,
filename
):
reader
=
csv
.
reader
(
open
(
filename
,
'r'
),
quotechar
=
'"'
,
delimiter
=
','
)
...
...
@@ -113,6 +101,3 @@ class GeneralHendrycksTest(MultipleChoiceTask):
self
.
_fewshot_docs
=
list
(
self
.
_load_docs
(
filename
))
return
rnd
.
sample
(
list
(
self
.
_fewshot_docs
),
k
)
def
doc_to_text
(
self
,
doc
):
return
doc
[
"query"
]
mc-answer-prompt-experiment.sh
0 → 100644
View file @
8458afa8
# Usage:
# sh mc-answer-prompt-experiment.sh \
# -e <engine> \
# -k <number of examples> \
# -s <mc-setting = "freeform" | "option" | "letter" | "number"> \
while
getopts
e:k:s: flag
do
case
"
${
flag
}
"
in
e
)
engine
=
${
OPTARG
}
;;
k
)
k_shot
=
${
OPTARG
}
;;
s
)
setting
=
${
OPTARG
}
;;
esac
done
ENGINE
=
$engine
KSHOT
=
$k_shot
MC_SETTING
=
$setting
# Set environment variables.
#export GOOSEAI_API_SECRET_KEY=sk-
export
MC_SETTING
=
$setting
# Setup paths.
RESULT_DIR
=
$(
pwd
)
/mc-task-results/
$ENGINE
/
$KSHOT
-shot
mkdir
-p
$RESULT_DIR
export
QUESTION_RESULT_PATH
=
$RESULT_DIR
/
$MC_SETTING
mkdir
-p
$RESULT_DIR
/
$MC_SETTING
# Tasks to run.
HENDRYCKS_TEST
=
hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions
# Runner function.
run_experiment
(){
local
curr_engine
=
$1
local
setting
=
$2
local
output_path
=
$RESULT_DIR
/
$setting
# Log stuff.
echo
"
\n
"
echo
"###################################################"
echo
"PID:
$PPID
"
echo
"MC Setting:
$setting
"
echo
"Few-shot:
$KSHOT
"
echo
"Current Engine:
$curr_engine
"
echo
"Current Results Dir:
\n
$output_path
"
echo
"Start Time:
$(
date
)
"
echo
"###################################################"
echo
"
\n
"
python3
-m
scripts.write_out
--output_base_path
$output_path
--tasks
hendrycksTest-abstract_algebra
--sets
test
--num_fewshot
$KSHOT
mv
$output_path
/hendrycksTest-abstract_algebra
$output_path
/example_prompt
python3 main.py
\
--model
gooseai
\
--model_args
engine
=
$curr_engine
\
--tasks
$HENDRYCKS_TEST
\
--output_path
$output_path
/results.json
\
--num_fewshot
$KSHOT
# Test Call.
# python3 main.py \
# --device cpu \
# --model gpt2 \
# --tasks anagrams1 \
# --limit 2 \
# --output_path $output_path/results.json
}
# Run experiment.
touch
$RESULT_DIR
/
$MC_SETTING
/out.log
run_experiment
$ENGINE
$MC_SETTING
>
$RESULT_DIR
/
$MC_SETTING
/out.log
# Setup subshells?
# ()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment