Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
884c29fb
Commit
884c29fb
authored
Mar 21, 2021
by
Charles Foster
Browse files
Bring SQuAD fork up to date with EAI upstream
parents
232c9ab6
8809c5f1
Changes
46
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
181 additions
and
1 deletion
+181
-1
scripts/cost_estimate.py
scripts/cost_estimate.py
+69
-0
scripts/fewshot_description_experiment.py
scripts/fewshot_description_experiment.py
+83
-0
scripts/get_prompts.py
scripts/get_prompts.py
+19
-0
scripts/make_table_tasks.py
scripts/make_table_tasks.py
+0
-0
tests/test_models.py
tests/test_models.py
+7
-1
tests/test_tasks.py
tests/test_tasks.py
+3
-0
No files found.
scripts/cost_estimate.py
0 → 100644
View file @
884c29fb
import
argparse
import
json
import
numpy
as
np
import
random
import
itertools
import
collections
import
logging
from
lm_eval
import
models
,
tasks
,
evaluator
,
base
import
random
from
lm_eval.base
import
LM
import
transformers
class
DryrunLM
(
LM
):
def
__init__
(
self
):
self
.
tokencost
=
0
self
.
tokenizer
=
transformers
.
GPT2TokenizerFast
.
from_pretrained
(
'gpt2'
)
self
.
tokenizer
.
pad_token
=
"<|endoftext|>"
@
classmethod
def
create_from_arg_string
(
cls
,
arg_string
):
return
cls
()
def
loglikelihood
(
self
,
requests
):
res
=
[]
for
ctx
,
cont
in
requests
:
res
.
append
((
-
random
.
random
(),
False
))
self
.
tokencost
+=
len
(
self
.
tokenizer
.
tokenize
(
ctx
+
cont
))
return
res
def
greedy_until
(
self
,
requests
):
res
=
[]
for
ctx
,
until
in
requests
:
res
.
append
(
"lol"
)
# assume worst case - generates until 256
self
.
tokencost
+=
len
(
self
.
tokenizer
.
tokenize
(
ctx
))
+
256
return
res
def
main
():
lm
=
DryrunLM
()
values
=
[]
for
taskname
in
list
(
tasks
.
TASK_REGISTRY
.
keys
()):
lm
.
tokencost
=
0
evaluator
.
evaluate
(
lm
,
{
taskname
:
tasks
.
get_task
(
taskname
)()},
False
,
0
,
None
)
print
(
taskname
,
lm
.
tokencost
)
values
.
append
([
taskname
,
lm
.
tokencost
,
lm
.
tokencost
/
1000
*
0.06
])
from
pytablewriter
import
MarkdownTableWriter
writer
=
MarkdownTableWriter
()
writer
.
headers
=
[
"Task"
,
"Tokens"
,
"Davinci Cost"
]
values
.
sort
(
key
=
lambda
x
:
-
x
[
1
])
totcost
=
sum
([
x
[
1
]
for
x
in
values
])
values
.
append
([
"**Total**"
,
totcost
,
totcost
/
1000
*
0.06
])
writer
.
value_matrix
=
values
print
(
writer
.
dumps
())
if
__name__
==
"__main__"
:
main
()
scripts/fewshot_description_experiment.py
0 → 100644
View file @
884c29fb
import
argparse
import
json
import
numpy
as
np
import
random
import
itertools
import
collections
import
logging
from
lm_eval
import
models
,
tasks
,
evaluator
,
base
logging
.
getLogger
(
"openai"
).
setLevel
(
logging
.
WARNING
)
fewshot_descriptions
=
[
"foo"
,
"bar"
]
task
=
"lambada"
num_fewshot
=
0
model
=
"gpt2"
model_args
=
""
limit
=
None
no_cache
=
False
class
CustomDescTask
:
def
__init__
(
self
,
task
,
desc
):
self
.
task
=
task
self
.
desc
=
desc
def
fewshot_description
():
return
self
.
desc
self
.
task
.
fewshot_description
=
fewshot_description
def
__getattr__
(
self
,
attr
):
return
getattr
(
self
.
task
,
attr
)
def
main
():
random
.
seed
(
42
)
np
.
random
.
seed
(
42
)
lm
=
models
.
get_model
(
model
).
create_from_arg_string
(
model_args
)
if
limit
:
print
(
"WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
if
not
no_cache
:
lm
=
base
.
CachingLM
(
lm
,
'lm_cache/'
+
model
+
'_'
+
model_args
.
replace
(
'='
,
'-'
).
replace
(
','
,
'_'
)
+
'.db'
)
task_dict
=
tasks
.
get_task_dict
([
task
])
for
desc
in
fewshot_descriptions
:
custom_task_dict
=
{
k
:
CustomDescTask
(
v
,
desc
)
for
k
,
v
in
task_dict
.
items
()}
results
=
evaluator
.
evaluate
(
lm
,
custom_task_dict
,
True
,
num_fewshot
,
limit
)
dumped
=
json
.
dumps
(
results
,
indent
=
2
)
print
(
'Description:'
,
desc
)
print
(
dumped
)
# MAKE TABLE
from
pytablewriter
import
MarkdownTableWriter
writer
=
MarkdownTableWriter
()
writer
.
headers
=
[
"Task"
,
"Metric"
,
"Value"
]
values
=
[]
for
k
,
dic
in
results
.
items
():
for
m
,
v
in
dic
.
items
():
values
.
append
([
k
,
m
,
'%.4f'
%
v
])
k
=
""
writer
.
value_matrix
=
values
print
(
writer
.
dumps
())
if
__name__
==
"__main__"
:
main
()
scripts/get_prompts.py
0 → 100644
View file @
884c29fb
from
lm_eval
import
tasks
from
itertools
import
islice
ct
=
3
for
tname
,
Task
in
tasks
.
TASK_REGISTRY
.
items
():
#[('record', tasks.superglue.ReCoRD)]:#
task
=
Task
()
print
(
'#'
,
tname
)
docs
=
islice
(
task
.
validation_docs
()
if
task
.
has_validation_docs
()
else
task
.
test_docs
(),
ct
)
print
()
print
(
'**Zero-Shot Prompt**:'
,
"
\n
```
\n
"
+
task
.
fewshot_description
()
+
"
\n
```
\n
"
)
for
i
in
range
(
ct
):
print
()
doc
=
next
(
docs
)
print
(
"**Context**:"
,
"
\n
```
\n
"
+
task
.
doc_to_text
(
doc
)
+
"
\n
```
\n
"
)
print
()
print
(
'**Target**:'
,
"
\n
```
\n
"
+
task
.
doc_to_target
(
doc
)
+
"
\n
```
\n
"
)
print
()
scripts/make_table.py
→
scripts/make_table
_tasks
.py
View file @
884c29fb
File moved
tests/test_models.py
View file @
884c29fb
...
@@ -12,4 +12,10 @@ def test_gpt2():
...
@@ -12,4 +12,10 @@ def test_gpt2():
assert
not
ig_cat
assert
not
ig_cat
# test empty context
# test empty context
gpt2
.
loglikelihood
([(
''
,
'test'
)])
gpt2
.
loglikelihood
([(
''
,
'test'
)])
\ No newline at end of file
gen
,
=
gpt2
.
greedy_until
([
(
'The quick brown fox jumps over the lazy'
,
[
'.'
,
'
\n
'
])
])
assert
gen
==
', lazy fox and they both fall to the ground'
\ No newline at end of file
tests/test_tasks.py
View file @
884c29fb
...
@@ -75,6 +75,9 @@ def test_documents_and_requests(taskname, Task):
...
@@ -75,6 +75,9 @@ def test_documents_and_requests(taskname, Task):
assert
tgt
[
0
]
==
' '
or
txt
[
-
1
]
==
'
\n
'
assert
tgt
[
0
]
==
' '
or
txt
[
-
1
]
==
'
\n
'
reqs
=
task
.
construct_requests
(
doc
,
txt
)
reqs
=
task
.
construct_requests
(
doc
,
txt
)
# construct_requests can return just one request
if
not
isinstance
(
reqs
,
(
list
,
tuple
)):
reqs
=
[
reqs
]
# todo: mock lm after refactoring evaluator.py to not be a mess
# todo: mock lm after refactoring evaluator.py to not be a mess
for
req
in
reqs
:
for
req
in
reqs
:
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment