Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
6862fa7d
Unverified
Commit
6862fa7d
authored
Jul 14, 2023
by
Lintang Sutawika
Committed by
GitHub
Jul 14, 2023
Browse files
Merge pull request #676 from EleutherAI/new-flags
[Refactor] Miscellaneous fixes
parents
98c85d73
f7dde0c3
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
90 additions
and
65 deletions
+90
-65
lm_eval/api/instance.py
lm_eval/api/instance.py
+7
-7
lm_eval/api/task.py
lm_eval/api/task.py
+12
-15
lm_eval/evaluator.py
lm_eval/evaluator.py
+34
-17
lm_eval/models/huggingface.py
lm_eval/models/huggingface.py
+2
-0
lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
+4
-6
lm_eval/tasks/pubmedqa/pubmedqa.yaml
lm_eval/tasks/pubmedqa/pubmedqa.yaml
+1
-1
lm_eval/tasks/race/preprocess_race.py
lm_eval/tasks/race/preprocess_race.py
+8
-4
lm_eval/tasks/swag/swag.yaml
lm_eval/tasks/swag/swag.yaml
+1
-1
lm_eval/tasks/winogrande/preprocess_winogrande.py
lm_eval/tasks/winogrande/preprocess_winogrande.py
+4
-1
main.py
main.py
+17
-13
No files found.
lm_eval/api/instance.py
View file @
6862fa7d
...
...
@@ -4,13 +4,13 @@ from typing import Literal, Tuple
@
dataclass
class
Instance
:
request_type
:
str
=
Literal
[
"loglikelihood"
,
"loglikelihood_rolling"
,
"greedy_until"
]
doc
:
dict
=
None
arguments
:
t
uple
=
None
idx
:
int
=
None
metadata
:
tuple
=
Tuple
[
str
,
int
,
int
]
# TODO: better typehints here
request_type
:
Literal
[
"loglikelihood"
,
"loglikelihood_rolling"
,
"greedy_until"
]
doc
:
dict
arguments
:
tuple
idx
:
int
metadata
:
T
uple
[
str
,
int
,
int
]
=
field
(
default_factory
=
lambda
:
(
None
,
None
,
None
)
)
# TODO: better typehints here
resps
:
list
=
field
(
default_factory
=
list
)
filtered_resps
:
dict
=
field
(
default_factory
=
dict
)
...
...
lm_eval/api/task.py
View file @
6862fa7d
...
...
@@ -8,6 +8,7 @@ import evaluate
import
random
import
itertools
import
functools
from
tqdm
import
tqdm
import
datasets
import
numpy
as
np
...
...
@@ -217,8 +218,8 @@ class Task(abc.ABC):
self
.
_filters
.
append
(
filter_pipeline
)
self
.
sampler
=
samplers
.
Sampler
(
list
(
self
.
fewshot_docs
()),
self
,
rnd
=
random
.
Random
()
)
# TODO: pass the correct docs in here
list
(
self
.
fewshot_docs
()),
self
,
rnd
=
random
.
Random
(
1234
)
)
def
download
(
self
,
data_dir
=
None
,
cache_dir
=
None
,
download_mode
=
None
):
"""Downloads and returns the task dataset.
...
...
@@ -366,13 +367,18 @@ class Task(abc.ABC):
False
),
f
"Task dataset (path=
{
self
.
DATASET_PATH
}
, name=
{
self
.
DATASET_NAME
}
) must have valid or test docs!"
eval_logger
.
info
(
f
"Building contexts for task '
{
self
.
_config
.
task
}
' on rank
{
rank
}
..."
)
instances
=
[]
for
doc_id
,
doc
in
utils
.
create_iterator
(
enumerate
(
docs
),
rank
,
world_size
,
limit
):
# sample fewshot context #TODO: need to offset doc_id by rank now!
fewshot_ctx
=
self
.
fewshot_context
(
doc
,
self
.
_config
.
num_fewshot
,
rnd
=
random
.
Random
()
doc
,
self
.
_config
.
num_fewshot
,
)
# TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
...
...
@@ -453,7 +459,7 @@ class Task(abc.ABC):
return
len
(
re
.
split
(
r
"\s+"
,
doc
))
@
utils
.
positional_deprecated
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
rnd
=
None
):
def
fewshot_context
(
self
,
doc
,
num_fewshot
):
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
...
...
@@ -461,15 +467,9 @@ class Task(abc.ABC):
The document as returned from training_docs, validation_docs, or test_docs.
:param num_fewshot: int
The number of fewshot examples to provide in the returned context string.
:param rnd: random.Random
The pseudo-random number generator used to randomly sample examples.
WARNING: This is currently a required arg although it's optionalized with a default `None`.
:returns: str
The fewshot context.
"""
assert
(
rnd
is
not
None
),
"A `random.Random` generator argument must be provided to `rnd`"
if
num_fewshot
==
0
:
# always prepend the (possibly empty) task description
...
...
@@ -625,7 +625,7 @@ class ConfigurableTask(Task):
if
self
.
fewshot_docs
()
is
not
None
:
self
.
sampler
=
samplers
.
Sampler
(
list
(
self
.
fewshot_docs
()),
self
,
rnd
=
random
.
Random
()
list
(
self
.
fewshot_docs
()),
self
,
rnd
=
random
.
Random
(
1234
)
)
def
download
(
self
,
dataset_kwargs
=
None
):
...
...
@@ -1004,13 +1004,10 @@ class PerplexityTask(Task):
assert
k
==
0
return
[]
def
fewshot_context
(
self
,
doc
,
num_fewshot
,
rnd
=
None
):
def
fewshot_context
(
self
,
doc
,
num_fewshot
):
assert
(
num_fewshot
==
0
),
"The number of fewshot examples must be 0 for perplexity tasks."
assert
(
rnd
is
not
None
),
"A `random.Random` generator argument must be provided to `rnd`."
return
""
...
...
lm_eval/evaluator.py
View file @
6862fa7d
...
...
@@ -45,6 +45,7 @@ def simple_evaluate(
check_integrity
=
False
,
decontamination_ngrams_path
=
None
,
write_out
=
False
,
log_samples
=
True
,
):
"""Instantiate and evaluate a model on a list of tasks.
...
...
@@ -72,12 +73,17 @@ def simple_evaluate(
:param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks
:param write_out: bool
If True, write details about prompts and logits to json for all tasks
If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:return
Dictionary of results
"""
random
.
seed
(
1234
)
random
.
seed
(
0
)
np
.
random
.
seed
(
1234
)
torch
.
manual_seed
(
1234
)
# TODO: this may affect training runs that are run with evaluation mid-run.
assert
tasks
!=
[],
"No tasks specified"
...
...
@@ -118,6 +124,7 @@ def simple_evaluate(
bootstrap_iters
=
bootstrap_iters
,
decontamination_ngrams_path
=
decontamination_ngrams_path
,
write_out
=
write_out
,
log_samples
=
log_samples
,
)
if
lm
.
rank
==
0
:
...
...
@@ -154,6 +161,7 @@ def evaluate(
bootstrap_iters
=
100000
,
decontamination_ngrams_path
=
None
,
write_out
=
False
,
log_samples
=
True
,
):
"""Instantiate and evaluate a model on a list of tasks.
...
...
@@ -168,7 +176,9 @@ def evaluate(
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:param write_out: bool
If True, write all prompts, logits and metrics to json for offline analysis
If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:return
Dictionary of results
"""
...
...
@@ -213,7 +223,10 @@ def evaluate(
# aggregate Instances by LM method requested to get output.
reqtype
=
(
"loglikelihood"
if
(
task
.
OUTPUT_TYPE
==
"multiple_choice"
or
task
.
OUTPUT_TYPE
==
"winograd_schema"
)
if
(
task
.
OUTPUT_TYPE
==
"multiple_choice"
or
task
.
OUTPUT_TYPE
==
"winograd_schema"
)
else
task
.
OUTPUT_TYPE
)
# TODO: this is hacky, fix in task.py
requests
[
reqtype
].
extend
(
task
.
instances
)
...
...
@@ -279,17 +292,18 @@ def evaluate(
metrics
=
task
.
process_results
(
doc
,
[
req
.
filtered_resps
[
key
]
for
req
in
requests
]
)
target
=
task
.
doc_to_target
(
doc
)
example
=
{
"doc_id"
:
doc_id
,
"doc"
:
doc
,
"target"
:
target
,
"arguments"
:
requests
[
0
].
args
,
"resps"
:
[
req
.
resps
for
req
in
requests
],
"filtered_resps"
:
[
req
.
filtered_resps
[
key
]
for
req
in
requests
],
}
example
.
update
(
metrics
)
samples
[
task_name
].
append
(
example
)
if
log_samples
:
target
=
task
.
doc_to_target
(
doc
)
example
=
{
"doc_id"
:
doc_id
,
"doc"
:
doc
,
"target"
:
target
,
"arguments"
:
[
req
.
args
for
req
in
requests
],
"resps"
:
[
req
.
resps
for
req
in
requests
],
"filtered_resps"
:
[
req
.
filtered_resps
[
key
]
for
req
in
requests
],
}
example
.
update
(
metrics
)
samples
[
task_name
].
append
(
example
)
for
metric
,
value
in
metrics
.
items
():
vals
[(
task_name
,
key
,
metric
)].
append
(
value
)
...
...
@@ -359,12 +373,15 @@ def evaluate(
if
stderr
is
not
None
:
results
[
task_name
][
metric
+
"_stderr"
+
","
+
key
]
=
stderr
(
items
)
re
turn
{
re
sults_dict
=
{
"results"
:
dict
(
results
),
"configs"
:
dict
(
configs
),
"versions"
:
dict
(
versions
),
"samples"
:
samples
,
}
if
log_samples
:
results_dict
[
"samples"
]
=
dict
(
samples
)
return
results_dict
else
:
return
None
lm_eval/models/huggingface.py
View file @
6862fa7d
...
...
@@ -70,6 +70,7 @@ class HFLM(LM):
batch_size
:
Optional
[
int
]
=
1
,
low_cpu_mem_usage
:
Optional
[
bool
]
=
True
,
trust_remote_code
:
Optional
[
bool
]
=
False
,
use_fast_tokenizer
:
Optional
[
bool
]
=
True
,
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
parallelize
:
Optional
[
bool
]
=
False
,
...
...
@@ -216,6 +217,7 @@ class HFLM(LM):
pretrained
if
tokenizer
is
None
else
tokenizer
,
revision
=
revision
,
trust_remote_code
=
trust_remote_code
,
use_fast
=
use_fast_tokenizer
,
)
self
.
vocab_size
=
self
.
tokenizer
.
vocab_size
...
...
lm_eval/tasks/pubmedqa/preprocess_pubmedqa.py
View file @
6862fa7d
...
...
@@ -4,13 +4,11 @@ def doc_to_text(doc):
ctxs
,
doc
[
"question"
],
doc
[
"final_decision"
]
)
def
doc_to_target
(
doc
):
return
" {}"
.
format
(
doc
[
"final_decision"
])
def
gold_alias
(
doc
):
dict_to_label
=
{
'yes'
:
0
,
'no'
:
1
,
'maybe'
:
2
}
return
dict_to_label
[
doc
[
"final_decision"
]]
\ No newline at end of file
dict_to_label
=
{
"yes"
:
0
,
"no"
:
1
,
"maybe"
:
2
}
return
dict_to_label
[
doc
[
"final_decision"
]]
lm_eval/tasks/pubmedqa/pubmedqa.yaml
View file @
6862fa7d
...
...
@@ -14,4 +14,4 @@ gold_alias: !function preprocess_pubmedqa.gold_alias
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
\ No newline at end of file
higher_is_better
:
true
lm_eval/tasks/race/preprocess_race.py
View file @
6862fa7d
import
ast
import
ast
def
process_ast
(
string
):
return
ast
.
literal_eval
(
string
)
def
last_problem
(
doc
):
return
process_ast
(
doc
[
"problems"
])[
-
1
]
def
get_answer_option
(
problem
):
letter_to_num
=
{
"A"
:
0
,
"B"
:
1
,
"C"
:
2
,
"D"
:
3
}
answer
=
letter_to_num
[
problem
[
"answer"
]]
return
problem
[
"options"
][
answer
]
def
create_choices
(
doc
):
problem
=
last_problem
(
doc
)
choices
=
[
problem
[
"options"
][
i
]
for
i
in
range
(
4
)]
return
choices
def
doc_to_text
(
doc
):
text
=
"Article: "
+
doc
[
"article"
]
+
"
\n\n
"
for
problem
in
process_ast
(
doc
[
"problems"
])[:
-
1
]:
if
problem
[
"question"
][
-
6
:]
==
" _ ."
:
text
+=
(
problem
[
"question"
][
-
5
:]
+
get_answer_option
(
problem
)
+
"
\n
"
)
text
+=
problem
[
"question"
][
-
5
:]
+
get_answer_option
(
problem
)
+
"
\n
"
else
:
question
=
"Question: "
+
problem
[
"question"
]
+
"
\n
"
answer
=
"Answer: "
+
get_answer_option
(
problem
)
+
"
\n
"
...
...
@@ -30,6 +33,7 @@ def doc_to_text(doc):
text
+=
last_problem
(
doc
)[
"question"
]
return
text
def
doc_to_target
(
doc
):
letter_to_num
=
{
"A"
:
0
,
"B"
:
1
,
"C"
:
2
,
"D"
:
3
}
answer
=
letter_to_num
[
last_problem
(
doc
)[
"answer"
]]
...
...
lm_eval/tasks/swag/swag.yaml
View file @
6862fa7d
...
...
@@ -17,4 +17,4 @@ metric_list:
higher_is_better
:
true
-
metric
:
acc_norm
aggregation
:
mean
higher_is_better
:
true
\ No newline at end of file
higher_is_better
:
true
lm_eval/tasks/winogrande/preprocess_winogrande.py
View file @
6862fa7d
...
...
@@ -4,11 +4,13 @@ def partial_context(doc, option):
pronoun_loc
=
doc
[
"sentence"
].
index
(
"_"
)
return
doc
[
"sentence"
][:
pronoun_loc
]
+
option
def
partial_target
(
doc
):
# The target is everything after the document specified pronoun.
pronoun_loc
=
doc
[
"sentence"
].
index
(
"_"
)
+
1
return
doc
[
"sentence"
][
pronoun_loc
:].
strip
()
def
create_choices
(
doc
):
choices
=
[]
for
option
in
[
doc
[
"option1"
],
doc
[
"option2"
]]:
...
...
@@ -16,6 +18,7 @@ def create_choices(doc):
choices
.
append
(
partial_ctx
)
return
choices
def
gold_alias
(
doc
):
answer_to_num
=
{
"1"
:
0
,
"2"
:
1
}
return
answer_to_num
[
doc
[
'answer'
]]
\ No newline at end of file
return
answer_to_num
[
doc
[
"answer"
]]
main.py
View file @
6862fa7d
...
...
@@ -43,6 +43,7 @@ def parse_args():
parser
.
add_argument
(
"--decontamination_ngrams_path"
,
default
=
None
)
parser
.
add_argument
(
"--check_integrity"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--write_out"
,
action
=
"store_true"
,
default
=
False
)
parser
.
add_argument
(
"--log_samples"
,
action
=
"store_true"
,
default
=
True
)
return
parser
.
parse_args
()
...
...
@@ -89,10 +90,12 @@ def main():
decontamination_ngrams_path
=
args
.
decontamination_ngrams_path
,
check_integrity
=
args
.
check_integrity
,
write_out
=
args
.
write_out
,
log_samples
=
args
.
log_samples
,
)
if
results
is
not
None
:
samples
=
results
.
pop
(
"samples"
)
if
args
.
log_samples
:
samples
=
results
.
pop
(
"samples"
)
dumped
=
json
.
dumps
(
results
,
indent
=
2
,
default
=
lambda
o
:
str
(
o
))
print
(
dumped
)
...
...
@@ -104,19 +107,20 @@ def main():
with
open
(
args
.
output_path
,
"w"
)
as
f
:
f
.
write
(
dumped
)
for
task_name
,
config
in
results
[
"configs"
].
items
():
output_name
=
"{}_{}"
.
format
(
re
.
sub
(
"/"
,
"__"
,
args
.
model_args
),
task_name
)
if
os
.
path
.
isdir
(
args
.
output_path
):
filename
=
f
"./
{
args
.
output_path
}
/
{
output_name
}
.jsonl"
elif
os
.
path
.
isfile
(
args
.
output_path
):
filename
=
(
f
"./
{
os
.
path
.
dirname
(
args
.
output_path
)
}
/
{
output_name
}
.jsonl"
if
args
.
log_samples
:
for
task_name
,
config
in
results
[
"configs"
].
items
():
output_name
=
"{}_{}"
.
format
(
re
.
sub
(
"/"
,
"__"
,
args
.
model_args
),
task_name
)
with
jsonlines
.
open
(
filename
,
"w"
)
as
f
:
f
.
write_all
(
samples
[
task_name
])
if
os
.
path
.
isdir
(
args
.
output_path
):
filename
=
f
"./
{
args
.
output_path
}
/
{
output_name
}
.jsonl"
elif
os
.
path
.
isfile
(
args
.
output_path
):
filename
=
(
f
"./
{
os
.
path
.
dirname
(
args
.
output_path
)
}
/
{
output_name
}
.jsonl"
)
with
jsonlines
.
open
(
filename
,
"w"
)
as
f
:
f
.
write_all
(
samples
[
task_name
])
print
(
f
"
{
args
.
model
}
(
{
args
.
model_args
}
), limit:
{
args
.
limit
}
, num_fewshot:
{
args
.
num_fewshot
}
, "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment