Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
318bd988
Unverified
Commit
318bd988
authored
Jul 04, 2023
by
Wang, Yi
Committed by
GitHub
Jul 04, 2023
Browse files
Merge branch 'EleutherAI:master' into fix_ptun
parents
35f1b5a7
25dfd3f6
Changes
25
Show whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
224 additions
and
57 deletions
+224
-57
lm_eval/tasks/triviaqa.py
lm_eval/tasks/triviaqa.py
+20
-13
lm_eval/utils.py
lm_eval/utils.py
+46
-3
main.py
main.py
+8
-41
scripts/regression.py
scripts/regression.py
+149
-0
setup.py
setup.py
+1
-0
No files found.
lm_eval/tasks/triviaqa.py
View file @
318bd988
...
@@ -10,11 +10,10 @@ high quality distant supervision for answering the questions.
...
@@ -10,11 +10,10 @@ high quality distant supervision for answering the questions.
Homepage: https://nlp.cs.washington.edu/triviaqa/
Homepage: https://nlp.cs.washington.edu/triviaqa/
"""
"""
import
inspect
import
inspect
import
lm_eval.datasets.triviaqa.triviaqa
import
string
from
lm_eval.base
import
Task
,
rf
from
lm_eval.base
import
Task
,
rf
from
lm_eval.metrics
import
mean
from
lm_eval.metrics
import
mean
_CITATION
=
"""
_CITATION
=
"""
@InProceedings{JoshiTriviaQA2017,
@InProceedings{JoshiTriviaQA2017,
author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
...
@@ -29,9 +28,9 @@ _CITATION = """
...
@@ -29,9 +28,9 @@ _CITATION = """
class
TriviaQA
(
Task
):
class
TriviaQA
(
Task
):
VERSION
=
1
VERSION
=
2
DATASET_PATH
=
inspect
.
getfile
(
lm_eval
.
datasets
.
triviaqa
.
triviaqa
)
DATASET_PATH
=
"
trivia
_
qa
"
DATASET_NAME
=
None
DATASET_NAME
=
"rc.nocontext"
def
has_training_docs
(
self
):
def
has_training_docs
(
self
):
return
True
return
True
...
@@ -74,19 +73,27 @@ class TriviaQA(Task):
...
@@ -74,19 +73,27 @@ class TriviaQA(Task):
return
ret
return
ret
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
):
ret
=
[]
"""Uses RequestFactory to construct Requests and returns an iterable of
for
alias
in
self
.
_remove_prefixes
(
doc
[
"answer"
][
"aliases"
]):
Requests which will be sent to the LM.
_
,
is_prediction
=
rf
.
loglikelihood
(
ctx
,
" "
+
alias
)
:param doc:
ret
.
append
(
is_prediction
)
The document as returned from training_docs, validation_docs, or test_docs.
return
ret
:param ctx: str
The context string, generated by fewshot_context. This includes the natural
language description, as well as the few shot examples, and the question
part of the document for `doc`.
"""
continuation
=
rf
.
greedy_until
(
ctx
,
{
"until"
:
[
"
\n
"
,
"."
,
","
]})
return
continuation
def
process_results
(
self
,
doc
,
results
):
def
process_results
(
self
,
doc
,
results
):
return
{
"acc"
:
float
(
any
(
results
))}
continuation
=
results
[
0
].
strip
().
lower
().
translate
(
str
.
maketrans
(
''
,
''
,
string
.
punctuation
))
list_of_candidates
=
[
alias
.
lower
().
translate
(
str
.
maketrans
(
''
,
''
,
string
.
punctuation
))
for
alias
in
self
.
_remove_prefixes
(
doc
[
"answer"
][
"aliases"
])]
return
{
"em"
:
float
(
continuation
in
list_of_candidates
)}
def
aggregation
(
self
):
def
aggregation
(
self
):
return
{
return
{
"
acc
"
:
mean
,
"
em
"
:
mean
,
}
}
def
higher_is_better
(
self
):
def
higher_is_better
(
self
):
return
{
"
acc
"
:
True
}
return
{
"
em
"
:
True
}
lm_eval/utils.py
View file @
318bd988
...
@@ -5,8 +5,10 @@ import collections
...
@@ -5,8 +5,10 @@ import collections
import
functools
import
functools
import
inspect
import
inspect
import
sys
import
sys
import
fnmatch
from
typing
import
List
,
Union
from
typing
import
List
,
Union
import
gc
import
torch
import
torch
from
omegaconf
import
OmegaConf
from
omegaconf
import
OmegaConf
...
@@ -63,11 +65,11 @@ def join_iters(iters):
...
@@ -63,11 +65,11 @@ def join_iters(iters):
yield
from
iter
yield
from
iter
def
chunks
(
iter
,
n
):
def
chunks
(
iter
,
n
=
0
,
fn
=
None
):
arr
=
[]
arr
=
[]
for
x
in
iter
:
for
i
,
x
in
enumerate
(
iter
)
:
arr
.
append
(
x
)
arr
.
append
(
x
)
if
len
(
arr
)
==
n
:
if
len
(
arr
)
==
(
fn
(
i
)
if
fn
else
n
)
:
yield
arr
yield
arr
arr
=
[]
arr
=
[]
...
@@ -84,6 +86,42 @@ def group(arr, fn):
...
@@ -84,6 +86,42 @@ def group(arr, fn):
return
list
(
res
.
values
())
return
list
(
res
.
values
())
def
_is_json_task
(
task_name
):
return
task_name
==
"json"
or
task_name
.
startswith
(
"json="
)
class
MultiChoice
:
def
__init__
(
self
,
choices
):
self
.
choices
=
choices
# Simple wildcard support (linux filename patterns)
def
__contains__
(
self
,
values
):
for
value
in
values
.
split
(
","
):
if
len
(
fnmatch
.
filter
(
self
.
choices
,
value
))
==
0
and
not
_is_json_task
(
value
):
return
False
return
True
def
__iter__
(
self
):
for
choice
in
self
.
choices
:
yield
choice
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def
pattern_match
(
patterns
,
source_list
):
task_names
=
set
()
for
pattern
in
patterns
:
if
_is_json_task
(
pattern
):
task_names
.
add
(
pattern
)
for
matching
in
fnmatch
.
filter
(
source_list
,
pattern
):
task_names
.
add
(
matching
)
return
sorted
(
list
(
task_names
))
def
general_detokenize
(
string
):
def
general_detokenize
(
string
):
string
=
string
.
replace
(
" n't"
,
"n't"
)
string
=
string
.
replace
(
" n't"
,
"n't"
)
string
=
string
.
replace
(
" )"
,
")"
)
string
=
string
.
replace
(
" )"
,
")"
)
...
@@ -246,3 +284,8 @@ def run_task_tests(task_list: List[str]):
...
@@ -246,3 +284,8 @@ def run_task_tests(task_list: List[str]):
raise
ValueError
(
raise
ValueError
(
f
"Not all tests for the specified tasks (
{
task_list
}
) ran successfully! Error code:
{
pytest_return_val
}
"
f
"Not all tests for the specified tasks (
{
task_list
}
) ran successfully! Error code:
{
pytest_return_val
}
"
)
)
def
clear_torch_cache
():
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
main.py
View file @
318bd988
import
argparse
import
argparse
import
json
import
json
import
logging
import
logging
import
fnmatch
import
os
import
os
from
lm_eval
import
tasks
,
evaluator
from
lm_eval
import
tasks
,
evaluator
,
utils
logging
.
getLogger
(
"openai"
).
setLevel
(
logging
.
WARNING
)
logging
.
getLogger
(
"openai"
).
setLevel
(
logging
.
WARNING
)
def
_is_json_task
(
task_name
):
return
task_name
==
"json"
or
task_name
.
startswith
(
"json="
)
class
MultiChoice
:
def
__init__
(
self
,
choices
):
self
.
choices
=
choices
# Simple wildcard support (linux filename patterns)
def
__contains__
(
self
,
values
):
for
value
in
values
.
split
(
","
):
if
len
(
fnmatch
.
filter
(
self
.
choices
,
value
))
==
0
and
not
_is_json_task
(
value
):
return
False
return
True
def
__iter__
(
self
):
for
choice
in
self
.
choices
:
yield
choice
def
parse_args
():
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
required
=
True
)
parser
.
add_argument
(
"--model"
,
required
=
True
)
parser
.
add_argument
(
"--model_args"
,
default
=
""
)
parser
.
add_argument
(
"--model_args"
,
default
=
""
)
parser
.
add_argument
(
"--tasks"
,
default
=
None
,
choices
=
MultiChoice
(
tasks
.
ALL_TASKS
))
parser
.
add_argument
(
"--tasks"
,
default
=
None
,
choices
=
utils
.
MultiChoice
(
tasks
.
ALL_TASKS
))
parser
.
add_argument
(
"--provide_description"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--provide_description"
,
action
=
"store_true"
)
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--batch_size"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--batch_size"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--max_batch_size"
,
type
=
int
,
default
=
None
,
help
=
"Maximal batch size to try with --batch_size auto"
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--output_path"
,
default
=
None
)
parser
.
add_argument
(
"--output_path"
,
default
=
None
)
parser
.
add_argument
(
"--limit"
,
type
=
float
,
default
=
None
,
parser
.
add_argument
(
"--limit"
,
type
=
float
,
default
=
None
,
...
@@ -56,19 +34,6 @@ def parse_args():
...
@@ -56,19 +34,6 @@ def parse_args():
return
parser
.
parse_args
()
return
parser
.
parse_args
()
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def
pattern_match
(
patterns
,
source_list
):
task_names
=
set
()
for
pattern
in
patterns
:
if
_is_json_task
(
pattern
):
task_names
.
add
(
pattern
)
for
matching
in
fnmatch
.
filter
(
source_list
,
pattern
):
task_names
.
add
(
matching
)
return
sorted
(
list
(
task_names
))
def
main
():
def
main
():
args
=
parse_args
()
args
=
parse_args
()
...
@@ -82,7 +47,7 @@ def main():
...
@@ -82,7 +47,7 @@ def main():
if
args
.
tasks
is
None
:
if
args
.
tasks
is
None
:
task_names
=
tasks
.
ALL_TASKS
task_names
=
tasks
.
ALL_TASKS
else
:
else
:
task_names
=
pattern_match
(
args
.
tasks
.
split
(
","
),
tasks
.
ALL_TASKS
)
task_names
=
utils
.
pattern_match
(
args
.
tasks
.
split
(
","
),
tasks
.
ALL_TASKS
)
print
(
f
"Selected Tasks:
{
task_names
}
"
)
print
(
f
"Selected Tasks:
{
task_names
}
"
)
...
@@ -97,6 +62,7 @@ def main():
...
@@ -97,6 +62,7 @@ def main():
tasks
=
task_names
,
tasks
=
task_names
,
num_fewshot
=
args
.
num_fewshot
,
num_fewshot
=
args
.
num_fewshot
,
batch_size
=
args
.
batch_size
,
batch_size
=
args
.
batch_size
,
max_batch_size
=
args
.
max_batch_size
,
device
=
args
.
device
,
device
=
args
.
device
,
no_cache
=
args
.
no_cache
,
no_cache
=
args
.
no_cache
,
limit
=
args
.
limit
,
limit
=
args
.
limit
,
...
@@ -115,9 +81,10 @@ def main():
...
@@ -115,9 +81,10 @@ def main():
with
open
(
args
.
output_path
,
"w"
)
as
f
:
with
open
(
args
.
output_path
,
"w"
)
as
f
:
f
.
write
(
dumped
)
f
.
write
(
dumped
)
batch_sizes
=
","
.
join
(
map
(
str
,
results
[
"config"
][
"batch_sizes"
]))
print
(
print
(
f
"
{
args
.
model
}
(
{
args
.
model_args
}
), limit:
{
args
.
limit
}
, provide_description:
{
args
.
provide_description
}
, "
f
"
{
args
.
model
}
(
{
args
.
model_args
}
), limit:
{
args
.
limit
}
, provide_description:
{
args
.
provide_description
}
, "
f
"num_fewshot:
{
args
.
num_fewshot
}
, batch_size:
{
args
.
batch_size
}
"
f
"num_fewshot:
{
args
.
num_fewshot
}
, batch_size:
{
args
.
batch_size
}
{
f
' (
{
batch_sizes
}
)
' if batch_sizes else ''
}
"
)
)
print
(
evaluator
.
make_table
(
results
))
print
(
evaluator
.
make_table
(
results
))
...
...
scripts/regression.py
0 → 100644
View file @
318bd988
import
argparse
import
json
import
os
import
subprocess
import
time
from
pathlib
import
Path
from
lm_eval
import
tasks
,
utils
seq2seq_models
=
[
"google/flan-t5-small"
]
causal_models
=
[
"gpt2"
,
"facebook/opt-125m"
,
"EleutherAI/gpt-neo-125m"
,
"EleutherAI/pythia-160m"
]
model_names
=
seq2seq_models
+
causal_models
completion_tasks
=
[
"boolq"
,
"lambada_openai"
,
"winogrande"
]
choice_tasks
=
[
"hellaswag"
,
"openbookqa"
,
"piqa"
]
perplexity_tasks
=
[
"wikitext"
]
generation_tasks
=
[]
task_names
=
completion_tasks
+
choice_tasks
+
perplexity_tasks
+
generation_tasks
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--branches"
,
default
=
[])
parser
.
add_argument
(
"--models"
,
default
=
model_names
)
parser
.
add_argument
(
"--tasks"
,
default
=
task_names
)
parser
.
add_argument
(
"--acc_norm"
,
type
=
bool
,
default
=
False
)
parser
.
add_argument
(
"--perplexity"
,
default
=
None
)
# TODO: implement num_fewshot and limit per task, e.g. task1:5,task2:1:100,task3::1000
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--limit"
,
type
=
float
,
default
=
None
)
# TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
parser
.
add_argument
(
"--model"
,
default
=
"hf-causal-experimental"
)
# Use whatever is faster here
parser
.
add_argument
(
"--model_args"
,
default
=
"use_accelerate=True,load_in_8bit=True"
)
parser
.
add_argument
(
"--batch_size"
,
default
=
"auto"
)
return
parser
.
parse_args
()
def
eval_models
(
args
,
branch
=
None
):
if
branch
is
not
None
:
if
os
.
system
(
f
"git checkout
{
branch
}
"
)
!=
0
:
return
{},
0
branch
=
branch
or
initial_branch
start_time
=
time
.
time
()
results
=
{}
for
model
in
args
.
models
:
model_type
=
"hf-causal-experimental"
if
model
in
causal_models
\
else
"hf-seq2seq"
if
model
in
seq2seq_models
else
args
.
model
model_args
=
f
"pretrained=
{
model
}
,
{
args
.
model_args
}
"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks
=
args
.
tasks
if
model
in
causal_models
or
model_type
==
"hf-causal-experimental"
\
else
list
(
filter
(
lambda
task
:
task
not
in
perplexity_tasks
,
args
.
tasks
))
# TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size
=
args
.
batch_size
if
model
in
causal_models
or
model_type
==
"hf-causal-experimental"
\
else
64
if
args
.
batch_size
==
"auto"
else
args
.
batch_size
output_path
=
f
"data/regression/
{
int
(
start_time
)
}
-
{
branch
}
-
{
Path
(
model
).
name
}
.json"
command
=
f
"python3 main.py --model
{
model_type
}
--model_args
{
model_args
}
--tasks
{
','
.
join
(
tasks
)
}
"
\
f
"--num_fewshot
{
args
.
num_fewshot
}{
''
if
args
.
limit
is
None
else
f
' --limit
{
args
.
limit
}
'
}
"
\
f
"--batch_size
{
batch_size
}
--no_cache --output_path
{
output_path
}
"
print
(
f
"
{
'='
*
80
}
\n
Evaluating
{
model
}
on
{
', '
.
join
(
tasks
)
}
at
{
branch
}
with:
\n\n
{
command
}
\n
{
'='
*
80
}
"
)
ret
=
os
.
system
(
command
)
results
[
model
]
=
json
.
load
(
open
(
output_path
))
if
ret
==
0
else
{
"results"
:
{}}
end_time
=
time
.
time
()
return
results
,
end_time
-
start_time
def
extract_value
(
args
,
results
,
model
,
task
,
err
=
False
):
if
model
not
in
results
:
return
0
results
=
results
[
model
][
"results"
]
if
task
not
in
results
:
return
0
results
=
results
[
task
]
if
args
.
acc_norm
and
"acc_norm"
in
results
:
return
results
[
"acc_norm"
]
if
not
err
else
results
[
"acc_norm_stderr"
]
if
"acc"
in
results
:
return
results
[
"acc"
]
if
not
err
else
results
[
"acc_stderr"
]
if
(
args
.
perplexity
or
"word_perplexity"
)
in
results
:
return
results
[
args
.
perplexity
or
"word_perplexity"
]
if
not
err
else
0
return
0
def
format_value
(
args
,
results
,
model
,
task
):
val
=
100
*
extract_value
(
args
,
results
,
model
,
task
)
err
=
100
*
extract_value
(
args
,
results
,
model
,
task
,
err
=
True
)
return
f
"
{
val
:.
2
f
}{
f
' ±
{
err
:.
2
f
}
' if err != 0 else ''
}
"
def
format_diff
(
args
,
results1
,
results2
,
model
,
task
):
val1
=
100
*
extract_value
(
args
,
results1
,
model
,
task
)
val2
=
100
*
extract_value
(
args
,
results2
,
model
,
task
)
diff
=
val2
-
val1
return
f
"**+
{
diff
:.
2
f
}
**"
if
diff
>
0
else
f
"
{
diff
:.
2
f
}
"
def
main
():
args
=
parse_args
()
args
.
branches
=
args
.
branches
.
split
(
","
)
if
type
(
args
.
branches
)
==
str
else
args
.
branches
args
.
models
=
args
.
models
.
split
(
","
)
if
type
(
args
.
models
)
==
str
else
args
.
models
args
.
tasks
=
tasks
.
ALL_TASKS
if
args
.
tasks
==
"all_tasks"
\
else
utils
.
pattern_match
(
args
.
tasks
.
split
(
","
)
if
type
(
args
.
tasks
)
==
str
else
args
.
tasks
,
tasks
.
ALL_TASKS
)
global
initial_branch
initial_branch
=
subprocess
.
check_output
(
"git branch --show-current"
,
shell
=
True
).
decode
(
"ascii"
).
strip
()
# TODO: implement proper timing for each task
# TODO: reduce IO by sharing tasks between models?
results
,
runtime
=
eval_models
(
args
)
print
(
results
,
runtime
)
runs
=
[]
for
branch
in
args
.
branches
:
runs
.
append
((
branch
,
*
eval_models
(
args
,
branch
)))
os
.
system
(
f
"git checkout
{
initial_branch
}
"
)
print
(
""
)
print
(
f
"|task|
{
'|'
.
join
(
map
(
lambda
model
:
Path
(
model
).
name
,
args
.
models
))
}
|"
)
print
(
f
"|--|
{
'--|'
*
len
(
args
.
models
)
}
"
)
for
task
in
args
.
tasks
:
print
(
f
"|
{
task
}
(
{
initial_branch
}
)|
{
'|'
.
join
(
map
(
lambda
model
:
format_value
(
args
,
results
,
model
,
task
),
args
.
models
))
}
|"
)
for
branch
,
branch_results
,
branch_runtime
in
runs
:
print
(
f
"|
{
task
}
(
{
branch
}
)|
{
'|'
.
join
(
map
(
lambda
model
:
format_value
(
args
,
branch_results
,
model
,
task
),
args
.
models
))
}
|"
)
print
(
f
"|
{
task
}
(diff)|
{
'|'
.
join
(
map
(
lambda
model
:
format_diff
(
args
,
results
,
branch_results
,
model
,
task
),
args
.
models
))
}
|"
)
print
(
""
)
print
(
"|branch|runtime|%|"
)
print
(
"|--|--|--|"
)
print
(
f
"|
{
initial_branch
}
|
{
runtime
:.
1
f
}
s|100%|"
)
for
branch
,
_
,
branch_runtime
in
runs
:
print
(
f
"|
{
branch
}
|
{
branch_runtime
:.
1
f
}
s|
{
100
*
branch_runtime
/
runtime
:.
2
f
}
%|"
)
if
__name__
==
"__main__"
:
main
()
setup.py
View file @
318bd988
...
@@ -45,5 +45,6 @@ setuptools.setup(
...
@@ -45,5 +45,6 @@ setuptools.setup(
"multilingual"
:
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
],
"multilingual"
:
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
],
"sentencepiece"
:
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
],
"sentencepiece"
:
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
],
"auto-gptq"
:
[
"auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"
],
"auto-gptq"
:
[
"auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"
],
"anthropic"
:
[
"anthropic"
],
},
},
)
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment