Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b5efc813
Commit
b5efc813
authored
Jun 15, 2023
by
gk
Browse files
Merge branch 'master' into big-refactor-merge
parents
7dec84a0
b018a7d5
Changes
7
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
25 additions
and
1225 deletions
+25
-1225
lm_eval/evaluator.py
lm_eval/evaluator.py
+10
-1
lm_eval/models/hf_wip.py
lm_eval/models/hf_wip.py
+0
-1203
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+0
-3
lm_eval/utils.py
lm_eval/utils.py
+0
-1
main.py
main.py
+1
-1
scripts/make_table_results.py
scripts/make_table_results.py
+0
-3
scripts/regression.py
scripts/regression.py
+14
-13
No files found.
lm_eval/evaluator.py
View file @
b5efc813
...
@@ -167,13 +167,22 @@ def evaluate(
...
@@ -167,13 +167,22 @@ def evaluate(
# get lists of each type of request
# get lists of each type of request
for
task_name
,
task
in
task_dict
.
items
():
for
task_name
,
task
in
task_dict
.
items
():
versions
[
task_name
]
=
task
.
VERSION
versions
[
task_name
]
=
task
.
VERSION
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
# TODO: don't access a private attribute here ; for non-YAML tasks handle this case
# TODO: don't access a private attribute here ; for non-YAML tasks handle this case
configs
[
task_name
]
=
dict
(
task
.
dump_config
())
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
# deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
# task_docs = list(task_doc_func())
# task_docs = list(task_doc_func())
# rnd = random.Random()
# rnd = random.Random()
# rnd.seed(42)
# rnd.seed(42)
# rnd.shuffle(task_docs)
# rnd.shuffle(task_docs)
if
limit
is
not
None
:
if
task
.
has_test_docs
():
task_docs
=
task
.
test_docs
()
elif
task
.
has_validation_docs
():
task_docs
=
task
.
validation_docs
()
else
:
raise
RuntimeError
(
"Task has neither test_docs nor validation_docs"
)
limit
=
int
(
len
(
task_docs
)
*
limit
)
if
limit
<
1.0
else
int
(
limit
)
task
.
build_all_requests
(
limit
=
limit
,
rank
=
lm
.
rank
,
world_size
=
lm
.
world_size
)
task
.
build_all_requests
(
limit
=
limit
,
rank
=
lm
.
rank
,
world_size
=
lm
.
world_size
)
...
...
lm_eval/models/hf_wip.py
deleted
100644 → 0
View file @
7dec84a0
This diff is collapsed.
Click to expand it.
lm_eval/tasks/__init__.py
View file @
b5efc813
...
@@ -16,9 +16,6 @@ from lm_eval.api.registry import (
...
@@ -16,9 +16,6 @@ from lm_eval.api.registry import (
)
)
ALL_TASKS
=
sorted
(
list
(
TASK_REGISTRY
.
keys
())
+
list
(
GROUP_REGISTRY
.
keys
()))
def
get_task_name_from_config
(
task_config
):
def
get_task_name_from_config
(
task_config
):
return
"{dataset_path}_{dataset_name}"
.
format
(
**
task_config
)
return
"{dataset_path}_{dataset_name}"
.
format
(
**
task_config
)
...
...
lm_eval/utils.py
View file @
b5efc813
...
@@ -19,7 +19,6 @@ from omegaconf import OmegaConf
...
@@ -19,7 +19,6 @@ from omegaconf import OmegaConf
from
jinja2
import
BaseLoader
,
Environment
,
StrictUndefined
from
jinja2
import
BaseLoader
,
Environment
,
StrictUndefined
from
itertools
import
islice
from
itertools
import
islice
from
lm_eval
import
tasks
from
lm_eval.logger
import
eval_logger
from
lm_eval.logger
import
eval_logger
...
...
main.py
View file @
b5efc813
...
@@ -16,7 +16,7 @@ def parse_args():
...
@@ -16,7 +16,7 @@ def parse_args():
parser
.
add_argument
(
"--tasks"
,
default
=
None
,
choices
=
utils
.
MultiChoice
(
sorted
(
ALL_TASKS
)))
parser
.
add_argument
(
"--tasks"
,
default
=
None
,
choices
=
utils
.
MultiChoice
(
sorted
(
ALL_TASKS
)))
parser
.
add_argument
(
"--config"
,
default
=
None
)
parser
.
add_argument
(
"--config"
,
default
=
None
)
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--batch_size"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
"--max_batch_size"
,
type
=
int
,
default
=
None
,
parser
.
add_argument
(
"--max_batch_size"
,
type
=
int
,
default
=
None
,
help
=
"Maximal batch size to try with --batch_size auto"
)
help
=
"Maximal batch size to try with --batch_size auto"
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
None
)
parser
.
add_argument
(
"--device"
,
type
=
str
,
default
=
None
)
...
...
scripts/make_table_results.py
View file @
b5efc813
...
@@ -3,7 +3,6 @@ Usage:
...
@@ -3,7 +3,6 @@ Usage:
python make_table_tasks.py --output <markdown_filename>
python make_table_tasks.py --output <markdown_filename>
"""
"""
import
logging
import
logging
from
lm_eval
import
tasks
from
pytablewriter
import
MarkdownTableWriter
,
LatexTableWriter
from
pytablewriter
import
MarkdownTableWriter
,
LatexTableWriter
import
os
import
os
import
json
import
json
...
@@ -54,8 +53,6 @@ def make_table(result_dict):
...
@@ -54,8 +53,6 @@ def make_table(result_dict):
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
task_names
=
tasks
.
ALL_TASKS
# loop dirs and subdirs in results dir
# loop dirs and subdirs in results dir
# for each dir, load json files
# for each dir, load json files
for
dirpath
,
dirnames
,
filenames
in
os
.
walk
(
"../results"
):
for
dirpath
,
dirnames
,
filenames
in
os
.
walk
(
"../results"
):
...
...
scripts/regression.py
View file @
b5efc813
...
@@ -5,7 +5,8 @@ import subprocess
...
@@ -5,7 +5,8 @@ import subprocess
import
time
import
time
from
pathlib
import
Path
from
pathlib
import
Path
from
lm_eval
import
tasks
,
utils
from
lm_eval
import
evaluator
,
utils
from
lm_eval.api.registry
import
ALL_TASKS
seq2seq_models
=
[
"google/flan-t5-small"
]
seq2seq_models
=
[
"google/flan-t5-small"
]
...
@@ -31,7 +32,7 @@ def parse_args():
...
@@ -31,7 +32,7 @@ def parse_args():
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--num_fewshot"
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
"--limit"
,
type
=
float
,
default
=
None
)
parser
.
add_argument
(
"--limit"
,
type
=
float
,
default
=
None
)
# TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
# TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
parser
.
add_argument
(
"--model"
,
default
=
"hf-causal
-experimental
"
)
parser
.
add_argument
(
"--model"
,
default
=
"hf-causal"
)
# Use whatever is faster here
# Use whatever is faster here
parser
.
add_argument
(
"--model_args"
,
default
=
"use_accelerate=True,load_in_8bit=True"
)
parser
.
add_argument
(
"--model_args"
,
default
=
"use_accelerate=True,load_in_8bit=True"
)
parser
.
add_argument
(
"--batch_size"
,
default
=
"auto"
)
parser
.
add_argument
(
"--batch_size"
,
default
=
"auto"
)
...
@@ -50,14 +51,14 @@ def eval_models(args, branch=None):
...
@@ -50,14 +51,14 @@ def eval_models(args, branch=None):
results
=
{}
results
=
{}
for
model
in
args
.
models
:
for
model
in
args
.
models
:
model_type
=
"hf-causal
-experimental
"
if
model
in
causal_models
\
model_type
=
"hf-causal"
if
model
in
causal_models
\
else
"hf-seq2seq"
if
model
in
seq2seq_models
else
args
.
model
else
"hf-seq2seq"
if
model
in
seq2seq_models
else
args
.
model
model_args
=
f
"pretrained=
{
model
}
,
{
args
.
model_args
}
"
model_args
=
f
"pretrained=
{
model
}
,
{
args
.
model_args
}
"
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
# TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
tasks
=
args
.
tasks
if
model
in
causal_models
or
model_type
==
"hf-causal
-experimental
"
\
tasks
=
args
.
tasks
if
model
in
causal_models
or
model_type
==
"hf-causal"
\
else
list
(
filter
(
lambda
task
:
task
not
in
perplexity_tasks
,
args
.
tasks
))
else
list
(
filter
(
lambda
task
:
task
not
in
perplexity_tasks
,
args
.
tasks
))
# TODO: OOM with auto for seq2seq models, also can OOM with llama
# TODO: OOM with auto for seq2seq models, also can OOM with llama
batch_size
=
args
.
batch_size
if
model
in
causal_models
or
model_type
==
"hf-causal
-experimental
"
\
batch_size
=
args
.
batch_size
if
model
in
causal_models
or
model_type
==
"hf-causal"
\
else
64
if
args
.
batch_size
==
"auto"
else
args
.
batch_size
else
64
if
args
.
batch_size
==
"auto"
else
args
.
batch_size
output_path
=
f
"data/regression/
{
int
(
start_time
)
}
-
{
branch
}
-
{
Path
(
model
).
name
}
.json"
output_path
=
f
"data/regression/
{
int
(
start_time
)
}
-
{
branch
}
-
{
Path
(
model
).
name
}
.json"
...
@@ -83,12 +84,12 @@ def extract_value(args, results, model, task, err=False):
...
@@ -83,12 +84,12 @@ def extract_value(args, results, model, task, err=False):
if
task
not
in
results
:
if
task
not
in
results
:
return
0
return
0
results
=
results
[
task
]
results
=
results
[
task
]
if
args
.
acc_norm
and
"acc_norm"
in
results
:
if
args
.
acc_norm
and
"acc_norm
,none
"
in
results
:
return
results
[
"acc_norm"
]
if
not
err
else
results
[
"acc_norm_stderr"
]
return
results
[
"acc_norm
,none
"
]
if
not
err
else
results
[
"acc_norm_stderr
,none
"
]
if
"acc"
in
results
:
if
"acc
,none
"
in
results
:
return
results
[
"acc"
]
if
not
err
else
results
[
"acc_stderr"
]
return
results
[
"acc
,none
"
]
if
not
err
else
results
[
"acc_stderr
,none
"
]
if
(
args
.
perplexity
or
"word_perplexity"
)
in
results
:
if
(
args
.
perplexity
or
"word_perplexity"
)
+
",none"
in
results
:
return
results
[
args
.
perplexity
or
"word_perplexity"
]
if
not
err
else
0
return
results
[
(
args
.
perplexity
or
"word_perplexity"
)
+
",none"
]
if
not
err
else
0
return
0
return
0
...
@@ -110,8 +111,8 @@ def main():
...
@@ -110,8 +111,8 @@ def main():
args
.
branches
=
args
.
branches
.
split
(
","
)
if
type
(
args
.
branches
)
==
str
else
args
.
branches
args
.
branches
=
args
.
branches
.
split
(
","
)
if
type
(
args
.
branches
)
==
str
else
args
.
branches
args
.
models
=
args
.
models
.
split
(
","
)
if
type
(
args
.
models
)
==
str
else
args
.
models
args
.
models
=
args
.
models
.
split
(
","
)
if
type
(
args
.
models
)
==
str
else
args
.
models
args
.
tasks
=
tasks
.
ALL_TASKS
if
args
.
tasks
==
"all_tasks"
\
args
.
tasks
=
ALL_TASKS
if
args
.
tasks
==
"all_tasks"
\
else
utils
.
pattern_match
(
args
.
tasks
.
split
(
","
)
if
type
(
args
.
tasks
)
==
str
else
args
.
tasks
,
tasks
.
ALL_TASKS
)
else
utils
.
pattern_match
(
args
.
tasks
.
split
(
","
)
,
ALL_TASKS
)
if
type
(
args
.
tasks
)
==
str
else
args
.
tasks
global
initial_branch
global
initial_branch
initial_branch
=
subprocess
.
check_output
(
"git branch --show-current"
,
shell
=
True
).
decode
(
"ascii"
).
strip
()
initial_branch
=
subprocess
.
check_output
(
"git branch --show-current"
,
shell
=
True
).
decode
(
"ascii"
).
strip
()
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment