Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
1ed76cfa
Commit
1ed76cfa
authored
Sep 19, 2023
by
lintangsutawika
Browse files
moved benchmarks back to tasks/
parent
10d8ed64
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
88 additions
and
164 deletions
+88
-164
lm_eval/benchmarks/__init__.py
lm_eval/benchmarks/__init__.py
+0
-63
lm_eval/evaluator.py
lm_eval/evaluator.py
+0
-1
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+40
-3
lm_eval/tasks/benchmarks/pythia.yaml
lm_eval/tasks/benchmarks/pythia.yaml
+0
-0
lm_eval/tasks/benchmarks/t0_eval.yaml
lm_eval/tasks/benchmarks/t0_eval.yaml
+0
-0
lm_eval/tasks/squadv2/default.yaml
lm_eval/tasks/squadv2/default.yaml
+11
-33
lm_eval/tasks/squadv2/utils.py
lm_eval/tasks/squadv2/utils.py
+36
-63
main.py
main.py
+1
-1
No files found.
lm_eval/benchmarks/__init__.py
deleted
100644 → 0
View file @
10d8ed64
import
os
import
yaml
from
lm_eval
import
utils
from
lm_eval.tasks
import
register_configurable_task
,
check_prompt_config
from
lm_eval.logger
import
eval_logger
from
lm_eval.api.registry
import
(
TASK_REGISTRY
,
GROUP_REGISTRY
,
ALL_TASKS
,
)
def
include_benchmarks
(
task_dir
:
str
)
->
None
:
for
root
,
subdirs
,
file_list
in
os
.
walk
(
task_dir
):
if
(
subdirs
==
[]
or
subdirs
==
[
"__pycache__"
])
and
(
len
(
file_list
)
>
0
):
for
f
in
file_list
:
if
f
.
endswith
(
".yaml"
):
try
:
benchmark_path
=
os
.
path
.
join
(
root
,
f
)
with
open
(
benchmark_path
,
"rb"
)
as
file
:
yaml_config
=
yaml
.
full_load
(
file
)
assert
"group"
in
yaml_config
group
=
yaml_config
[
"group"
]
all_task_list
=
yaml_config
[
"task"
]
config_list
=
[
task
for
task
in
all_task_list
if
type
(
task
)
!=
str
]
task_list
=
[
task
for
task
in
all_task_list
if
type
(
task
)
==
str
]
for
task_config
in
config_list
:
var_configs
=
check_prompt_config
(
{
**
task_config
,
**
{
"group"
:
group
},
}
)
for
config
in
var_configs
:
register_configurable_task
(
config
)
task_names
=
utils
.
pattern_match
(
task_list
,
ALL_TASKS
)
for
task
in
task_names
:
if
(
task
in
TASK_REGISTRY
)
or
(
task
in
GROUP_REGISTRY
):
if
group
in
GROUP_REGISTRY
:
GROUP_REGISTRY
[
group
].
append
(
task
)
else
:
GROUP_REGISTRY
[
group
]
=
[
task
]
ALL_TASKS
.
add
(
group
)
except
Exception
as
error
:
eval_logger
.
warning
(
"Failed to load benchmark in
\n
"
f
"
{
benchmark_path
}
\n
"
" Benchmark will not be added to registry
\n
"
f
" Error:
{
error
}
"
)
task_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/"
include_benchmarks
(
task_dir
)
lm_eval/evaluator.py
View file @
1ed76cfa
...
@@ -11,7 +11,6 @@ import numpy as np
...
@@ -11,7 +11,6 @@ import numpy as np
import
lm_eval.api
import
lm_eval.api
import
lm_eval.tasks
import
lm_eval.tasks
import
lm_eval.benchmarks
import
lm_eval.models
import
lm_eval.models
import
lm_eval.api.metrics
import
lm_eval.api.metrics
import
lm_eval.api.registry
import
lm_eval.api.registry
...
...
lm_eval/tasks/__init__.py
View file @
1ed76cfa
...
@@ -37,6 +37,37 @@ def register_configurable_task(config: Dict[str, str]) -> int:
...
@@ -37,6 +37,37 @@ def register_configurable_task(config: Dict[str, str]) -> int:
return
0
return
0
def
register_configurable_group
(
config
:
Dict
[
str
,
str
])
->
int
:
group
=
config
[
"group"
]
all_task_list
=
config
[
"task"
]
config_list
=
[
task
for
task
in
all_task_list
if
type
(
task
)
!=
str
]
task_list
=
[
task
for
task
in
all_task_list
if
type
(
task
)
==
str
]
for
task_config
in
config_list
:
var_configs
=
check_prompt_config
(
{
**
task_config
,
**
{
"group"
:
group
},
}
)
for
config
in
var_configs
:
register_configurable_task
(
config
)
task_names
=
utils
.
pattern_match
(
task_list
,
ALL_TASKS
)
for
task
in
task_names
:
if
(
task
in
TASK_REGISTRY
)
or
(
task
in
GROUP_REGISTRY
):
if
group
in
GROUP_REGISTRY
:
GROUP_REGISTRY
[
group
].
append
(
task
)
else
:
GROUP_REGISTRY
[
group
]
=
[
task
]
ALL_TASKS
.
add
(
group
)
return
0
def
check_prompt_config
(
config
:
Dict
[
str
,
str
])
->
List
[
Dict
[
str
,
str
]]:
def
check_prompt_config
(
config
:
Dict
[
str
,
str
])
->
List
[
Dict
[
str
,
str
]]:
all_configs
=
[]
all_configs
=
[]
...
@@ -87,9 +118,15 @@ def include_task_folder(task_dir: str) -> None:
...
@@ -87,9 +118,15 @@ def include_task_folder(task_dir: str) -> None:
yaml_path
=
os
.
path
.
join
(
root
,
f
)
yaml_path
=
os
.
path
.
join
(
root
,
f
)
try
:
try
:
config
=
utils
.
load_yaml_config
(
yaml_path
)
config
=
utils
.
load_yaml_config
(
yaml_path
)
all_configs
=
check_prompt_config
(
config
)
for
config
in
all_configs
:
# If a `task` in config is a list,
register_configurable_task
(
config
)
# that means it's a benchmark
if
type
(
config
[
"task"
])
==
list
:
register_configurable_group
(
config
)
else
:
all_configs
=
check_prompt_config
(
config
)
for
config
in
all_configs
:
register_configurable_task
(
config
)
except
Exception
as
error
:
except
Exception
as
error
:
eval_logger
.
warning
(
eval_logger
.
warning
(
...
...
lm_eval/benchmarks/pythia.yaml
→
lm_eval/
tasks/
benchmarks/pythia.yaml
View file @
1ed76cfa
File moved
lm_eval/benchmarks/t0_eval.yaml
→
lm_eval/
tasks/
benchmarks/t0_eval.yaml
View file @
1ed76cfa
File moved
lm_eval/tasks/squadv2/default.yaml
View file @
1ed76cfa
...
@@ -4,44 +4,22 @@ output_type: greedy_until
...
@@ -4,44 +4,22 @@ output_type: greedy_until
training_split
:
train
training_split
:
train
validation_split
:
validation
validation_split
:
validation
doc_to_text
:
"
Title:
{{title}}
\n\n
Background:
{{context}}
\n\n
Question:
{{question}}
\n\n
Answer:"
doc_to_text
:
"
Title:
{{title}}
\n\n
Background:
{{context}}
\n\n
Question:
{{question}}
\n\n
Answer:"
doc_to_target
:
"
{%
if
answers.text|
length
>
0
%}{{answers.text}}{%
else
%}{{['
unanswerable
']}}{%
endif
%}"
doc_to_target
:
"
{%
if
answers.text|
length
>
0
%}{{answers.text}}{%
else
%}{{['']}}{%
endif
%}"
target_delimiter
:
"
"
target_delimiter
:
"
"
should_decontaminate
:
true
should_decontaminate
:
true
doc_to_decontamination_query
:
context
doc_to_decontamination_query
:
context
process_results
:
!function
utils.process_results
generation_kwargs
:
generation_kwargs
:
until
:
until
:
-
"
\n\n
"
-
"
\n
"
-
"
\n
"
do_sample
:
false
# filter_list:
temperature
:
0.0
# - name: remove_whitespace
filter_list
:
# filter:
-
name
:
remove_whitespace
# - function: remove_whitespace
filter
:
# - function: take_first
-
function
:
remove_whitespace
-
function
:
take_first
metric_list
:
metric_list
:
-
metric
:
exact
-
metric
:
!function
utils.exact
aggregation
:
!function
utils.exact
aggregation
:
mean
higher_is_better
:
true
-
metric
:
!function
utils.f1
aggregation
:
mean
higher_is_better
:
true
higher_is_better
:
true
# - metric: f1
# aggregation: mean
# higher_is_better: true
# - metric: HasAns_exact
# aggregation: mean
# higher_is_better: true
# - metric: HasAns_f1
# aggregation: mean
# higher_is_better: true
# - metric: NoAns_exact
# aggregation: mean
# higher_is_better: true
# - metric: NoAns_f1
# aggregation: mean
# higher_is_better: true
# - metric: best_exact
# aggregation: mean
# higher_is_better: true
# - metric: best_f1
# aggregation: mean
# higher_is_better: true
lm_eval/tasks/squadv2/utils.py
View file @
1ed76cfa
import
evaluate
import
re
import
string
from
math
import
exp
import
collections
from
functools
import
partial
def
normalize_answer
(
s
):
"""Lower text and remove punctuation, articles and extra whitespace."""
def
process_results
(
doc
,
results
):
def
remove_articles
(
text
):
regex
=
re
.
compile
(
r
'\b(a|an|the)\b'
,
re
.
UNICODE
)
continuation
=
results
[
0
]
return
re
.
sub
(
regex
,
' '
,
text
)
no_answer_probability
=
0
# exp(logprob_unanswerable)
def
white_space_fix
(
text
):
return
' '
.
join
(
text
.
split
())
predictions
=
{
def
remove_punc
(
text
):
"id"
:
doc
[
"id"
],
exclude
=
set
(
string
.
punctuation
)
"prediction_text"
:
continuation
,
return
''
.
join
(
ch
for
ch
in
text
if
ch
not
in
exclude
)
"no_answer_probability"
:
no_answer_probability
,
def
lower
(
text
):
}
return
text
.
lower
()
return
white_space_fix
(
remove_articles
(
remove_punc
(
lower
(
s
))))
references
=
{
"id"
:
doc
[
"id"
],
def
get_tokens
(
s
):
"answers"
:
doc
[
"answers"
],
if
not
s
:
return
[]
}
return
normalize_answer
(
s
).
split
()
return
{
"predictions"
:
predictions
,
"reference"
:
references
}
# return _squad_metric([predictions], [references])
# return {key: value if key in metrics for key, value in score.items()}
def
_squad_metric
(
predictions
,
references
):
squad_metric
=
evaluate
.
load
(
"squad_v2"
)
return
squad_metric
.
compute
(
predictions
=
predictions
,
references
=
references
)
# Exact match (the normalized answer exactly match the gold answer)
# Exact match (the normalized answer exactly match the gold answer)
def
exact
(
items
):
def
exact
(
predictions
,
references
):
print
(
items
)
return
int
(
normalize_answer
(
references
[
0
])
==
normalize_answer
(
predictions
[
0
]))
import
sys
;
sys
.
exit
()
predictions
,
references
=
zip
(
*
items
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)[
"exact"
]
# The F-score of predicted tokens versus the gold answer
# The F-score of predicted tokens versus the gold answer
def
f1
(
predictions
,
references
):
def
f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)[
"f1"
]
gold_toks
=
get_tokens
(
references
[
0
])
pred_toks
=
get_tokens
(
predictions
[
0
])
# Exact match (the normalized answer exactly match the gold answer)
common
=
collections
.
Counter
(
gold_toks
)
&
collections
.
Counter
(
pred_toks
)
def
HasAns_exact
(
predictions
,
references
):
num_same
=
sum
(
common
.
values
())
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)[
"HasAns_exact"
]
if
len
(
gold_toks
)
==
0
or
len
(
pred_toks
)
==
0
:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
# The F-score of predicted tokens versus the gold answer
return
int
(
gold_toks
==
pred_toks
)
def
HasAns_f1
(
predictions
,
references
):
if
num_same
==
0
:
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)[
"HasAns_f1"
]
return
0
precision
=
1.0
*
num_same
/
len
(
pred_toks
)
# Exact match (the normalized answer exactly match the gold answer)
recall
=
1.0
*
num_same
/
len
(
gold_toks
)
def
NoAns_exact
(
predictions
,
references
):
f1
=
(
2
*
precision
*
recall
)
/
(
precision
+
recall
)
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)[
"NoAns_exact"
]
return
f1
# The F-score of predicted tokens versus the gold answer
def
NoAns_f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)[
"NoAns_f1"
]
# Best exact match (with varying threshold)
def
best_exact
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)[
"best_exact"
]
# Best F1 (with varying threshold)
def
best_f1
(
predictions
,
references
):
return
_squad_metric
(
predictions
=
predictions
,
references
=
references
)[
"best_f1"
]
main.py
View file @
1ed76cfa
...
@@ -11,7 +11,7 @@ from lm_eval import evaluator, utils
...
@@ -11,7 +11,7 @@ from lm_eval import evaluator, utils
from
lm_eval.api.registry
import
ALL_TASKS
from
lm_eval.api.registry
import
ALL_TASKS
from
lm_eval.logger
import
eval_logger
,
SPACING
from
lm_eval.logger
import
eval_logger
,
SPACING
from
lm_eval.tasks
import
include_task_folder
from
lm_eval.tasks
import
include_task_folder
from
lm_eval.benchmarks
import
include_benchmarks
#
from lm_eval.benchmarks import include_benchmarks
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment