Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
f5d763dc
Commit
f5d763dc
authored
Jun 28, 2024
by
Nathan Habib
Browse files
cleanup
parent
4619f7c1
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
95 additions
and
76 deletions
+95
-76
lm_eval/__main__.py
lm_eval/__main__.py
+8
-4
lm_eval/api/task.py
lm_eval/api/task.py
+10
-36
lm_eval/evaluator.py
lm_eval/evaluator.py
+6
-1
lm_eval/loggers/evaluation_tracker.py
lm_eval/loggers/evaluation_tracker.py
+70
-35
lm_eval/models/huggingface.py
lm_eval/models/huggingface.py
+1
-0
No files found.
lm_eval/__main__.py
View file @
f5d763dc
...
...
@@ -5,7 +5,6 @@ import os
import
sys
from
functools
import
partial
from
typing
import
Union
from
accelerate
import
Accelerator
from
lm_eval
import
evaluator
,
utils
from
lm_eval.evaluator
import
request_caching_arg_to_dict
...
...
@@ -293,6 +292,13 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"If fewshot_as_multiturn is set, apply_chat_template must be set to True."
)
if
(
args
.
num_fewshot
is
None
or
args
.
num_fewshot
==
0
)
and
args
.
fewshot_as_multiturn
:
raise
ValueError
(
"If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
)
if
args
.
include_path
is
not
None
:
eval_logger
.
info
(
f
"Including path:
{
args
.
include_path
}
"
)
task_manager
=
TaskManager
(
args
.
verbosity
,
include_path
=
args
.
include_path
)
...
...
@@ -394,9 +400,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
**
request_caching_args
,
)
accelerator
=
Accelerator
()
if
results
is
not
None
and
accelerator
.
is_main_process
:
if
results
is
not
None
:
if
args
.
log_samples
:
samples
=
results
.
pop
(
"samples"
)
dumped
=
json
.
dumps
(
...
...
lm_eval/api/task.py
View file @
f5d763dc
...
...
@@ -376,7 +376,8 @@ class Task(abc.ABC):
system_instruction
:
Optional
[
str
]
=
None
,
apply_chat_template
:
bool
=
False
,
fewshot_as_multiturn
:
bool
=
False
,
lm
=
None
,
chat_template
:
Optional
[
Callable
]
=
None
,
tokenizer_name
:
str
=
""
,
)
->
None
:
"""Build a set of Instances for a task, and store them in task.instances"""
...
...
@@ -391,7 +392,7 @@ class Task(abc.ABC):
if
system_instruction
is
not
None
else
""
)
cache_key
+=
f
"-tokenizer
{
lm
.
tokenizer_name
}
"
if
apply_chat_template
else
""
cache_key
+=
f
"-tokenizer
{
tokenizer_name
}
"
cached_instances
=
load_from_cache
(
file_name
=
cache_key
)
...
...
@@ -436,7 +437,7 @@ class Task(abc.ABC):
system_instruction
,
apply_chat_template
,
fewshot_as_multiturn
,
lm
,
chat_template
,
)
# TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
...
...
@@ -444,7 +445,6 @@ class Task(abc.ABC):
doc
=
doc
,
ctx
=
fewshot_ctx
,
metadata
=
(
self
.
config
[
"task"
],
doc_id
,
self
.
config
.
repeats
),
apply_chat_template
=
apply_chat_template
)
if
not
isinstance
(
inst
,
list
):
...
...
@@ -987,28 +987,6 @@ class ConfigurableTask(Task):
return
super
().
fewshot_docs
()
@
staticmethod
def
append_target_question
(
labeled_examples
:
List
[
Dict
[
str
,
str
]],
question
:
str
,
fewshot_as_multiturn
:
bool
=
False
,
)
->
None
:
"""Adds a target question to the labeled examples list.
If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
"""
if
not
fewshot_as_multiturn
:
# if no messages or last message is system, append as new user entry
if
len
(
labeled_examples
)
==
0
or
labeled_examples
[
-
1
][
"role"
]
==
"system"
:
labeled_examples
.
append
({
"role"
:
"user"
,
"content"
:
question
})
# if last message is user, append to it to avoid two user messages in a row
else
:
labeled_examples
[
-
1
][
"content"
]
+=
question
else
:
return
self
.
sampler
.
fewshot_delimiter
+
""
.
join
(
f
"
{
s
[
'role'
]
}
:
{
s
[
'content'
]
}
"
+
self
.
sampler
.
fewshot_delimiter
for
s
in
chat_history
)
@
staticmethod
def
append_target_question
(
labeled_examples
:
List
[
Dict
[
str
,
str
]],
question
:
str
,
...
...
@@ -1037,7 +1015,7 @@ class ConfigurableTask(Task):
system_instruction
:
Optional
[
str
]
=
None
,
apply_chat_template
:
bool
=
False
,
fewshot_as_multiturn
:
bool
=
False
,
lm
=
None
,
chat_template
:
Optional
[
Callable
]
=
None
,
)
->
str
:
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
...
...
@@ -1052,8 +1030,8 @@ class ConfigurableTask(Task):
Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param
lm:
Language model with definition of the tokenizer/function to use for applying the chat template
.
:param
chat_template: Callable
Chat template to be applied to the fewshot context
.
:returns: str
The fewshot context.
"""
...
...
@@ -1100,7 +1078,7 @@ class ConfigurableTask(Task):
example
=
self
.
doc_to_text
(
doc
)
if
apply_chat_template
:
if
self
.
multiple_input
:
return
lm
.
apply_
chat_template
(
labeled_examples
)
return
chat_template
(
labeled_examples
)
if
isinstance
(
example
,
str
):
self
.
append_target_question
(
labeled_examples
,
example
,
fewshot_as_multiturn
...
...
@@ -1112,7 +1090,7 @@ class ConfigurableTask(Task):
for
ex
in
example
:
chat
=
deepcopy
(
labeled_examples
)
self
.
append_target_question
(
chat
,
ex
,
fewshot_as_multiturn
)
labeled_examples_list
.
append
(
lm
.
apply_
chat_template
(
chat
))
labeled_examples_list
.
append
(
chat_template
(
chat
))
return
labeled_examples_list
# if example is an integer, append the choice or convert to string
elif
isinstance
(
example
,
int
):
...
...
@@ -1126,7 +1104,7 @@ class ConfigurableTask(Task):
labeled_examples
,
str
(
example
),
fewshot_as_multiturn
)
# return lm.apply_chat_template(labeled_examples)
return
lm
.
apply_
chat_template
(
labeled_examples
)
return
chat_template
(
labeled_examples
)
else
:
if
self
.
multiple_input
:
return
labeled_examples
...
...
@@ -1293,8 +1271,6 @@ class ConfigurableTask(Task):
elif
self
.
OUTPUT_TYPE
==
"multiple_choice"
:
choices
=
self
.
doc_to_choice
(
doc
)
target_delimiter
=
self
.
config
.
target_delimiter
if
kwargs
.
get
(
"apply_chat_template"
,
False
)
is
True
:
target_delimiter
=
""
if
self
.
multiple_input
:
# If there are multiple inputs, choices are placed in the ctx
cont
=
self
.
doc_to_target
(
doc
)
...
...
@@ -1304,7 +1280,6 @@ class ConfigurableTask(Task):
else
:
# Otherwise they are placed in the continuation
arguments
=
[(
ctx
,
f
"
{
target_delimiter
}{
cont
}
"
)
for
cont
in
choices
]
kwargs
.
pop
(
"apply_chat_template"
)
request_list
=
[
Instance
(
...
...
@@ -1341,7 +1316,6 @@ class ConfigurableTask(Task):
elif
self
.
OUTPUT_TYPE
==
"generate_until"
:
arguments
=
(
ctx
,
deepcopy
(
self
.
config
.
generation_kwargs
))
kwargs
.
pop
(
"apply_chat_template"
)
return
Instance
(
request_type
=
self
.
OUTPUT_TYPE
,
doc
=
doc
,
arguments
=
arguments
,
idx
=
0
,
**
kwargs
)
...
...
lm_eval/evaluator.py
View file @
f5d763dc
...
...
@@ -399,7 +399,12 @@ def evaluate(
system_instruction
=
system_instruction
,
apply_chat_template
=
apply_chat_template
,
fewshot_as_multiturn
=
fewshot_as_multiturn
,
lm
=
lm
,
chat_template
=
getattr
(
lm
,
"apply_chat_template"
)
if
apply_chat_template
else
None
,
tokenizer_name
=
getattr
(
lm
,
"tokenizer_name"
,
""
)
if
apply_chat_template
else
""
,
)
eval_logger
.
debug
(
f
"Task:
{
task_output
.
task_name
}
; number of requests on this rank:
{
len
(
task
.
instances
)
}
"
...
...
lm_eval/loggers/evaluation_tracker.py
View file @
f5d763dc
import
json
import
os
import
re
import
time
from
collections
import
defaultdict
from
dataclasses
import
asdict
,
dataclass
from
datetime
import
datetime
from
pathlib
import
Path
from
huggingface_hub.utils
import
build_hf_headers
,
get_session
,
hf_raise_for_status
from
datasets
import
load_dataset
from
datasets.utils.metadata
import
MetadataConfigs
...
...
@@ -212,17 +210,21 @@ class EvaluationTracker:
file_results_aggregated
.
open
(
"w"
,
encoding
=
"utf-8"
).
write
(
dumped
)
if
self
.
api
and
self
.
push_results_to_hub
:
repo_id
=
"open-llm-leaderboard/results_v2"
repo_id
=
(
self
.
hub_results_repo
if
self
.
public_repo
else
self
.
hub_results_repo_private
)
self
.
api
.
create_repo
(
repo_id
=
repo_id
,
repo_type
=
"dataset"
,
private
=
not
self
.
public_repo
,
exist_ok
=
True
,
)
self
.
api
.
upload_f
ile
(
self
.
api
.
upload_f
older
(
repo_id
=
repo_id
,
path_or_fileobj
=
str
(
path
.
joinpath
(
f
"results_
{
self
.
date_id
}
.json"
)
),
path_in_repo
=
os
.
path
.
join
(
self
.
general_config_tracker
.
model_name
,
f
"results_
{
self
.
date_id
}
.json"
)
,
folder_path
=
str
(
path
),
path_in_repo
=
self
.
general_config_tracker
.
model_name
_sanitized
,
repo_type
=
"dataset"
,
commit_message
=
f
"Adding aggregated results for
{
self
.
general_config_tracker
.
model_name
}
"
,
)
...
...
@@ -276,7 +278,6 @@ class EvaluationTracker:
sample
[
"resps"
]
=
sanitize_list
(
sample
[
"resps"
])
sample
[
"filtered_resps"
]
=
sanitize_list
(
sample
[
"filtered_resps"
])
sample
[
"arguments"
]
=
arguments
sample
[
"target"
]
=
str
(
sample
[
"target"
])
sample_dump
=
(
json
.
dumps
(
...
...
@@ -302,13 +303,6 @@ class EvaluationTracker:
private
=
not
self
.
public_repo
,
exist_ok
=
True
,
)
headers
=
build_hf_headers
()
r
=
get_session
().
put
(
url
=
f
"https://huggingface.co/api/datasets/
{
repo_id
}
/settings"
,
headers
=
headers
,
json
=
{
"gated"
:
"auto"
},
)
hf_raise_for_status
(
r
)
self
.
api
.
upload_folder
(
repo_id
=
repo_id
,
folder_path
=
str
(
path
),
...
...
@@ -366,10 +360,7 @@ class EvaluationTracker:
results_datetime
,
)
latest_task_results_datetime
[
samples_key
]
=
latest_datetime
latest_task_results_datetime
[
results_key
]
=
max
(
latest_task_results_datetime
[
results_key
],
latest_datetime
,
)
latest_task_results_datetime
[
results_key
]
=
latest_datetime
# Create metadata card
card_metadata
=
MetadataConfigs
()
...
...
@@ -386,15 +377,14 @@ class EvaluationTracker:
sanitized_last_eval_date_results
=
re
.
sub
(
r
"[^\w\.]"
,
"_"
,
latest_task_results_datetime
[
config_name
]
)
# Ensure that all results files are listed in the metadata card
current_results
=
card_metadata
.
get
(
config_name
,
{
"data_files"
:
[]})
current_results
[
"data_files"
].
append
(
{
"split"
:
eval_date_sanitized
,
"path"
:
[
str
(
results_filename
)]}
)
card_metadata
[
config_name
]
=
current_results
# If the results file is the newest, update the "latest" field in the metadata card
if
eval_date_sanitized
==
sanitized_last_eval_date_results
:
# Ensure that all results files are listed in the metadata card
current_results
=
card_metadata
.
get
(
config_name
,
{
"data_files"
:
[]})
current_results
[
"data_files"
].
append
(
{
"split"
:
eval_date_sanitized
,
"path"
:
[
str
(
results_filename
)]}
)
card_metadata
[
config_name
]
=
current_results
# If the results file is the newest, update the "latest" field in the metadata card
card_metadata
[
config_name
][
"data_files"
].
append
(
{
"split"
:
"latest"
,
"path"
:
[
str
(
results_filename
)]}
)
...
...
@@ -413,20 +403,65 @@ class EvaluationTracker:
sanitized_last_eval_date_results
=
re
.
sub
(
r
"[^\w\.]"
,
"_"
,
latest_task_results_datetime
[
config_name
]
)
# Ensure that all sample results files are listed in the metadata card
current_details_for_task
=
card_metadata
.
get
(
config_name
,
{
"data_files"
:
[]}
)
current_details_for_task
[
"data_files"
].
append
(
{
"split"
:
eval_date_sanitized
,
"path"
:
[
str
(
results_filename
)]}
)
card_metadata
[
config_name
]
=
current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
if
eval_date_sanitized
==
sanitized_last_eval_date_results
:
# Ensure that all sample results files are listed in the metadata card
current_details_for_task
=
card_metadata
.
get
(
config_name
,
{
"data_files"
:
[]}
)
current_details_for_task
[
"data_files"
].
append
(
{
"split"
:
eval_date_sanitized
,
"path"
:
[
str
(
results_filename
)]}
)
card_metadata
[
config_name
]
=
current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
card_metadata
[
config_name
][
"data_files"
].
append
(
{
"split"
:
"latest"
,
"path"
:
[
str
(
results_filename
)]}
)
# Special case for MMLU with a single split covering it all
# We add another config with all MMLU splits results together for easy inspection
SPECIAL_TASKS
=
[
"mmlu"
,
"gpqa"
,
"minerva_math"
]
for
special_task
in
SPECIAL_TASKS
:
if
special_task
in
config_name
:
special_task
=
f
"
{
model_name
}
__
{
special_task
}
"
former_entry
=
card_metadata
.
get
(
special_task
,
{
"data_files"
:
[]})
former_split
=
[
(
i
,
entry
)
for
i
,
entry
in
enumerate
(
former_entry
[
"data_files"
])
if
entry
.
get
(
"split"
,
None
)
==
eval_date_sanitized
]
if
len
(
former_split
)
==
0
:
former_entry
[
"data_files"
].
append
(
{
"split"
:
eval_date_sanitized
,
"path"
:
[
str
(
results_filename
)],
}
)
else
:
split_index
,
_
=
former_split
[
0
]
former_entry
[
"data_files"
][
split_index
][
"path"
].
append
(
str
(
results_filename
)
)
if
eval_date_sanitized
==
sanitized_last_eval_date_results
:
latest_split
=
[
(
i
,
entry
)
for
i
,
entry
in
enumerate
(
former_entry
[
"data_files"
])
if
entry
.
get
(
"split"
,
None
)
==
"latest"
]
if
len
(
latest_split
)
==
0
:
former_entry
[
"data_files"
].
append
(
{
"split"
:
"latest"
,
"path"
:
[
str
(
results_filename
)]}
)
else
:
latest_index
,
_
=
latest_split
[
0
]
former_entry
[
"data_files"
][
latest_index
][
"path"
].
append
(
str
(
results_filename
)
)
card_metadata
[
special_task
]
=
former_entry
# Get latest results and extract info to update metadata card examples
latest_datetime
=
max
(
latest_task_results_datetime
.
values
())
latest_model_name
=
max
(
...
...
lm_eval/models/huggingface.py
View file @
f5d763dc
...
...
@@ -40,6 +40,7 @@ from lm_eval.models.utils import (
eval_logger
=
utils
.
eval_logger
def
_get_accelerate_args
(
device_map_option
:
Optional
[
str
]
=
"auto"
,
max_memory_per_gpu
:
Optional
[
Union
[
int
,
str
]]
=
None
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment