Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
62df55d1
"src/targets/vscode:/vscode.git/clone" did not exist on "ee79e9b76ba8e5586cd0c381706ac8e229707f05"
Commit
62df55d1
authored
May 08, 2024
by
Konrad
Browse files
initial chat template
parent
885f48d6
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
143 additions
and
16 deletions
+143
-16
lm_eval/__main__.py
lm_eval/__main__.py
+7
-0
lm_eval/api/samplers.py
lm_eval/api/samplers.py
+46
-1
lm_eval/api/task.py
lm_eval/api/task.py
+81
-15
lm_eval/evaluator.py
lm_eval/evaluator.py
+9
-0
No files found.
lm_eval/__main__.py
View file @
62df55d1
...
...
@@ -162,6 +162,12 @@ def setup_parser() -> argparse.ArgumentParser:
default
=
False
,
help
=
"If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path."
,
)
parser
.
add_argument
(
"--apply_chat_template"
,
action
=
"store_true"
,
default
=
False
,
help
=
"If True, applies the chat template to the prompt"
,
)
parser
.
add_argument
(
"--show_config"
,
action
=
"store_true"
,
...
...
@@ -357,6 +363,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
check_integrity
=
args
.
check_integrity
,
write_out
=
args
.
write_out
,
log_samples
=
args
.
log_samples
,
apply_chat_template
=
args
.
apply_chat_template
,
gen_kwargs
=
args
.
gen_kwargs
,
task_manager
=
task_manager
,
verbosity
=
args
.
verbosity
,
...
...
lm_eval/api/samplers.py
View file @
62df55d1
...
...
@@ -63,9 +63,54 @@ class ContextSampler:
)
+
self
.
fewshot_delimiter
)
return
labeled_examples
def
get_chat_context
(
self
,
doc
,
num_fewshot
,
chat_history
:
list
=
[],
):
# draw an extra fewshot sample if using same split as evaluating on
n_samples
=
(
num_fewshot
+
1
if
self
.
config
.
fewshot_split
==
self
.
config
.
test_split
else
num_fewshot
)
# draw `n_samples` docs from fewshot_docs
fewshotex
=
self
.
sample
(
n_samples
)
# get rid of the doc that's the one we're evaluating, if it's in the fewshot
# TODO: should we just stop people from using fewshot from same split as evaluating?
selected_docs
=
[
x
for
x
in
fewshotex
if
x
!=
doc
][:
num_fewshot
]
for
doc
in
selected_docs
:
chat_history
.
append
(
{
"role"
:
"user"
,
"content"
:
self
.
doc_to_text
(
doc
)
if
(
self
.
config
.
doc_to_choice
is
None
or
isinstance
(
self
.
doc_to_text
(
doc
),
str
)
)
else
self
.
doc_to_choice
(
doc
)[
self
.
doc_to_text
(
doc
)],
}
)
chat_history
.
append
(
{
"role"
:
"assistant"
,
"content"
:
str
(
self
.
doc_to_target
(
doc
)[
0
])
if
isinstance
(
self
.
doc_to_target
(
doc
),
list
)
else
self
.
doc_to_target
(
doc
)
if
(
self
.
config
.
doc_to_choice
is
None
or
isinstance
(
self
.
doc_to_target
(
doc
),
str
)
)
else
str
(
self
.
doc_to_choice
(
doc
)[
self
.
doc_to_target
(
doc
)]),
}
)
return
chat_history
def
sample
(
self
,
n
):
"""
Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.
...
...
lm_eval/api/task.py
View file @
62df55d1
...
...
@@ -373,6 +373,8 @@ class Task(abc.ABC):
world_size
=
None
,
cache_requests
=
False
,
rewrite_requests_cache
=
False
,
apply_chat_template
=
False
,
tokenizer
=
None
,
)
->
None
:
"""Build a set of Instances for a task, and store them in task.instances"""
...
...
@@ -421,6 +423,8 @@ class Task(abc.ABC):
fewshot_ctx
=
self
.
fewshot_context
(
doc
,
0
if
self
.
config
.
num_fewshot
is
None
else
self
.
config
.
num_fewshot
,
apply_chat_template
,
tokenizer
,
)
# TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
...
...
@@ -957,8 +961,32 @@ class ConfigurableTask(Task):
)
return
super
().
fewshot_docs
()
def
convert_chat_history_to_string
(
self
,
chat_history
:
list
,
tokenizer
=
None
)
->
str
:
"""Returns chat history tokenized or concatenated as a string.
:param chat_history: list
The chat history to convert to a string.
:param tokenizer:
Optional tokenizer to use for applying the chat template, if None, the sampler's fewshot_delimiter is used.
"""
if
tokenizer
:
return
tokenizer
.
apply_chat_template
(
chat_history
,
tokenize
=
False
,
add_generation_prompt
=
True
)
else
:
return
self
.
sampler
.
fewshot_delimiter
+
""
.
join
(
f
"
{
s
[
'role'
]
}
:
{
s
[
'content'
]
}
"
+
self
.
sampler
.
fewshot_delimiter
for
s
in
chat_history
)
@
utils
.
positional_deprecated
def
fewshot_context
(
self
,
doc
:
str
,
num_fewshot
:
int
)
->
str
:
def
fewshot_context
(
self
,
doc
:
str
,
num_fewshot
:
int
,
apply_chat_template
:
bool
=
False
,
tokenizer
=
None
,
)
->
str
:
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
...
...
@@ -966,19 +994,57 @@ class ConfigurableTask(Task):
The document as returned from training_docs, validation_docs, or test_docs.
:param num_fewshot: int
The number of fewshot examples to provide in the returned context string.
:param apply_chat_template: bool
Whether to apply the chat template to the fewshot context.
:param tokenizer:
The tokenizer to use for applying the chat template.
:returns: str
The fewshot context.
"""
if
description
:
=
self
.
config
.
description
:
description
=
utils
.
apply_template
(
self
.
config
.
description
,
doc
)
chat_history
=
[]
if
num_fewshot
==
0
:
# always prepend the (possibly empty) task description
if
apply_chat_template
:
chat_history
.
append
({
"role"
:
"system"
,
"content"
:
description
})
else
:
labeled_examples
=
description
else
:
labeled_examples
=
description
+
self
.
sampler
.
get_context
(
doc
,
num_fewshot
)
if
apply_chat_template
:
chat_history
=
self
.
sampler
.
get_chat_context
(
doc
,
num_fewshot
,
chat_history
)
else
:
labeled_examples
=
description
+
self
.
sampler
.
get_context
(
doc
,
num_fewshot
)
example
=
self
.
doc_to_text
(
doc
)
if
apply_chat_template
:
if
not
self
.
multiple_input
:
if
isinstance
(
example
,
str
):
chat_history
.
append
({
"role"
:
"user"
,
"content"
:
example
})
elif
isinstance
(
example
,
list
):
chat_histories_list
=
[]
for
ex
in
example
:
chat
=
deepcopy
(
chat_history
)
chat
.
append
({
"role"
:
"user"
,
"content"
:
ex
})
chat_histories_list
.
append
(
self
.
convert_chat_history_to_string
(
chat
,
tokenizer
)
)
return
chat_histories_list
elif
isinstance
(
example
,
int
):
if
self
.
config
.
doc_to_choice
is
not
None
:
choices
=
self
.
doc_to_choice
(
doc
)
chat_history
.
append
(
{
"role"
:
"user"
,
"content"
:
choices
[
example
]}
)
else
:
chat_history
.
append
({
"role"
:
"user"
,
"content"
:
str
(
example
)})
return
self
.
convert_chat_history_to_string
(
chat_history
,
tokenizer
)
else
:
if
self
.
multiple_input
:
return
labeled_examples
else
:
...
...
lm_eval/evaluator.py
View file @
62df55d1
...
...
@@ -55,6 +55,7 @@ def simple_evaluate(
check_integrity
:
bool
=
False
,
write_out
:
bool
=
False
,
log_samples
:
bool
=
True
,
apply_chat_template
:
bool
=
False
,
gen_kwargs
:
Optional
[
str
]
=
None
,
task_manager
:
Optional
[
TaskManager
]
=
None
,
verbosity
:
str
=
"INFO"
,
...
...
@@ -99,6 +100,8 @@ def simple_evaluate(
If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param apply_chat_template: bool
If True, apply chat template to the prompt
:param gen_kwargs: str
String arguments for model generation
Ignored for all tasks with loglikelihood output_type
...
...
@@ -262,6 +265,7 @@ def simple_evaluate(
bootstrap_iters
=
bootstrap_iters
,
write_out
=
write_out
,
log_samples
=
log_samples
,
apply_chat_template
=
apply_chat_template
,
verbosity
=
verbosity
,
)
...
...
@@ -317,6 +321,7 @@ def evaluate(
bootstrap_iters
:
Optional
[
int
]
=
100000
,
write_out
:
bool
=
False
,
log_samples
:
bool
=
True
,
apply_chat_template
:
bool
=
False
,
verbosity
:
str
=
"INFO"
,
):
"""Instantiate and evaluate a model on a list of tasks.
...
...
@@ -333,6 +338,8 @@ def evaluate(
If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:param apply_chat_template: bool
If True, apply chat template to the prompt
:return
Dictionary of results
"""
...
...
@@ -362,6 +369,8 @@ def evaluate(
world_size
=
lm
.
world_size
,
cache_requests
=
cache_requests
,
rewrite_requests_cache
=
rewrite_requests_cache
,
apply_chat_template
=
apply_chat_template
,
tokenizer
=
lm
.
tokenizer
,
)
eval_logger
.
debug
(
f
"Task:
{
task_output
.
task_name
}
; number of requests on this rank:
{
len
(
task
.
instances
)
}
"
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment