Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cb8889cc
Commit
cb8889cc
authored
Feb 05, 2024
by
lintangsutawika
Browse files
merged with latest update from main
parents
ec05e561
74119471
Changes
69
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
739 additions
and
139 deletions
+739
-139
lm_eval/models/huggingface.py
lm_eval/models/huggingface.py
+26
-11
lm_eval/models/optimum_lm.py
lm_eval/models/optimum_lm.py
+69
-0
lm_eval/models/vllm_causallms.py
lm_eval/models/vllm_causallms.py
+15
-8
lm_eval/prompts/__init__.py
lm_eval/prompts/__init__.py
+1
-1
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+251
-92
lm_eval/tasks/bbh/_generate_configs.py
lm_eval/tasks/bbh/_generate_configs.py
+2
-2
lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+1
-0
lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+1
-0
lm_eval/tasks/belebele/_generate_configs.py
lm_eval/tasks/belebele/_generate_configs.py
+4
-4
lm_eval/tasks/belebele/belebele_default.yaml
lm_eval/tasks/belebele/belebele_default.yaml
+0
-4
lm_eval/tasks/benchmarks/flan/_held_in_template_yaml
lm_eval/tasks/benchmarks/flan/_held_in_template_yaml
+14
-0
lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+331
-0
lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+13
-0
lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
+0
-6
lm_eval/tasks/bigbench/generate_tasks.py
lm_eval/tasks/bigbench/generate_tasks.py
+1
-1
lm_eval/tasks/blimp/generate_configs.py
lm_eval/tasks/blimp/generate_configs.py
+1
-1
lm_eval/tasks/ceval/_generate_configs.py
lm_eval/tasks/ceval/_generate_configs.py
+3
-3
lm_eval/tasks/cmmlu/_generate_configs.py
lm_eval/tasks/cmmlu/_generate_configs.py
+3
-3
lm_eval/tasks/code_x_glue/code-text/bleu.py
lm_eval/tasks/code_x_glue/code-text/bleu.py
+1
-1
lm_eval/tasks/csatqa/_generate_configs.py
lm_eval/tasks/csatqa/_generate_configs.py
+2
-2
No files found.
lm_eval/models/huggingface.py
View file @
cb8889cc
...
...
@@ -108,8 +108,8 @@ class HFLM(LM):
assert
not
parallelize
,
"`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
self
.
_model
=
pretrained
self
.
_device
=
self
.
_model
.
device
self
.
_config
=
self
.
_model
.
config
gpus
=
0
if
tokenizer
:
assert
isinstance
(
...
...
@@ -200,8 +200,9 @@ class HFLM(LM):
)
# access self._model through self.model property outside this method
self
.
model
.
eval
()
self
.
model
.
tie_weights
()
if
isinstance
(
self
.
model
,
torch
.
nn
.
Module
):
self
.
model
.
eval
()
self
.
model
.
tie_weights
()
if
isinstance
(
pretrained
,
str
)
and
(
gpus
>=
1
or
str
(
self
.
device
)
==
"mps"
):
# TODO: can remove this whole snippet except in the mps case, perhaps?
...
...
@@ -238,6 +239,16 @@ class HFLM(LM):
if
self
.
config
.
model_type
==
"qwen"
:
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
self
.
tokenizer
.
pad_token
=
"<|endoftext|>"
elif
(
self
.
tokenizer
.
__class__
.
__name__
==
"RWKVWorldTokenizer"
or
self
.
tokenizer
.
__class__
.
__name__
==
"Rwkv5Tokenizer"
):
# The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
# The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
# ---
# Note that the world tokenizer class name, might change in the future for the final huggingface merge
# https://github.com/huggingface/transformers/pull/26963
assert
self
.
tokenizer
.
pad_token_id
==
0
else
:
self
.
tokenizer
.
add_special_tokens
({
"pad_token"
:
"<|pad|>"
})
...
...
@@ -361,7 +372,7 @@ class HFLM(LM):
def
_get_backend
(
self
,
config
:
transformers
.
AutoConfig
,
config
:
Union
[
transformers
.
PretrainedConfig
,
transformers
.
AutoConfig
]
,
backend
:
Optional
[
Literal
[
"default"
,
"causal"
,
"seq2seq"
]]
=
"default"
,
trust_remote_code
:
Optional
[
bool
]
=
False
,
)
->
None
:
...
...
@@ -602,8 +613,7 @@ class HFLM(LM):
(
batch_size
,
max_length
),
device
=
self
.
device
).
long
()
for
_
in
range
(
5
):
out
=
F
.
log_softmax
(
self
.
_model_call
(
test_batch
,
**
call_kwargs
),
dim
=-
1
)
out
=
out
# Identity process so that it passes pre-commit
out
=
F
.
log_softmax
(
self
.
_model_call
(
test_batch
,
**
call_kwargs
),
dim
=-
1
)
# noqa: F841
return
batch_size
...
...
@@ -705,10 +715,14 @@ class HFLM(LM):
return
self
.
model
(
inps
).
logits
def
_model_generate
(
self
,
context
,
max_length
,
stop
,
**
generation_kwargs
):
# we require users to pass do_sample=True explicitly
# for non-greedy gen. This should be reevaluated when considering beam search.
if
"do_sample"
not
in
generation_kwargs
:
generation_kwargs
[
"do_sample"
]
=
False
# temperature = 0.0 if not set
# if do_sample is false and temp==0.0:
# remove temperature, as do_sample=False takes care of this
# and we don't want a warning from HF
generation_kwargs
[
"temperature"
]
=
generation_kwargs
.
get
(
"temperature"
,
0.0
)
do_sample
=
generation_kwargs
.
get
(
"do_sample"
,
None
)
if
do_sample
is
False
and
generation_kwargs
.
get
(
"temperature"
)
==
0.0
:
generation_kwargs
.
pop
(
"temperature"
)
# build stopping criteria
stopping_criteria
=
stop_sequences_criteria
(
self
.
tokenizer
,
stop
,
context
.
shape
[
1
],
context
.
shape
[
0
]
...
...
@@ -1045,6 +1059,7 @@ class HFLM(LM):
return
-
len
(
toks
),
x
[
0
]
pbar
=
tqdm
(
total
=
len
(
requests
),
disable
=
(
self
.
rank
!=
0
))
adaptive_batch_size
=
None
if
self
.
batch_size
==
"auto"
:
# using rolling window with maximum context
print
(
"Passed argument batch_size = auto. Detecting largest batch size"
)
...
...
@@ -1089,7 +1104,7 @@ class HFLM(LM):
)
else
:
raise
ValueError
(
f
"Expected `kwargs` to be of type `dict` but got
{
kwargs
}
"
f
"Expected `kwargs` to be of type `dict` but got
{
type
(
gen_
kwargs
)
}
"
)
if
not
until
:
until
=
[
self
.
tok_decode
(
self
.
eot_token_id
)]
...
...
lm_eval/models/optimum_lm.py
0 → 100644
View file @
cb8889cc
from
importlib.util
import
find_spec
from
pathlib
import
Path
from
lm_eval.api.registry
import
register_model
from
lm_eval.models.huggingface
import
HFLM
@
register_model
(
"openvino"
)
class
OptimumLM
(
HFLM
):
"""
Optimum Intel provides a simple interface to optimize Transformer models and convert them to
\
OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on
\
Intel® architectures using OpenVINO™ runtime.
"""
def
__init__
(
self
,
device
=
"cpu"
,
**
kwargs
,
)
->
None
:
if
"backend"
in
kwargs
:
# optimum currently only supports causal models
assert
(
kwargs
[
"backend"
]
==
"causal"
),
"Currently, only OVModelForCausalLM is supported."
self
.
openvino_device
=
device
super
().
__init__
(
device
=
self
.
openvino_device
,
backend
=
kwargs
.
get
(
"backend"
,
"causal"
),
**
kwargs
,
)
def
_create_model
(
self
,
pretrained
:
str
,
revision
=
"main"
,
dtype
=
"auto"
,
trust_remote_code
=
False
,
**
kwargs
,
)
->
None
:
if
not
find_spec
(
"optimum"
):
raise
Exception
(
"package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
)
else
:
from
optimum.intel.openvino
import
OVModelForCausalLM
model_kwargs
=
kwargs
if
kwargs
else
{}
model_file
=
Path
(
pretrained
)
/
"openvino_model.xml"
if
model_file
.
exists
():
export
=
False
else
:
export
=
True
kwargs
[
"ov_config"
]
=
{
"PERFORMANCE_HINT"
:
"LATENCY"
,
"NUM_STREAMS"
:
"1"
,
"CACHE_DIR"
:
""
,
}
self
.
_model
=
OVModelForCausalLM
.
from_pretrained
(
pretrained
,
revision
=
revision
,
trust_remote_code
=
trust_remote_code
,
export
=
export
,
device
=
self
.
openvino_device
.
upper
(),
**
model_kwargs
,
)
lm_eval/models/vllm_causallms.py
View file @
cb8889cc
...
...
@@ -170,18 +170,12 @@ class VLLM(LM):
stop
:
Optional
[
List
[
str
]]
=
None
,
**
kwargs
,
):
if
"do_sample"
in
kwargs
.
keys
():
kwargs
.
pop
(
"do_sample"
)
if
generate
:
# hf defaults
kwargs
[
"skip_special_tokens"
]
=
kwargs
.
get
(
"skip_special_tokens"
,
False
)
kwargs
[
"spaces_between_special_tokens"
]
=
kwargs
.
get
(
"spaces_between_special_tokens"
,
False
)
kwargs
=
self
.
modify_gen_kwargs
(
kwargs
)
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
stop
=
stop
,
**
kwargs
)
else
:
sampling_params
=
SamplingParams
(
temperature
=
0
,
prompt_logprobs
=
2
,
max_tokens
=
1
temperature
=
0
,
prompt_logprobs
=
1
,
max_tokens
=
1
)
if
self
.
data_parallel_size
>
1
:
requests
=
[
list
(
x
)
for
x
in
divide
(
requests
,
self
.
data_parallel_size
)]
...
...
@@ -438,3 +432,16 @@ class VLLM(LM):
break
return
continuation_logprobs
,
is_greedy
@
staticmethod
def
modify_gen_kwargs
(
kwargs
:
dict
)
->
dict
:
# sampling_params
do_sample
=
kwargs
.
pop
(
"do_sample"
,
None
)
if
do_sample
is
False
or
"temperature"
not
in
kwargs
:
kwargs
[
"temperature"
]
=
0.0
# hf defaults
kwargs
[
"skip_special_tokens"
]
=
kwargs
.
get
(
"skip_special_tokens"
,
False
)
kwargs
[
"spaces_between_special_tokens"
]
=
kwargs
.
get
(
"spaces_between_special_tokens"
,
False
)
return
kwargs
lm_eval/prompts/__init__.py
View file @
cb8889cc
...
...
@@ -117,7 +117,7 @@ class PromptString:
# TODO need a way to process doc_to_choice
if
"doc_to_choice"
in
self
.
prompt_string
:
raise
"Not yet implemented to accept doc_to_choice"
raise
Exception
(
"Not yet implemented to accept doc_to_choice"
)
text_string
=
utils
.
apply_template
(
doc_to_text
,
doc
)
target_string
=
utils
.
apply_template
(
doc_to_target
,
doc
)
...
...
lm_eval/tasks/__init__.py
View file @
cb8889cc
import
os
import
abc
import
yaml
import
collections
from
functools
import
partial
from
typing
import
List
,
Union
,
Dict
from
lm_eval
import
utils
from
lm_eval
import
prompts
from
lm_eval.api.task
import
TaskConfig
,
Task
,
ConfigurableTask
from
lm_eval.api.task
import
Task
,
ConfigurableTask
import
logging
# # import python tasks
# import squadv2.task
# import scrolls.task
# python_tasks = {
# "squadv2": squadv2.task.SQuAD2,
# "scrolls_quality": scrolls.task.QuALITY,
# "scrolls_narrativeqa": scrolls.task.NarrativeQA,
# "scrolls_contractnli": scrolls.task.ContractNLI,
# "scrolls_govreport": scrolls.task.GovReport,
# "scrolls_summscreenfd": scrolls.task.SummScreenFD,
# "scrolls_qmsum": scrolls.task.QMSum,
# }
eval_logger
=
utils
.
eval_logger
GROUP_KEYS
=
[
"group"
,
"task"
,
"weight_by_size"
]
PYTHON_TASK_KEYS
=
[
"task"
,
"class"
]
class
TaskManager
(
abc
.
ABC
):
class
TaskManager
:
"""TaskManager indexes all tasks from the default `lm_eval/tasks/`
and an optional directory if provided.
"""
def
__init__
(
self
,
verbosity
=
"INFO"
,
...
...
@@ -40,79 +24,132 @@ class TaskManager(abc.ABC):
self
.
verbosity
=
verbosity
self
.
include_path
=
include_path
self
.
logger
=
eval_logger
.
setLevel
(
getattr
(
logging
,
f
"
{
verbosity
}
"
))
self
.
logger
=
utils
.
eval_logger
self
.
logger
.
setLevel
(
getattr
(
logging
,
f
"
{
verbosity
}
"
))
self
.
ALL_TASKS
=
self
.
initialize_tasks
(
self
.
_task_index
=
self
.
initialize_tasks
(
include_path
=
include_path
)
self
.
_all_tasks
=
sorted
(
list
(
self
.
_task_index
.
keys
()))
self
.
task_group_map
=
collections
.
defaultdict
(
list
)
def
initialize_tasks
(
self
,
include_path
:
str
=
None
):
"""Creates an dictionary of tasks index.
def
initialize_tasks
(
self
,
include_path
=
None
):
:param include_path: str = None
An additional path to be searched for tasks
:return
Dictionary of task names as key and task metadata
"""
all_paths
=
[
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/"
]
if
include_path
is
not
None
:
if
isinstance
(
include_path
,
str
):
include_path
=
[
include_path
]
all_paths
.
extend
(
include_path
)
ALL_TASKS
=
{}
task_index
=
{}
for
task_dir
in
all_paths
:
tasks
=
self
.
_get_task_and_group
(
task_dir
)
ALL_TASKS
=
{
**
tasks
,
**
ALL_TASKS
}
task_index
=
{
**
tasks
,
**
task_index
}
return
ALL_TASKS
return
task_index
@
property
def
all_tasks
(
self
):
return
sorted
(
list
(
self
.
ALL_TASKS
.
keys
()))
return
self
.
_all_tasks
@
property
def
task_index
(
self
):
return
self
.
_task_index
def
match_tasks
(
self
,
task_list
):
return
utils
.
pattern_match
(
task_list
,
self
.
all_tasks
)
def
_name_is_registered
(
self
,
name
):
if
name
in
self
.
ALL_TASKS
:
if
name
in
self
.
all_tasks
:
return
True
return
False
def
_name_is_task
(
self
,
name
):
if
self
.
_name_is_registered
(
name
)
and
(
"task"
in
self
.
ALL_TASKS
[
name
][
"type"
]):
if
self
.
_name_is_registered
(
name
)
and
(
"task"
in
self
.
task_index
[
name
][
"type"
]):
return
True
return
False
def
_name_is_group
(
self
,
name
):
if
self
.
_name_is_registered
(
name
)
and
(
self
.
task_index
[
name
][
"type"
]
==
"group"
):
return
True
return
False
def
_name_is_python_task
(
self
,
name
):
if
self
.
_name_is_registered
(
name
)
and
(
self
.
ALL_TASKS
[
name
][
"type"
]
==
"python_task"
):
if
self
.
_name_is_registered
(
name
)
and
(
self
.
task_index
[
name
][
"type"
]
==
"python_task"
):
return
True
return
False
def
_config_is_task
(
self
,
config
):
if
set
(
config
.
keys
())
<=
set
(
GROUP_KEYS
):
return
False
return
True
if
(
"task"
in
config
)
and
isinstance
(
config
[
"task"
],
str
):
return
True
return
False
def
_config_is_group
(
self
,
config
):
if
(
"task"
in
config
)
and
isinstance
(
config
[
"task"
],
list
):
return
True
return
False
def
_config_is_python_task
(
self
,
config
):
if
set
(
config
.
keys
())
==
set
(
PYTHON_TASK_KEYS
)
:
if
"class"
in
config
:
return
True
return
False
def
_get_yaml_path
(
self
,
name
):
assert
name
in
self
.
ALL_TASKS
return
self
.
ALL_TASKS
[
name
][
"yaml_path"
]
assert
name
in
self
.
task_index
return
self
.
task_index
[
name
][
"yaml_path"
]
def
_get_config
(
self
,
name
):
assert
name
in
self
.
ALL_TASKS
assert
name
in
self
.
task_index
yaml_path
=
self
.
_get_yaml_path
(
name
)
return
utils
.
load_yaml_config
(
"full"
,
yaml_path
)
if
yaml_path
==
-
1
:
return
{}
else
:
return
utils
.
load_yaml_config
(
yaml_path
,
mode
=
"full"
)
def
_get_tasklist
(
self
,
name
):
assert
self
.
_name_is_task
(
name
)
==
False
return
self
.
ALL_TASKS
[
name
][
"task"
]
return
self
.
task_index
[
name
][
"task"
]
def
_process_alias
(
self
,
config
,
group
=
None
):
# If the group is not the same as the original
# group which the group alias was intended for,
# Set the group_alias to None instead.
if
(
"group_alias"
in
config
)
and
(
"group"
in
config
)
and
group
is
not
None
:
if
config
[
"group"
]
!=
group
:
config
[
"group_alias"
]
=
None
return
config
def
_load_individual_task_or_group
(
self
,
name_or_config
:
Union
[
str
,
dict
]
=
None
,
parent_name
:
str
=
None
,
update_config
:
dict
=
None
update_config
:
dict
=
None
,
yaml_path
:
str
=
None
,
)
->
ConfigurableTask
:
def
load_task
(
config
,
task
,
group
=
None
,
is_python_class
=
False
):
if
is_python_class
:
def
load_task
(
config
,
task
,
group
=
None
,
yaml_path
=
None
):
if
"include"
in
config
:
assert
yaml_path
is
not
None
config
.
update
(
utils
.
load_yaml_config
(
yaml_path
,
yaml_config
=
{
"include"
:
config
.
pop
(
"include"
)},
mode
=
"full"
,
)
)
if
self
.
_config_is_python_task
(
config
):
task_object
=
config
[
"class"
]()
else
:
config
=
self
.
_process_alias
(
config
,
group
=
group
)
task_object
=
ConfigurableTask
(
config
=
config
)
if
group
is
not
None
:
task_object
=
(
group
,
task_object
)
...
...
@@ -124,15 +161,26 @@ class TaskManager(abc.ABC):
name_or_config
=
{
"task"
:
name_or_config
,
**
update_config
}
elif
self
.
_name_is_task
(
name_or_config
):
task_config
=
self
.
_get_config
(
name_or_config
)
is_python_class
=
False
if
self
.
_name_is_python_task
(
name_or_config
):
is_python_class
=
True
return
load_task
(
task_config
,
task
=
name_or_config
,
group
=
parent_name
,
is_python_class
=
is_python_class
)
return
load_task
(
task_config
,
task
=
name_or_config
,
group
=
parent_name
)
else
:
group_name
=
name_or_config
subtask_list
=
self
.
_get_tasklist
(
name_or_config
)
if
subtask_list
==
-
1
:
subtask_list
=
self
.
_get_config
(
name_or_config
)[
"task"
]
group_config
=
self
.
_get_config
(
name_or_config
)
subtask_list
=
group_config
[
"task"
]
# This checks if we're at the root.
if
parent_name
is
None
:
group_config
=
self
.
_get_config
(
name_or_config
)
if
set
(
group_config
.
keys
())
>
set
([
"task"
,
"group"
]):
update_config
=
{
k
:
v
for
k
,
v
in
group_config
.
items
()
if
k
not
in
[
"task"
,
"group"
]
}
yaml_path
=
self
.
_get_yaml_path
(
group_name
)
if
(
update_config
is
not
None
)
and
(
"group_alias"
in
update_config
):
group_name
=
update_config
[
"group_alias"
]
update_config
.
pop
(
"group_alias"
)
if
isinstance
(
name_or_config
,
dict
):
...
...
@@ -145,7 +193,8 @@ class TaskManager(abc.ABC):
if
self
.
_config_is_task
(
name_or_config
):
name
=
name_or_config
[
"task"
]
# If the name is registered as a group
if
self
.
_name_is_task
(
name
)
is
False
:
# if self._name_is_task(name) is False:
if
self
.
_name_is_group
(
name
):
group_name
=
name
update_config
=
{
k
:
v
for
k
,
v
in
name_or_config
.
items
()
if
k
!=
"task"
}
subtask_list
=
self
.
_get_tasklist
(
name
)
...
...
@@ -154,28 +203,49 @@ class TaskManager(abc.ABC):
else
:
if
self
.
_name_is_registered
(
name
):
base_task_config
=
self
.
_get_config
(
name
)
# Check if this is a duplicate.
if
parent_name
is
not
None
:
name_or_config
[
"group"
]
=
parent_name
num_duplicate
=
len
(
list
(
filter
(
lambda
x
:
x
.
startswith
(
name
),
self
.
task_group_map
[
parent_name
])))
if
num_duplicate
>
0
:
name
=
f
"
{
name
}
-
{
num_duplicate
}
"
self
.
task_group_map
[
parent_name
].
append
(
name
)
task_config
=
{
**
base_task_config
,
**
name_or_config
,
}
else
:
task_config
=
name_or_config
return
load_task
(
task_config
,
task
=
name
,
group
=
parent_name
)
return
load_task
(
task_config
,
task
=
name
,
group
=
parent_name
,
yaml_path
=
yaml_path
)
else
:
group_name
=
name_or_config
[
"group"
]
subtask_list
=
name_or_config
[
"task"
]
# update_config = {k:v for k,v in name_or_config.items() if k != "task"}
if
set
(
name_or_config
.
keys
())
>
set
([
"task"
,
"group"
]):
update_config
=
{
k
:
v
for
k
,
v
in
name_or_config
.
items
()
if
k
not
in
[
"task"
,
"group"
]
}
all_subtasks
=
{}
if
(
parent_name
is
not
None
)
and
((
self
.
_name_is_registered
(
group_name
)
is
False
)
or
(
self
.
_get_yaml_path
(
group_name
)
==
-
1
))
:
if
(
parent_name
is
not
None
):
all_subtasks
=
{
group_name
:
(
parent_name
,
None
)}
fn
=
partial
(
self
.
_load_individual_task_or_group
,
parent_name
=
group_name
,
update_config
=
update_config
)
fn
=
partial
(
self
.
_load_individual_task_or_group
,
parent_name
=
group_name
,
update_config
=
update_config
,
yaml_path
=
yaml_path
)
all_subtasks
=
{
**
all_subtasks
,
**
dict
(
collections
.
ChainMap
(
*
map
(
fn
,
subtask_list
)))}
return
all_subtasks
def
load_task_or_group
(
self
,
task_list
:
Union
[
str
,
list
]
=
None
)
->
dict
:
"""Loads a dictionary of task objects from a list
:param task_list: Union[str, list] = None
Single string or list of string of task names to be loaded
:return
Dictionary of task objects
"""
if
isinstance
(
task_list
,
str
):
task_list
=
[
task_list
]
...
...
@@ -189,20 +259,43 @@ class TaskManager(abc.ABC):
)
return
all_loaded_tasks
def
load_config
(
self
,
config
:
Dict
):
return
self
.
_load_individual_task_or_group
(
config
)
def
_get_task_and_group
(
self
,
task_dir
:
str
):
"""Creates an dictionary of tasks index with the following metadata,
- `type`, that can be either `task`, `python_task`, or `group`.
`task` refer to regular task configs, `python_task` are special
yaml files that only consists of `task` and `class` parameters.
`group` are group configs.
- `yaml_path`, path to the yaml file. If the entry is a `group` that
was configured through a task config, the yaml_path will be -1
and all subtasks will be listed in `task` (see below)
- `task`, reserved for entries with `type` as `group`. This will list
all subtasks. When a group config is created (as opposed to task
config having `group` parameter set), this will be set to -1 to
avoid recursive indexing. The whole list of subtasks will be loaded
at evaluation.
:param task_dir: str
A directory to check for tasks
:return
Dictionary of task names as key and task metadata
"""
tasks_and_groups
=
collections
.
defaultdict
()
for
root
,
_
,
file_list
in
os
.
walk
(
task_dir
):
for
f
in
file_list
:
if
f
.
endswith
(
".yaml"
):
yaml_path
=
os
.
path
.
join
(
root
,
f
)
config
=
utils
.
load_yaml_config
(
"simple"
,
yaml_path
)
if
se
t
(
config
.
keys
())
==
set
(
PYTHON_TASK_KEYS
):
config
=
utils
.
load_yaml_config
(
yaml_path
,
mode
=
"simple"
)
if
se
lf
.
_
config
_is_python_task
(
config
):
# This is a python class config
tasks_and_groups
[
config
[
"task"
]]
=
{
"type"
:
"python_task"
,
"yaml_path"
:
yaml_path
,
}
elif
se
t
(
config
.
keys
())
<=
set
(
GROUP_KEYS
):
elif
se
lf
.
_
config
_is_group
(
config
):
# This is a group config
tasks_and_groups
[
config
[
"group"
]]
=
{
"type"
:
"group"
,
...
...
@@ -213,7 +306,17 @@ class TaskManager(abc.ABC):
# when called.
"yaml_path"
:
yaml_path
,
}
else
:
# # Registered the level 1 tasks from a group config
# for config in config["task"]:
# if isinstance(config, dict) and self._config_is_task(config):
# task = config["task"]
# tasks_and_groups[task] = {
# "type": "task",
# "yaml_path": yaml_path,
# }
elif
self
.
_config_is_task
(
config
):
# This is a task config
task
=
config
[
"task"
]
tasks_and_groups
[
task
]
=
{
...
...
@@ -235,41 +338,97 @@ class TaskManager(abc.ABC):
}
else
:
tasks_and_groups
[
group
][
"task"
].
append
(
task
)
else
:
self
.
logger
.
debug
(
f
"File
{
f
}
in
{
root
}
could not be loaded"
)
return
tasks_and_groups
# def check_prompt_config(
# config: Dict[str, str], yaml_path: str = None
# ) -> List[Dict[str, str]]:
# all_configs = []
# if "use_prompt" in config:
# prompt_list = prompts.load_prompt_list(
# use_prompt=config["use_prompt"],
# dataset_name=config["dataset_path"],
# subset_name=config["dataset_name"] if "dataset_name" in config else None,
# yaml_path=yaml_path,
# )
# for idx, prompt_variation in enumerate(prompt_list):
# all_configs.append(
# {
# **config,
# **{"use_prompt": prompt_variation},
# **{
# "task": "_".join(
# [
# config["task"]
# if "task" in config
# else get_task_name_from_config(config),
# prompt_variation.split("/")[-1]
# if ".yaml" in prompt_variation
# else prompt_variation,
# ]
# )
# },
# **{"output_type": "generate_until"},
# }
# )
# else:
# all_configs.append(config)
# return all_configs
def
include_path
(
task_dir
):
logger
=
utils
.
eval_logger
logger
.
setLevel
(
getattr
(
logging
,
"INFO"
))
logger
.
info
(
"To still use tasks loaded from args.include_path,"
"see an example of the new TaskManager API in https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
return
0
def
initialize_tasks
(
verbosity
=
"INFO"
):
logger
=
utils
.
eval_logger
logger
.
setLevel
(
getattr
(
logging
,
f
"
{
verbosity
}
"
))
logger
.
info
(
"lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
"It will be removed in v0.4.2 release. "
"TaskManager will instead be used."
)
return
0
def
get_task_name_from_config
(
task_config
:
Dict
[
str
,
str
])
->
str
:
if
"task"
in
task_config
:
return
task_config
[
"task"
]
if
"dataset_name"
in
task_config
:
return
"{dataset_path}_{dataset_name}"
.
format
(
**
task_config
)
else
:
return
"{dataset_path}"
.
format
(
**
task_config
)
def
get_task_name_from_object
(
task_object
):
if
hasattr
(
task_object
,
"config"
):
return
task_object
.
_config
[
"task"
]
# TODO: scrap this
# this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
return
(
task_object
.
EVAL_HARNESS_NAME
if
hasattr
(
task_object
,
"EVAL_HARNESS_NAME"
)
else
type
(
task_object
).
__name__
)
def
get_task_dict
(
task_name_list
:
List
[
Union
[
str
,
Dict
,
Task
]],
task_manager
:
TaskManager
=
None
):
"""Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
:param task_name_list: List[Union[str, Dict, Task]]
Name of model or LM object, see lm_eval.models.get_model
:param task_manager: TaskManager = None
A TaskManager object that stores indexed tasks. If not set,
task_manager will load one. This should be set by the user
if there are additional paths that want to be included
via `include_path`
:return
Dictionary of task objects
"""
task_name_from_string_dict
=
{}
task_name_from_config_dict
=
{}
task_name_from_object_dict
=
{}
if
isinstance
(
task_name_list
,
str
):
task_name_list
=
[
task_name_list
]
string_task_name_list
=
[
task
for
task
in
task_name_list
if
isinstance
(
task
,
str
)]
others_task_name_list
=
[
task
for
task
in
task_name_list
if
~
isinstance
(
task
,
str
)]
if
len
(
string_task_name_list
)
>
0
:
if
task_manager
is
None
:
task_manager
=
TaskManager
()
task_name_from_string_dict
=
task_manager
.
load_task_or_group
(
string_task_name_list
)
for
task_element
in
others_task_name_list
:
if
isinstance
(
task_element
,
dict
):
task_name_from_config_dict
=
{
**
task_name_from_config_dict
,
**
task_manager
.
load_config
(
config
=
task_element
),
}
elif
isinstance
(
task_element
,
Task
):
task_name_from_object_dict
=
{
**
task_name_from_object_dict
,
get_task_name_from_object
(
task_element
):
task_element
,
}
assert
set
(
task_name_from_string_dict
.
keys
()).
isdisjoint
(
set
(
task_name_from_object_dict
.
keys
())
)
return
{
**
task_name_from_string_dict
,
**
task_name_from_config_dict
,
**
task_name_from_object_dict
,
}
lm_eval/tasks/bbh/_generate_configs.py
View file @
cb8889cc
...
...
@@ -28,7 +28,7 @@ if __name__ == "__main__":
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
with
open
(
args
.
base_yaml_path
)
as
f
:
with
open
(
args
.
base_yaml_path
,
encoding
=
"utf-8"
)
as
f
:
base_yaml
=
yaml
.
full_load
(
f
)
base_doc_to_text
=
"Q: {{input}}
\n
A:"
...
...
@@ -70,7 +70,7 @@ if __name__ == "__main__":
file_save_path
=
args
.
save_prefix_path
+
f
"/
{
task
}
.yaml"
utils
.
eval_logger
.
info
(
f
"Saving yaml for subset
{
task
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml_dict
,
yaml_file
,
...
...
lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
View file @
cb8889cc
...
...
@@ -29,3 +29,4 @@ filter_list:
num_fewshot: 0
metadata:
version: 2.0
num_fewshot: 3 # controls what is printed in n-shot
lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
View file @
cb8889cc
...
...
@@ -20,3 +20,4 @@ generation_kwargs:
num_fewshot: 0
metadata:
version: 1.0
num_fewshot: 3 # will be printed in results table
lm_eval/tasks/belebele/_generate_configs.py
View file @
cb8889cc
...
...
@@ -27,13 +27,13 @@ if __name__ == "__main__":
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
with
open
(
args
.
base_yaml_path
)
as
f
:
with
open
(
args
.
base_yaml_path
,
encoding
=
"utf-8"
)
as
f
:
base_yaml
=
yaml
.
full_load
(
f
)
if
args
.
cot_prompt_path
is
not
None
:
import
json
with
open
(
args
.
cot_prompt_path
)
as
f
:
with
open
(
args
.
cot_prompt_path
,
encoding
=
"utf-8"
)
as
f
:
cot_file
=
json
.
load
(
f
)
def
query
():
...
...
@@ -42,7 +42,7 @@ if __name__ == "__main__":
print
(
query
())
languages
=
[
split
[
"split"
]
for
split
in
query
()]
for
lang
in
tqdm
(
languages
):
for
lang
in
tqdm
(
[
lang
for
lang
in
languages
if
"default"
not
in
lang
]
):
yaml_dict
=
{
"include"
:
base_yaml_name
,
"task"
:
f
"belebele_
{
args
.
task_prefix
}
_
{
lang
}
"
...
...
@@ -54,7 +54,7 @@ if __name__ == "__main__":
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
lang
}
.yaml"
logging
.
info
(
f
"Saving yaml for subset
{
lang
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml_dict
,
yaml_file
,
...
...
lm_eval/tasks/belebele/belebele_default.yaml
deleted
100644 → 0
View file @
ec05e561
"
fewshot_split"
:
"
default"
"
include"
:
"
_default_template_yaml"
"
task"
:
"
belebele_default"
"
test_split"
:
"
default"
lm_eval/tasks/benchmarks/flan/_held_in_template_yaml
0 → 100644
View file @
cb8889cc
output_type: generate_until
test_split: null
doc_to_choice: null
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
generation_kwargs:
until:
- "</s>"
do_sample: false
temperature: 0.0
metadata:
version: 1.0
lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
0 → 100644
View file @
cb8889cc
group
:
flan_held_in
group_alias
:
Flan (Held-In)
task
:
# ANLI R1
-
group
:
anli_r1_flan
group_alias
:
ANLI R1
task
:
-
task
:
anli_r1
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Choose
your
answer:
based
on
the
paragraph
above
can
we
conclude
that
\"
{{hypothesis}}
\"
?
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
I
think
the
answer
is"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Based
on
that
paragraph
can
we
conclude
that
this
sentence
is
true?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Can
we
draw
the
following
conclusion?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Does
this
next
sentence
follow,
given
the
preceding
text?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Can
we
infer
the
following?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
The
answer
is:"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
following
paragraph
and
determine
if
the
hypothesis
is
true:
\n\n
{{premise}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
Hypothesis:
{{hypothesis}}
\n\n\n
"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
text
and
determine
if
the
sentence
is
true
(see
options
at
the
end):
\n\n
{{premise}}
\n\n
Sentence:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-7
include
:
_held_in_template_yaml
doc_to_text
:
"
Can
we
draw
the
following
hypothesis
from
the
context
(see
options)?
\n\n
Context:
\n\n
{{premise}}
\n\n
Hypothesis:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-8
include
:
_held_in_template_yaml
doc_to_text
:
"
Choose
from
options:
Determine
if
the
sentence
is
true
based
on
the
text
below:
\n
{{hypothesis}}
\n\n
{{premise}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
# ANLI R2
-
group
:
anli_r2_flan
group_alias
:
ANLI R2
task
:
-
task
:
anli_r2
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Choose
your
answer:
based
on
the
paragraph
above
can
we
conclude
that
\"
{{hypothesis}}
\"
?
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
I
think
the
answer
is"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Based
on
that
paragraph
can
we
conclude
that
this
sentence
is
true?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Can
we
draw
the
following
conclusion?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Does
this
next
sentence
follow,
given
the
preceding
text?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Can
we
infer
the
following?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
The
answer
is:"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
following
paragraph
and
determine
if
the
hypothesis
is
true:
\n\n
{{premise}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
Hypothesis:
{{hypothesis}}
\n\n\n
"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
text
and
determine
if
the
sentence
is
true
(see
options
at
the
end):
\n\n
{{premise}}
\n\n
Sentence:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-7
include
:
_held_in_template_yaml
doc_to_text
:
"
Can
we
draw
the
following
hypothesis
from
the
context
(see
options)?
\n\n
Context:
\n\n
{{premise}}
\n\n
Hypothesis:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-8
include
:
_held_in_template_yaml
doc_to_text
:
"
Choose
from
options:
Determine
if
the
sentence
is
true
based
on
the
text
below:
\n
{{hypothesis}}
\n\n
{{premise}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
# ANLI R3
-
group
:
anli_r3_flan
group_alias
:
ANLI R3
task
:
-
task
:
anli_r3
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Choose
your
answer:
based
on
the
paragraph
above
can
we
conclude
that
\"
{{hypothesis}}
\"
?
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
I
think
the
answer
is"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Based
on
that
paragraph
can
we
conclude
that
this
sentence
is
true?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Can
we
draw
the
following
conclusion?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Does
this
next
sentence
follow,
given
the
preceding
text?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Can
we
infer
the
following?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
The
answer
is:"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
following
paragraph
and
determine
if
the
hypothesis
is
true:
\n\n
{{premise}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
Hypothesis:
{{hypothesis}}
\n\n\n
"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
text
and
determine
if
the
sentence
is
true
(see
options
at
the
end):
\n\n
{{premise}}
\n\n
Sentence:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-7
include
:
_held_in_template_yaml
doc_to_text
:
"
Can
we
draw
the
following
hypothesis
from
the
context
(see
options)?
\n\n
Context:
\n\n
{{premise}}
\n\n
Hypothesis:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-8
include
:
_held_in_template_yaml
doc_to_text
:
"
Choose
from
options:
Determine
if
the
sentence
is
true
based
on
the
text
below:
\n
{{hypothesis}}
\n\n
{{premise}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
# Arc Easy
-
group
:
arc_easy_flan
group_alias
:
Arc Easy
task
:
-
task
:
arc_easy
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{question}}
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
Question:
{{question}}
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}
\n
Answer:"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
Question:
{{question}}
\n\n
What
is
the
correct
answer
to
the
question
from
the
following
choices?
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
Q:
{{question}}
\n
What
is
the
correct
answer
to
this
question?
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}...A:"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
Choose
your
answer?
\n\n
{{question}}
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Answer
the
question
\n\n
{{question}}
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
{{question}}
\n\n
Pick
the
answer
from
these
options
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
# Arc Challenge
-
group
:
arc_challenge_flan
group_alias
:
Arc Challenge
task
:
-
task
:
arc_challenge
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{question}}
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
Question:
{{question}}
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}
\n
Answer:"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
Question:
{{question}}
\n\n
What
is
the
correct
answer
to
the
question
from
the
following
choices?
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
Q:
{{question}}
\n
What
is
the
correct
answer
to
this
question?
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}...A:"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
Choose
your
answer?
\n\n
{{question}}
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Answer
the
question
\n\n
{{question}}
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
{{question}}
\n\n
Pick
the
answer
from
these
options
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
# BoolQ
-
group
:
boolq_flan
group_alias
:
BoolQ
task
:
-
task
:
boolq
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
Can
we
conclude
that
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
Is
it
true
that
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
Text:
{{passage}}
\n\n
Question:
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
What's
the
best
answer
to
this
question:
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n
Based
on
the
above
text
what's
the
best
answer
to
this
question:
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n
Answer
this
question
making
sure
that
the
answer
is
supposed
by
the
text:
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-7
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
Is
the
following
statement
correct
based
on
the
text
\n\n
{{question}}
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-8
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
Is
this
statement
correct
\"
{{question}}
\"
?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-9
include
:
_held_in_template_yaml
doc_to_text
:
"
Is
it
true
that
{{question}}
based
on
the
following
text?
\n\n
{{passage}}
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
# RTE
-
group
:
rte_flan
group_alias
:
RTE
task
:
-
task
:
rte
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{sentence1}}
\n\n
Question
with
options:
Based
on
the
paragraph
above
can
we
conclude
that
\"
{{sentence2}}
\"
?
\n\n
OPTIONS:
\n
-
yes
\n
-
no"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
{{sentence1}}
\n\n
Based
on
that
paragraph
can
we
conclude
that
the
sentence
below
is
true?
\n
{{sentence2}}
\n\n
OPTIONS:
\n
-
yes
\n
-
no"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
{{sentence1}}
\n\n
Q
with
options:
Can
we
draw
the
following
conclusion?
\n
{{sentence2}}
\n\n
OPTIONS:
\n
-
yes
\n
-
no"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
{{sentence1}}
\n
Does
this
next
sentence
follow,
given
the
preceding
text?
\n
{{sentence2}}
\n\n
OPTIONS:
\n
-
yes
\n
-
no"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
{{sentence1}}
\n
OPTIONS:
\n
-
yes
\n
-
no
\n
Question:
Can
we
infer
the
following?
\n
{{sentence2}}"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
following
paragraph
and
determine
if
the
hypothesis
is
true.
Select
from
options
at
the
end:
\n\n
{{sentence1}}
\n\n
Hypothesis:
{{sentence2}}
\n
OPTIONS:
\n
-
yes
\n
-
no
\n
The
answer
is"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
text
and
determine
if
the
sentence
is
true:
\n\n
{{sentence1}}
\n\n
Sentence:
{{sentence2}}
\n
OPTIONS:
\n
-
yes
\n
-
no
\n
A:"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-7
include
:
_held_in_template_yaml
doc_to_text
:
"
Question
with
options:
can
we
draw
the
following
hypothesis
from
the
context?
\n\n
Context:
\n\n
{{sentence1}}
\n\n
Hypothesis:
{{sentence2}}
\n
OPTIONS:
\n
-
yes
\n
-
no
\n
A:"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-8
include
:
_held_in_template_yaml
doc_to_text
:
"
Determine
if
the
sentence
is
true
based
on
the
text
below.
Choose
from
options.
\n
{{sentence2}}
\n\n
{{sentence1}}
\n
OPTIONS:
\n
-
yes
\n
-
no"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
0 → 100644
View file @
cb8889cc
group
:
flan_held_out
task
:
# BBH
-
bbh_zeroshot
-
bbh_fewshot
-
bbh_cot_fewshot
-
bbh_cot_zeroshot
# MMLU
-
mmlu
-
mmlu_flan_n_shot_generative
-
mmlu_flan_n_shot_loglikelihood
-
mmlu_flan_cot_zeroshot
-
mmlu_flan_cot_fewshot
lm_eval/tasks/benchmarks/multimedqa/multimedqa.yaml
View file @
cb8889cc
...
...
@@ -5,19 +5,13 @@ task:
-
medqa_4options
-
task
:
mmlu_anatomy
task_alias
:
"
anatomy
(mmlu)"
group_alias
:
null
-
task
:
mmlu_clinical_knowledge
task_alias
:
"
clinical_knowledge
(mmlu)"
group_alias
:
null
-
task
:
mmlu_college_medicine
task_alias
:
"
college_medicine
(mmlu)"
group_alias
:
null
-
task
:
mmlu_medical_genetics
task_alias
:
"
medical_genetics
(mmlu)"
group_alias
:
null
-
task
:
mmlu_professional_medicine
task_alias
:
"
professional_medicine
(mmlu)"
group_alias
:
null
-
task
:
mmlu_college_biology
task_alias
:
"
college_biology
(mmlu)"
group_alias
:
null
lm_eval/tasks/bigbench/generate_tasks.py
View file @
cb8889cc
...
...
@@ -181,7 +181,7 @@ def main() -> None:
for
task
in
all_subtasks
:
file_name
=
f
"
{
task
}
.yaml"
try
:
with
open
(
f
"
{
path
}
/
{
file_name
}
"
,
"w"
)
as
f
:
with
open
(
f
"
{
path
}
/
{
file_name
}
"
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
"# Generated by utils.py
\n
"
)
yaml
.
dump
(
{
...
...
lm_eval/tasks/blimp/generate_configs.py
View file @
cb8889cc
...
...
@@ -75,7 +75,7 @@ def main() -> None:
for
task
in
all_subtasks
:
file_name
=
f
"
{
task
}
.yaml"
try
:
with
open
(
f
"
{
file_name
}
"
,
"w"
)
as
f
:
with
open
(
f
"
{
file_name
}
"
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
"# Generated by utils.py
\n
"
)
yaml
.
dump
(
{
...
...
lm_eval/tasks/ceval/_generate_configs.py
View file @
cb8889cc
...
...
@@ -79,13 +79,13 @@ if __name__ == "__main__":
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
with
open
(
args
.
base_yaml_path
)
as
f
:
with
open
(
args
.
base_yaml_path
,
encoding
=
"utf-8"
)
as
f
:
base_yaml
=
yaml
.
full_load
(
f
)
if
args
.
cot_prompt_path
is
not
None
:
import
json
with
open
(
args
.
cot_prompt_path
)
as
f
:
with
open
(
args
.
cot_prompt_path
,
encoding
=
"utf-8"
)
as
f
:
cot_file
=
json
.
load
(
f
)
for
subject_eng
,
subject_zh
in
tqdm
(
SUBJECTS
.
items
()):
...
...
@@ -107,7 +107,7 @@ if __name__ == "__main__":
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
subject_eng
}
.yaml"
eval_logger
.
info
(
f
"Saving yaml for subset
{
subject_eng
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml_dict
,
yaml_file
,
...
...
lm_eval/tasks/cmmlu/_generate_configs.py
View file @
cb8889cc
...
...
@@ -94,13 +94,13 @@ if __name__ == "__main__":
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
with
open
(
args
.
base_yaml_path
)
as
f
:
with
open
(
args
.
base_yaml_path
,
encoding
=
"utf-8"
)
as
f
:
base_yaml
=
yaml
.
full_load
(
f
)
if
args
.
cot_prompt_path
is
not
None
:
import
json
with
open
(
args
.
cot_prompt_path
)
as
f
:
with
open
(
args
.
cot_prompt_path
,
encoding
=
"utf-8"
)
as
f
:
cot_file
=
json
.
load
(
f
)
for
subject_eng
,
subject_zh
in
tqdm
(
SUBJECTS
.
items
()):
...
...
@@ -122,7 +122,7 @@ if __name__ == "__main__":
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
subject_eng
}
.yaml"
eval_logger
.
info
(
f
"Saving yaml for subset
{
subject_eng
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml_dict
,
yaml_file
,
...
...
lm_eval/tasks/code_x_glue/code-text/bleu.py
View file @
cb8889cc
...
...
@@ -184,7 +184,7 @@ def splitPuncts(line):
def
computeMaps
(
predictions
,
goldfile
):
predictionMap
:
Dict
[
str
,
list
]
=
{}
goldMap
:
Dict
[
str
,
list
]
=
{}
gf
=
open
(
goldfile
,
"r"
)
gf
=
open
(
goldfile
,
"r"
,
encoding
=
"utf-8"
)
for
row
in
predictions
:
cols
=
row
.
strip
().
split
(
"
\t
"
)
...
...
lm_eval/tasks/csatqa/_generate_configs.py
View file @
cb8889cc
...
...
@@ -25,7 +25,7 @@ if __name__ == "__main__":
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
with
open
(
args
.
base_yaml_path
)
as
f
:
with
open
(
args
.
base_yaml_path
,
encoding
=
"utf-8"
)
as
f
:
base_yaml
=
yaml
.
full_load
(
f
)
for
name
in
tqdm
(
SUBSETS
):
...
...
@@ -39,7 +39,7 @@ if __name__ == "__main__":
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
name
.
lower
()
}
.yaml"
eval_logger
.
info
(
f
"Saving yaml for subset
{
name
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml_dict
,
yaml_file
,
...
...
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment