Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
f66fc06f
Commit
f66fc06f
authored
Feb 01, 2024
by
haileyschoelkopf
Browse files
fix merge conflicts
parents
b13753cd
d714fc95
Changes
84
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
821 additions
and
367 deletions
+821
-367
lm_eval/models/huggingface.py
lm_eval/models/huggingface.py
+26
-11
lm_eval/models/optimum_lm.py
lm_eval/models/optimum_lm.py
+69
-0
lm_eval/models/vllm_causallms.py
lm_eval/models/vllm_causallms.py
+15
-8
lm_eval/prompts/__init__.py
lm_eval/prompts/__init__.py
+1
-1
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+366
-232
lm_eval/tasks/arc/arc_easy.yaml
lm_eval/tasks/arc/arc_easy.yaml
+1
-1
lm_eval/tasks/bbh/_generate_configs.py
lm_eval/tasks/bbh/_generate_configs.py
+2
-2
lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
+1
-0
lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
+1
-0
lm_eval/tasks/belebele/_generate_configs.py
lm_eval/tasks/belebele/_generate_configs.py
+4
-4
lm_eval/tasks/belebele/belebele_default.yaml
lm_eval/tasks/belebele/belebele_default.yaml
+0
-4
lm_eval/tasks/benchmarks/flan/_held_in_template_yaml
lm_eval/tasks/benchmarks/flan/_held_in_template_yaml
+2
-1
lm_eval/tasks/benchmarks/flan/flan_anli.yaml
lm_eval/tasks/benchmarks/flan/flan_anli.yaml
+0
-17
lm_eval/tasks/benchmarks/flan/flan_arc.yaml
lm_eval/tasks/benchmarks/flan/flan_arc.yaml
+0
-14
lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
+0
-7
lm_eval/tasks/benchmarks/flan/flan_cot.yaml
lm_eval/tasks/benchmarks/flan/flan_cot.yaml
+0
-11
lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+329
-4
lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
+0
-39
lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+4
-4
lm_eval/tasks/benchmarks/flan/flan_rte.yaml
lm_eval/tasks/benchmarks/flan/flan_rte.yaml
+0
-7
No files found.
lm_eval/models/huggingface.py
View file @
f66fc06f
...
@@ -108,8 +108,8 @@ class HFLM(LM):
...
@@ -108,8 +108,8 @@ class HFLM(LM):
assert
not
parallelize
,
"`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
assert
not
parallelize
,
"`parallelize=True` is not compatible with passing pre-initialized model to `pretrained`"
self
.
_model
=
pretrained
self
.
_model
=
pretrained
self
.
_device
=
self
.
_model
.
device
self
.
_device
=
self
.
_model
.
device
self
.
_config
=
self
.
_model
.
config
self
.
_config
=
self
.
_model
.
config
gpus
=
0
if
tokenizer
:
if
tokenizer
:
assert
isinstance
(
assert
isinstance
(
...
@@ -200,8 +200,9 @@ class HFLM(LM):
...
@@ -200,8 +200,9 @@ class HFLM(LM):
)
)
# access self._model through self.model property outside this method
# access self._model through self.model property outside this method
self
.
model
.
eval
()
if
isinstance
(
self
.
model
,
torch
.
nn
.
Module
):
self
.
model
.
tie_weights
()
self
.
model
.
eval
()
self
.
model
.
tie_weights
()
if
isinstance
(
pretrained
,
str
)
and
(
gpus
>=
1
or
str
(
self
.
device
)
==
"mps"
):
if
isinstance
(
pretrained
,
str
)
and
(
gpus
>=
1
or
str
(
self
.
device
)
==
"mps"
):
# TODO: can remove this whole snippet except in the mps case, perhaps?
# TODO: can remove this whole snippet except in the mps case, perhaps?
...
@@ -238,6 +239,16 @@ class HFLM(LM):
...
@@ -238,6 +239,16 @@ class HFLM(LM):
if
self
.
config
.
model_type
==
"qwen"
:
if
self
.
config
.
model_type
==
"qwen"
:
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
# Qwen's trust_remote_code tokenizer does not allow for adding special tokens
self
.
tokenizer
.
pad_token
=
"<|endoftext|>"
self
.
tokenizer
.
pad_token
=
"<|endoftext|>"
elif
(
self
.
tokenizer
.
__class__
.
__name__
==
"RWKVWorldTokenizer"
or
self
.
tokenizer
.
__class__
.
__name__
==
"Rwkv5Tokenizer"
):
# The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0)
# The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer
# ---
# Note that the world tokenizer class name, might change in the future for the final huggingface merge
# https://github.com/huggingface/transformers/pull/26963
assert
self
.
tokenizer
.
pad_token_id
==
0
else
:
else
:
self
.
tokenizer
.
add_special_tokens
({
"pad_token"
:
"<|pad|>"
})
self
.
tokenizer
.
add_special_tokens
({
"pad_token"
:
"<|pad|>"
})
...
@@ -361,7 +372,7 @@ class HFLM(LM):
...
@@ -361,7 +372,7 @@ class HFLM(LM):
def
_get_backend
(
def
_get_backend
(
self
,
self
,
config
:
transformers
.
AutoConfig
,
config
:
Union
[
transformers
.
PretrainedConfig
,
transformers
.
AutoConfig
]
,
backend
:
Optional
[
Literal
[
"default"
,
"causal"
,
"seq2seq"
]]
=
"default"
,
backend
:
Optional
[
Literal
[
"default"
,
"causal"
,
"seq2seq"
]]
=
"default"
,
trust_remote_code
:
Optional
[
bool
]
=
False
,
trust_remote_code
:
Optional
[
bool
]
=
False
,
)
->
None
:
)
->
None
:
...
@@ -602,8 +613,7 @@ class HFLM(LM):
...
@@ -602,8 +613,7 @@ class HFLM(LM):
(
batch_size
,
max_length
),
device
=
self
.
device
(
batch_size
,
max_length
),
device
=
self
.
device
).
long
()
).
long
()
for
_
in
range
(
5
):
for
_
in
range
(
5
):
out
=
F
.
log_softmax
(
self
.
_model_call
(
test_batch
,
**
call_kwargs
),
dim
=-
1
)
out
=
F
.
log_softmax
(
self
.
_model_call
(
test_batch
,
**
call_kwargs
),
dim
=-
1
)
# noqa: F841
out
=
out
# Identity process so that it passes pre-commit
return
batch_size
return
batch_size
...
@@ -705,10 +715,14 @@ class HFLM(LM):
...
@@ -705,10 +715,14 @@ class HFLM(LM):
return
self
.
model
(
inps
).
logits
return
self
.
model
(
inps
).
logits
def
_model_generate
(
self
,
context
,
max_length
,
stop
,
**
generation_kwargs
):
def
_model_generate
(
self
,
context
,
max_length
,
stop
,
**
generation_kwargs
):
# we require users to pass do_sample=True explicitly
# temperature = 0.0 if not set
# for non-greedy gen. This should be reevaluated when considering beam search.
# if do_sample is false and temp==0.0:
if
"do_sample"
not
in
generation_kwargs
:
# remove temperature, as do_sample=False takes care of this
generation_kwargs
[
"do_sample"
]
=
False
# and we don't want a warning from HF
generation_kwargs
[
"temperature"
]
=
generation_kwargs
.
get
(
"temperature"
,
0.0
)
do_sample
=
generation_kwargs
.
get
(
"do_sample"
,
None
)
if
do_sample
is
False
and
generation_kwargs
.
get
(
"temperature"
)
==
0.0
:
generation_kwargs
.
pop
(
"temperature"
)
# build stopping criteria
# build stopping criteria
stopping_criteria
=
stop_sequences_criteria
(
stopping_criteria
=
stop_sequences_criteria
(
self
.
tokenizer
,
stop
,
context
.
shape
[
1
],
context
.
shape
[
0
]
self
.
tokenizer
,
stop
,
context
.
shape
[
1
],
context
.
shape
[
0
]
...
@@ -1045,6 +1059,7 @@ class HFLM(LM):
...
@@ -1045,6 +1059,7 @@ class HFLM(LM):
return
-
len
(
toks
),
x
[
0
]
return
-
len
(
toks
),
x
[
0
]
pbar
=
tqdm
(
total
=
len
(
requests
),
disable
=
(
self
.
rank
!=
0
))
pbar
=
tqdm
(
total
=
len
(
requests
),
disable
=
(
self
.
rank
!=
0
))
adaptive_batch_size
=
None
if
self
.
batch_size
==
"auto"
:
if
self
.
batch_size
==
"auto"
:
# using rolling window with maximum context
# using rolling window with maximum context
print
(
"Passed argument batch_size = auto. Detecting largest batch size"
)
print
(
"Passed argument batch_size = auto. Detecting largest batch size"
)
...
@@ -1089,7 +1104,7 @@ class HFLM(LM):
...
@@ -1089,7 +1104,7 @@ class HFLM(LM):
)
)
else
:
else
:
raise
ValueError
(
raise
ValueError
(
f
"Expected `kwargs` to be of type `dict` but got
{
kwargs
}
"
f
"Expected `kwargs` to be of type `dict` but got
{
type
(
gen_
kwargs
)
}
"
)
)
if
not
until
:
if
not
until
:
until
=
[
self
.
tok_decode
(
self
.
eot_token_id
)]
until
=
[
self
.
tok_decode
(
self
.
eot_token_id
)]
...
...
lm_eval/models/optimum_lm.py
0 → 100644
View file @
f66fc06f
from
importlib.util
import
find_spec
from
pathlib
import
Path
from
lm_eval.api.registry
import
register_model
from
lm_eval.models.huggingface
import
HFLM
@
register_model
(
"openvino"
)
class
OptimumLM
(
HFLM
):
"""
Optimum Intel provides a simple interface to optimize Transformer models and convert them to
\
OpenVINO™ Intermediate Representation (IR) format to accelerate end-to-end pipelines on
\
Intel® architectures using OpenVINO™ runtime.
"""
def
__init__
(
self
,
device
=
"cpu"
,
**
kwargs
,
)
->
None
:
if
"backend"
in
kwargs
:
# optimum currently only supports causal models
assert
(
kwargs
[
"backend"
]
==
"causal"
),
"Currently, only OVModelForCausalLM is supported."
self
.
openvino_device
=
device
super
().
__init__
(
device
=
self
.
openvino_device
,
backend
=
kwargs
.
get
(
"backend"
,
"causal"
),
**
kwargs
,
)
def
_create_model
(
self
,
pretrained
:
str
,
revision
=
"main"
,
dtype
=
"auto"
,
trust_remote_code
=
False
,
**
kwargs
,
)
->
None
:
if
not
find_spec
(
"optimum"
):
raise
Exception
(
"package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
)
else
:
from
optimum.intel.openvino
import
OVModelForCausalLM
model_kwargs
=
kwargs
if
kwargs
else
{}
model_file
=
Path
(
pretrained
)
/
"openvino_model.xml"
if
model_file
.
exists
():
export
=
False
else
:
export
=
True
kwargs
[
"ov_config"
]
=
{
"PERFORMANCE_HINT"
:
"LATENCY"
,
"NUM_STREAMS"
:
"1"
,
"CACHE_DIR"
:
""
,
}
self
.
_model
=
OVModelForCausalLM
.
from_pretrained
(
pretrained
,
revision
=
revision
,
trust_remote_code
=
trust_remote_code
,
export
=
export
,
device
=
self
.
openvino_device
.
upper
(),
**
model_kwargs
,
)
lm_eval/models/vllm_causallms.py
View file @
f66fc06f
...
@@ -170,18 +170,12 @@ class VLLM(LM):
...
@@ -170,18 +170,12 @@ class VLLM(LM):
stop
:
Optional
[
List
[
str
]]
=
None
,
stop
:
Optional
[
List
[
str
]]
=
None
,
**
kwargs
,
**
kwargs
,
):
):
if
"do_sample"
in
kwargs
.
keys
():
kwargs
.
pop
(
"do_sample"
)
if
generate
:
if
generate
:
# hf defaults
kwargs
=
self
.
modify_gen_kwargs
(
kwargs
)
kwargs
[
"skip_special_tokens"
]
=
kwargs
.
get
(
"skip_special_tokens"
,
False
)
kwargs
[
"spaces_between_special_tokens"
]
=
kwargs
.
get
(
"spaces_between_special_tokens"
,
False
)
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
stop
=
stop
,
**
kwargs
)
sampling_params
=
SamplingParams
(
max_tokens
=
max_tokens
,
stop
=
stop
,
**
kwargs
)
else
:
else
:
sampling_params
=
SamplingParams
(
sampling_params
=
SamplingParams
(
temperature
=
0
,
prompt_logprobs
=
2
,
max_tokens
=
1
temperature
=
0
,
prompt_logprobs
=
1
,
max_tokens
=
1
)
)
if
self
.
data_parallel_size
>
1
:
if
self
.
data_parallel_size
>
1
:
requests
=
[
list
(
x
)
for
x
in
divide
(
requests
,
self
.
data_parallel_size
)]
requests
=
[
list
(
x
)
for
x
in
divide
(
requests
,
self
.
data_parallel_size
)]
...
@@ -438,3 +432,16 @@ class VLLM(LM):
...
@@ -438,3 +432,16 @@ class VLLM(LM):
break
break
return
continuation_logprobs
,
is_greedy
return
continuation_logprobs
,
is_greedy
@
staticmethod
def
modify_gen_kwargs
(
kwargs
:
dict
)
->
dict
:
# sampling_params
do_sample
=
kwargs
.
pop
(
"do_sample"
,
None
)
if
do_sample
is
False
or
"temperature"
not
in
kwargs
:
kwargs
[
"temperature"
]
=
0.0
# hf defaults
kwargs
[
"skip_special_tokens"
]
=
kwargs
.
get
(
"skip_special_tokens"
,
False
)
kwargs
[
"spaces_between_special_tokens"
]
=
kwargs
.
get
(
"spaces_between_special_tokens"
,
False
)
return
kwargs
lm_eval/prompts/__init__.py
View file @
f66fc06f
...
@@ -117,7 +117,7 @@ class PromptString:
...
@@ -117,7 +117,7 @@ class PromptString:
# TODO need a way to process doc_to_choice
# TODO need a way to process doc_to_choice
if
"doc_to_choice"
in
self
.
prompt_string
:
if
"doc_to_choice"
in
self
.
prompt_string
:
raise
"Not yet implemented to accept doc_to_choice"
raise
Exception
(
"Not yet implemented to accept doc_to_choice"
)
text_string
=
utils
.
apply_template
(
doc_to_text
,
doc
)
text_string
=
utils
.
apply_template
(
doc_to_text
,
doc
)
target_string
=
utils
.
apply_template
(
doc_to_target
,
doc
)
target_string
=
utils
.
apply_template
(
doc_to_target
,
doc
)
...
...
lm_eval/tasks/__init__.py
View file @
f66fc06f
import
os
import
os
import
yaml
import
abc
import
collections
from
functools
import
partial
from
typing
import
List
,
Union
,
Dict
from
typing
import
List
,
Union
,
Dict
from
lm_eval
import
utils
from
lm_eval
import
utils
from
lm_eval
import
prompts
from
lm_eval.api.task
import
Task
,
ConfigurableTask
from
lm_eval.api.task
import
TaskConfig
,
Task
,
ConfigurableTask
from
lm_eval.api.registry
import
(
register_task
,
register_group
,
TASK_REGISTRY
,
GROUP_REGISTRY
,
ALL_TASKS
,
)
import
logging
import
logging
# import python tasks
from
.squadv2.task
import
SQuAD2
from
.scrolls.task
import
(
QuALITY
,
NarrativeQA
,
ContractNLI
,
GovReport
,
SummScreenFD
,
QMSum
,
)
eval_logger
=
utils
.
eval_logger
def
register_configurable_task
(
config
:
Dict
[
str
,
str
])
->
int
:
SubClass
=
type
(
config
[
"task"
]
+
"ConfigurableTask"
,
(
ConfigurableTask
,),
{
"CONFIG"
:
TaskConfig
(
**
config
)},
)
if
"task"
in
config
:
class
TaskManager
:
task_name
=
"{}"
.
format
(
config
[
"
task
"
])
"""TaskManager indexes all tasks from the default `lm_eval/
task
s/`
register_task
(
task_name
)(
SubClass
)
and an optional directory if provided.
if
"group"
in
config
:
"""
if
config
[
"group"
]
==
config
[
"task"
]:
def
__init__
(
raise
ValueError
(
"task and group name cannot be the same"
)
self
,
elif
type
(
config
[
"group"
])
==
str
:
verbosity
=
"INFO"
,
group_name
=
[
config
[
"group"
]]
include_path
=
None
else
:
)
->
None
:
group_name
=
config
[
"group"
]
self
.
verbosity
=
verbosity
self
.
include_path
=
include_path
self
.
logger
=
utils
.
eval_logger
self
.
logger
.
setLevel
(
getattr
(
logging
,
f
"
{
verbosity
}
"
))
self
.
_task_index
=
self
.
initialize_tasks
(
include_path
=
include_path
)
self
.
_all_tasks
=
sorted
(
list
(
self
.
_task_index
.
keys
()))
for
group
in
group_name
:
self
.
task_group_map
=
collections
.
defaultdict
(
list
)
register_group
(
group
)(
SubClass
)
return
0
def
initialize_tasks
(
self
,
include_path
:
str
=
None
):
"""Creates an dictionary of tasks index.
:param include_path: str = None
An additional path to be searched for tasks
:return
Dictionary of task names as key and task metadata
"""
all_paths
=
[
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/"
]
if
include_path
is
not
None
:
if
isinstance
(
include_path
,
str
):
include_path
=
[
include_path
]
all_paths
.
extend
(
include_path
)
task_index
=
{}
for
task_dir
in
all_paths
:
tasks
=
self
.
_get_task_and_group
(
task_dir
)
task_index
=
{
**
tasks
,
**
task_index
}
def
register_configurable_group
(
config
:
Dict
[
str
,
str
],
yaml_path
:
str
=
None
)
->
int
:
return
task_index
group
=
config
[
"group"
]
all_task_list
=
config
[
"task"
]
@
property
config_list
=
[
task
for
task
in
all_task_list
if
type
(
task
)
!=
str
]
def
all_tasks
(
self
):
task_list
=
[
task
for
task
in
all_task_list
if
type
(
task
)
==
str
]
return
self
.
_all_tasks
for
task_config
in
config_list
:
@
property
def
task_index
(
self
):
base_config
=
{}
return
self
.
_task_index
task_name_config
=
{}
if
"task"
in
task_config
:
def
match_tasks
(
self
,
task_list
):
task_name
=
task_config
[
"task"
]
return
utils
.
pattern_match
(
if
task_name
in
ALL_TASKS
:
task_list
,
self
.
all_tasks
task_obj
=
get_task_dict
(
task_name
)[
task_name
]
if
type
(
task_obj
)
==
tuple
:
_
,
task_obj
=
task_obj
if
task_obj
is
not
None
:
base_config
=
task_obj
.
_config
.
to_dict
(
keep_callable
=
True
)
task_name_config
[
"task"
]
=
f
"
{
group
}
_
{
task_name
}
"
task_config
=
utils
.
load_yaml_config
(
yaml_path
,
task_config
)
var_configs
=
check_prompt_config
(
{
**
base_config
,
**
task_config
,
**
{
"group"
:
group
},
**
task_name_config
,
},
yaml_path
=
os
.
path
.
dirname
(
yaml_path
),
)
)
for
config
in
var_configs
:
register_configurable_task
(
config
)
def
_name_is_registered
(
self
,
name
):
if
name
in
self
.
all_tasks
:
task_names
=
utils
.
pattern_match
(
task_list
,
ALL_TASKS
)
return
True
for
task
in
task_names
:
return
False
if
(
task
in
TASK_REGISTRY
)
or
(
task
in
GROUP_REGISTRY
):
if
group
in
GROUP_REGISTRY
:
def
_name_is_task
(
self
,
name
):
GROUP_REGISTRY
[
group
].
append
(
task
)
if
self
.
_name_is_registered
(
name
)
and
(
"task"
in
self
.
task_index
[
name
][
"type"
]):
return
True
return
False
def
_name_is_group
(
self
,
name
):
if
self
.
_name_is_registered
(
name
)
and
(
self
.
task_index
[
name
][
"type"
]
==
"group"
):
return
True
return
False
def
_name_is_python_task
(
self
,
name
):
if
self
.
_name_is_registered
(
name
)
and
(
self
.
task_index
[
name
][
"type"
]
==
"python_task"
):
return
True
return
False
def
_config_is_task
(
self
,
config
):
if
(
"task"
in
config
)
and
isinstance
(
config
[
"task"
],
str
):
return
True
return
False
def
_config_is_group
(
self
,
config
):
if
(
"task"
in
config
)
and
isinstance
(
config
[
"task"
],
list
):
return
True
return
False
def
_config_is_python_task
(
self
,
config
):
if
"class"
in
config
:
return
True
return
False
def
_get_yaml_path
(
self
,
name
):
assert
name
in
self
.
task_index
return
self
.
task_index
[
name
][
"yaml_path"
]
def
_get_config
(
self
,
name
):
assert
name
in
self
.
task_index
yaml_path
=
self
.
_get_yaml_path
(
name
)
if
yaml_path
==
-
1
:
return
{}
else
:
return
utils
.
load_yaml_config
(
yaml_path
,
mode
=
"full"
)
def
_get_tasklist
(
self
,
name
):
assert
self
.
_name_is_task
(
name
)
==
False
return
self
.
task_index
[
name
][
"task"
]
def
_process_alias
(
self
,
config
,
group
=
None
):
# If the group is not the same as the original
# group which the group alias was intended for,
# Set the group_alias to None instead.
if
(
"group_alias"
in
config
)
and
(
"group"
in
config
)
and
group
is
not
None
:
if
config
[
"group"
]
!=
group
:
config
[
"group_alias"
]
=
None
return
config
def
_load_individual_task_or_group
(
self
,
name_or_config
:
Union
[
str
,
dict
]
=
None
,
parent_name
:
str
=
None
,
update_config
:
dict
=
None
,
yaml_path
:
str
=
None
,
)
->
ConfigurableTask
:
def
load_task
(
config
,
task
,
group
=
None
,
yaml_path
=
None
):
if
"include"
in
config
:
assert
yaml_path
is
not
None
config
.
update
(
utils
.
load_yaml_config
(
yaml_path
,
yaml_config
=
{
"include"
:
config
.
pop
(
"include"
)},
mode
=
"full"
,
)
)
if
self
.
_config_is_python_task
(
config
):
task_object
=
config
[
"class"
]()
else
:
config
=
self
.
_process_alias
(
config
,
group
=
group
)
task_object
=
ConfigurableTask
(
config
=
config
)
if
group
is
not
None
:
task_object
=
(
group
,
task_object
)
return
{
task
:
task_object
}
if
isinstance
(
name_or_config
,
str
):
if
update_config
is
not
None
:
# Process name_or_config as a dict instead
name_or_config
=
{
"task"
:
name_or_config
,
**
update_config
}
elif
self
.
_name_is_task
(
name_or_config
):
task_config
=
self
.
_get_config
(
name_or_config
)
return
load_task
(
task_config
,
task
=
name_or_config
,
group
=
parent_name
)
else
:
else
:
GROUP_REGISTRY
[
group
]
=
[
task
]
group_name
=
name_or_config
ALL_TASKS
.
add
(
group
)
subtask_list
=
self
.
_get_tasklist
(
name_or_config
)
if
subtask_list
==
-
1
:
group_config
=
self
.
_get_config
(
name_or_config
)
subtask_list
=
group_config
[
"task"
]
# This checks if we're at the root.
if
parent_name
is
None
:
group_config
=
self
.
_get_config
(
name_or_config
)
if
set
(
group_config
.
keys
())
>
set
([
"task"
,
"group"
]):
update_config
=
{
k
:
v
for
k
,
v
in
group_config
.
items
()
if
k
not
in
[
"task"
,
"group"
]
}
yaml_path
=
self
.
_get_yaml_path
(
group_name
)
return
0
if
(
update_config
is
not
None
)
and
(
"group_alias"
in
update_config
):
group_name
=
update_config
[
"group_alias"
]
update_config
.
pop
(
"group_alias"
)
if
isinstance
(
name_or_config
,
dict
):
def
check_prompt_config
(
if
update_config
is
not
None
:
config
:
Dict
[
str
,
str
],
yaml_path
:
str
=
None
name_or_config
=
{
)
->
List
[
Dict
[
str
,
str
]]:
**
name_or_config
,
all_configs
=
[]
**
update_config
,
if
"use_prompt"
in
config
:
prompt_list
=
prompts
.
load_prompt_list
(
use_prompt
=
config
[
"use_prompt"
],
dataset_name
=
config
[
"dataset_path"
],
subset_name
=
config
[
"dataset_name"
]
if
"dataset_name"
in
config
else
None
,
yaml_path
=
yaml_path
,
)
for
idx
,
prompt_variation
in
enumerate
(
prompt_list
):
all_configs
.
append
(
{
**
config
,
**
{
"use_prompt"
:
prompt_variation
},
**
{
"task"
:
"_"
.
join
(
[
config
[
"task"
]
if
"task"
in
config
else
get_task_name_from_config
(
config
),
prompt_variation
.
split
(
"/"
)[
-
1
]
if
".yaml"
in
prompt_variation
else
prompt_variation
,
]
)
},
**
{
"output_type"
:
"generate_until"
},
}
}
)
else
:
all_configs
.
append
(
config
)
return
all_configs
def
get_task_name_from_config
(
task_config
:
Dict
[
str
,
str
])
->
str
:
if
self
.
_config_is_task
(
name_or_config
):
if
"dataset_name"
in
task_config
:
name
=
name_or_config
[
"task"
]
return
"{dataset_path}_{dataset_name}"
.
format
(
**
task_config
)
# If the name is registered as a group
else
:
# if self._name_is_task(name) is False:
return
"{dataset_path}"
.
format
(
**
task_config
)
if
self
.
_name_is_group
(
name
):
group_name
=
name
update_config
=
{
k
:
v
for
k
,
v
in
name_or_config
.
items
()
if
k
!=
"task"
}
subtask_list
=
self
.
_get_tasklist
(
name
)
if
subtask_list
==
-
1
:
subtask_list
=
self
.
_get_config
(
name
)[
"task"
]
else
:
if
self
.
_name_is_registered
(
name
):
base_task_config
=
self
.
_get_config
(
name
)
# Check if this is a duplicate.
if
parent_name
is
not
None
:
name_or_config
[
"group"
]
=
parent_name
num_duplicate
=
len
(
list
(
filter
(
lambda
x
:
x
.
startswith
(
name
),
self
.
task_group_map
[
parent_name
])))
if
num_duplicate
>
0
:
name
=
f
"
{
name
}
-
{
num_duplicate
}
"
self
.
task_group_map
[
parent_name
].
append
(
name
)
task_config
=
{
**
base_task_config
,
**
name_or_config
,
}
else
:
task_config
=
name_or_config
return
load_task
(
task_config
,
task
=
name
,
group
=
parent_name
,
yaml_path
=
yaml_path
)
else
:
group_name
=
name_or_config
[
"group"
]
subtask_list
=
name_or_config
[
"task"
]
# update_config = {k:v for k,v in name_or_config.items() if k != "task"}
if
set
(
name_or_config
.
keys
())
>
set
([
"task"
,
"group"
]):
update_config
=
{
k
:
v
for
k
,
v
in
name_or_config
.
items
()
if
k
not
in
[
"task"
,
"group"
]
}
all_subtasks
=
{}
if
(
parent_name
is
not
None
):
all_subtasks
=
{
group_name
:
(
parent_name
,
None
)}
def
include_task_folder
(
task_dir
:
str
,
register_task
:
bool
=
True
)
->
None
:
fn
=
partial
(
self
.
_load_individual_task_or_group
,
parent_name
=
group_name
,
update_config
=
update_config
,
yaml_path
=
yaml_path
)
"""
all_subtasks
=
{
**
all_subtasks
,
**
dict
(
collections
.
ChainMap
(
*
map
(
fn
,
subtask_list
)))}
Calling this function
return
all_subtasks
"""
# Track whether any tasks failed during loading
import_fail
=
False
for
root
,
subdirs
,
file_list
in
os
.
walk
(
task_dir
):
# if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
for
f
in
file_list
:
if
f
.
endswith
(
".yaml"
):
yaml_path
=
os
.
path
.
join
(
root
,
f
)
try
:
config
=
utils
.
load_yaml_config
(
yaml_path
)
if
"task"
not
in
config
:
continue
all_configs
=
check_prompt_config
(
config
,
yaml_path
=
os
.
path
.
dirname
(
yaml_path
)
)
for
config
in
all_configs
:
if
register_task
:
if
type
(
config
[
"task"
])
==
str
:
register_configurable_task
(
config
)
else
:
if
type
(
config
[
"task"
])
==
list
:
register_configurable_group
(
config
,
yaml_path
)
# Log this silently and show it only when
# the user defines the appropriate verbosity.
except
(
ImportError
,
ModuleNotFoundError
)
as
e
:
import_fail
=
True
eval_logger
.
debug
(
f
"
{
yaml_path
}
:
{
e
}
. Config will not be added to registry."
)
except
Exception
as
error
:
import
traceback
eval_logger
.
warning
(
"Unexpected error loading config in
\n
"
f
"
{
yaml_path
}
\n
"
" Config will not be added to registry
\n
"
f
" Error:
{
error
}
\n
"
f
" Traceback:
{
traceback
.
format_exc
()
}
"
)
if
import_fail
:
def
load_task_or_group
(
self
,
task_list
:
Union
[
str
,
list
]
=
None
)
->
dict
:
eval_logger
.
warning
(
"""Loads a dictionary of task objects from a list
"Some tasks could not be loaded due to missing dependencies."
" Run with `--verbosity DEBUG` for full details."
)
return
0
:param task_list: Union[str, list] = None
Single string or list of string of task names to be loaded
def
include_path
(
task_dir
):
:return
include_task_folder
(
task_dir
)
Dictionary of task objects
# Register Benchmarks after all tasks have been added
"""
include_task_folder
(
task_dir
,
register_task
=
False
)
if
isinstance
(
task_list
,
str
):
return
0
task_list
=
[
task_list
]
all_loaded_tasks
=
dict
(
collections
.
ChainMap
(
*
map
(
self
.
_load_individual_task_or_group
,
task_list
)
)
)
return
all_loaded_tasks
def
load_config
(
self
,
config
:
Dict
):
return
self
.
_load_individual_task_or_group
(
config
)
def
_get_task_and_group
(
self
,
task_dir
:
str
):
"""Creates an dictionary of tasks index with the following metadata,
- `type`, that can be either `task`, `python_task`, or `group`.
`task` refer to regular task configs, `python_task` are special
yaml files that only consists of `task` and `class` parameters.
`group` are group configs.
- `yaml_path`, path to the yaml file. If the entry is a `group` that
was configured through a task config, the yaml_path will be -1
and all subtasks will be listed in `task` (see below)
- `task`, reserved for entries with `type` as `group`. This will list
all subtasks. When a group config is created (as opposed to task
config having `group` parameter set), this will be set to -1 to
avoid recursive indexing. The whole list of subtasks will be loaded
at evaluation.
:param task_dir: str
A directory to check for tasks
:return
Dictionary of task names as key and task metadata
"""
tasks_and_groups
=
collections
.
defaultdict
()
for
root
,
_
,
file_list
in
os
.
walk
(
task_dir
):
for
f
in
file_list
:
if
f
.
endswith
(
".yaml"
):
yaml_path
=
os
.
path
.
join
(
root
,
f
)
config
=
utils
.
load_yaml_config
(
yaml_path
,
mode
=
"simple"
)
if
self
.
_config_is_python_task
(
config
):
# This is a python class config
tasks_and_groups
[
config
[
"task"
]]
=
{
"type"
:
"python_task"
,
"yaml_path"
:
yaml_path
,
}
elif
self
.
_config_is_group
(
config
):
# This is a group config
tasks_and_groups
[
config
[
"group"
]]
=
{
"type"
:
"group"
,
"task"
:
-
1
,
# This signals that
# we don't need to know
# the task list for indexing
# as it can be loaded
# when called.
"yaml_path"
:
yaml_path
,
}
def
initialize_tasks
(
verbosity
=
"INFO"
):
# # Registered the level 1 tasks from a group config
eval_logger
.
setLevel
(
getattr
(
logging
,
f
"
{
verbosity
}
"
))
# for config in config["task"]:
# if isinstance(config, dict) and self._config_is_task(config):
# task = config["task"]
# tasks_and_groups[task] = {
# "type": "task",
# "yaml_path": yaml_path,
# }
elif
self
.
_config_is_task
(
config
):
# This is a task config
task
=
config
[
"task"
]
tasks_and_groups
[
task
]
=
{
"type"
:
"task"
,
"yaml_path"
:
yaml_path
,
}
task_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/"
if
"group"
in
config
:
include_path
(
task_dir
)
groups
=
config
[
"group"
]
if
isinstance
(
config
[
"group"
],
str
):
groups
=
[
groups
]
for
group
in
groups
:
if
group
not
in
tasks_and_groups
:
tasks_and_groups
[
group
]
=
{
"type"
:
"group"
,
"task"
:
[
task
],
"yaml_path"
:
-
1
,
}
else
:
tasks_and_groups
[
group
][
"task"
].
append
(
task
)
else
:
self
.
logger
.
debug
(
f
"File
{
f
}
in
{
root
}
could not be loaded"
)
return
tasks_and_groups
def
include_path
(
task_dir
):
logger
=
utils
.
eval_logger
logger
.
setLevel
(
getattr
(
logging
,
"INFO"
))
logger
.
info
(
"To still use tasks loaded from args.include_path,"
"see an example of the new TaskManager API in https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
)
return
0
def
get_task
(
task_name
,
config
):
def
initialize_tasks
(
verbosity
=
"INFO"
):
try
:
logger
=
utils
.
eval_logger
return
TASK_REGISTRY
[
task_name
](
config
=
config
)
logger
.
setLevel
(
getattr
(
logging
,
f
"
{
verbosity
}
"
))
except
KeyError
:
logger
.
info
(
eval_logger
.
info
(
"Available tasks:"
)
"lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
eval_logger
.
info
(
list
(
TASK_REGISTRY
)
+
list
(
GROUP_REGISTRY
))
"It will be removed in v0.4.2 release. "
raise
KeyError
(
f
"Missing task
{
task_name
}
"
)
"TaskManager will instead be used."
)
return
0
def
get_task_name_from_config
(
task_config
:
Dict
[
str
,
str
])
->
str
:
if
"task"
in
task_config
:
return
task_config
[
"task"
]
if
"dataset_name"
in
task_config
:
return
"{dataset_path}_{dataset_name}"
.
format
(
**
task_config
)
else
:
return
"{dataset_path}"
.
format
(
**
task_config
)
def
get_task_name_from_object
(
task_object
):
def
get_task_name_from_object
(
task_object
):
for
name
,
class_
in
TASK_REGISTRY
.
items
():
if
hasattr
(
task_object
,
"config"
):
if
class_
is
task_object
:
return
task_object
.
_config
[
"task"
]
return
name
# TODO: scrap this
# TODO: scrap this
# this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
# this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
...
@@ -234,54 +382,40 @@ def get_task_name_from_object(task_object):
...
@@ -234,54 +382,40 @@ def get_task_name_from_object(task_object):
else
type
(
task_object
).
__name__
else
type
(
task_object
).
__name__
)
)
def
get_task_dict
(
task_name_list
:
List
[
Union
[
str
,
Dict
,
Task
]],
task_manager
:
TaskManager
=
None
):
"""Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
# TODO: pass num_fewshot and other cmdline overrides in a better way
:param task_name_list: List[Union[str, Dict, Task]]
def
get_task_dict
(
task_name_list
:
List
[
Union
[
str
,
Dict
,
Task
]],
**
kwargs
):
Name of model or LM object, see lm_eval.models.get_model
config
=
{
**
kwargs
}
:param task_manager: TaskManager = None
A TaskManager object that stores indexed tasks. If not set,
task_manager will load one. This should be set by the user
if there are additional paths that want to be included
via `include_path`
task_name_from_registry_dict
=
{}
:return
Dictionary of task objects
"""
task_name_from_string_dict
=
{}
task_name_from_config_dict
=
{}
task_name_from_config_dict
=
{}
task_name_from_object_dict
=
{}
task_name_from_object_dict
=
{}
if
typ
e
(
task_name_list
)
!=
list
:
if
isinstanc
e
(
task_name_list
,
str
)
:
task_name_list
=
[
task_name_list
]
task_name_list
=
[
task_name_list
]
for
task_element
in
task_name_list
:
string_task_name_list
=
[
task
for
task
in
task_name_list
if
isinstance
(
task
,
str
)]
if
isinstance
(
task_element
,
str
):
others_task_name_list
=
[
task
for
task
in
task_name_list
if
~
isinstance
(
task
,
str
)]
if
task_element
in
GROUP_REGISTRY
:
if
len
(
string_task_name_list
)
>
0
:
group_name
=
task_element
if
task_manager
is
None
:
for
task_name
in
GROUP_REGISTRY
[
task_element
]:
task_manager
=
TaskManager
()
if
task_name
not
in
task_name_from_registry_dict
:
task_obj
=
get_task_dict
(
task_name
)
if
task_name
in
task_obj
.
keys
():
task_dict
=
{
task_name
:
(
group_name
,
task_obj
[
task_name
]),
}
else
:
task_dict
=
{
task_name
:
(
group_name
,
None
),
**
task_obj
,
}
task_name_from_registry_dict
=
{
task_name_from_string_dict
=
task_manager
.
load_task_or_group
(
string_task_name_list
)
**
task_name_from_registry_dict
,
**
task_dict
,
}
else
:
task_name
=
task_element
if
task_name
not
in
task_name_from_registry_dict
:
task_name_from_registry_dict
=
{
**
task_name_from_registry_dict
,
task_name
:
get_task
(
task_name
=
task_element
,
config
=
config
),
}
elif
isinstance
(
task_element
,
dict
)
:
for
task_element
in
others_task_name_list
:
task_element
.
update
(
config
)
if
isinstance
(
task_element
,
dict
):
task_name_from_config_dict
=
{
task_name_from_config_dict
=
{
**
task_name_from_config_dict
,
**
task_name_from_config_dict
,
get_task_name_from_config
(
task_element
):
ConfigurableTask
(
**
task_manager
.
load_config
(
config
=
task_element
),
config
=
task_element
),
}
}
elif
isinstance
(
task_element
,
Task
):
elif
isinstance
(
task_element
,
Task
):
...
@@ -290,11 +424,11 @@ def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
...
@@ -290,11 +424,11 @@ def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
get_task_name_from_object
(
task_element
):
task_element
,
get_task_name_from_object
(
task_element
):
task_element
,
}
}
assert
set
(
task_name_from_
registry
_dict
.
keys
()).
isdisjoint
(
assert
set
(
task_name_from_
string
_dict
.
keys
()).
isdisjoint
(
set
(
task_name_from_object_dict
.
keys
())
set
(
task_name_from_object_dict
.
keys
())
)
)
return
{
return
{
**
task_name_from_
registry
_dict
,
**
task_name_from_
string
_dict
,
**
task_name_from_config_dict
,
**
task_name_from_config_dict
,
**
task_name_from_object_dict
,
**
task_name_from_object_dict
,
}
}
lm_eval/tasks/arc/arc_easy.yaml
View file @
f66fc06f
group
:
group
:
-
ai2_arc
-
ai2_arc
task
:
arc_easy
task
:
arc_easy
dataset_path
:
ai2_arc
dataset_path
:
allenai/
ai2_arc
dataset_name
:
ARC-Easy
dataset_name
:
ARC-Easy
output_type
:
multiple_choice
output_type
:
multiple_choice
training_split
:
train
training_split
:
train
...
...
lm_eval/tasks/bbh/_generate_configs.py
View file @
f66fc06f
...
@@ -28,7 +28,7 @@ if __name__ == "__main__":
...
@@ -28,7 +28,7 @@ if __name__ == "__main__":
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
with
open
(
args
.
base_yaml_path
)
as
f
:
with
open
(
args
.
base_yaml_path
,
encoding
=
"utf-8"
)
as
f
:
base_yaml
=
yaml
.
full_load
(
f
)
base_yaml
=
yaml
.
full_load
(
f
)
base_doc_to_text
=
"Q: {{input}}
\n
A:"
base_doc_to_text
=
"Q: {{input}}
\n
A:"
...
@@ -70,7 +70,7 @@ if __name__ == "__main__":
...
@@ -70,7 +70,7 @@ if __name__ == "__main__":
file_save_path
=
args
.
save_prefix_path
+
f
"/
{
task
}
.yaml"
file_save_path
=
args
.
save_prefix_path
+
f
"/
{
task
}
.yaml"
utils
.
eval_logger
.
info
(
f
"Saving yaml for subset
{
task
}
to
{
file_save_path
}
"
)
utils
.
eval_logger
.
info
(
f
"Saving yaml for subset
{
task
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml
.
dump
(
yaml_dict
,
yaml_dict
,
yaml_file
,
yaml_file
,
...
...
lm_eval/tasks/bbh/cot_fewshot/_cot_fewshot_template_yaml
View file @
f66fc06f
...
@@ -28,3 +28,4 @@ filter_list:
...
@@ -28,3 +28,4 @@ filter_list:
num_fewshot: 0
num_fewshot: 0
metadata:
metadata:
version: 2.0
version: 2.0
num_fewshot: 3 # controls what is printed in n-shot
lm_eval/tasks/bbh/fewshot/_fewshot_template_yaml
View file @
f66fc06f
...
@@ -19,3 +19,4 @@ generation_kwargs:
...
@@ -19,3 +19,4 @@ generation_kwargs:
num_fewshot: 0
num_fewshot: 0
metadata:
metadata:
version: 1.0
version: 1.0
num_fewshot: 3 # will be printed in results table
lm_eval/tasks/belebele/_generate_configs.py
View file @
f66fc06f
...
@@ -27,13 +27,13 @@ if __name__ == "__main__":
...
@@ -27,13 +27,13 @@ if __name__ == "__main__":
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
with
open
(
args
.
base_yaml_path
)
as
f
:
with
open
(
args
.
base_yaml_path
,
encoding
=
"utf-8"
)
as
f
:
base_yaml
=
yaml
.
full_load
(
f
)
base_yaml
=
yaml
.
full_load
(
f
)
if
args
.
cot_prompt_path
is
not
None
:
if
args
.
cot_prompt_path
is
not
None
:
import
json
import
json
with
open
(
args
.
cot_prompt_path
)
as
f
:
with
open
(
args
.
cot_prompt_path
,
encoding
=
"utf-8"
)
as
f
:
cot_file
=
json
.
load
(
f
)
cot_file
=
json
.
load
(
f
)
def
query
():
def
query
():
...
@@ -42,7 +42,7 @@ if __name__ == "__main__":
...
@@ -42,7 +42,7 @@ if __name__ == "__main__":
print
(
query
())
print
(
query
())
languages
=
[
split
[
"split"
]
for
split
in
query
()]
languages
=
[
split
[
"split"
]
for
split
in
query
()]
for
lang
in
tqdm
(
languages
):
for
lang
in
tqdm
(
[
lang
for
lang
in
languages
if
"default"
not
in
lang
]
):
yaml_dict
=
{
yaml_dict
=
{
"include"
:
base_yaml_name
,
"include"
:
base_yaml_name
,
"task"
:
f
"belebele_
{
args
.
task_prefix
}
_
{
lang
}
"
"task"
:
f
"belebele_
{
args
.
task_prefix
}
_
{
lang
}
"
...
@@ -54,7 +54,7 @@ if __name__ == "__main__":
...
@@ -54,7 +54,7 @@ if __name__ == "__main__":
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
lang
}
.yaml"
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
lang
}
.yaml"
logging
.
info
(
f
"Saving yaml for subset
{
lang
}
to
{
file_save_path
}
"
)
logging
.
info
(
f
"Saving yaml for subset
{
lang
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml
.
dump
(
yaml_dict
,
yaml_dict
,
yaml_file
,
yaml_file
,
...
...
lm_eval/tasks/belebele/belebele_default.yaml
deleted
100644 → 0
View file @
b13753cd
"
fewshot_split"
:
"
default"
"
include"
:
"
_default_template_yaml"
"
task"
:
"
belebele_default"
"
test_split"
:
"
default"
lm_eval/tasks/benchmarks/flan/
yaml_templates/
held_in_template_yaml
→
lm_eval/tasks/benchmarks/flan/
_
held_in_template_yaml
View file @
f66fc06f
output_type: generate_until
output_type: generate_until
validation_split: validation
test_split: null
doc_to_choice: null
metric_list:
metric_list:
- metric: exact_match
- metric: exact_match
aggregation: mean
aggregation: mean
...
...
lm_eval/tasks/benchmarks/flan/flan_anli.yaml
deleted
100644 → 0
View file @
b13753cd
group
:
flan_anli
task
:
-
include
:
yaml_templates/held_in_template_yaml
task
:
anli_r1
dataset_path
:
anli
use_prompt
:
prompt_templates/anli.yaml:*
validation_split
:
dev_r1
-
include
:
yaml_templates/held_in_template_yaml
task
:
anli_r2
dataset_path
:
anli
use_prompt
:
prompt_templates/anli.yaml:*
validation_split
:
dev_r2
-
include
:
yaml_templates/held_in_template_yaml
task
:
anli_r3
dataset_path
:
anli
use_prompt
:
prompt_templates/anli.yaml:*
validation_split
:
dev_r3
lm_eval/tasks/benchmarks/flan/flan_arc.yaml
deleted
100644 → 0
View file @
b13753cd
group
:
flan_arc
task
:
-
include
:
yaml_templates/held_in_template_yaml
task
:
arc_easy
dataset_path
:
ai2_arc
dataset_name
:
ARC-Easy
use_prompt
:
prompt_templates/arc.yaml:*
validation_split
:
validation
-
include
:
yaml_templates/held_in_template_yaml
task
:
arc_challenge
dataset_path
:
ai2_arc
dataset_name
:
ARC-Challenge
use_prompt
:
prompt_templates/arc.yaml:*
validation_split
:
validation
lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
deleted
100644 → 0
View file @
b13753cd
group
:
flan_boolq
task
:
-
include
:
yaml_templates/held_in_template_yaml
dataset_path
:
super_glue
dataset_name
:
boolq
use_prompt
:
prompt_templates/boolq.yaml:*
validation_split
:
validation
lm_eval/tasks/benchmarks/flan/flan_cot.yaml
deleted
100644 → 0
View file @
b13753cd
group
:
flan_cot
task
:
-
include
:
yaml_templates/cot_template_yaml
dataset_path
:
gsmk
dataset_name
:
boolq
use_prompt
:
promptsource:*
validation_split
:
validation
-
include
:
yaml_templates/cot_template_yaml
dataset_path
:
EleutherAI/asdiv
use_prompt
:
promptsource:*
validation_split
:
validation
lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
View file @
f66fc06f
group
:
flan_held_in
group
:
flan_held_in
group_alias
:
Flan (Held-In)
task
:
task
:
-
flan_boolq
# ANLI R1
-
flan_rte
-
group
:
anli_r1_flan
-
flan_anli
group_alias
:
ANLI R1
-
flan_arc
task
:
-
task
:
anli_r1
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Choose
your
answer:
based
on
the
paragraph
above
can
we
conclude
that
\"
{{hypothesis}}
\"
?
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
I
think
the
answer
is"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Based
on
that
paragraph
can
we
conclude
that
this
sentence
is
true?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Can
we
draw
the
following
conclusion?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Does
this
next
sentence
follow,
given
the
preceding
text?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Can
we
infer
the
following?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
The
answer
is:"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
following
paragraph
and
determine
if
the
hypothesis
is
true:
\n\n
{{premise}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
Hypothesis:
{{hypothesis}}
\n\n\n
"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
text
and
determine
if
the
sentence
is
true
(see
options
at
the
end):
\n\n
{{premise}}
\n\n
Sentence:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-7
include
:
_held_in_template_yaml
doc_to_text
:
"
Can
we
draw
the
following
hypothesis
from
the
context
(see
options)?
\n\n
Context:
\n\n
{{premise}}
\n\n
Hypothesis:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r1
task_alias
:
prompt-8
include
:
_held_in_template_yaml
doc_to_text
:
"
Choose
from
options:
Determine
if
the
sentence
is
true
based
on
the
text
below:
\n
{{hypothesis}}
\n\n
{{premise}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
# ANLI R2
-
group
:
anli_r2_flan
group_alias
:
ANLI R2
task
:
-
task
:
anli_r2
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Choose
your
answer:
based
on
the
paragraph
above
can
we
conclude
that
\"
{{hypothesis}}
\"
?
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
I
think
the
answer
is"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Based
on
that
paragraph
can
we
conclude
that
this
sentence
is
true?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Can
we
draw
the
following
conclusion?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Does
this
next
sentence
follow,
given
the
preceding
text?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Can
we
infer
the
following?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
The
answer
is:"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
following
paragraph
and
determine
if
the
hypothesis
is
true:
\n\n
{{premise}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
Hypothesis:
{{hypothesis}}
\n\n\n
"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
text
and
determine
if
the
sentence
is
true
(see
options
at
the
end):
\n\n
{{premise}}
\n\n
Sentence:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-7
include
:
_held_in_template_yaml
doc_to_text
:
"
Can
we
draw
the
following
hypothesis
from
the
context
(see
options)?
\n\n
Context:
\n\n
{{premise}}
\n\n
Hypothesis:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r2
task_alias
:
prompt-8
include
:
_held_in_template_yaml
doc_to_text
:
"
Choose
from
options:
Determine
if
the
sentence
is
true
based
on
the
text
below:
\n
{{hypothesis}}
\n\n
{{premise}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
# ANLI R3
-
group
:
anli_r3_flan
group_alias
:
ANLI R3
task
:
-
task
:
anli_r3
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Choose
your
answer:
based
on
the
paragraph
above
can
we
conclude
that
\"
{{hypothesis}}
\"
?
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
I
think
the
answer
is"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Based
on
that
paragraph
can
we
conclude
that
this
sentence
is
true?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n\n
Can
we
draw
the
following
conclusion?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Does
this
next
sentence
follow,
given
the
preceding
text?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
{{premise}}
\n
Can
we
infer
the
following?
\n
{{hypothesis}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
The
answer
is:"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
following
paragraph
and
determine
if
the
hypothesis
is
true:
\n\n
{{premise}}
\n\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No
\n
Hypothesis:
{{hypothesis}}
\n\n\n
"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
text
and
determine
if
the
sentence
is
true
(see
options
at
the
end):
\n\n
{{premise}}
\n\n
Sentence:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-7
include
:
_held_in_template_yaml
doc_to_text
:
"
Can
we
draw
the
following
hypothesis
from
the
context
(see
options)?
\n\n
Context:
\n\n
{{premise}}
\n\n
Hypothesis:
{{hypothesis}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
-
task
:
anli_r3
task_alias
:
prompt-8
include
:
_held_in_template_yaml
doc_to_text
:
"
Choose
from
options:
Determine
if
the
sentence
is
true
based
on
the
text
below:
\n
{{hypothesis}}
\n\n
{{premise}}
\n
OPTIONS:
\n
-
Yes
\n
-
It's
impossible
to
say
\n
-
No"
doc_to_target
:
"
{{[
\"
Yes
\"
,
\"
It's
impossible
to
say
\"
,
\"
No
\"
][label]}}"
# Arc Easy
-
group
:
arc_easy_flan
group_alias
:
Arc Easy
task
:
-
task
:
arc_easy
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{question}}
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
Question:
{{question}}
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}
\n
Answer:"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
Question:
{{question}}
\n\n
What
is
the
correct
answer
to
the
question
from
the
following
choices?
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
Q:
{{question}}
\n
What
is
the
correct
answer
to
this
question?
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}...A:"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
Choose
your
answer?
\n\n
{{question}}
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Answer
the
question
\n\n
{{question}}
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_easy
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
{{question}}
\n\n
Pick
the
answer
from
these
options
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
# Arc Challenge
-
group
:
arc_challenge_flan
group_alias
:
Arc Challenge
task
:
-
task
:
arc_challenge
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{question}}
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
Question:
{{question}}
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}
\n
Answer:"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
Question:
{{question}}
\n\n
What
is
the
correct
answer
to
the
question
from
the
following
choices?
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
Q:
{{question}}
\n
What
is
the
correct
answer
to
this
question?
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}...A:"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
Choose
your
answer?
\n\n
{{question}}
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Answer
the
question
\n\n
{{question}}
\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
-
task
:
arc_challenge
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
{{question}}
\n\n
Pick
the
answer
from
these
options
\n\n
OPTIONS:
\n
-
{{choices.text|join('
\n
-
')}}"
doc_to_target
:
"
{{choices.text[choices.label.index(answerKey)]}}"
# BoolQ
-
group
:
boolq_flan
group_alias
:
BoolQ
task
:
-
task
:
boolq
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
Can
we
conclude
that
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
Is
it
true
that
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
Text:
{{passage}}
\n\n
Question:
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
What's
the
best
answer
to
this
question:
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n
Based
on
the
above
text
what's
the
best
answer
to
this
question:
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n
Answer
this
question
making
sure
that
the
answer
is
supposed
by
the
text:
{{question}}?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-7
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
Is
the
following
statement
correct
based
on
the
text
\n\n
{{question}}
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-8
include
:
_held_in_template_yaml
doc_to_text
:
"
{{passage}}
\n\n
Is
this
statement
correct
\"
{{question}}
\"
?
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
-
task
:
boolq
task_alias
:
prompt-9
include
:
_held_in_template_yaml
doc_to_text
:
"
Is
it
true
that
{{question}}
based
on
the
following
text?
\n\n
{{passage}}
\n\n
OPTIONS:
\n
-
no
\n
-
yes"
doc_to_target
:
"
{{['no',
'yes'][label]}}"
# RTE
-
group
:
rte_flan
group_alias
:
RTE
task
:
-
task
:
rte
task_alias
:
prompt-0
include
:
_held_in_template_yaml
doc_to_text
:
"
{{sentence1}}
\n\n
Question
with
options:
Based
on
the
paragraph
above
can
we
conclude
that
\"
{{sentence2}}
\"
?
\n\n
OPTIONS:
\n
-
yes
\n
-
no"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-1
include
:
_held_in_template_yaml
doc_to_text
:
"
{{sentence1}}
\n\n
Based
on
that
paragraph
can
we
conclude
that
the
sentence
below
is
true?
\n
{{sentence2}}
\n\n
OPTIONS:
\n
-
yes
\n
-
no"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-2
include
:
_held_in_template_yaml
doc_to_text
:
"
{{sentence1}}
\n\n
Q
with
options:
Can
we
draw
the
following
conclusion?
\n
{{sentence2}}
\n\n
OPTIONS:
\n
-
yes
\n
-
no"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-3
include
:
_held_in_template_yaml
doc_to_text
:
"
{{sentence1}}
\n
Does
this
next
sentence
follow,
given
the
preceding
text?
\n
{{sentence2}}
\n\n
OPTIONS:
\n
-
yes
\n
-
no"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-4
include
:
_held_in_template_yaml
doc_to_text
:
"
{{sentence1}}
\n
OPTIONS:
\n
-
yes
\n
-
no
\n
Question:
Can
we
infer
the
following?
\n
{{sentence2}}"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-5
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
following
paragraph
and
determine
if
the
hypothesis
is
true.
Select
from
options
at
the
end:
\n\n
{{sentence1}}
\n\n
Hypothesis:
{{sentence2}}
\n
OPTIONS:
\n
-
yes
\n
-
no
\n
The
answer
is"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-6
include
:
_held_in_template_yaml
doc_to_text
:
"
Read
the
text
and
determine
if
the
sentence
is
true:
\n\n
{{sentence1}}
\n\n
Sentence:
{{sentence2}}
\n
OPTIONS:
\n
-
yes
\n
-
no
\n
A:"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-7
include
:
_held_in_template_yaml
doc_to_text
:
"
Question
with
options:
can
we
draw
the
following
hypothesis
from
the
context?
\n\n
Context:
\n\n
{{sentence1}}
\n\n
Hypothesis:
{{sentence2}}
\n
OPTIONS:
\n
-
yes
\n
-
no
\n
A:"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
-
task
:
rte
task_alias
:
prompt-8
include
:
_held_in_template_yaml
doc_to_text
:
"
Determine
if
the
sentence
is
true
based
on
the
text
below.
Choose
from
options.
\n
{{sentence2}}
\n\n
{{sentence1}}
\n
OPTIONS:
\n
-
yes
\n
-
no"
doc_to_target
:
"
{{['yes',
'no'][label]}}"
lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
deleted
100644 → 0
View file @
b13753cd
group: flan_held_in
task:
- include: flan/yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: boolq
use_prompt: flan/prompt_templates/boolq.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
dataset_path: super_glue
dataset_name: rte
use_prompt: flan/prompt_templates/rte.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r1
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r1
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r2
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r2
- include: flan/yaml_templates/held_in_template_yaml
task: anli_r3
dataset_path: anli
use_prompt: flan/prompt_templates/anli.yaml:*
validation_split: dev_r3
- include: flan/yaml_templates/held_in_template_yaml
task: arc_easy
dataset_path: ai2_arc
dataset_name: ARC-Easy
use_prompt: flan/prompt_templates/arc.yaml:*
validation_split: validation
- include: flan/yaml_templates/held_in_template_yaml
task: arc_challenge
dataset_path: ai2_arc
dataset_name: ARC-Challenge
use_prompt: flan/prompt_templates/arc.yaml:*
validation_split: validation
lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
View file @
f66fc06f
group
:
flan_held_out
group
:
flan_held_out
task
:
task
:
# BBH
# BBH
-
bbh_
flan_
zeroshot
-
bbh_zeroshot
-
bbh_
flan_
fewshot
-
bbh_fewshot
-
bbh_
flan_
cot_fewshot
-
bbh_cot_fewshot
-
bbh_
flan_
cot_zeroshot
-
bbh_cot_zeroshot
# MMLU
# MMLU
-
mmlu
-
mmlu
-
mmlu_flan_n_shot_generative
-
mmlu_flan_n_shot_generative
...
...
lm_eval/tasks/benchmarks/flan/flan_rte.yaml
deleted
100644 → 0
View file @
b13753cd
group
:
flan_rte
task
:
-
include
:
yaml_templates/held_in_template_yaml
dataset_path
:
super_glue
dataset_name
:
rte
use_prompt
:
prompt_templates/rte.yaml:*
validation_split
:
validation
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment