Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
17396935
Commit
17396935
authored
Aug 05, 2024
by
lintangsutawika
Browse files
merged
parents
cd8642e7
458342e2
Changes
23
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
235 additions
and
139 deletions
+235
-139
lm_eval/api/samplers.py
lm_eval/api/samplers.py
+46
-3
lm_eval/api/task.py
lm_eval/api/task.py
+9
-3
lm_eval/evaluator.py
lm_eval/evaluator.py
+72
-0
lm_eval/tasks/mmlu_pro/README.md
lm_eval/tasks/mmlu_pro/README.md
+15
-10
lm_eval/tasks/mmlu_pro/_default_template_yaml
lm_eval/tasks/mmlu_pro/_default_template_yaml
+33
-0
lm_eval/tasks/mmlu_pro/_generate_configs.py
lm_eval/tasks/mmlu_pro/_generate_configs.py
+0
-114
lm_eval/tasks/mmlu_pro/default/mmlu_pro_health.yaml
lm_eval/tasks/mmlu_pro/default/mmlu_pro_health.yaml
+0
-8
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_cot_prompts.json
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_cot_prompts.json
+0
-1
lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_other.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_other.yaml
+5
-0
lm_eval/tasks/mmlu_pro/mmlu_pro_philosophy.yaml
lm_eval/tasks/mmlu_pro/mmlu_pro_philosophy.yaml
+5
-0
No files found.
lm_eval/api/samplers.py
View file @
17396935
from
functools
import
partial
import
datasets
import
datasets
...
@@ -15,8 +17,37 @@ class ContextSampler:
...
@@ -15,8 +17,37 @@ class ContextSampler:
self
.
target_delimiter
=
self
.
config
.
target_delimiter
self
.
target_delimiter
=
self
.
config
.
target_delimiter
self
.
fewshot_delimiter
=
self
.
config
.
fewshot_delimiter
self
.
fewshot_delimiter
=
self
.
config
.
fewshot_delimiter
if
(
self
.
config
.
fewshot_config
is
not
None
and
self
.
config
.
fewshot_config
.
get
(
"doc_to_text"
,
None
)
is
not
None
):
self
.
doc_to_text
=
partial
(
self
.
task
.
doc_to_text
,
doc_to_text
=
self
.
config
.
fewshot_config
.
get
(
"doc_to_text"
,
None
),
)
else
:
self
.
doc_to_text
=
self
.
task
.
doc_to_text
self
.
doc_to_text
=
self
.
task
.
doc_to_text
if
(
self
.
config
.
fewshot_config
is
not
None
and
self
.
config
.
fewshot_config
.
get
(
"doc_to_target"
,
None
)
is
not
None
):
self
.
doc_to_target
=
partial
(
self
.
task
.
doc_to_target
,
doc_to_target
=
self
.
config
.
fewshot_config
.
get
(
"doc_to_target"
,
None
),
)
else
:
self
.
doc_to_target
=
self
.
task
.
doc_to_target
self
.
doc_to_target
=
self
.
task
.
doc_to_target
if
(
self
.
config
.
fewshot_config
is
not
None
and
self
.
config
.
fewshot_config
.
get
(
"doc_to_choice"
,
None
)
is
not
None
):
self
.
doc_to_choice
=
partial
(
self
.
task
.
doc_to_choice
,
doc_to_choice
=
self
.
config
.
fewshot_config
.
get
(
"doc_to_choice"
,
None
),
)
else
:
self
.
doc_to_choice
=
self
.
task
.
doc_to_choice
self
.
doc_to_choice
=
self
.
task
.
doc_to_choice
self
.
docs
=
docs
# HF dataset split, provided by task._fewshot_docs()
self
.
docs
=
docs
# HF dataset split, provided by task._fewshot_docs()
...
@@ -52,6 +83,7 @@ class ContextSampler:
...
@@ -52,6 +83,7 @@ class ContextSampler:
else
self
.
doc_to_choice
(
doc
)[
doc_content
]
else
self
.
doc_to_choice
(
doc
)[
doc_content
]
)
)
labeled_examples
+=
self
.
target_delimiter
labeled_examples
+=
self
.
target_delimiter
<<<<<<<
HEAD
labeled_examples
+=
(
labeled_examples
+=
(
str
(
doc_target
[
0
])
str
(
doc_target
[
0
])
if
isinstance
(
doc_target
,
list
)
if
isinstance
(
doc_target
,
list
)
...
@@ -60,6 +92,17 @@ class ContextSampler:
...
@@ -60,6 +92,17 @@ class ContextSampler:
else
str
(
self
.
doc_to_choice
(
doc
)[
doc_target
])
else
str
(
self
.
doc_to_choice
(
doc
)[
doc_target
])
)
)
labeled_examples
+=
self
.
fewshot_delimiter
labeled_examples
+=
self
.
fewshot_delimiter
=======
if
doc_target
!=
""
:
labeled_examples
+=
(
str
(
doc_target
[
0
])
if
isinstance
(
doc_target
,
list
)
else
doc_target
if
self
.
config
.
doc_to_choice
is
None
or
isinstance
(
doc_target
,
str
)
else
str
(
self
.
doc_to_choice
(
doc
)[
doc_target
])
)
labeled_examples
+=
self
.
fewshot_delimiter
>>>>>>>
mmlu
-
pro
-
changes
return
labeled_examples
return
labeled_examples
...
...
lm_eval/api/task.py
View file @
17396935
...
@@ -1171,9 +1171,11 @@ class ConfigurableTask(Task):
...
@@ -1171,9 +1171,11 @@ class ConfigurableTask(Task):
"""
"""
return
doc
return
doc
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
,
doc_to_text
=
None
):
if
self
.
prompt
is
not
None
:
if
self
.
prompt
is
not
None
:
doc_to_text
=
self
.
prompt
doc_to_text
=
self
.
prompt
elif
doc_to_text
is
not
None
:
doc_to_text
=
doc_to_text
else
:
else
:
doc_to_text
=
self
.
config
.
doc_to_text
doc_to_text
=
self
.
config
.
doc_to_text
...
@@ -1205,9 +1207,11 @@ class ConfigurableTask(Task):
...
@@ -1205,9 +1207,11 @@ class ConfigurableTask(Task):
print
(
type
(
doc_to_text
))
print
(
type
(
doc_to_text
))
raise
TypeError
raise
TypeError
def
doc_to_target
(
self
,
doc
:
Mapping
)
->
Union
[
int
,
str
,
list
]:
def
doc_to_target
(
self
,
doc
:
Mapping
,
doc_to_target
=
None
)
->
Union
[
int
,
str
,
list
]:
if
self
.
prompt
is
not
None
:
if
self
.
prompt
is
not
None
:
doc_to_target
=
self
.
prompt
doc_to_target
=
self
.
prompt
elif
doc_to_target
is
not
None
:
doc_to_target
=
doc_to_target
else
:
else
:
doc_to_target
=
self
.
config
.
doc_to_target
doc_to_target
=
self
.
config
.
doc_to_target
...
@@ -1249,9 +1253,11 @@ class ConfigurableTask(Task):
...
@@ -1249,9 +1253,11 @@ class ConfigurableTask(Task):
else
:
else
:
raise
TypeError
raise
TypeError
def
doc_to_choice
(
self
,
doc
:
Any
)
->
List
[
str
]:
def
doc_to_choice
(
self
,
doc
:
Any
,
doc_to_choice
=
None
)
->
List
[
str
]:
if
self
.
prompt
is
not
None
:
if
self
.
prompt
is
not
None
:
doc_to_choice
=
self
.
prompt
doc_to_choice
=
self
.
prompt
elif
doc_to_choice
is
not
None
:
doc_to_choice
=
doc_to_choice
elif
self
.
config
.
doc_to_choice
is
None
:
elif
self
.
config
.
doc_to_choice
is
None
:
eval_logger
.
error
(
"doc_to_choice was called but not set in config"
)
eval_logger
.
error
(
"doc_to_choice was called but not set in config"
)
else
:
else
:
...
...
lm_eval/evaluator.py
View file @
17396935
...
@@ -607,6 +607,78 @@ def evaluate(
...
@@ -607,6 +607,78 @@ def evaluate(
_higher_is_better
[
m
]
=
None
_higher_is_better
[
m
]
=
None
higher_is_better
[
group
]
=
_higher_is_better
higher_is_better
[
group
]
=
_higher_is_better
<<<<<<<
HEAD
=======
# collect all metric keys used by a subtask in the group.
metric_list
=
list
(
{
key
for
task
in
task_list
for
key
in
results
[
task
].
keys
()
if
"_stderr"
not
in
key
and
key
not
in
[
"alias"
,
"samples"
]
}
)
for
metric
in
metric_list
:
stderr
=
"_stderr,"
.
join
(
metric
.
split
(
","
))
# gather metrics, sizes, and stderrs from subtasks
metrics
=
[
results
[
task
][
metric
]
for
task
in
task_list
if
metric
in
results
[
task
]
]
# TODO: copy?
stderrs
=
[
results
[
task
][
stderr
]
for
task
in
task_list
if
stderr
in
results
[
task
]
]
sizes
=
[
results
[
task
][
"samples"
]
for
task
in
task_list
if
metric
in
results
[
task
]
]
# compute group's pooled metric and stderr
results
[
group
][
metric
]
=
(
lm_eval
.
api
.
metrics
.
aggregate_subtask_metrics
(
metrics
,
sizes
)
)
# TODO: calculate grouped metric using aggregation fn
if
"N/A"
in
stderrs
:
results
[
group
][
stderr
]
=
"N/A"
else
:
results
[
group
][
stderr
]
=
(
lm_eval
.
api
.
metrics
.
pooled_sample_stderr
(
stderrs
,
sizes
)
)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
results
[
group
][
"samples"
]
=
sum
(
sizes
)
results_agg
=
defaultdict
(
dict
)
groups_agg
=
defaultdict
(
dict
)
all_tasks_list
=
list
(
task_hierarchy
.
keys
())
while
True
:
add_tasks_list
=
list
(
k
for
k
in
results_agg
.
keys
())
left_tasks_list
=
sorted
(
list
(
set
(
all_tasks_list
)
-
set
(
add_tasks_list
)))
if
len
(
left_tasks_list
)
==
0
:
break
_task_hierarchy
=
{
k
:
v
for
k
,
v
in
task_hierarchy
.
items
()
if
k
in
left_tasks_list
}
_results_agg
,
_groups_agg
=
prepare_print_tasks
(
_task_hierarchy
,
results
)
results_agg
=
{
**
results_agg
,
**
_results_agg
}
groups_agg
=
{
**
groups_agg
,
**
_groups_agg
}
for
group_name
,
task_list
in
task_hierarchy
.
items
():
if
task_list
:
num_fewshot
[
group_name
]
=
num_fewshot
[
task_list
[
0
]
]
# TODO: validate this
>>>>>>>
mmlu
-
pro
-
changes
results_dict
=
{
results_dict
=
{
"results"
:
dict
(
results_agg
.
items
()),
"results"
:
dict
(
results_agg
.
items
()),
**
(
**
(
...
...
lm_eval/tasks/mmlu_pro/README.md
View file @
17396935
...
@@ -31,19 +31,24 @@ Homepage (preprocessed): https://huggingface.co/datasets/sjyuxyz/MMLU-Pro-with-s
...
@@ -31,19 +31,24 @@ Homepage (preprocessed): https://huggingface.co/datasets/sjyuxyz/MMLU-Pro-with-s
#### Groups
#### Groups
*
`mmlu_pro`
: 'All 14 subjects of the mmlu_pro dataset, evaluated following the methodology in mmlu's original implementation'
*
`mmlu_pro`
: 'All 14 subjects of the mmlu_pro dataset, evaluated following the methodology in mmlu's original implementation'
*
`mmlu_pro_flan_cot_fewshot`
: 'mmlu_pro_flan_cot_fewshot includes 5-shot of exemplars for chain-of-thought approach'
*
`mmlu_pro_flan_cot_zeroshot`
: 'mmlu_pro_flan_cot_zeroshot evaluates using zero-shot chain-of-thought approach'
*
`mmlu_pro_generative`
: 'mmlu_pro_generative solves questions of mmlu_pro using direct (generative) approach'
*
`mmlu_pro_continuation`
: 'mmlu_pro_continuation evaluates the ability to continue and complete a given text'
#### Tasks
#### Tasks
The following tasks evaluate subjects in the mmlu_pro dataset
The following tasks evaluate subjects in the mmlu_pro dataset
-
`mmlu_pro_{subject_english}`
-
`mmlu_pro_biology`
-
`mmlu_pro_flan_cot_fewshot_{subject_english}`
-
`mmlu_pro_business`
-
`mmlu_pro_flan_cot_zeroshot_{subject_english}`
-
`mmlu_pro_chemistry`
-
`mmlu_pro_generative_{subject_english}`
-
`mmlu_pro_computer_science`
-
`mmlu_pro_continuation_{subject_english}`
-
`mmlu_pro_economics`
-
`mmlu_pro_engineering`
-
`mmlu_pro_health`
-
`mmlu_pro_history`
-
`mmlu_pro_law`
-
`mmlu_pro_math`
-
`mmlu_pro_other`
-
`mmlu_pro_philosophy`
-
`mmlu_pro_physics`
-
`mmlu_pro_psychology`
### Checklist
### Checklist
...
...
lm_eval/tasks/mmlu_pro/_default_template_yaml
0 → 100644
View file @
17396935
dataset_path: TIGER-Lab/MMLU-Pro
test_split: test
fewshot_split: validation
fewshot_config:
sampler: first_n
doc_to_text: !function utils.fewshot_to_text
doc_to_target: ""
output_type: generate_until
doc_to_text: !function utils.doc_to_text
doc_to_target: answer
filter_list:
- name: "custom-extract"
filter:
- function: "regex"
regex_pattern: r"answer is \(?([ABCDEFGHIJ])\)?"
# regex_pattern: r".*[aA]nswer:\s*([A-J])",
- function: "take_first"
generation_kwargs:
until:
- "</s>"
- "Q:"
- "<|im_end|>"
do_sample: false
temperature: 0.0
num_fewshot: 5
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
version: 0.0
lm_eval/tasks/mmlu_pro/_generate_configs.py
deleted
100644 → 0
View file @
cd8642e7
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import
argparse
import
logging
import
os
import
yaml
from
tqdm
import
tqdm
eval_logger
=
logging
.
getLogger
(
"lm-eval"
)
SUBJECTS
=
{
"business"
:
"other"
,
"law"
:
"humanities"
,
"psychology"
:
"social_sciences"
,
"biology"
:
"stem"
,
"chemistry"
:
"stem"
,
"history"
:
"humanities"
,
"other"
:
"other"
,
"health"
:
"other"
,
"economics"
:
"social_sciences"
,
"math"
:
"stem"
,
"physics"
:
"stem"
,
"computer_science"
:
"stem"
,
"philosophy"
:
"humanities"
,
"engineering"
:
"stem"
}
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--base_yaml_path"
,
required
=
True
)
parser
.
add_argument
(
"--save_prefix_path"
,
default
=
"mmlu_pro"
)
parser
.
add_argument
(
"--cot_prompt_path"
,
default
=
None
)
parser
.
add_argument
(
"--task_prefix"
,
default
=
""
)
parser
.
add_argument
(
"--group_prefix"
,
default
=
""
)
return
parser
.
parse_args
()
if
__name__
==
"__main__"
:
args
=
parse_args
()
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name
=
os
.
path
.
split
(
args
.
base_yaml_path
)[
-
1
]
with
open
(
args
.
base_yaml_path
,
encoding
=
"utf-8"
)
as
f
:
base_yaml
=
yaml
.
full_load
(
f
)
if
args
.
cot_prompt_path
is
not
None
:
import
json
with
open
(
args
.
cot_prompt_path
,
encoding
=
"utf-8"
)
as
f
:
cot_file
=
json
.
load
(
f
)
ALL_CATEGORIES
=
[]
for
subject
,
category
in
tqdm
(
SUBJECTS
.
items
()):
if
category
not
in
ALL_CATEGORIES
:
ALL_CATEGORIES
.
append
(
category
)
if
args
.
cot_prompt_path
is
not
None
:
description
=
cot_file
[
subject
]
else
:
description
=
f
"The following are multiple choice questions (with answers) about
{
' '
.
join
(
subject
.
split
(
'_'
))
}
.
\n\n
"
yaml_dict
=
{
"include"
:
base_yaml_name
,
"group"
:
f
"mmlu_pro_
{
args
.
task_prefix
}
_
{
category
}
"
if
args
.
task_prefix
!=
""
else
f
"mmlu_pro_
{
category
}
"
,
"group_alias"
:
category
.
replace
(
"_"
,
" "
),
"task"
:
f
"mmlu_pro_
{
args
.
task_prefix
}
_
{
subject
}
"
if
args
.
task_prefix
!=
""
else
f
"mmlu_pro_
{
subject
}
"
,
"task_alias"
:
subject
.
replace
(
"_"
,
" "
),
"dataset_name"
:
subject
,
"description"
:
description
,
}
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
subject
}
.yaml"
eval_logger
.
info
(
f
"Saving yaml for subset
{
subject
}
to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
yaml_dict
,
yaml_file
,
allow_unicode
=
True
,
default_style
=
'"'
,
)
if
args
.
task_prefix
!=
""
:
mmlu_pro_subcategories
=
[
f
"mmlu_pro_
{
args
.
task_prefix
}
_
{
category
}
"
for
category
in
ALL_CATEGORIES
]
else
:
mmlu_pro_subcategories
=
[
f
"mmlu_pro_
{
category
}
"
for
category
in
ALL_CATEGORIES
]
if
args
.
group_prefix
!=
""
:
file_save_path
=
args
.
group_prefix
+
".yaml"
else
:
file_save_path
=
args
.
save_prefix_path
+
".yaml"
eval_logger
.
info
(
f
"Saving benchmark config to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
{
"group"
:
f
"mmlu_pro_
{
args
.
task_prefix
}
"
if
args
.
task_prefix
!=
""
else
"mmlu_pro"
,
"task"
:
mmlu_pro_subcategories
,
},
yaml_file
,
indent
=
4
,
default_flow_style
=
False
,
)
lm_eval/tasks/mmlu_pro/default/mmlu_pro_health.yaml
deleted
100644 → 0
View file @
cd8642e7
"
dataset_name"
:
"
health"
"
description"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
health.
\n\
\n
"
"
group"
:
"
mmlu_pro_other"
"
group_alias"
:
"
other"
"
include"
:
"
_default_template_yaml"
"
task"
:
"
mmlu_pro_health"
"
task_alias"
:
"
health"
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_cot_prompts.json
deleted
100644 → 0
View file @
cd8642e7
This diff is collapsed.
Click to expand it.
lm_eval/tasks/mmlu_pro/mmlu_pro_biology.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
biology.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_biology"
task_alias
:
"
biology"
process_docs
:
!function
utils.process_biology
lm_eval/tasks/mmlu_pro/mmlu_pro_business.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
business.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_business"
task_alias
:
"
business"
process_docs
:
!function
utils.process_business
lm_eval/tasks/mmlu_pro/mmlu_pro_chemistry.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
chemistry.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_chemistry"
task_alias
:
"
chemistry"
process_docs
:
!function
utils.process_chemistry
lm_eval/tasks/mmlu_pro/mmlu_pro_computer_science.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
computer
science.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_computer_science"
task_alias
:
"
computer_science"
process_docs
:
!function
utils.process_computer_science
lm_eval/tasks/mmlu_pro/mmlu_pro_economics.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
economics.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_economics"
task_alias
:
"
economics"
process_docs
:
!function
utils.process_economics
lm_eval/tasks/mmlu_pro/mmlu_pro_engineering.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
engineering.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_engineering"
task_alias
:
"
engineering"
process_docs
:
!function
utils.process_engineering
lm_eval/tasks/mmlu_pro/mmlu_pro_health.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
health.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_health"
task_alias
:
"
health"
process_docs
:
!function
utils.process_health
lm_eval/tasks/mmlu_pro/mmlu_pro_history.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
history.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_history"
task_alias
:
"
history"
process_docs
:
!function
utils.process_history
lm_eval/tasks/mmlu_pro/mmlu_pro_law.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
law.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_law"
task_alias
:
"
law"
process_docs
:
!function
utils.process_law
lm_eval/tasks/mmlu_pro/mmlu_pro_math.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
math.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_math"
task_alias
:
"
math"
process_docs
:
!function
utils.process_math
lm_eval/tasks/mmlu_pro/mmlu_pro_other.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
other.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_other"
task_alias
:
"
other"
process_docs
:
!function
utils.process_other
lm_eval/tasks/mmlu_pro/mmlu_pro_philosophy.yaml
0 → 100644
View file @
17396935
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
philosophy.
Think
step
by
step
and
then
finish
your
answer
with
\"
the
answer
is
(X)
\"
where
X
is
the
correct
letter
choice."
include
:
"
_default_template_yaml"
task
:
"
mmlu_pro_philosophy"
task_alias
:
"
philosophy"
process_docs
:
!function
utils.process_philosophy
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment