Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
cd441ab1
Commit
cd441ab1
authored
Jul 26, 2024
by
Yu Shi Jie
Browse files
mmlu-pro: update yaml content in line with mmlu
parent
5bae76d6
Changes
22
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
145 additions
and
34 deletions
+145
-34
lm_eval/tasks/mmlu_pro/continuation/_continuation_template_yaml
...l/tasks/mmlu_pro/continuation/_continuation_template_yaml
+3
-0
lm_eval/tasks/mmlu_pro/continuation/_mmlu_pro.yaml
lm_eval/tasks/mmlu_pro/continuation/_mmlu_pro.yaml
+30
-4
lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_biology.yaml
lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_biology.yaml
+2
-2
lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_chemistry.yaml
lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_chemistry.yaml
+2
-2
lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_psychology.yaml
lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_psychology.yaml
+2
-2
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_mmlu_pro.yaml
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_mmlu_pro.yaml
+30
-4
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_mmlu_pro_flan_cot_fewshot_template_yaml
...flan_cot_fewshot/_mmlu_pro_flan_cot_fewshot_template_yaml
+2
-2
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_math.yaml
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_math.yaml
+1
-1
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_miscellaneous.yaml
...sks/mmlu_pro/flan_cot_fewshot/mmlu_pro_miscellaneous.yaml
+1
-1
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_philosophy.yaml
.../tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_philosophy.yaml
+1
-1
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_physics.yaml
...val/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_physics.yaml
+1
-1
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_psychology.yaml
.../tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_psychology.yaml
+1
-1
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/_mmlu_pro.yaml
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/_mmlu_pro.yaml
+30
-4
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/_mmlu_pro_flan_cot_zeroshot_template_yaml
...an_cot_zeroshot/_mmlu_pro_flan_cot_zeroshot_template_yaml
+3
-1
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_biology.yaml
...al/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_biology.yaml
+1
-1
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_business.yaml
...l/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_business.yaml
+1
-1
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_math.yaml
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_math.yaml
+1
-1
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_psychology.yaml
...tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_psychology.yaml
+1
-1
lm_eval/tasks/mmlu_pro/generative/_default_template_yaml
lm_eval/tasks/mmlu_pro/generative/_default_template_yaml
+2
-0
lm_eval/tasks/mmlu_pro/generative/_mmlu_pro.yaml
lm_eval/tasks/mmlu_pro/generative/_mmlu_pro.yaml
+30
-4
No files found.
lm_eval/tasks/mmlu_pro/continuation/_continuation_template_yaml
View file @
cd441ab1
dataset_path: sjyuxyz/MMLU-Pro-with-subset
output_type: multiple_choice
test_split: test
fewshot_split: dev
fewshot_config:
...
...
@@ -8,3 +9,5 @@ doc_to_choice: "{{options}}"
doc_to_target: "{{answer_index}}"
metadata:
version: 0.0
dataset_kwargs:
trust_remote_code: true
lm_eval/tasks/mmlu_pro/continuation/_mmlu_pro.yaml
View file @
cd441ab1
group
:
mmlu_pro_continuation
group_alias
:
mmlu-pro (continuation)
task
:
-
mmlu_pro_continuation_stem
-
mmlu_pro_continuation_other
-
mmlu_pro_continuation_social_sciences
-
mmlu_pro_continuation_humanities
-
group
:
stem
task
:
-
mmlu_pro_continuation_stem
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_pro_continuation_other
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_pro_continuation_social_sciences
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_pro_continuation_humanities
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
1
lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_biology.yaml
View file @
cd441ab1
"
dataset_name"
:
"
biology"
"
description"
:
"
The
following
are
questions
(with
answers)
about
biology.
\n\
\n
"
"
group
"
:
"
mmlu_continuation_stem"
"
tag
"
:
"
mmlu_
pro_
continuation_stem"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_biology"
"
task"
:
"
mmlu_
pro_
continuation_biology"
lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_chemistry.yaml
View file @
cd441ab1
"
dataset_name"
:
"
math"
"
description"
:
"
The
following
are
questions
(with
answers)
about
math.
\n\
\n
"
"
group
"
:
"
mmlu_continuation_stem"
"
tag
"
:
"
mmlu_
pro_
continuation_stem"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_chemistry"
\ No newline at end of file
"
task"
:
"
mmlu_pro_continuation_chemistry"
\ No newline at end of file
lm_eval/tasks/mmlu_pro/continuation/mmlu_pro_psychology.yaml
View file @
cd441ab1
"
dataset_name"
:
"
psychology"
"
description"
:
"
The
following
are
questions
(with
answers)
about
psychology.
\n\
\n
"
"
group
"
:
"
mmlu_continuation_social_sciences"
"
tag
"
:
"
mmlu_
pro_
continuation_social_sciences"
"
include"
:
"
_continuation_template_yaml"
"
task"
:
"
mmlu_continuation_psychology"
"
task"
:
"
mmlu_
pro_
continuation_psychology"
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_mmlu_pro.yaml
View file @
cd441ab1
group
:
mmlu_pro_flan_cot_fewshot
group_alias
:
mmlu-pro (flan style, fewshot cot)
task
:
-
mmlu_pro_flan_cot_fewshot_stem
-
mmlu_pro_flan_cot_fewshot_other
-
mmlu_pro_flan_cot_fewshot_social_sciences
-
mmlu_pro_flan_cot_fewshot_humanities
-
group
:
stem
task
:
-
mmlu_pro_flan_cot_fewshot_stem
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_pro_flan_cot_fewshot_other
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_pro_flan_cot_fewshot_social_sciences
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_pro_flan_cot_fewshot_humanities
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
1
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/_mmlu_pro_flan_cot_fewshot_template_yaml
View file @
cd441ab1
...
...
@@ -27,5 +27,5 @@ metric_list:
ignore_punctuation: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_math.yaml
View file @
cd441ab1
...
...
@@ -28,6 +28,6 @@ fewshot_config:
-
question
:
"
A
total
of
30
players
will
play
basketball
at
a
park.
There
will
be
exactly
5
players
on
each
team.
Which
statement
correctly
explains
how
to
find
the
number
of
teams
needed?
(A)
Multiply
5
by
5
to
find
25
teams.
(B)
Divide
30
by
5
to
find
6
teams.
(C)
Add
5
to
30
to
find
35
teams.
(D)
Subtract
30
from
5
to
find
-25
teams.
(E)
Divide
5
by
30
to
find
0.1667
teams.
(F)
Add
5
to
30
then
divide
by
2
to
find
17.5
teams.
(G)
N/A
(H)
N/A
(I)
N/A
(J)
N/A"
target
:
"
Let's
think
step
by
step.
We
want
to
find
the
number
of
teams.
We
know
that
there
are
5
players/team,
and
30
players.
Thus
to
get
the
number
of
teams
we
divide
players
by
players/team,
so
30
players
/
5
players/team
=
6
teams.
The
answer
is
(B)."
group
:
mmlu_pro_flan_cot_fewshot_stem
tag
:
mmlu_pro_flan_cot_fewshot_stem
include
:
_mmlu_pro_flan_cot_fewshot_template_yaml
task
:
mmlu_pro_flan_cot_fewshot_math
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_miscellaneous.yaml
View file @
cd441ab1
...
...
@@ -18,6 +18,6 @@ fewshot_config:
-
question
:
"
What
place
is
named
in
the
title
of
the
1979
live
album
by
rock
legends
Cheap
Trick?
(A)
Brooklyn
(B)
Beijing
(C)
Budapest
(D)
Boston
(E)
Bhutan
(F)
Barcelona
(G)
Britain
(H)
Brisbane
(I)
Bruges
(J)
Budokan"
target
:
"
Let's
think
step
by
step.
We
refer
to
Wikipedia
for
help.
Nippon
Budokan
is
an
indoor
arena
in
Tokyo,
Japan
renowned
for
hosting
rock
music
concerts
including
Cheap
Trick
in
1978.
'Cheap
Trick
at
Budokan'
became
the
name
of
their
album.
The
answer
is
(J)."
group
:
mmlu_pro_flan_cot_fewshot_other
tag
:
mmlu_pro_flan_cot_fewshot_other
include
:
_mmlu_pro_flan_cot_fewshot_template_yaml
task
:
mmlu_pro_flan_cot_fewshot_miscellaneous
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_philosophy.yaml
View file @
cd441ab1
...
...
@@ -24,6 +24,6 @@ fewshot_config:
-
question
:
"
What
is
the
sign
of
the
covenant
for
Jewish
males?
(A)
Fasting
on
Yom
Kippur
(B)
Lighting
Shabbat
candles
(C)
The
rainbow
(D)
Circumcision
(E)
The
Torah
(F)
Bar
mitzvah
(G)
Keeping
kosher
(H)
Wearing
a
kippah
(I)
A
son
(J)
The
Star
of
David"
target
:
"
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
world
religions
for
help.
In
Judaism,
the
most
distinctive
sign
of
the
covenant
is
circumcision
(brit
milah).
The
answer
is
(D)."
group
:
mmlu_pro_flan_cot_fewshot_humanities
tag
:
mmlu_pro_flan_cot_fewshot_humanities
include
:
_mmlu_pro_flan_cot_fewshot_template_yaml
task
:
mmlu_pro_flan_cot_fewshot_philosophy
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_physics.yaml
View file @
cd441ab1
...
...
@@ -18,6 +18,6 @@ fewshot_config:
-
question
:
"
A
microwave
oven
is
connected
to
an
outlet,
120
V,
and
draws
a
current
of
2
amps.
At
what
rate
is
energy
being
used
by
the
microwave
oven?
(A)
240
W
(B)
120
W
(C)
10
W
(D)
480
W
(E)
360
W
(F)
200
W
(G)
30
W
(H)
150
W
(I)
60
W
(J)
300
W"
target
:
"
Let's
think
step
by
step.
Rate
of
energy
usage
is
known
as
power;
in
an
dissipative
electrical
circuit,
power
is
given
by
voltage
times
current.
So
in
our
case,
the
power
is
120
V
times
2
amps,
or
240
W.
The
answer
is
(A)."
group
:
mmlu_pro_flan_cot_fewshot_stem
tag
:
mmlu_pro_flan_cot_fewshot_stem
include
:
_mmlu_pro_flan_cot_fewshot_template_yaml
task
:
mmlu_pro_flan_cot_fewshot_physics
lm_eval/tasks/mmlu_pro/flan_cot_fewshot/mmlu_pro_psychology.yaml
View file @
cd441ab1
...
...
@@ -18,6 +18,6 @@ fewshot_config:
-
question
:
"
In
terms
of
Hofstede’s
(1980)
five
cultural
dimensions,
the
United
States
scores
at
the
top
of
the
scale
on:
(A)
individualism
and
long-term
orientation.
(B)
individualism
and
power
distance.
(C)
uncertainty
avoidance.
(D)
long-term
orientation.
(E)
individualism.
(F)
individualism
and
masculinity.
(G)
long-term
orientation
and
uncertainty
avoidance.
(H)
power
distance.
(I)
power
distance
and
masculinity.
(J)
N/A"
target
:
"
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
psychology
for
help.
The
US
scores
highest
on
individualism
among
the
five
cultural
dimensions.
The
answer
is
(E)."
group
:
mmlu_pro_flan_cot_fewshot_social_sciences
tag
:
mmlu_pro_flan_cot_fewshot_social_sciences
include
:
_mmlu_pro_flan_cot_fewshot_template_yaml
task
:
mmlu_pro_flan_cot_fewshot_psychology
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/_mmlu_pro.yaml
View file @
cd441ab1
group
:
mmlu_pro_flan_cot_zeroshot
group_alias
:
mmlu-pro (flan style, zeroshot cot)
task
:
-
mmlu_pro_flan_cot_zeroshot_stem
-
mmlu_pro_flan_cot_zeroshot_other
-
mmlu_pro_flan_cot_zeroshot_social_sciences
-
mmlu_pro_flan_cot_zeroshot_humanities
-
group
:
stem
task
:
-
mmlu_pro_flan_cot_zeroshot_stem
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_pro_flan_cot_zeroshot_other
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_pro_flan_cot_zeroshot_social_sciences
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_pro_flan_cot_zeroshot_humanities
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
1
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/_mmlu_pro_flan_cot_zeroshot_template_yaml
View file @
cd441ab1
...
...
@@ -33,4 +33,6 @@ metric_list:
ignore_case: true
ignore_punctuation: true
metadata:
version: 2.0
version: 1.0
dataset_kwargs:
trust_remote_code: true
\ No newline at end of file
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_biology.yaml
View file @
cd441ab1
"
dataset_name"
:
"
biology"
"
description"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
biology.
\n\
\n
"
"
group
"
:
"
mmlu_pro_flan_cot_zeroshot_stem"
"
tag
"
:
"
mmlu_pro_flan_cot_zeroshot_stem"
"
include"
:
"
_mmlu_pro_flan_cot_zeroshot_template_yaml"
"
task"
:
"
mmlu_pro_flan_cot_zeroshot_biology"
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_business.yaml
View file @
cd441ab1
"
dataset_name"
:
"
business"
"
description"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
business.
\n\
\n
"
"
group
"
:
"
mmlu_pro_flan_cot_zeroshot_other"
"
tag
"
:
"
mmlu_pro_flan_cot_zeroshot_other"
"
include"
:
"
_mmlu_pro_flan_cot_zeroshot_template_yaml"
"
task"
:
"
mmlu_pro_flan_cot_zeroshot_business"
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_math.yaml
View file @
cd441ab1
"
dataset_name"
:
"
math"
"
description"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
math.
\n\
\n
"
"
group
"
:
"
mmlu_pro_flan_cot_zeroshot_stem"
"
tag
"
:
"
mmlu_pro_flan_cot_zeroshot_stem"
"
include"
:
"
_mmlu_pro_flan_cot_zeroshot_template_yaml"
"
task"
:
"
mmlu_pro_flan_cot_zeroshot_math"
lm_eval/tasks/mmlu_pro/flan_cot_zeroshot/mmlu_pro_psychology.yaml
View file @
cd441ab1
"
dataset_name"
:
"
psychology"
"
description"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
psychology.
\n\
\n
"
"
group
"
:
"
mmlu_pro_flan_cot_zeroshot_social_sciences"
"
tag
"
:
"
mmlu_pro_flan_cot_zeroshot_social_sciences"
"
include"
:
"
_mmlu_pro_flan_cot_zeroshot_template_yaml"
"
task"
:
"
mmlu_pro_flan_cot_zeroshot_psychology"
lm_eval/tasks/mmlu_pro/generative/_default_template_yaml
View file @
cd441ab1
...
...
@@ -16,3 +16,5 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
lm_eval/tasks/mmlu_pro/generative/_mmlu_pro.yaml
View file @
cd441ab1
group
:
mmlu_pro_generative
group_alias
:
mmlu-pro (generative)
task
:
-
mmlu_pro_stem_generative
-
mmlu_pro_other_generative
-
mmlu_pro_social_sciences_generative
-
mmlu_pro_humanities_generative
-
group
:
stem
task
:
-
mmlu_pro_stem_generative
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
other
task
:
-
mmlu_pro_other_generative
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
social sciences
task
:
-
mmlu_pro_social_sciences_generative
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
-
group
:
humanities
task
:
-
mmlu_pro_humanities_generative
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
aggregate_metric_list
:
-
metric
:
acc
weight_by_size
:
True
metadata
:
version
:
1
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment