Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
109ed1c7
Commit
109ed1c7
authored
Oct 17, 2023
by
lintangsutawika
Browse files
added subgroups for other mmlu variants
parent
93a45962
Changes
238
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
602 additions
and
733 deletions
+602
-733
lm_eval/tasks/mmlu/_generate_configs.py
lm_eval/tasks/mmlu/_generate_configs.py
+21
-8
lm_eval/tasks/mmlu/default/_mmlu.yaml
lm_eval/tasks/mmlu/default/_mmlu.yaml
+0
-0
lm_eval/tasks/mmlu/flan_cot_fewshot/_cot_prompts.json
lm_eval/tasks/mmlu/flan_cot_fewshot/_cot_prompts.json
+0
-0
lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
+6
-0
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
...al/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
+5
-4
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
+41
-41
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
+39
-38
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
...val/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
+26
-25
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
.../tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
+35
-58
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
...val/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
+6
-5
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
...l/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
+38
-37
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
.../mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
+79
-189
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
...tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
+46
-45
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
...al/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
+43
-43
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
...val/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
+44
-70
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
...l/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
+32
-31
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
.../tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
+33
-32
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
+57
-57
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
...ks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
+32
-32
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
...ks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
+19
-18
No files found.
lm_eval/tasks/mmlu/_generate_configs.py
View file @
109ed1c7
...
...
@@ -74,7 +74,7 @@ SUBJECTS = {
def
parse_args
():
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--base_yaml_path"
,
required
=
True
)
parser
.
add_argument
(
"--save_prefix_path"
,
default
=
"
flan
"
)
parser
.
add_argument
(
"--save_prefix_path"
,
default
=
"
mmlu
"
)
parser
.
add_argument
(
"--cot_prompt_path"
,
default
=
None
)
parser
.
add_argument
(
"--task_prefix"
,
default
=
""
)
parser
.
add_argument
(
"--group_prefix"
,
default
=
""
)
...
...
@@ -109,7 +109,9 @@ if __name__ == "__main__":
yaml_dict
=
{
"include"
:
base_yaml_name
,
"group"
:
f
"mmlu_
{
category
}
"
,
"group"
:
f
"mmlu_
{
args
.
task_prefix
}
_
{
category
}
"
if
args
.
task_prefix
!=
""
else
f
"mmlu_
{
category
}
"
,
"task"
:
f
"mmlu_
{
args
.
task_prefix
}
_
{
subject
}
"
if
args
.
task_prefix
!=
""
else
f
"mmlu_
{
subject
}
"
,
...
...
@@ -123,22 +125,33 @@ if __name__ == "__main__":
yaml
.
dump
(
yaml_dict
,
yaml_file
,
width
=
float
(
"inf"
),
#
width=float("inf"),
allow_unicode
=
True
,
default_style
=
'"'
,
)
if
args
.
group_prefix
==
""
:
file_save_path
=
args
.
save_prefix_path
+
".yaml"
if
args
.
task_prefix
!=
""
:
mmlu_subcategories
=
[
f
"mmlu_
{
args
.
task_prefix
}
_
{
category
}
"
for
category
in
ALL_CATEGORIES
]
else
:
mmlu_subcategories
=
[
f
"mmlu_
{
category
}
"
for
category
in
ALL_CATEGORIES
]
if
args
.
group_prefix
!=
""
:
file_save_path
=
args
.
group_prefix
+
".yaml"
else
:
file_save_path
=
args
.
save_prefix_path
+
f
"_
{
args
.
group_prefix
}
.yaml"
file_save_path
=
args
.
save_prefix_path
+
".yaml"
eval_logger
.
info
(
f
"Saving benchmark config to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
yaml
.
dump
(
{
"group"
:
f
"mmlu_
{
args
.
group_prefix
}
"
,
"task"
:
[
f
"mmlu_
{
category
}
"
for
category
in
ALL_CATEGORIES
],
"group"
:
f
"mmlu_
{
args
.
task_prefix
}
"
if
args
.
task_prefix
!=
""
else
"mmlu"
,
"task"
:
mmlu_subcategories
,
},
yaml_file
,
indent
=
4
,
default_flow_style
=
False
,
)
lm_eval/tasks/mmlu/default/mmlu.yaml
→
lm_eval/tasks/mmlu/default/
_
mmlu.yaml
View file @
109ed1c7
File moved
lm_eval/tasks/mmlu/_cot_prompts.json
→
lm_eval/tasks/mmlu/
flan_cot_fewshot/
_cot_prompts.json
View file @
109ed1c7
File moved
lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu.yaml
0 → 100644
View file @
109ed1c7
group
:
mmlu_flan_cot_fewshot
task
:
-
mmlu_flan_cot_fewshot_stem
-
mmlu_flan_cot_fewshot_other
-
mmlu_flan_cot_fewshot_social_sciences
-
mmlu_flan_cot_fewshot_humanities
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_abstract_algebra.yaml
View file @
109ed1c7
dataset_name
:
abstract_algebra
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
abstract
\
"
dataset_name
"
:
"
abstract_algebra
"
"
description
"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
abstract
\
\
algebra.
\n\n
Q:
Statement
1
|
Every
element
of
a
group
generates
a
cyclic
subgroup
\
\
of
the
group.
Statement
2
|
The
symmetric
group
S_10
has
10
elements.
\n
(A)
True,
\
\
True
(B)
False,
False
(C)
True,
False
(D)
False,
True
\n
A:
Let's
think
step
by
\
...
...
@@ -36,5 +36,6 @@ description: "The following are multiple choice questions (with answers) about a
\
x
=
2,
hence
x^2
+
1
does
not
have
any
roots.
For
c
=
2
the
polynomial
x^2
+
2
\
\
has
two
roots
at
x
=
1
and
x
=
2.
Hence
Z_3[x]/(x^2
+
c)
is
a
field
if
and
only
\
\
if
c
=
1.
The
answer
is
(B)."
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_abstract_algebra
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_abstract_algebra"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_anatomy.yaml
View file @
109ed1c7
dataset_name
:
anatomy
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
anatomy.
\n\
"
dataset_name
"
:
"
anatomy
"
"
description
"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
anatomy.
\n\
\n
Q:
Which
of
the
following
is
the
body
cavity
that
contains
the
pituitary
gland?
\n\
(A)
Abdominal
(B)
Cranial
(C)
Pleural
(D)
Spinal
\n
A:
Let's
think
step
by
step.
We
\
\
refer
to
Wikipedia
articles
on
anatomy
for
help.
Let
\u2019
s
solve
this
problem
\
\
step
by
step.
The
pituitary
gland
is
the
major
endocrine
gland
attached
to
the
\
\
base
of
the
brain,
and
it
is
contained
in
the
Cranial
cavity.
The
answer
is
(B).
\n\
\n
Q:
Which
of
these
branches
of
the
trigeminal
nerve
contain
somatic
motor
processes?
\n\
\
refer
to
Wikipedia
articles
on
anatomy
for
help.
Let
’
s
solve
this
problem
step
\
\
by
step.
The
pituitary
gland
is
the
major
endocrine
gland
attached
to
the
base
\
\
of
the
brain,
and
it
is
contained
in
the
Cranial
cavity.
The
answer
is
(B).
\n\
n\
Q:
Which
of
these
branches
of
the
trigeminal
nerve
contain
somatic
motor
processes?
\n\
(A)
The
supraorbital
nerve
(B)
The
infraorbital
nerve
(C)
The
mental
nerve
(D)
None
\
\
of
the
above
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
anatomy
\
\
for
help.
Let
\u2019
s
solve
this
problem
step
by
step.
\n
We
know
the
following:
\
\
(A)
The
supraorbital
nerve
(also
known
as
the
frontal
nerve)
is
the
largest
branch
\
\
for
help.
Let
’
s
solve
this
problem
step
by
step.
\n
We
know
the
following:
(A)
\
\
The
supraorbital
nerve
(also
known
as
the
frontal
nerve)
is
the
largest
branch
\
\
of
the
ophthalmic
nerve
and
branch
of
ophthalmic
division
of
the
trigeminal
nerve.
\
\
(B)
The
infraorbital
nerve
is
a
branch
of
the
maxillary
division
of
the
trigeminal
\
\
nerve.
(C)
The
mental
nerve
is
a
branch
of
the
mandibular
division
of
the
trigeminal
\
...
...
@@ -19,39 +19,39 @@ description: "The following are multiple choice questions (with answers) about a
(A)
excess
overbite
of
the
upper
lateral
incisors.
(B)
negative
overjet
of
the
upper
\
\
central
incisors.
(C)
excess
overjet
of
the
upper
lateral
incisors.
(D)
excess
\
\
overjet
of
the
upper
central
incisors.
\n
A:
Let's
think
step
by
step.
We
refer
\
\
to
Wikipedia
articles
on
anatomy
for
help.
Let
\u2019
s
solve
this
problem
step
\
\
by
step.
This
is
a
question
related
to
anatomy
and
orthodontics.
Excess
overjet
\
\
is
associated
with
Class
II
occlusions;
therefore,
we
can
safely
eliminate
(B)
\
\
from
the
list,
as
negative
overjet
is
often
associated
with
Class
III
occlusions.
\
\
Now,
we
need
to
determine
the
location
of
the
excess
overjet,
and
that
would
be
\
\
the
upper
(maxillary)
lateral
incisors.
Only
(C)
has
the
correct
information.
\
\
The
answer
is
(C).
\n\n
Q:
The
pleura
\n
(A)
have
no
sensory
innervation.
(B)
are
\
\
separated
by
a
2
mm
space.
(C)
extend
into
the
neck.
(D)
are
composed
of
respiratory
\
\
epithelium.
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
anatomy
\
\
for
help.
Let
\u2019
s
solve
this
problem
step
by
step.
First,
recall
that
the
pleura
\
\
refers
to
the
thin
layer
of
tissue
that
covers
the
lungs
and
lines
the
interior
\
\
wall
of
the
chest
cavity.
Now,
let
\u2019
s
look
at
each
option:
\n
Option
(A):
\u201C\
The
pleura
have
no
sensory
innervation.
\u201D
This
information
is
not
correct.
The
\
\
pleura
do
have
a
sensory
innervation.
\n
Option
(B):
\u201C
The
pleura
are
separated
\
\
by
a
2
mm
space.
\u201D
This
information
is
not
correct.
There
is
a
very
thin
\u201C\
potential
\u201D
space
between
the
layers
of
the
pleura;
however,
it
is
typically
\
\
filled
with
serous
pleural
fluid.
\n
Option
(C):
\u201C
The
pleura
extend
into
the
\
\
neck.
\u201D
This
information
is
actuakky
true.
The
cervical
pleura,
also
known
\
\
as
the
dome
of
the
pleuradome
of
the
pleura,
lines
the
extendsiton
of
the
pleural
\
\
cavity
into
the
neck.
\n
Option
(D):
\u201C
The
pleura
are
composed
of
respiratory
\
\
epithelium.
\u201D
This
information
is
not
correct.
The
pleaura
are
composed
of
\
\
connective
tissue
(CT).
\n
Because
(A),
(B),
and
(D)
are
all
incorrect,
(D)
is
the
\
\
only
correct
answer.
The
answer
is
(C).
\n\n
Q:
What
is
the
embryological
origin
\
\
to
Wikipedia
articles
on
anatomy
for
help.
Let’s
solve
this
problem
step
by
step.
\
\
This
is
a
question
related
to
anatomy
and
orthodontics.
Excess
overjet
is
associated
\
\
with
Class
II
occlusions;
therefore,
we
can
safely
eliminate
(B)
from
the
list,
\
\
as
negative
overjet
is
often
associated
with
Class
III
occlusions.
Now,
we
need
\
\
to
determine
the
location
of
the
excess
overjet,
and
that
would
be
the
upper
(maxillary)
\
\
lateral
incisors.
Only
(C)
has
the
correct
information.
The
answer
is
(C).
\n\n\
Q:
The
pleura
\n
(A)
have
no
sensory
innervation.
(B)
are
separated
by
a
2
mm
space.
\
\
(C)
extend
into
the
neck.
(D)
are
composed
of
respiratory
epithelium.
\n
A:
Let's
\
\
think
step
by
step.
We
refer
to
Wikipedia
articles
on
anatomy
for
help.
Let’s
\
\
solve
this
problem
step
by
step.
First,
recall
that
the
pleura
refers
to
the
thin
\
\
layer
of
tissue
that
covers
the
lungs
and
lines
the
interior
wall
of
the
chest
\
\
cavity.
Now,
let’s
look
at
each
option:
\n
Option
(A):
“The
pleura
have
no
sensory
\
\
innervation.”
This
information
is
not
correct.
The
pleura
do
have
a
sensory
innervation.
\n\
Option
(B):
“The
pleura
are
separated
by
a
2
mm
space.”
This
information
is
not
\
\
correct.
There
is
a
very
thin
“potential”
space
between
the
layers
of
the
pleura;
\
\
however,
it
is
typically
filled
with
serous
pleural
fluid.
\n
Option
(C):
“The
\
\
pleura
extend
into
the
neck.”
This
information
is
actuakky
true.
The
cervical
\
\
pleura,
also
known
as
the
dome
of
the
pleuradome
of
the
pleura,
lines
the
extendsiton
\
\
of
the
pleural
cavity
into
the
neck.
\n
Option
(D):
“The
pleura
are
composed
of
\
\
respiratory
epithelium.”
This
information
is
not
correct.
The
pleaura
are
composed
\
\
of
connective
tissue
(CT).
\n
Because
(A),
(B),
and
(D)
are
all
incorrect,
(D)
is
\
\
the
only
correct
answer.
The
answer
is
(C).
\n\n
Q:
What
is
the
embryological
origin
\
\
of
the
hyoid
bone?
\n
(A)
The
first
pharyngeal
arch
(B)
The
first
and
second
pharyngeal
\
\
arches
(C)
The
second
pharyngeal
arch
(D)
The
second
and
third
pharyngeal
arches
\n\
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
anatomy
for
help.
\
\
Let
\u2019
s
solve
this
problem
step
by
step.
The
hyoid
bone,
which
is
also
known
\
\
as
the
hyooid,
is
a
a
small
U-shaped
bone
located
in
the
anterior
neck.
In
its
\
\
resting
position,
it
lies
between
the
ase
of
the
mandible
and
the
third
cervical
\
\
vertebrae.
We
know
that
the
second
and
the
third
pharyngeal
arches
give
rise
to
\
\
the
horns
of
the
hyoid
bone;
therefore,
the
embryological
origin
of
the
hyoid
\
\
bone
are
the
second
and
the
third
pharyngeal
arches
\u2014
this
information
is
covered
\
\
in
the
last
option
(D).
Therefore,
we
conclude
that
(D)
must
be
the
correct
answer.
\
\
The
answer
is
(D)."
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_anatomy
\
Let’s
solve
this
problem
step
by
step.
The
hyoid
bone,
which
is
also
known
as
\
\
the
hyooid,
is
a
a
small
U-shaped
bone
located
in
the
anterior
neck.
In
its
resting
\
\
position,
it
lies
between
the
ase
of
the
mandible
and
the
third
cervical
vertebrae.
\
\
We
know
that
the
second
and
the
third
pharyngeal
arches
give
rise
to
the
horns
\
\
of
the
hyoid
bone;
therefore,
the
embryological
origin
of
the
hyoid
bone
are
the
\
\
second
and
the
third
pharyngeal
arches—this
information
is
covered
in
the
last
\
\
option
(D).
Therefore,
we
conclude
that
(D)
must
be
the
correct
answer.
The
answer
\
\
is
(D)."
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_anatomy"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_astronomy.yaml
View file @
109ed1c7
dataset_name
:
astronomy
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
astronomy.
\n\
"
dataset_name
"
:
"
astronomy
"
"
description
"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
astronomy.
\n\
\n
Q:
Where
do
most
short-period
comets
come
from
and
how
do
we
know?
\n
(A)
The
Kuiper
\
\
belt;
short
period
comets
tend
to
be
in
the
plane
of
the
solar
system
just
like
\
\
the
Kuiper
belt.
(B)
The
Kuiper
belt;
short
period
comets
tend
to
come
from
random
\
...
...
@@ -16,39 +16,40 @@ description: "The following are multiple choice questions (with answers) about a
\
lighter
on
Mars.
(C)
It
would
be
harder
since
the
truck
is
lighter
on
Mars.
(D)
\
\
It
would
be
the
same
no
matter
where
you
are.
\n
A:
Let's
think
step
by
step.
If
\
\
we
assume
that
there
is
no
friction,
the
force
needed
to
accelerate
the
truck
\
\
is
by
Newton
\u2019
s
second
law
only
dependent
on
the
mass
of
the
truck.
Hence
\
\
(A),
(B)
and
(C)
are
incorrect
since
it
doesn
\u2019
t
matter
that
it
\u2019
s
on
\
\
Mars,
and
(D)
is
the
correct
answer.
The
answer
is
(D).
\n\n
Q:
Say
the
pupil
of
\
\
your
eye
has
a
diameter
of
5
mm
and
you
have
a
telescope
with
an
aperture
of
50
\
\
cm.
How
much
more
light
can
the
telescope
gather
than
your
eye?
\n
(A)
10000
times
\
\
more
(B)
100
times
more
(C)
1000
times
more
(D)
10
times
more
\n
A:
Let's
think
\
\
step
by
step.
The
amount
of
light
is
proportional
to
the
aperture
area
$A
=
\\\
pi
D^2/4$
for
a
lens
with
diameter
$D$,
so
the
relative
amounts
of
light
between
\
\
the
eye
with
diameter
5mm
and
the
telescope
with
diameter
50mm
is
$(50
cm)^2/(5mm)^2
\
\
=
10000$.
The
answer
is
(A).
\n\n
Q:
Why
isn't
there
a
planet
where
the
asteroid
\
\
belt
is
located?
\n
(A)
A
planet
once
formed
here
but
it
was
broken
apart
by
a
catastrophic
\
\
collision.
(B)
There
was
not
enough
material
in
this
part
of
the
solar
nebula
\
\
to
form
a
planet.
(C)
There
was
too
much
rocky
material
to
form
a
terrestrial
\
\
planet
but
not
enough
gaseous
material
to
form
a
jovian
planet.
(D)
Resonance
\
\
with
Jupiter
prevented
material
from
collecting
together
to
form
a
planet.
\n
A:
\
\
Let's
think
step
by
step.
The
asteroid
belt
is
a
stellar
disc
consisting
of
a
\
\
large
number
of
asteroids
between
Mars
and
Jupiter's
orbits.
The
asteroids
in
\
\
this
belt
are
affected
by
the
gravitational
pull
from
both
other
asteroids
and
\
\
nearby
planets.
Due
to
the
strong
gravitational
force
of
Jupiter
there
are
resonances
\
\
that
give
rise
to
low
density
regions
of
asteroids
known
as
the
Kirkwood
gap.
\
\
So
(B)
and
(C)
are
not
correct
since
it
\u2019
s
not
a
lack
of
material
that
prevents
\
\
a
planet
from
being
formed,
and
(A)
is
incorrect
because
the
Kirkwood
gap
would
\
\
have
prevented
a
planet
from
forming
in
the
first
place,
and
(D)
is
the
correct
\
\
option.
The
answer
is
(D).
\n\n
Q:
Why
is
Mars
red?
\n
(A)
Because
the
surface
is
\
\
covered
with
heavily
oxidized
(
\"
rusted
\"
)
minerals.
(B)
Because
the
atmosphere
\
\
scatters
more
light
at
bluer
wavelengths
transmitting
mostly
red
light.
(C)
Because
\
\
Mars
is
covered
with
ancient
lava
flows
which
are
red
in
color.
(D)
Because
flowing
\
\
water
on
Mars's
surface
altered
the
surface
minerals
several
billion
years
ago.
\n\
A:
Let's
think
step
by
step.
Option
(B)
is
not
correct
because
if
the
red
color
\
\
was
caused
by
the
scattering
off
the
atmosphere,
then
the
earth
with
a
much
thicker
\
\
atmosphere
would
also
look
red.
Options
(C)
and
(D)
are
not
specific
enough
about
\
\
why
the
color
of
the
surface
would
be
red,
while
(A)
is
correct
because
it
explains
\
\
that
the
surface
is
red
due
to
the
rusted
materials
on
the
surface
and
the
red
\
\
color
comes
from
the
rust.
So
the
correct
option
is
(A).
The
answer
is
(A)."
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_astronomy
\
is
by
Newton’s
second
law
only
dependent
on
the
mass
of
the
truck.
Hence
(A),
\
\
(B)
and
(C)
are
incorrect
since
it
doesn’t
matter
that
it’s
on
Mars,
and
(D)
is
\
\
the
correct
answer.
The
answer
is
(D).
\n\n
Q:
Say
the
pupil
of
your
eye
has
a
diameter
\
\
of
5
mm
and
you
have
a
telescope
with
an
aperture
of
50
cm.
How
much
more
light
\
\
can
the
telescope
gather
than
your
eye?
\n
(A)
10000
times
more
(B)
100
times
more
\
\
(C)
1000
times
more
(D)
10
times
more
\n
A:
Let's
think
step
by
step.
The
amount
\
\
of
light
is
proportional
to
the
aperture
area
$A
=
\\
pi
D^2/4$
for
a
lens
with
\
\
diameter
$D$,
so
the
relative
amounts
of
light
between
the
eye
with
diameter
5mm
\
\
and
the
telescope
with
diameter
50mm
is
$(50
cm)^2/(5mm)^2
=
10000$.
The
answer
\
\
is
(A).
\n\n
Q:
Why
isn't
there
a
planet
where
the
asteroid
belt
is
located?
\n
(A)
\
\
A
planet
once
formed
here
but
it
was
broken
apart
by
a
catastrophic
collision.
\
\
(B)
There
was
not
enough
material
in
this
part
of
the
solar
nebula
to
form
a
planet.
\
\
(C)
There
was
too
much
rocky
material
to
form
a
terrestrial
planet
but
not
enough
\
\
gaseous
material
to
form
a
jovian
planet.
(D)
Resonance
with
Jupiter
prevented
\
\
material
from
collecting
together
to
form
a
planet.
\n
A:
Let's
think
step
by
step.
\
\
The
asteroid
belt
is
a
stellar
disc
consisting
of
a
large
number
of
asteroids
\
\
between
Mars
and
Jupiter's
orbits.
The
asteroids
in
this
belt
are
affected
by
\
\
the
gravitational
pull
from
both
other
asteroids
and
nearby
planets.
Due
to
the
\
\
strong
gravitational
force
of
Jupiter
there
are
resonances
that
give
rise
to
low
\
\
density
regions
of
asteroids
known
as
the
Kirkwood
gap.
So
(B)
and
(C)
are
not
\
\
correct
since
it’s
not
a
lack
of
material
that
prevents
a
planet
from
being
formed,
\
\
and
(A)
is
incorrect
because
the
Kirkwood
gap
would
have
prevented
a
planet
from
\
\
forming
in
the
first
place,
and
(D)
is
the
correct
option.
The
answer
is
(D).
\n\
\n
Q:
Why
is
Mars
red?
\n
(A)
Because
the
surface
is
covered
with
heavily
oxidized
\
\
(
\"
rusted
\"
)
minerals.
(B)
Because
the
atmosphere
scatters
more
light
at
bluer
\
\
wavelengths
transmitting
mostly
red
light.
(C)
Because
Mars
is
covered
with
ancient
\
\
lava
flows
which
are
red
in
color.
(D)
Because
flowing
water
on
Mars's
surface
\
\
altered
the
surface
minerals
several
billion
years
ago.
\n
A:
Let's
think
step
by
\
\
step.
Option
(B)
is
not
correct
because
if
the
red
color
was
caused
by
the
scattering
\
\
off
the
atmosphere,
then
the
earth
with
a
much
thicker
atmosphere
would
also
look
\
\
red.
Options
(C)
and
(D)
are
not
specific
enough
about
why
the
color
of
the
surface
\
\
would
be
red,
while
(A)
is
correct
because
it
explains
that
the
surface
is
red
\
\
due
to
the
rusted
materials
on
the
surface
and
the
red
color
comes
from
the
rust.
\
\
So
the
correct
option
is
(A).
The
answer
is
(A)."
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_astronomy"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_business_ethics.yaml
View file @
109ed1c7
dataset_name
:
business_ethics
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
business
\
"
dataset_name
"
:
"
business_ethics
"
"
description
"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
business
\
\
ethics.
\n\n
Q:
In
contrast
to
_______,
_______
aim
to
reward
favourable
behaviour
\
\
by
companies.
The
success
of
such
campaigns
have
been
heightened
through
the
use
\
\
of
___________,
which
allow
campaigns
to
facilitate
the
company
in
achieving
_________
\
...
...
@@ -7,12 +7,12 @@ description: "The following are multiple choice questions (with answers) about b
\
Boycotts,
Digital
technology,
Increased
Sales
(C)
Boycotts,
Buyalls,
Blockchain
\
\
technology,
Charitable
donations
(D)
Boycotts,
Buycotts,
Digital
technology,
Increased
\
\
Sales
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
business
\
\
ethics
for
help.
The
sentence
that
best
uses
the
possible
options
above
is
\u201C
\
In
contrast
to
*boycotts*,
*buycotts*
aim
to
reward
favourable
behavior
by
companies.
\
\
ethics
for
help.
The
sentence
that
best
uses
the
possible
options
above
is
“In
\
\
contrast
to
*boycotts*,
*buycotts*
aim
to
reward
favourable
behavior
by
companies.
\
\
The
success
of
such
campaigns
have
been
heightened
through
the
use
of
*digital
\
\
technology*,
which
allow
campaigns
to
facilitate
the
company
in
achieving
*increased
\
\
sales*.
\u201D
The
answer
is
(D).
\n\n
Q:
_______
is
the
direct
attempt
to
formally
\
\
or
informally
manage
ethical
issues
or
problems,
through
specific
policies,
practices
\
\
sales*.
”
The
answer
is
(D).
\n\n
Q:
_______
is
the
direct
attempt
to
formally
or
\
\
informally
manage
ethical
issues
or
problems,
through
specific
policies,
practices
\
\
and
programmes.
\n
(A)
Corporate
social
responsibility
(B)
Business
ethics
management
\
\
(C)
Sustainability
(D)
Environmental
management
\n
A:
Let's
think
step
by
step.
\
\
We
refer
to
Wikipedia
articles
on
business
ethics
for
help.
The
direct
attempt
\
...
...
@@ -26,30 +26,31 @@ description: "The following are multiple choice questions (with answers) about b
\
action,
Violent
direct
action,
Non-violent
direct-action
Boycott
(D)
Non-violent
\
\
direct
action,
Instrumental
action,
Indirect
action,
Information
campaign
\n
A:
\
\
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
business
ethics
for
\
\
help.
The
sentence
that
best
uses
the
possible
options
above
is
\u201C
Three
contrasting
\
\
help.
The
sentence
that
best
uses
the
possible
options
above
is
“
Three
contrasting
\
\
tactics
that
CSO's
can
engage
in
to
meet
their
aims
are
*indirect
action*,
which
\
\
typically
involves
research
and
communication,
*violent
direct
action*,
which
\
\
may
involve
physically
attacking
a
company's
operations
or
*non-violent
direct
\
\
action*,
often
involving
some
form
of
*boycott*.
\u201D
The
answer
is
(C).
\n\n\
Q:
To
ensure
the
independence
of
the
non-executive
board
members,
there
are
a
number
\
\
action*,
often
involving
some
form
of
*boycott*.
”
The
answer
is
(C).
\n\n
Q:
To
\
\
ensure
the
independence
of
the
non-executive
board
members,
there
are
a
number
\
\
of
steps
which
can
be
taken,
which
include
non-executives
being
drawn
from
_______
\
\
the
company,
being
appointed
for
a
_________
time
period
as
well
as
being
appointed
\
\
_________.
\n
(A)
Outside,
Limited,
Independently
(B)
Inside,
Limited,
Intermittently
\
\
(C)
Outside,
Unlimited,
Intermittently
(D)
Inside,
Unlimited,
Independently
\n\
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
business
ethics
for
\
\
help.
The
sentence
that
best
uses
the
possible
options
above
is
\u201C
To
ensure
\
\
the
independence
of
the
non-executive
board
members,
there
are
a
number
of
steps
\
\
which
can
be
taken,
which
include
non-executives
being
draw
from
*outside*
the
\
\
company,
being
appointed
for
a
*limited*
time
period
as
well
as
being
imported
\
\
*independently*.
The
answer
is
(A).
\n\n
Q:
Beyond
the
business
case
for
engaging
\
\
in
CSR
there
are
a
number
of
moral
arguments
relating
to:
negative
_______,
the
\
\
_______that
corporations
possess
and
the
________
of
business
and
society.
\n
(A)
\
\
Externalities,
Power,
Independence
(B)
Publicity,
Insubstantial
resources,
Mutual
\
\
dependence
(C)
Publicity,
Power,
Independence
(D)
Externalities,
Power,
Mutual
\
\
dependence
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
business
\
\
ethics
for
help.
The
sentence
that
best
uses
the
possible
options
above
is
\u201C\
Beyond
the
business
case
for
engaging
the
CSR
there
are
a
number
of
moral
arguments
\
\
relating
to:
negative
*externalities*,
the
*power*
that
corporations
possess
and
\
\
the
*mutual
independence*
of
business
and
society.
The
answer
is
(D)."
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_business_ethics
\
help.
The
sentence
that
best
uses
the
possible
options
above
is
“To
ensure
the
\
\
independence
of
the
non-executive
board
members,
there
are
a
number
of
steps
which
\
\
can
be
taken,
which
include
non-executives
being
draw
from
*outside*
the
company,
\
\
being
appointed
for
a
*limited*
time
period
as
well
as
being
imported
*independently*.
\
\
The
answer
is
(A).
\n\n
Q:
Beyond
the
business
case
for
engaging
in
CSR
there
are
\
\
a
number
of
moral
arguments
relating
to:
negative
_______,
the
_______that
corporations
\
\
possess
and
the
________
of
business
and
society.
\n
(A)
Externalities,
Power,
Independence
\
\
(B)
Publicity,
Insubstantial
resources,
Mutual
dependence
(C)
Publicity,
Power,
\
\
Independence
(D)
Externalities,
Power,
Mutual
dependence
\n
A:
Let's
think
step
\
\
by
step.
We
refer
to
Wikipedia
articles
on
business
ethics
for
help.
The
sentence
\
\
that
best
uses
the
possible
options
above
is
“Beyond
the
business
case
for
engaging
\
\
the
CSR
there
are
a
number
of
moral
arguments
relating
to:
negative
*externalities*,
\
\
the
*power*
that
corporations
possess
and
the
*mutual
independence*
of
business
\
\
and
society.
The
answer
is
(D)."
"
group"
:
"
mmlu_flan_cot_fewshot_other"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_business_ethics"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_clinical_knowledge.yaml
View file @
109ed1c7
dataset_name
:
clinical_knowledge
description
:
'
The
following
are
multiple
choice
questions
(with
answers)
about
clinical
knowledge.
Q:
Glycolysis
is
the
name
given
to
the
pathway
involving
the
conversion
of:
(A)
glycogen
to
glucose-1-phosphate.
(B)
glycogen
or
glucose
to
fructose.
(C)
glycogen
or
glucose
to
pyruvate
or
lactate.
(D)
glycogen
or
glucose
to
pyruvate
or
acetyl
CoA.
A:
Let'
'
s
think
step
by
step.
We
refer
to
Wikipedia
articles
on
clinical
knowledge
for
help.
Glycolysis
is
the
name
given
to
the
pathway
involving
conversion
of
glycogen
or
glucose
to
pyruvate
or
lactate.
The
answer
is
(C).
Q:
What
is
the
difference
between
a
male
and
a
female
catheter?
(A)
Male
and
female
catheters
are
different
colours.
(B)
Male
catheters
are
longer
than
female
catheters.
(C)
Male
catheters
are
bigger
than
female
catheters.
(D)
Female
catheters
are
longer
than
male
catheters.
A:
Let'
'
s
think
step
by
step.
We
refer
to
Wikipedia
articles
on
clinical
knowledge
for
help.
The
difference
between
a
male
and
female
catheter
is
that
male
catheters
tend
to
be
longer
than
female
catheters.
The
answer
is
(B).
Q:
How
many
attempts
should
you
make
to
cannulate
a
patient
before
passing
the
job
on
to
a
senior
colleague,
according
to
the
medical
knowledge
of
2020?
(A)
4
(B)
3
(C)
2
(D)
1
A:
Let'
'
s
think
step
by
step.
We
refer
to
Wikipedia
articles
on
clinical
knowledge
for
help.
According
to
the
medical
protocol
as
of
2020,
you
should
make
two
attempts
to
cannulate
a
patient
before
passing
the
job
on
to
a
more-senior
practitioner.
The
answer
is
(C).
Q:
In
the
assessment
of
the
hand
function
which
of
the
following
is
true?
(A)
Abduction
of
the
thumb
is
supplied
by
spinal
root
T2
(B)
Opposition
of
the
thumb
by
opponens
policis
is
supplied
by
spinal
root
T1
(C)
Finger
adduction
is
supplied
by
the
median
nerve
(D)
Finger
abduction
is
mediated
by
the
palmar
interossei
A:
Let'
'
s
think
step
by
step.
We
refer
to
Wikipedia
articles
on
clinical
knowledge
for
help.
Of
all
the
options,
it
is
only
true
that
the
opposition
of
the
thumb
by
opponens
pollicis
is
supplied
by
spinal
root
T1.
The
answer
is
(B).
Q:
The
energy
for
all
forms
of
muscle
contraction
is
provided
by:
(A)
ATP.
(B)
ADP.
(C)
phosphocreatine.
(D)
oxidative
phosphorylation.
A:
Let'
'
s
think
step
by
step.
We
refer
to
Wikipedia
articles
on
clinical
knowledge
for
help.
The
energy
for
muscular
contraction
is
provided
by
ATP
(adenosine
triphosphate),
which
is
the
powerhouse
of
the
cell.
The
answer
is
(A).'
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_clinical_knowledge
"
dataset_name"
:
"
clinical_knowledge"
"
description"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
clinical
\
\
knowledge.
\n\n
Q:
Glycolysis
is
the
name
given
to
the
pathway
involving
the
conversion
\
\
of:
\n
(A)
glycogen
to
glucose-1-phosphate.
(B)
glycogen
or
glucose
to
fructose.
\
\
(C)
glycogen
or
glucose
to
pyruvate
or
lactate.
(D)
glycogen
or
glucose
to
pyruvate
\
\
or
acetyl
CoA.
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
\
\
clinical
knowledge
for
help.
Glycolysis
is
the
name
given
to
the
pathway
involving
\
\
conversion
of
glycogen
or
glucose
to
pyruvate
or
lactate.
The
answer
is
(C).
\n\
\n
Q:
What
is
the
difference
between
a
male
and
a
female
catheter?
\n
(A)
Male
and
\
\
female
catheters
are
different
colours.
(B)
Male
catheters
are
longer
than
female
\
\
catheters.
(C)
Male
catheters
are
bigger
than
female
catheters.
(D)
Female
catheters
\
\
are
longer
than
male
catheters.
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
\
\
articles
on
clinical
knowledge
for
help.
The
difference
between
a
male
and
female
\
\
catheter
is
that
male
catheters
tend
to
be
longer
than
female
catheters.
The
answer
\
\
is
(B).
\n\n
Q:
How
many
attempts
should
you
make
to
cannulate
a
patient
before
\
\
passing
the
job
on
to
a
senior
colleague,
according
to
the
medical
knowledge
of
\
\
2020?
\n
(A)
4
(B)
3
(C)
2
(D)
1
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
\
\
articles
on
clinical
knowledge
for
help.
According
to
the
medical
protocol
as
\
\
of
2020,
you
should
make
two
attempts
to
cannulate
a
patient
before
passing
the
\
\
job
on
to
a
more-senior
practitioner.
The
answer
is
(C).
\n\n
Q:
In
the
assessment
\
\
of
the
hand
function
which
of
the
following
is
true?
\n
(A)
Abduction
of
the
thumb
\
\
is
supplied
by
spinal
root
T2
(B)
Opposition
of
the
thumb
by
opponens
policis
\
\
is
supplied
by
spinal
root
T1
(C)
Finger
adduction
is
supplied
by
the
median
nerve
\
\
(D)
Finger
abduction
is
mediated
by
the
palmar
interossei
\n
A:
Let's
think
step
\
\
by
step.
We
refer
to
Wikipedia
articles
on
clinical
knowledge
for
help.
Of
all
\
\
the
options,
it
is
only
true
that
the
opposition
of
the
thumb
by
opponens
pollicis
\
\
is
supplied
by
spinal
root
T1.
The
answer
is
(B).
\n\n
Q:
The
energy
for
all
forms
\
\
of
muscle
contraction
is
provided
by:
\n
(A)
ATP.
(B)
ADP.
(C)
phosphocreatine.
\
\
(D)
oxidative
phosphorylation.
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
\
\
articles
on
clinical
knowledge
for
help.
The
energy
for
muscular
contraction
is
\
\
provided
by
ATP
(adenosine
triphosphate),
which
is
the
powerhouse
of
the
cell.
\
\
The
answer
is
(A)."
"
group"
:
"
mmlu_flan_cot_fewshot_other"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_clinical_knowledge"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_biology.yaml
View file @
109ed1c7
dataset_name
:
college_biology
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
college
\
"
dataset_name
"
:
"
college_biology
"
"
description
"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
college
\
\
biology.
\n\n
Q:
Which
of
the
following
represents
an
accurate
statement
concerning
\
\
arthropods?
\n
(A)
They
possess
an
exoskeleton
composed
primarily
of
peptidoglycan.
\
\
(B)
They
possess
an
open
circulatory
system
with
a
dorsal
heart.
(C)
They
are
\
...
...
@@ -19,7 +19,7 @@ description: "The following are multiple choice questions (with answers) about c
\
Law,
$p^2
+
2
p
q
+
q^2
=
1$,
and
$p
+
q
=
1$
where
$p$
is
the
frequency
of
the
\
\
dominant
allele,
$q$
is
the
frequency
of
the
recessive
allele,
and
$p^2$,
$q^2$,
\
\
and
$2pq$
are
the
frequencies
of
dominant
homozygous,
recessive
homozygous,
and
\
\
heterozygous
individuals,
respectively.
\u200B
The
frequency
of
the
recessive
allele
\
\
heterozygous
individuals,
respectively.
The
frequency
of
the
recessive
allele
\
\
(q)
is
$
\\
sqrt{
\f
rac{1}{400}}
=
0.05$.
We
have
$p
=
1
-
q
=
0.95$.
The
frequency
\
\
of
heterozygous
individuals
is
$2pq
=
2
\\
cdot
0.05
\\
cdot
0.95
=
0.095$.
The
\
\
number
of
heterozygous
individuals
is
equal
to
the
frequency
of
heterozygous
individuals
\
...
...
@@ -56,5 +56,6 @@ description: "The following are multiple choice questions (with answers) about c
\
the
human
and
bird
forearms,
which
rules
out
(D).
Humans
and
birds
do
belong
to
\
\
the
same
clade
-
a
group
of
organisms
composed
of
a
common
ancestor.
The
answer
\
\
is
(C)."
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_college_biology
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_college_biology"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_chemistry.yaml
View file @
109ed1c7
dataset_name
:
college_chemistry
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
college
\
\
chemistry.
\n\n
Q:
3
Cl
\u2212
(aq)
+
4
CrO_4^2
\u2212
(aq)
+
23
H+(aq)
\u2192
3
HClO2(aq)
\
\
+
4
Cr3+(aq)
+
10
H2O(l).
In
the
reaction
shown
above,
Cl
\u2212
(aq)
behaves
as
\n\
(A)
an
acid
(B)
a
base
(C)
a
catalyst
(D)
a
reducing
agent
\n
A:
Let's
think
step
\
\
by
step.
A
molecule
that
behaves
as
a
base
accepts
an
H+
ion
(or
proton)
from
\
\
another
molecule,
whereas
a
molecule
that
behaves
as
an
acid
donates
an
H+
ion
\
\
(or
proton)
to
another
molecule.
Neither
of
these
is
the
case
for
Cl
in
this
reaction,
\
\
which
rules
out
(A)
and
(B).
A
catalyst
is
a
substance
that
only
accelerates
a
\
\
reaction
without
itself
undergoing
chemical
change,
which
is
not
the
case
here.
\
\
This
rules
out
(C).
Instead,
the
$Cl^{-}
molecules
carry
a
negative
charge,
which
\
\
they
donate
in
the
reaction
to
form
3
HClO2.
This
is
the
behavior
of
a
reducing
\
\
agent,
or
(D).
The
answer
is
(D).
\n\n
Q:
Which
of
the
following
statements
about
\
\
the
lanthanide
elements
is
NOT
true?
\n
(A)
The
most
common
oxidation
state
for
\
\
the
lanthanide
elements
is
+3.
(B)
Lanthanide
complexes
often
have
high
coordination
\
\
numbers
(>
6).
(C)
All
of
the
lanthanide
elements
react
with
aqueous
acid
to
liberate
\
\
hydrogen.
(D)
The
atomic
radii
of
the
lanthanide
elements
increase
across
the
\
\
period
from
La
to
Lu.
\n
A:
Let's
think
step
by
step.
The
atomic
radii
of
the
lanthanide
\
\
elements
in
fact
decrease
across
the
period
from
La
to
Lu.
Options
(A),
(B),
and
\
\
(C)
are
all
true.
This
means
that
only
(D)
is
NOT
true.
The
answer
is
(D).
\n\n\
Q:
Which
of
the
following
lists
the
hydrides
of
group-14
elements
in
order
of
thermal
\
\
stability,
from
lowest
to
highest?
\n
(A)
PbH4
<
SnH4
<
GeH4
<
SiH4
<
CH4
(B)
PbH4
\
\
<
SnH4
<
CH4
<
GeH4
<
SiH4
(C)
CH4
<
SiH4
<
GeH4
<
SnH4
<
PbH4
(D)
CH4
<
PbH4
\
\
<
GeH4
<
SnH4
<
SiH4
\n
A:
Let's
think
step
by
step.
The
thermal
stability
of
group-14
\
\
hydrides
decreases
as
we
move
from
the
top
of
group
14
to
the
bottom.
The
order
\
\
of
elements
in
the
group
from
top
to
bottom
is
C,
Si,
Ge,
Sn,
Pb.
Therefore
in
\
\
order
of
increasing
thermal
stability
we
have
PbH4,
SnH4,
GeH4,
SiH4,
and
CH4,
\
\
or
answer
(A).
The
answer
is
(A).
\n\n
Q:
Predict
the
number
of
lines
in
the
EPR
\
\
spectrum
of
a
solution
of
13C-labelled
methyl
radical
(13CH3
\u2022
),
assuming
\
\
the
lines
do
not
overlap.
\n
(A)
4
(B)
3
(C)
6
(D)
24
(E)
8
\n
A:
Let's
think
step
\
\
by
step.
The
electron
paramagnetic
resonance
spectrum
will
be
split
by
two
forms
\
\
of
interactions.
The
first
is
the
hyperfine
interaction
with
the
13C
(nuclear
\
\
spin
$I
=
\n
rac{1}{2}$)
which
will
split
the
spectrum
into
2
lines.
This
will
\
\
be
further
split
into
4
lines
by
the
interaction
with
three
equivalent
1H
nuclei.
\
\
The
total
number
of
lines
is
therefore
$2
\\
cdot
4
=
8$.
The
answer
is
(E)."
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_college_chemistry
"
dataset_name"
:
"
college_chemistry"
"
description"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
college
\
\
chemistry.
\n\n
Q:
3
Cl−(aq)
+
4
CrO_4^2−(aq)
+
23
H+(aq)
→
3
HClO2(aq)
+
4
Cr3+(aq)
\
\
+
10
H2O(l).
In
the
reaction
shown
above,
Cl−(aq)
behaves
as
\n
(A)
an
acid
(B)
\
\
a
base
(C)
a
catalyst
(D)
a
reducing
agent
\n
A:
Let's
think
step
by
step.
A
molecule
\
\
that
behaves
as
a
base
accepts
an
H+
ion
(or
proton)
from
another
molecule,
whereas
\
\
a
molecule
that
behaves
as
an
acid
donates
an
H+
ion
(or
proton)
to
another
molecule.
\
\
Neither
of
these
is
the
case
for
Cl
in
this
reaction,
which
rules
out
(A)
and
\
\
(B).
A
catalyst
is
a
substance
that
only
accelerates
a
reaction
without
itself
\
\
undergoing
chemical
change,
which
is
not
the
case
here.
This
rules
out
(C).
Instead,
\
\
the
$Cl^{-}
molecules
carry
a
negative
charge,
which
they
donate
in
the
reaction
\
\
to
form
3
HClO2.
This
is
the
behavior
of
a
reducing
agent,
or
(D).
The
answer
\
\
is
(D).
\n\n
Q:
Which
of
the
following
statements
about
the
lanthanide
elements
\
\
is
NOT
true?
\n
(A)
The
most
common
oxidation
state
for
the
lanthanide
elements
\
\
is
+3.
(B)
Lanthanide
complexes
often
have
high
coordination
numbers
(>
6).
(C)
\
\
All
of
the
lanthanide
elements
react
with
aqueous
acid
to
liberate
hydrogen.
(D)
\
\
The
atomic
radii
of
the
lanthanide
elements
increase
across
the
period
from
La
\
\
to
Lu.
\n
A:
Let's
think
step
by
step.
The
atomic
radii
of
the
lanthanide
elements
\
\
in
fact
decrease
across
the
period
from
La
to
Lu.
Options
(A),
(B),
and
(C)
are
\
\
all
true.
This
means
that
only
(D)
is
NOT
true.
The
answer
is
(D).
\n\n
Q:
Which
\
\
of
the
following
lists
the
hydrides
of
group-14
elements
in
order
of
thermal
stability,
\
\
from
lowest
to
highest?
\n
(A)
PbH4
<
SnH4
<
GeH4
<
SiH4
<
CH4
(B)
PbH4
<
SnH4
<
\
\
CH4
<
GeH4
<
SiH4
(C)
CH4
<
SiH4
<
GeH4
<
SnH4
<
PbH4
(D)
CH4
<
PbH4
<
GeH4
<
\
\
SnH4
<
SiH4
\n
A:
Let's
think
step
by
step.
The
thermal
stability
of
group-14
hydrides
\
\
decreases
as
we
move
from
the
top
of
group
14
to
the
bottom.
The
order
of
elements
\
\
in
the
group
from
top
to
bottom
is
C,
Si,
Ge,
Sn,
Pb.
Therefore
in
order
of
increasing
\
\
thermal
stability
we
have
PbH4,
SnH4,
GeH4,
SiH4,
and
CH4,
or
answer
(A).
The
\
\
answer
is
(A).
\n\n
Q:
Predict
the
number
of
lines
in
the
EPR
spectrum
of
a
solution
\
\
of
13C-labelled
methyl
radical
(13CH3•),
assuming
the
lines
do
not
overlap.
\n\
(A)
4
(B)
3
(C)
6
(D)
24
(E)
8
\n
A:
Let's
think
step
by
step.
The
electron
paramagnetic
\
\
resonance
spectrum
will
be
split
by
two
forms
of
interactions.
The
first
is
the
\
\
hyperfine
interaction
with
the
13C
(nuclear
spin
$I
=
\n
rac{1}{2}$)
which
will
\
\
split
the
spectrum
into
2
lines.
This
will
be
further
split
into
4
lines
by
the
\
\
interaction
with
three
equivalent
1H
nuclei.
The
total
number
of
lines
is
therefore
\
\
$2
\\
cdot
4
=
8$.
The
answer
is
(E)."
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_college_chemistry"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_computer_science.yaml
View file @
109ed1c7
dataset_name
:
college_computer_science
description
:
'
The
following
are
multiple
choice
questions
(with
answers)
about
college
computer
science.
Q:
Which
of
the
following
regular
expressions
is
equivalent
to
(describes
the
same
set
of
strings
as)
(a*
+
b)*(c
+
d)?
(A)
a*(c
+
d)+
b(c
+
d)
(B)
a*(c
+
d)*
+
b(c
+
d)*
(C)
a*(c
+
d)+
b*(c
+
d)
(D)
(a
+
b)*c
+(a
+
b)*d
A:
Let'
'
s
think
step
by
step.
We
know
that:
1.
(X*
+
Y)*
=
(X
+
Y)*
2.
X(Y
+
Z)?
=
XY
+
XZ
Using
equation
1
we
can
rewrite
(a*
+
b)*(c
+
d)?
as:
3.
(a
+
b)*(c
+
d)?
Using
equation
2
we
can
rewrite
equation
3
as:
(a
+
b)*c
+
(a
+
b)*d
The
answer
is
(D).
Q:
The
Singleton
design
pattern
is
used
to
guarantee
that
only
a
single
instance
of
a
class
may
be
instantiated.
Which
of
the
following
is
(are)
true
of
this
design
pattern?
I.
The
Singleton
class
has
a
static
factory
method
to
provide
its
instance.
II.
The
Singleton
class
can
be
a
subclass
of
another
class.
III.
The
Singleton
class
has
a
private
constructor.
(A)
I
only
(B)
II
only
(C)
III
only
(D)
I,
II,
and
III
A:
Let'
'
s
think
step
by
step.
Statement
I
is
a
correct
statement
about
a
Singleton,
because
a
Singleton
restricts
instantiation
to
a
single,
static
method.
Statement
II
is
also
correct,
because
there
is
no
inherent
restriction
regarding
the
inheritance
of
a
Singleton.
Statement
III
is
also
correct,
because
a
Singletons
must
be
instantiated
only
once,
so
its
constructor
is
made
private
to
prevent
any
construction
except
via
its
static
factory
method.
Given
these
facts,
statements
I,
II,
and
III
are
all
correct.
The
answer
is
(D).
Q:
A
certain
pipelined
RISC
machine
has
8
general-purpose
registers
R0,
R1,
.
.
.
,
R7
and
supports
the
following
operations:
ADD
Rs1,
Rs2,
Rd
(Add
Rs1
to
Rs2
and
put
the
sum
in
Rd)
MUL
Rs1,
Rs2,
Rd
(Multiply
Rs1
by
Rs2
and
put
the
product
in
Rd)
An
operation
normally
takes
one
cycle;
however,
an
operation
takes
two
cycles
if
it
produces
a
result
required
by
the
immediately
following
operation
in
an
operation
sequence.
Consider
the
expression
AB
+
ABC
+
BC,
where
variables
A,
B,
C
are
located
in
registers
R0,
R1,
R2.
If
the
contents
of
these
three
registers
must
not
be
modified,
what
is
the
minimum
number
of
clock
cycles
required
for
an
operation
sequence
that
computes
the
value
of
AB
+
ABC
+
BC?
(A)
5
(B)
6
(C)
7
(D)
8
A:
Let'
'
s
think
step
by
step.
First,
we
are
given
that
A
is
in
R0,
B
is
in
R1,
and
C
is
in
R2.
Next,
we
can
see
that
we
must
compute
three
multiplies
(AB,
BC,
and
ABC)
and
two
adds
(AB
+
ABC,
(AB
+
ABC)
+
BC)
to
compute
our
final
answer,
resulting
in
a
minimum
of
five
clock
cycles.
Next,
we
can
see
that
there
is
no
way
to
avoid
at
least
one
pipeline
stall
when
computing
our
final
answer,
because
to
compute
our
final
sum
we
must
wait
at
least
one
cycle
for
the
results
from
the
previous
stage
to
be
ready.
Thus,
our
minimum
number
of
cycles
must
be
6.
We
can
verify
that
we
can
create
a
solution
that
requires
only
six
cycles
as
follows:
compute
AB:
MUL
R0,
R1,
R3
compute
BC:
MUL
R1,
R2,
R4
compute
ABC:
MUL
R3,
R4,
R5
compute
AB
+
BC:
ADD
R3,
R4,
R6
STALL
compute
AB
+
ABC
+
BC:
ADD
R5,
R6,
R7
So
there
are
6
cycles.
The
answer
is
(B).
Q:
A
compiler
generates
code
for
the
following
assignment
statement.
G
:=
(A
+
B)
*
C
-
(D
+
E)
*
F
The
target
machine
has
a
single
accumulator
and
a
single-address
instruction
set
consisting
of
instructions
load,
store,
add,
subtract,
and
multiply.
For
the
arithmetic
operations,
the
left
operand
is
taken
from
the
accumulator
and
the
result
appears
in
the
accumulator.
The
smallest
possible
number
of
instructions
in
the
resulting
code
is
(A)
5
(B)
6
(C)
7
(D)
9
A:
Let'
'
s
think
step
by
step.
We
can
compute
the
final
answer
with
the
following
sequence
of
operations:
1.
LOAD
D
(accumulator
=
D)
2.
ADD
E
(accumulator
=
D+E)
3.
MUL
F
(accumulator
=
(D+E)*F)
4.
STORE
X
(X
=
(D+E)*F)
5.
LOAD
A
(accumulator
=
A)
6.
ADD
B
(accumulator
=
A+B)
7.
MUL
C
(accumulator
=
(A+B)*C)
8.
SUB
X
(accumulator
=
(A+B)*C
-
(D+E)*F)
9.
STORE
G
(G
=
(A+B)*C
-
(D+E)*F)
This
sequence
takes
9
instructions.
The
answer
is
(D).
Q:
Consider
a
computer
design
in
which
multiple
processors,
each
with
a
private
cache
memory,
share
global
memory
using
a
single
bus.
This
bus
is
the
critical
system
resource.
Each
processor
can
execute
one
instruction
every
500
nanoseconds
as
long
as
memory
references
are
satisfied
by
its
local
cache.
When
a
cache
miss
occurs,
the
processor
is
delayed
for
an
additional
2,000
nanoseconds.
During
half
of
this
additional
delay,
the
bus
is
dedicated
to
serving
the
cache
miss.
During
the
other
half,
the
processor
cannot
continue,
but
the
bus
is
free
to
service
requests
from
other
processors.
On
average,
each
instruction
requires
2
memory
references.
On
average,
cache
misses
occur
on
1
percent
of
references.
What
proportion
of
the
capacity
of
the
bus
would
a
single
processor
consume,
ignoring
delays
due
to
competition
from
other
processors?
(A)
1/50
(B)
1/27
(C)
1/25
(D)
2/27
A:
Let'
'
s
think
step
by
step.
We
know
that
each
instruction
requires
two
memory
references
per
instruction,
and
that
there
is
an
average
cache
miss
rate
of
one
percent.
Thus
a
given
processor
has:
(1
cache
miss
/
100
references)
*
(2
references
/
instruction)
=
(2
cache
misses
/
100
instructions),
so:
misses_per_instruction
=
1
cache
miss
/
50
instructions.
Next,
we
know
that
each
instruction
requires
500
nanoseconds
when
there
is
no
cache
miss,
and
500
+
2000
=
2500
nanoseconds
when
there
is
a
cache
miss.
Thus:
50
instructions
/
(49
*
500)
+
(1
*
2500)
nanoseconds,
so:
instructions_per_ns
=
50
instructions
/
27000
nanoseconds.
Now,
we
know
that
each
cache
miss
locks
the
bus
for
half
of
the
2000
nanosecond
cache
miss
delay,
or
1000
nanoseconds,
so:
lock_ns_per_miss
=
1000
nanoseconds
/
cache
miss.
Thus
we
can
see
that
on
average
a
single
processor
will
lock
the
bus
for:
lock_ns_per_miss
*
misses_per_instruction
*
instructions_per_ns
=
(1000
nanoseconds
/
cache
miss)
*
(1
cache
miss
/
50
instructions)
*
(50
instructions
/
27000
nanoseconds)
=
1000
*
(1/50)
*
(50/27000)
=
1000/27000
=
1/27.
The
answer
is
(B).'
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_college_computer_science
"
dataset_name"
:
"
college_computer_science"
"
description"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
college
\
\
computer
science.
\n\n
Q:
Which
of
the
following
regular
expressions
is
equivalent
\
\
to
(describes
the
same
set
of
strings
as)
(a*
+
b)*(c
+
d)?
\n
(A)
a*(c
+
d)+
b(c
\
\
+
d)
\n
(B)
a*(c
+
d)*
+
b(c
+
d)*
\n
(C)
a*(c
+
d)+
b*(c
+
d)
\n
(D)
(a
+
b)*c
+(a
\
\
+
b)*d
\n
A:
Let's
think
step
by
step.
We
know
that:
\n
1.
(X*
+
Y)*
=
(X
+
Y)*
\n\
2.
X(Y
+
Z)?
=
XY
+
XZ
\n
Using
equation
1
we
can
rewrite
(a*
+
b)*(c
+
d)?
as:
\n\
3.
(a
+
b)*(c
+
d)?
\n
Using
equation
2
we
can
rewrite
equation
3
as:
\n
(a
+
b)*c
+
\
\
(a
+
b)*d
The
answer
is
(D).
\n\n
Q:
The
Singleton
design
pattern
is
used
to
guarantee
\
\
that
only
a
single
instance
of
a
class
may
be
instantiated.
Which
of
the
following
\
\
is
(are)
true
of
this
design
pattern?
\n
I.
The
Singleton
class
has
a
static
factory
\
\
method
to
provide
its
instance.
\n
II.
The
Singleton
class
can
be
a
subclass
of
\
\
another
class.
\n
III.
The
Singleton
class
has
a
private
constructor.
\n
(A)
I
only
\n\
(B)
II
only
\n
(C)
III
only
\n
(D)
I,
II,
and
III
\n
A:
Let's
think
step
by
step.
Statement
\
\
I
is
a
correct
statement
about
a
Singleton,
because
a
Singleton
restricts
instantiation
\
\
to
a
single,
static
method.
Statement
II
is
also
correct,
because
there
is
no
\
\
inherent
restriction
regarding
the
inheritance
of
a
Singleton.
Statement
III
is
\
\
also
correct,
because
a
Singletons
must
be
instantiated
only
once,
so
its
constructor
\
\
is
made
private
to
prevent
any
construction
except
via
its
static
factory
method.
\n\
Given
these
facts,
statements
I,
II,
and
III
are
all
correct.
The
answer
is
(D).
\n\
\n
Q:
A
certain
pipelined
RISC
machine
has
8
general-purpose
registers
R0,
R1,
.
\
\
.
.
,
R7
and
supports
the
following
operations:
\n
ADD
Rs1,
Rs2,
Rd
(Add
Rs1
to
\
\
Rs2
and
put
the
sum
in
Rd)
\n
MUL
Rs1,
Rs2,
Rd
(Multiply
Rs1
by
Rs2
and
put
the
\
\
product
in
Rd)
\n
An
operation
normally
takes
one
cycle;
however,
an
operation
takes
\
\
two
cycles
if
it
produces
a
result
required
by
the
immediately
following
operation
\
\
in
an
operation
sequence.
\n
Consider
the
expression
AB
+
ABC
+
BC,
where
variables
\
\
A,
B,
C
are
located
in
registers
R0,
R1,
R2.
If
the
contents
of
these
three
registers
\
\
must
not
be
modified,
what
is
the
minimum
number
of
clock
cycles
required
for
\
\
an
operation
sequence
that
computes
the
value
of
AB
+
ABC
+
BC?
\n
(A)
5
(B)
6
(C)
\
\
7
(D)
8
\n
A:
Let's
think
step
by
step.
First,
we
are
given
that
A
is
in
R0,
B
is
\
\
in
R1,
and
C
is
in
R2.
\n
Next,
we
can
see
that
we
must
compute
three
multiplies
\
\
(AB,
BC,
and
ABC)
and
two
adds
(AB
+
ABC,
(AB
+
ABC)
+
BC)
to
compute
our
final
\
\
answer,
resulting
in
a
minimum
of
five
clock
cycles.
\n
Next,
we
can
see
that
there
\
\
is
no
way
to
avoid
at
least
one
pipeline
stall
when
computing
our
final
answer,
\
\
because
to
compute
our
final
sum
we
must
wait
at
least
one
cycle
for
the
results
\
\
from
the
previous
stage
to
be
ready.
Thus,
our
minimum
number
of
cycles
must
be
\
\
6.
\n
We
can
verify
that
we
can
create
a
solution
that
requires
only
six
cycles
\
\
as
follows:
\n
compute
AB:
MUL
R0,
R1,
R3
\n
compute
BC:
MUL
R1,
R2,
R4
\n
compute
ABC:
\
\
MUL
R3,
R4,
R5
\n
compute
AB
+
BC:
ADD
R3,
R4,
R6
\n
STALL
\n
compute
AB
+
ABC
+
BC:
\
\
ADD
R5,
R6,
R7
\n
So
there
are
6
cycles.
The
answer
is
(B).
\n\n
Q:
A
compiler
generates
\
\
code
for
the
following
assignment
statement.
\n
G
:=
(A
+
B)
*
C
-
(D
+
E)
*
F
\n\
The
target
machine
has
a
single
accumulator
and
a
single-address
instruction
set
\
\
consisting
of
instructions
load,
store,
add,
subtract,
and
multiply.
For
the
arithmetic
\
\
operations,
the
left
operand
is
taken
from
the
accumulator
and
the
result
appears
\
\
in
the
accumulator.
The
smallest
possible
number
of
instructions
in
the
resulting
\
\
code
is
\n
(A)
5
(B)
6
(C)
7
(D)
9
\n
A:
Let's
think
step
by
step.
We
can
compute
\
\
the
final
answer
with
the
following
sequence
of
operations:
\n
1.
LOAD
D
(accumulator
\
\
=
D)
\n
2.
ADD
E
(accumulator
=
D+E)
\n
3.
MUL
F
(accumulator
=
(D+E)*F)
\n
4.
STORE
\
\
X
(X
=
(D+E)*F)
\n
5.
LOAD
A
(accumulator
=
A)
\n
6.
ADD
B
(accumulator
=
A+B)
\n\
7.
MUL
C
(accumulator
=
(A+B)*C)
\n
8.
SUB
X
(accumulator
=
(A+B)*C
-
(D+E)*F)
\n\
9.
STORE
G
(G
=
(A+B)*C
-
(D+E)*F)
\n
This
sequence
takes
9
instructions.
The
answer
\
\
is
(D).
\n\n
Q:
Consider
a
computer
design
in
which
multiple
processors,
each
with
\
\
a
private
cache
memory,
share
global
memory
using
a
single
bus.
This
bus
is
the
\
\
critical
system
resource.
Each
processor
can
execute
one
instruction
every
500
\
\
nanoseconds
as
long
as
memory
references
are
satisfied
by
its
local
cache.
When
\
\
a
cache
miss
occurs,
the
processor
is
delayed
for
an
additional
2,000
nanoseconds.
\
\
During
half
of
this
additional
delay,
the
bus
is
dedicated
to
serving
the
cache
\
\
miss.
During
the
other
half,
the
processor
cannot
continue,
but
the
bus
is
free
\
\
to
service
requests
from
other
processors.
On
average,
each
instruction
requires
\
\
2
memory
references.
On
average,
cache
misses
occur
on
1
percent
of
references.
\
\
What
proportion
of
the
capacity
of
the
bus
would
a
single
processor
consume,
ignoring
\
\
delays
due
to
competition
from
other
processors?
\n
(A)
1/50
(B)
1/27
(C)
1/25
(D)
\
\
2/27
\n
A:
Let's
think
step
by
step.
We
know
that
each
instruction
requires
two
\
\
memory
references
per
instruction,
and
that
there
is
an
average
cache
miss
rate
\
\
of
one
percent.
\n
Thus
a
given
processor
has:
\n
(1
cache
miss
/
100
references)
\
\
*
(2
references
/
instruction)
=
\n
(2
cache
misses
/
100
instructions),
so:
\n
misses_per_instruction
\
\
=
1
cache
miss
/
50
instructions.
\n
Next,
we
know
that
each
instruction
requires
\
\
500
nanoseconds
when
there
is
no
cache
miss,
and
500
+
2000
=
2500
nanoseconds
\
\
when
there
is
a
cache
miss.
Thus:
\n
50
instructions
/
(49
*
500)
+
(1
*
2500)
nanoseconds,
\
\
so:
\n
instructions_per_ns
=
50
instructions
/
27000
nanoseconds.
\n
Now,
we
know
\
\
that
each
cache
miss
locks
the
bus
for
half
of
the
2000
nanosecond
cache
miss
\
\
delay,
or
1000
nanoseconds,
so:
\n
lock_ns_per_miss
=
1000
nanoseconds
/
cache
miss.
\n\
Thus
we
can
see
that
on
average
a
single
processor
will
lock
the
bus
for:
\n
lock_ns_per_miss
\
\
*
misses_per_instruction
*
instructions_per_ns
=
\n
(1000
nanoseconds
/
cache
miss)
\
\
*
(1
cache
miss
/
50
instructions)
*
(50
instructions
/
27000
nanoseconds)
=
1000
\
\
*
(1/50)
*
(50/27000)
=
1000/27000
=
1/27.
The
answer
is
(B)."
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_college_computer_science"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_mathematics.yaml
View file @
109ed1c7
dataset_name
:
college_mathematics
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
college
\
"
dataset_name
"
:
"
college_mathematics
"
"
description
"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
college
\
\
mathematics.
\n\n
Q:
Let
V
be
the
set
of
all
real
polynomials
p(x).
Let
transformations
\
\
T,
S
be
defined
on
V
by
T:p(x)
->
xp(x)
and
S:p(x)
->
p'(x)
=
d/dx
p(x),
and
interpret
\
\
(ST)(p(x))
as
S(T(p(x))).
Which
of
the
following
is
true?
\n
(A)
ST
=
0
(B)
ST
=
\
\
T
(C)
ST
=
TS
(D)
ST
-
TS
is
the
identity
map
of
V
onto
itself.
\n
A:
Let's
think
\
\
step
by
step.
For
a
given
polynomial
$p$
we
have
\n\\
[ST(p)
=
(xp(x))
\u2019
=
p(x)
\
\
+
xp
\u2019
(x)
\\
]
\n
and
\n\\
[TS(p)
=
xp
\u2019
(x).
\\
]
\n
Hence
\\
[ST(p)
-
TS(p)
=
p(x)
\
\
+
xp
\u2019
(x)
-
xp
\u2019
(x).
\\
]
The
answer
is
(D).
\n\n
Q:
Suppose
that
f(1
+
x)
\
\
=
f(x)
for
all
real
x.
If
f
is
a
polynomial
and
f(5)
=
11,
then
f(15/2)
\n
(A)
-11
\
\
(B)
0
(C)
11
(D)
33/2
\n
A:
Let's
think
step
by
step.
The
only
polynomial
so
that
\
\
$f(1
+
x)
=
f(x)$
is
a
constant
polynomial.
Hence
$f(5)
=
11
=
f(15/2)$.
The
answer
\
\
is
(C).
\n\n
Q:
Let
A
be
a
real
2x2
matrix.
Which
of
the
following
statements
must
\
\
be
true?
\n
I.
All
of
the
entries
of
A^2
are
nonnegative.
\n
II.
The
determinant
of
\
\
A^2
is
nonnegative.
\n
III.
If
A
has
two
distinct
eigenvalues,
then
A^2
has
two
\
\
distinct
eigenvalues.
\n
(A)
I
only
(B)
II
only
(C)
III
only
(D)
II
and
III
only
\n\
A:
Let's
think
step
by
step.
We
have
\\
[
det(A^2)
=
(det(A))^2
\\
geq
0,
\\
]
hence
\
\
II
holds.
\n
III
is
false:
as
a
counterexample
take
a
diagonal
matrix
with
-1
and
\
\
1
on
the
diagonal.
Then
$A^2$
is
the
identity
matrix.
The
answer
is
(B).
\n\n
Q:
\
\
Let
A
be
the
set
of
all
ordered
pairs
of
integers
(m,
n)
such
that
7m
+
12n
=
\
\
22.
What
is
the
greatest
negative
number
in
the
set
B
=
{m
+
n
:
(m,
n)
\\
in
A}?
\n\
(A)
-5
(B)
-4
(C)
-3
(D)
-2
\n
A:
Let's
think
step
by
step.
We
have
12n
=
22
-
7m
\
\
and
one
of
the
solutions
is
$m
=
-2$,
$n
=
3$.
Then
$m
+
n
=
1$,
hence
we
need
\
\
to
look
for
smaller
$m$
in
order
to
make
$m
+
n$
negative.
The
next
solution
is
\
\
$m
=
-14$
and
$n
=
10$.
For
smaller
$m$
we
have
$m
+
n$
smaller
than
$-4$.
The
\
\
answer
is
(B).
\n\n
Q:
A
tank
initially
contains
a
salt
solution
of
3
grams
of
salt
\
\
dissolved
in
100
liters
of
water.
A
salt
solution
containing
0.02
grams
of
salt
\
\
per
liter
of
water
is
sprayed
into
the
tank
at
a
rate
of
4
liters
per
minute.
\
\
The
sprayed
solution
is
continually
mixed
with
the
salt
solution
in
the
tank,
\
\
and
the
mixture
flows
out
of
the
tank
at
a
rate
of
4
liters
per
minute.
If
the
\
\
mixing
is
instantaneous,
how
many
grams
of
salt
are
in
the
tank
after
100
minutes
\
\
have
elapsed?
\n
(A)
2
(B)
2
-
e^-2
(C)
2
+
e^-2
(D)
2
+
e^-4
\n
A:
Let's
think
step
\
\
by
step.
For
all
$t
\\
in
\\
mathbb{R}$,
let
$s(t)$
denote
the
number
grams
of
salt
\
\
in
the
tank
at
the
$t$
minute
mark.
Then
$s(0)
=
3$.
\n
We
use
$s$
and
$s(t)$
interchangeably.
\
\
We
also
use
$s^{
\\
prime}$
and
$s^{
\\
prime}(t)$
interchangeably.
The
solution
sprayed
\
\
into
the
tank
adds
$(0.02)
4=2
/
25$
grams
of
salt
per
minute.
There
are
always
\
\
100
liters
of
liquid
in
the
tank,
containing
$s$
grams
of
salt.
So
the
density
\
\
of
salt
in
the
tank
is
$s
/
100$
grams
per
liter.
The
flow
of
water
out
of
the
\
\
tank
therefore
subtracts
$4(s
/
100)=s
/
25$
grams
of
salt
per
minute.
Then,
for
\
\
all
$t
\\
in
\\
mathbb{R}$,
we
have
$s^{
\\
prime}(t)=(2
/
25)-(s
/
25)=(2-s)
/
25$,
\
\
and
so
$[s(t)=2]
\\
Rightarrow
\\
left[s^{
\\
prime}(t)=0
\r
ight]$.
For
all
$t
\\
in
\
\ \\
mathbb{R}$,
\n
$$
\n\f
rac{d}{d
t}[
\\
ln
(s-2)]=
\f
rac{s^{
\\
prime}}{s-2}=
\f
rac{-1}{25}=
\f\
rac{d}{d
t}
\\
left[-
\f
rac{t}{25}
\r
ight]
.
\n
$$
\n
Choose
$C
\\
in
\\
mathbb{R}$
such
that,
\
\
for
all
$t
\\
in
\\
mathbb{R},
\\
ln
((s(t)-2))=-[t
/
25]+C$.
Let
$K:=e^{C}$.
Then,
\
\
for
all
$t
\\
in
\\
mathbb{R}$,
we
have
$(s(t))-2=K
e^{-t
/
25}$,
and
so
$s(t)=2+K
\
\
e^{-t
/
25}$.
Then
$3=s(0)=2+K
e^{0}=2+K$,
so
$K=1$.
Then
$s(100)=2+K
e^{-100
\
\
/
25}=2+1
\\
cdot
e^{-4}=2+e^{-4}$.
The
answer
is
(D)."
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_college_mathematics
\
step
by
step.
For
a
given
polynomial
$p$
we
have
\n\\
[ST(p)
=
(xp(x))’
=
p(x)
+
\
\
xp’(x)
\\
]
\n
and
\n\\
[TS(p)
=
xp’(x).
\\
]
\n
Hence
\\
[ST(p)
-
TS(p)
=
p(x)
+
xp’(x)
\
\
-
xp’(x).
\\
]
The
answer
is
(D).
\n\n
Q:
Suppose
that
f(1
+
x)
=
f(x)
for
all
real
\
\
x.
If
f
is
a
polynomial
and
f(5)
=
11,
then
f(15/2)
\n
(A)
-11
(B)
0
(C)
11
(D)
\
\
33/2
\n
A:
Let's
think
step
by
step.
The
only
polynomial
so
that
$f(1
+
x)
=
f(x)$
\
\
is
a
constant
polynomial.
Hence
$f(5)
=
11
=
f(15/2)$.
The
answer
is
(C).
\n\n\
Q:
Let
A
be
a
real
2x2
matrix.
Which
of
the
following
statements
must
be
true?
\n\
I.
All
of
the
entries
of
A^2
are
nonnegative.
\n
II.
The
determinant
of
A^2
is
nonnegative.
\n\
III.
If
A
has
two
distinct
eigenvalues,
then
A^2
has
two
distinct
eigenvalues.
\n\
(A)
I
only
(B)
II
only
(C)
III
only
(D)
II
and
III
only
\n
A:
Let's
think
step
by
\
\
step.
We
have
\\
[
det(A^2)
=
(det(A))^2
\\
geq
0,
\\
]
hence
II
holds.
\n
III
is
false:
\
\
as
a
counterexample
take
a
diagonal
matrix
with
-1
and
1
on
the
diagonal.
Then
\
\
$A^2$
is
the
identity
matrix.
The
answer
is
(B).
\n\n
Q:
Let
A
be
the
set
of
all
\
\
ordered
pairs
of
integers
(m,
n)
such
that
7m
+
12n
=
22.
What
is
the
greatest
\
\
negative
number
in
the
set
B
=
{m
+
n
:
(m,
n)
\\
in
A}?
\n
(A)
-5
(B)
-4
(C)
-3
\
\
(D)
-2
\n
A:
Let's
think
step
by
step.
We
have
12n
=
22
-
7m
and
one
of
the
solutions
\
\
is
$m
=
-2$,
$n
=
3$.
Then
$m
+
n
=
1$,
hence
we
need
to
look
for
smaller
$m$
\
\
in
order
to
make
$m
+
n$
negative.
The
next
solution
is
$m
=
-14$
and
$n
=
10$.
\
\
For
smaller
$m$
we
have
$m
+
n$
smaller
than
$-4$.
The
answer
is
(B).
\n\n
Q:
A
\
\
tank
initially
contains
a
salt
solution
of
3
grams
of
salt
dissolved
in
100
liters
\
\
of
water.
A
salt
solution
containing
0.02
grams
of
salt
per
liter
of
water
is
\
\
sprayed
into
the
tank
at
a
rate
of
4
liters
per
minute.
The
sprayed
solution
is
\
\
continually
mixed
with
the
salt
solution
in
the
tank,
and
the
mixture
flows
out
\
\
of
the
tank
at
a
rate
of
4
liters
per
minute.
If
the
mixing
is
instantaneous,
\
\
how
many
grams
of
salt
are
in
the
tank
after
100
minutes
have
elapsed?
\n
(A)
2
\
\
(B)
2
-
e^-2
(C)
2
+
e^-2
(D)
2
+
e^-4
\n
A:
Let's
think
step
by
step.
For
all
$t
\
\ \\
in
\\
mathbb{R}$,
let
$s(t)$
denote
the
number
grams
of
salt
in
the
tank
at
the
\
\
$t$
minute
mark.
Then
$s(0)
=
3$.
\n
We
use
$s$
and
$s(t)$
interchangeably.
We
also
\
\
use
$s^{
\\
prime}$
and
$s^{
\\
prime}(t)$
interchangeably.
The
solution
sprayed
into
\
\
the
tank
adds
$(0.02)
4=2
/
25$
grams
of
salt
per
minute.
There
are
always
100
\
\
liters
of
liquid
in
the
tank,
containing
$s$
grams
of
salt.
So
the
density
of
\
\
salt
in
the
tank
is
$s
/
100$
grams
per
liter.
The
flow
of
water
out
of
the
tank
\
\
therefore
subtracts
$4(s
/
100)=s
/
25$
grams
of
salt
per
minute.
Then,
for
all
\
\
$t
\\
in
\\
mathbb{R}$,
we
have
$s^{
\\
prime}(t)=(2
/
25)-(s
/
25)=(2-s)
/
25$,
and
\
\
so
$[s(t)=2]
\\
Rightarrow
\\
left[s^{
\\
prime}(t)=0
\r
ight]$.
For
all
$t
\\
in
\\
mathbb{R}$,
\n\
$$
\n\f
rac{d}{d
t}[
\\
ln
(s-2)]=
\f
rac{s^{
\\
prime}}{s-2}=
\f
rac{-1}{25}=
\f
rac{d}{d
t}
\\\
left[-
\f
rac{t}{25}
\r
ight]
.
\n
$$
\n
Choose
$C
\\
in
\\
mathbb{R}$
such
that,
for
all
\
\
$t
\\
in
\\
mathbb{R},
\\
ln
((s(t)-2))=-[t
/
25]+C$.
Let
$K:=e^{C}$.
Then,
for
all
\
\
$t
\\
in
\\
mathbb{R}$,
we
have
$(s(t))-2=K
e^{-t
/
25}$,
and
so
$s(t)=2+K
e^{-t
\
\
/
25}$.
Then
$3=s(0)=2+K
e^{0}=2+K$,
so
$K=1$.
Then
$s(100)=2+K
e^{-100
/
25}=2+1
\
\ \\
cdot
e^{-4}=2+e^{-4}$.
The
answer
is
(D)."
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_college_mathematics"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_medicine.yaml
View file @
109ed1c7
dataset_name
:
college_medicine
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
college
\
"
dataset_name
"
:
"
college_medicine
"
"
description
"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
college
\
\
medicine.
\n\n
Q:
An
expected
side
effect
of
creatine
supplementation
is:
\n
(A)
muscle
\
\
weakness.
(B)
gain
in
body
mass.
(C)
muscle
cramps.
(D)
loss
of
electrolytes.
\n\
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
medicine
for
help.
\
...
...
@@ -9,44 +9,44 @@ description: "The following are multiple choice questions (with answers) about c
\
endurance
runners
have
a
high
proportion
of
Type
I
fibres
in
their
leg
muscles
\
\
(C)
Liver
glycogen
is
important
in
the
maintenance
of
the
blood
glucose
concentration
\
\
(D)
Insulin
promotes
glucose
uptake
by
all
tissues
in
the
body
\n
A:
Let's
think
\
\
step
by
step.
We
refer
to
Wikipedia
articles
on
medicine
for
help.
Let
\u2019
s
\
\
solve
this
step
by
step
and
go
over
each
choice:
\n
(A)
\u201C
Muscle
glycogen
is
\
\
broken
down
enzymatically
to
glucose-1-phosphate
\u201D
:
This
is
a
correct
statement.
\n\
(B)
\u201C
Elite
endurance
runners
have
a
high
proportion
of
Type
I
fibres
in
their
\
\
leg
muscles
\u201D
:
This
is
a
correct
statement.
\n
(C)
\u201C
Liver
glycogen
is
important
\
\
in
the
maintenance
of
the
blood
glucose
concentration
\u201D
:
This
is
a
correct
\
\
statement.
\n
(D)
\u201C
Insulin
promotes
glucose
uptake
by
all
tissues
in
the
body
\u201D
\
:
This
is
not
a
correct
statement,
because
insulin
promotes
glucose
uptake
by
the
\
\
liver,
adipose
tissue,
and
muscle,
but
not
all
tissues.
For
instance,
the
tissues
\
\
in
the
brain
and
red
blood
cells
are
not
affected
by
insulin.
The
answer
is
(D).
\n\
\
n
Q:
A
high
school
science
teacher
fills
a
1
liter
bottle
with
pure
nitrogen
and
\
\
seals
the
lid.
The
pressure
is
1.70
atm,
and
the
room
temperature
is
25
\xB0
C.
\
\
Which
two
variables
will
both
increase
the
pressure
of
the
system,
if
all
other
\
\
variables
are
held
constant?
\n
(A
)
Increasing
temperature,
increasing
m
ol
es
of
\
\
gas
(B)
Increasing
temperatur
e,
in
creasing
volum
e
(
C
)
Decreasing
v
ol
ume,
de
creasing
\
\
temperature
(D)
Decreasing
moles
of
gas,
increasing
volume
\n
A:
Let's
think
step
\
\
by
ste
p.
W
e
re
fer
to
Wikipedia
articles
on
medicine
for
help.
The
relevant
equation
\
\
for
this
is
the
ideal
gas
law:
PV=nRT.
To
increase
the
pressure
of
the
system
\
\
(P),
then
either
n
(number
of
moles
of
the
gas)
or
T
(temperature)
have
to
increase.
\
\
The
answer
is
(A).
\n\n
Q:
In
a
genetic
test
of
a
newborn,
a
rare
genetic
disorder
\
\
is
found
that
has
X-
li
n
ke
d
recessive
transmission.
Which
of
the
following
statement
s
\
\
is
likely
true
regarding
the
pedigree
of
this
disorder?
\n
(A)
All
descendants
on
\
\
the
maternal
side
will
have
the
disorder.
(B)
Females
will
be
approximately
twice
\
\
as
affected
as
male
s
in
this
family.
(C)
All
daughters
of
an
affected
male
will
\
\
be
affected.
(D)
There
will
be
equal
distribution
of
males
and
females
affected.
\n
\
A:
Let
'
s
thi
nk
step
by
step.
We
refer
to
Wikipedia
articles
on
medicine
for
help.
\
\
Let
\u2019
s
solve
this
step
by
step.
Let's
recall
first
that
females
have
two
X
\
\
chromosomes,
while
males
hav
e
o
ne
X
and
one
Y
chromosome.
This
is
an
important
\
\
fact
we
need
to
know
before
answering
this
question.
\n
Because
a
male
can
only
\
\
pass
his
only
one
X
chromosome
to
a
daughter,
if
he
is
affected
by
this
rare
genetic
\
\
disorder
,
t
hen
we
know
for
sure
that
he
will
pass
this
rare
genetic
disorder
to
\
\
all
his
future-born
daughters.
Therefore,
\u201C
(C):
All
daughters
of
an
affected
\
\
male
will
be
affected
\u201D
is
a
correct
statement.
The
answer
is
(C).
\n\n
Q:
Glucose
\
\
is
transported
into
the
muscle
cell:
\n
(A)
via
protein
transporters
called
GLUT4.
\
\
(B)
only
in
the
presence
of
insulin.
(C)
via
hexokinase.
(D)
via
monocarbylic
\
\
a
cid
transporters.
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
\
\
on
medicine
for
help.
Glucose
(also
known
as
the
blood
sugar)
is
the
main
sugar
\
\
found
in
the
human
body.
It
is
transported
into
the
muscle
cell
via
diffusion
\
\
through
protein
transporters
called
GLUT4.
The
answer
is
(A).
"
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_college_medicine
\
step
by
step.
We
refer
to
Wikipedia
articles
on
medicine
for
help.
Let
’s
solve
\
\
this
step
by
step
and
go
over
each
choice:
\n
(A)
“
Muscle
glycogen
is
broken
down
\
\
enzymatically
to
glucose-1-phosphate
”
:
This
is
a
correct
statement.
\n
(B)
“Elite
\
\
endurance
runners
have
a
high
proportion
of
Type
I
fibres
in
their
leg
muscles”:
\
\
This
is
a
correct
statement.
\n
(C)
“
Liver
glycogen
is
important
in
the
maintenance
\
\
of
the
blood
glucose
concentration
”
:
This
is
a
correct
statement.
\n
(D)
“Insulin
\
\
promotes
glucose
uptake
by
all
tissues
in
the
body
”:
This
is
not
a
correct
statement,
\
\
because
insulin
promotes
glucose
uptake
by
the
liver,
adipose
tissue,
and
muscle,
\
\
but
not
all
tissues.
For
instance,
the
tissues
in
the
brain
and
red
blood
cells
\
\
are
not
affected
by
insulin.
The
answer
is
(D).
\n\
n
Q:
A
high
school
science
teacher
\
\
fills
a
1
liter
bottle
with
pure
nitrogen
and
seals
the
lid.
The
pressure
is
1.70
\
\
atm,
and
the
room
temperature
is
25°C.
Which
two
variables
will
both
increase
\
\
the
pressure
of
the
system,
if
all
other
variables
are
held
constant?
\n
(A)
Increasing
\
\
temperature,
increasing
moles
of
gas
(B
)
Increasing
temperature,
increasing
v
ol
ume
\
\
(C)
Decreasing
volum
e,
de
creasing
temperatur
e
(
D
)
Decreasing
m
ol
es
of
gas,
in
creasing
\
\
volume
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
medicine
\
\
for
hel
p.
Th
e
re
levant
equation
for
this
is
the
ideal
gas
law:
PV=nRT.
To
increase
\
\
the
pressure
of
the
system
(P),
then
either
n
(number
of
moles
of
the
gas)
or
\
\
T
(temperature)
have
to
increase.
The
answer
is
(A).
\n\n
Q:
In
a
genetic
test
of
\
\
a
newborn,
a
rare
genetic
disorder
is
found
that
has
X-linked
recessive
transmission.
\
\
Which
of
the
following
statements
is
like
ly
true
regarding
the
pedigree
of
thi
s
\
\
disorder?
\n
(A)
All
descendants
on
the
maternal
side
will
have
the
disorder.
(B)
\
\
Females
will
be
approximately
twice
as
affected
as
males
in
this
family.
(C)
All
\
\
daughters
of
an
affected
male
will
be
affected.
(D)
There
will
be
equal
distribution
\
\
of
males
and
females
affected.
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
\
\
articles
on
medicine
for
help.
Let
’
s
solve
thi
s
step
by
step.
Let's
recall
first
\
\
that
females
have
two
X
chromosomes,
while
males
have
one
X
and
one
Y
chromosome.
\
\
This
is
an
important
fact
w
e
ne
ed
to
know
before
answering
this
question.
\n
Because
\
\
a
male
can
only
pass
his
only
one
X
chromosome
to
a
daughter,
if
he
is
affected
\
\
by
this
rare
genetic
disorder,
then
we
know
for
sure
that
he
will
pass
this
rare
\
\
genetic
disorder
t
o
all
his
future-born
daughters.
Therefore,
“(C):
All
daughters
\
\
of
an
affected
male
will
be
affected”
is
a
correct
statement.
The
answer
is
(C).
\n
\
\
n
Q:
Glucose
is
transported
into
the
muscle
cell:
\n
(A)
via
protein
transporters
\
\
called
GLUT4.
(B)
only
in
the
presence
of
insulin.
(C)
via
hexokinase.
(D)
via
\
\
monocarbylic
acid
transporters.
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
\
\
a
rticles
on
medicine
for
help.
Glucose
(also
known
as
the
blood
sugar)
is
the
\
\
main
sugar
found
in
the
human
body.
It
is
transported
into
the
muscle
cell
via
\
\
diffusion
through
protein
transporters
called
GLUT4.
The
answer
is
(A)."
"
group"
:
"
mmlu_flan_cot_fewshot_other
"
"
include
"
:
"
_mmlu_flan_cot_fewshot_template_yaml
"
"
task
"
:
"
mmlu_flan_cot_fewshot_college_medicine
"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_college_physics.yaml
View file @
109ed1c7
dataset_name
:
college_physics
description
:
'
The
following
are
multiple
choice
questions
(with
answers)
about
college
physics.
Q:
A
refracting
telescope
consists
of
two
converging
lenses
separated
by
100
cm.
The
eye-piece
lens
has
a
focal
length
of
20
cm.
The
angular
magnification
of
the
telescope
is
(A)
4
(B)
5
(C)
6
(D)
20
A:
Let'
'
s
think
step
by
step.
In
a
refracting
telescope,
if
both
lenses
are
converging,
the
focus
of
both
lenses
must
be
between
the
two
lenses,
and
thus
the
focal
lengths
of
the
two
lenses
must
add
up
to
their
separation.
Since
the
focal
length
of
one
lens
is
20
cm,
the
focal
length
of
the
other
must
be
80
cm.
The
magnification
is
the
ratio
of
these
two
focal
lengths,
or
4.
The
answer
is
(A).
Q:
The
muon
decays
with
a
characteristic
lifetime
of
about
10^-6
second
into
an
electron,
a
muon
neutrino,
and
an
electron
antineutrino.
The
muon
is
forbidden
from
decaying
into
an
electron
and
just
a
single
neutrino
by
the
law
of
conservation
of
(A)
charge
(B)
mass
(C)
energy
and
momentum
(D)
lepton
number
A:
Let'
'
s
think
step
by
step.
Lepton
number
must
be
conserved,
meaning
the
total
number
of
leptons
minus
the
number
of
antileptons.
If
a
muon
decays
into
an
electron
and
a
single
neutrino,
the
total
lepton
number
would
go
from
one
to
two,
violating
lepton
number
conservation.
The
answer
is
(D).
Q:
One
end
of
a
Nichrome
wire
of
length
2L
and
cross-sectional
area
A
is
attached
to
an
end
of
another
Nichrome
wire
of
length
L
and
cross-
sectional
area
2A.
If
the
free
end
of
the
longer
wire
is
at
an
electric
potential
of
8.0
volts,
and
the
free
end
of
the
shorter
wire
is
at
an
electric
potential
of
1.0
volt,
the
potential
at
the
junction
of
the
two
wires
is
most
nearly
equal
to
(A)
2.4
V
(B)
3.3
V
(C)
4.5
V
(D)
5.7
V
A:
Let'
'
s
think
step
by
step.
This
is
a
simple
voltage
divider
problem,
where
the
longer
wire
has
a
resistance
four
times
that
of
the
shorter
end.
So
the
voltage
divider
ratio
is
1
/
5,
meaning
that
the
potential
in
the
middle
is
1.0
V
+
(8.0
V
-
1.0
V)
*
1/5
=
2.4
V.
The
answer
is
(A).
Q:
A
refracting
telescope
consists
of
two
converging
lenses
separated
by
100
cm.
The
eye-piece
lens
has
a
focal
length
of
20
cm.
The
angular
magnification
of
the
telescope
is
(A)
4
(B)
5
(C)
6
(D)
20
A:
Let'
'
s
think
step
by
step.
In
a
refracting
telescope,
if
both
lenses
are
converging,
the
focus
of
both
lenses
must
be
between
the
two
lenses,
and
thus
the
focal
lengths
of
the
two
lenses
must
add
up
to
their
separation.
Since
the
focal
length
of
one
lens
is
20
cm,
the
focal
length
of
the
other
must
be
80
cm.
The
magnification
is
the
ratio
of
these
two
focal
lengths,
or
4.
The
answer
is
(A).
Q:
For
which
of
the
following
thermodynamic
processes
is
the
increase
in
the
internal
energy
of
an
ideal
gas
equal
to
the
heat
added
to
the
gas?
(A)
Constant
temperature
(B)
Constant
volume
(C)
Constant
pressure
(D)
Adiabatic
A:
Let'
'
s
think
step
by
step.
Heat
added
to
the
gas
can
go
into
the
gases
internal
energy
or
work
done
against
an
external
force.
However,
if
the
volume
of
the
gas
container
is
constant,
no
work
will
be
done
(since
work
is
pressure
times
change
in
volume).
So,
at
constant
volume,
all
of
the
heat
goes
into
the
internal
energy.
The
answer
is
(B).'
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_college_physics
"
dataset_name"
:
"
college_physics"
"
description"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
college
\
\
physics.
\n\n
Q:
A
refracting
telescope
consists
of
two
converging
lenses
separated
\
\
by
100
cm.
The
eye-piece
lens
has
a
focal
length
of
20
cm.
The
angular
magnification
\
\
of
the
telescope
is
\n
(A)
4
(B)
5
(C)
6
(D)
20
\n
A:
Let's
think
step
by
step.
In
\
\
a
refracting
telescope,
if
both
lenses
are
converging,
the
focus
of
both
lenses
\
\
must
be
between
the
two
lenses,
and
thus
the
focal
lengths
of
the
two
lenses
must
\
\
add
up
to
their
separation.
Since
the
focal
length
of
one
lens
is
20
cm,
the
focal
\
\
length
of
the
other
must
be
80
cm.
The
magnification
is
the
ratio
of
these
two
\
\
focal
lengths,
or
4.
The
answer
is
(A).
\n\n
Q:
The
muon
decays
with
a
characteristic
\
\
lifetime
of
about
10^-6
second
into
an
electron,
a
muon
neutrino,
and
an
electron
\
\
antineutrino.
The
muon
is
forbidden
from
decaying
into
an
electron
and
just
a
\
\
single
neutrino
by
the
law
of
conservation
of
\n
(A)
charge
(B)
mass
(C)
energy
\
\
and
momentum
(D)
lepton
number
\n
A:
Let's
think
step
by
step.
Lepton
number
must
\
\
be
conserved,
meaning
the
total
number
of
leptons
minus
the
number
of
antileptons.
\
\
If
a
muon
decays
into
an
electron
and
a
single
neutrino,
the
total
lepton
number
\
\
would
go
from
one
to
two,
violating
lepton
number
conservation.
The
answer
is
\
\
(D).
\n\n
Q:
One
end
of
a
Nichrome
wire
of
length
2L
and
cross-sectional
area
A
\
\
is
attached
to
an
end
of
another
Nichrome
wire
of
length
L
and
cross-
sectional
\
\
area
2A.
If
the
free
end
of
the
longer
wire
is
at
an
electric
potential
of
8.0
\
\
volts,
and
the
free
end
of
the
shorter
wire
is
at
an
electric
potential
of
1.0
\
\
volt,
the
potential
at
the
junction
of
the
two
wires
is
most
nearly
equal
to
\n\
(A)
2.4
V
(B)
3.3
V
(C)
4.5
V
(D)
5.7
V
\n
A:
Let's
think
step
by
step.
This
is
a
\
\
simple
voltage
divider
problem,
where
the
longer
wire
has
a
resistance
four
times
\
\
that
of
the
shorter
end.
So
the
voltage
divider
ratio
is
1
/
5,
meaning
that
the
\
\
potential
in
the
middle
is
1.0
V
+
(8.0
V
-
1.0
V)
*
1/5
=
2.4
V.
The
answer
is
\
\
(A).
\n\n
Q:
A
refracting
telescope
consists
of
two
converging
lenses
separated
\
\
by
100
cm.
The
eye-piece
lens
has
a
focal
length
of
20
cm.
The
angular
magnification
\
\
of
the
telescope
is
\n
(A)
4
(B)
5
(C)
6
(D)
20
\n
A:
Let's
think
step
by
step.
In
\
\
a
refracting
telescope,
if
both
lenses
are
converging,
the
focus
of
both
lenses
\
\
must
be
between
the
two
lenses,
and
thus
the
focal
lengths
of
the
two
lenses
must
\
\
add
up
to
their
separation.
Since
the
focal
length
of
one
lens
is
20
cm,
the
focal
\
\
length
of
the
other
must
be
80
cm.
The
magnification
is
the
ratio
of
these
two
\
\
focal
lengths,
or
4.
The
answer
is
(A).
\n\n
Q:
For
which
of
the
following
thermodynamic
\
\
processes
is
the
increase
in
the
internal
energy
of
an
ideal
gas
equal
to
the
\
\
heat
added
to
the
gas?
\n
(A)
Constant
temperature
(B)
Constant
volume
(C)
Constant
\
\
pressure
(D)
Adiabatic
\n
A:
Let's
think
step
by
step.
Heat
added
to
the
gas
can
\
\
go
into
the
gases
internal
energy
or
work
done
against
an
external
force.
However,
\
\
if
the
volume
of
the
gas
container
is
constant,
no
work
will
be
done
(since
work
\
\
is
pressure
times
change
in
volume).
So,
at
constant
volume,
all
of
the
heat
goes
\
\
into
the
internal
energy.
The
answer
is
(B)."
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_college_physics"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_computer_security.yaml
View file @
109ed1c7
dataset_name
:
computer_security
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
computer
\
"
dataset_name
"
:
"
computer_security
"
"
description
"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
computer
\
\
security.
\n\n
Q:
SHA-1
has
a
message
digest
of
\n
(A)
160
bits
(B)
512
bits
(C)
628
\
\
bits
(D)
820
bits
\n
A:
Let's
think
step
by
step.
Since
SHA-1
is
a
hash
function
\
\
which
takes
an
input
and
produces
a
160-bit
(20-byte)
hash
value,
its
message
\
\
digest
is
160
bits.
The
answer
is
(A).
\n\n
Q:
_____________
can
modify
data
on
\
\
your
system
\u2013
so
that
your
system
doesn
\u2019
t
run
correctly
or
you
can
no
\
\
longer
access
specific
data,
or
it
may
even
ask
for
ransom
in
order
to
give
your
\
\
access.
\n
(A)
IM
\u2013
Trojans
(B)
Backdoor
Trojans
(C)
Trojan-Downloader
(D)
\
\
Ransom
Trojan
\n
A:
Let's
think
step
by
step.
The
system
is
asking
for
trojans,
\
\
which
are
for
ransom,
which
means
ransom
trojan.
The
answer
is
(D).
\n\n
Q:
What
\
\
is
ethical
hacking?
\n
(A)
\"
Hacking
\"
ethics
so
they
justify
unintended
selfish
\
\
behavior
(B)
Hacking
systems
(e.g.,
during
penetration
testing)
to
expose
vulnerabilities
\
\
so
they
can
be
fixed,
rather
than
exploited
(C)
Hacking
into
systems
run
by
those
\
\
whose
ethics
you
disagree
with
(D)
A
slang
term
for
rapid
software
development,
\
\
e.g.,
as
part
of
hackathons
\n
A:
Let's
think
step
by
step.
Ethical
hacking
is
a
\
\
process
of
detecting
vulnerabilities
in
an
application,
system,
or
organization's
\
\
infrastructure
that
an
attacker
can
use
to
exploit
an
individual
or
organization.
\
\
They
use
this
process
to
prevent
cyberattacks
and
security
breaches
by
lawfully
\
\
hacking
into
the
systems
and
looking
for
weak
points.
The
answer
is
(B).
\n\n
Q:
\
\
The
____________
is
anything
which
your
search
engine
cannot
search.
\n
(A)
Haunted
\
\
web
(B)
World
Wide
Web
(C)
Surface
web
(D)
Deep
Web
\n
A:
Let's
think
step
by
step.
\
\
The
search
engine
searches
on
the
Surface
Web,
which
is
the
portion
of
the
world
\
\
wide
web
which
is
visible
so
(B,C)
are
wrong.
The
Haunted
Web
doesn
\u2019
t
correspond
\
\
to
an
internet
concept.
The
Deep
Web
is
the
part
of
the
World
Wide
Web
which
is
\
\
not
indexed.
The
answer
is
(D).
\n\n
Q:
Exploitation
of
the
Heartbleed
bug
permits
\n\
(A)
overwriting
cryptographic
keys
in
memory
(B)
a
kind
of
code
injection
(C)
a
\
\
read
outside
bounds
of
a
buffer
(D)
a
format
string
attack
\n
A:
Let's
think
step
\
\
by
step.
The
Heartbleed
Bug
is
a
serious
vulnerability
in
the
popular
OpenSSL
\
\
cryptographic
software
library.
Heartbleed
resulted
from
improper
input
validation
\
\
(due
to
a
missing
bounds
check)
in
the
implementation
of
the
TLS
heartbeat
extension.
\
\
The
vulnerability
was
classified
as
a
buffer
over-read,
a
situation
where
more
\
\
data
can
be
read
than
should
be
allowed.
The
answer
is
(C)."
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_computer_security
\
your
system
–
so
that
your
system
doesn’t
run
correctly
or
you
can
no
longer
access
\
\
specific
data,
or
it
may
even
ask
for
ransom
in
order
to
give
your
access.
\n
(A)
\
\
IM
–
Trojans
(B)
Backdoor
Trojans
(C)
Trojan-Downloader
(D)
Ransom
Trojan
\n
A:
\
\
Let's
think
step
by
step.
The
system
is
asking
for
trojans,
which
are
for
ransom,
\
\
which
means
ransom
trojan.
The
answer
is
(D).
\n\n
Q:
What
is
ethical
hacking?
\n\
(A)
\"
Hacking
\"
ethics
so
they
justify
unintended
selfish
behavior
(B)
Hacking
systems
\
\
(e.g.,
during
penetration
testing)
to
expose
vulnerabilities
so
they
can
be
fixed,
\
\
rather
than
exploited
(C)
Hacking
into
systems
run
by
those
whose
ethics
you
disagree
\
\
with
(D)
A
slang
term
for
rapid
software
development,
e.g.,
as
part
of
hackathons
\n\
A:
Let's
think
step
by
step.
Ethical
hacking
is
a
process
of
detecting
vulnerabilities
\
\
in
an
application,
system,
or
organization's
infrastructure
that
an
attacker
can
\
\
use
to
exploit
an
individual
or
organization.
They
use
this
process
to
prevent
\
\
cyberattacks
and
security
breaches
by
lawfully
hacking
into
the
systems
and
looking
\
\
for
weak
points.
The
answer
is
(B).
\n\n
Q:
The
____________
is
anything
which
your
\
\
search
engine
cannot
search.
\n
(A)
Haunted
web
(B)
World
Wide
Web
(C)
Surface
web
\
\
(D)
Deep
Web
\n
A:
Let's
think
step
by
step.
The
search
engine
searches
on
the
Surface
\
\
Web,
which
is
the
portion
of
the
world
wide
web
which
is
visible
so
(B,C)
are
\
\
wrong.
The
Haunted
Web
doesn’t
correspond
to
an
internet
concept.
The
Deep
Web
\
\
is
the
part
of
the
World
Wide
Web
which
is
not
indexed.
The
answer
is
(D).
\n\n\
Q:
Exploitation
of
the
Heartbleed
bug
permits
\n
(A)
overwriting
cryptographic
keys
\
\
in
memory
(B)
a
kind
of
code
injection
(C)
a
read
outside
bounds
of
a
buffer
(D)
\
\
a
format
string
attack
\n
A:
Let's
think
step
by
step.
The
Heartbleed
Bug
is
a
serious
\
\
vulnerability
in
the
popular
OpenSSL
cryptographic
software
library.
Heartbleed
\
\
resulted
from
improper
input
validation
(due
to
a
missing
bounds
check)
in
the
\
\
implementation
of
the
TLS
heartbeat
extension.
The
vulnerability
was
classified
\
\
as
a
buffer
over-read,
a
situation
where
more
data
can
be
read
than
should
be
\
\
allowed.
The
answer
is
(C)."
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_computer_security"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_conceptual_physics.yaml
View file @
109ed1c7
dataset_name
:
conceptual_physics
description
:
"
\n
The
following
are
multiple
choice
questions
(with
answers)
about
conceptual
\
\
physics.
\n\n
Q:
Colors
in
a
soap
bubble
result
from
light
\n
(A)
converted
to
a
different
\
\
frequency
(B)
deflection
(C)
interference
(D)
polarization
\n
A:
Let's
think
step
\
\
by
step.
In
a
soap
bubble
film,
the
light
bounces
between
the
two
soap-air
interfaces
\
\
many
times,
interfering
with
itself
constructively
or
destructively
depending
\
\
on
the
width
of
the
film.
This
results
in
different
colors
being
visible.
The
\
\
answer
is
(C).
\n\n
Q:
Compared
with
the
mass
of
a
uranium
atom
undergoing
fission,
\
\
the
combined
masses
of
the
products
after
fission
are
\n
(A)
less
(B)
more
(C)
the
\
\
same
(D)
zero
\n
A:
Let's
think
step
by
step.
Fission
releases
energy,
which
comes
\
\
from
the
rest
mass
of
its
initial
nucleus.
Thus
the
mass
of
the
products
is
less
\
\
than
the
mass
of
the
reactant
uranium
nucleus.
The
answer
is
(A).
\n\n
Q:
Things
\
\
that
are
equivalent
according
to
the
equivalence
principle
are
\n
(A)
space
and
\
\
time.
(B)
a
traveling
twin
and
a
stay-at-home
twin.
(C)
gravity
and
acceleration.
\
\
(D)
mass
and
energy.
\n
A:
Let's
think
step
by
step.
Einstein
\u2019
s
famous
equivalence
\
\
principle
states
that
gravity
and
acceleration
are
equivalent.
The
answer
is
(C).
\n\
\n
Q:
Which
of
these
three
elements
has
the
most
mass
per
nucleon?
\n
(A)
Hydrogen
\
\
(B)
Iron
(C)
Uranium
(D)
Same
in
each
\n
A:
Let's
think
step
by
step.
Due
to
nuclear
\
\
binding
energy,
the
mass
of
an
atomic
nucleus
is
less
than
the
sum
of
individual
\
\
masses
of
the
free
constituent
protons
and
neutrons;
this
is
known
as
the
mass
\
\
defect.
Hydrogen
has
no
mass
defect
because
it
has
only
a
single
nucleon,
so
it
\
\
will
have
the
most
mass
per
nucleon.
The
answer
is
(A).
\n\n
Q:
A
model
airplane
\
\
flies
slower
when
flying
into
the
wind
and
faster
with
wind
at
its
back.
When
\
\
launched
at
right
angles
to
the
wind
a
cross
wind
its
groundspeed
compared
with
\
\
flying
in
still
air
is
\n
(A)
the
same
(B)
greater
(C)
less
(D)
either
greater
or
\
\
less
depending
on
wind
speed
\n
A:
Let's
think
step
by
step.
The
plane
\u2019
s
speed
\
\
in
the
direction
of
the
wind
is
greater
than
it
would
be
in
the
absence
of
wind,
\
\
and
its
direction
orthogonal
to
the
wind
is
the
same
as
it
would
be
in
the
absence
\
\
of
the
wind.
The
total
speed,
which
is
these
two
components
added
in
quadrature,
\
\
is
thus
greater
than
the
speed
in
still
air.
The
answer
is
(B)."
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_conceptual_physics
"
dataset_name"
:
"
conceptual_physics"
"
description"
:
"
\n
The
following
are
multiple
choice
questions
(with
answers)
about
\
\
conceptual
physics.
\n\n
Q:
Colors
in
a
soap
bubble
result
from
light
\n
(A)
converted
\
\
to
a
different
frequency
(B)
deflection
(C)
interference
(D)
polarization
\n
A:
\
\
Let's
think
step
by
step.
In
a
soap
bubble
film,
the
light
bounces
between
the
\
\
two
soap-air
interfaces
many
times,
interfering
with
itself
constructively
or
\
\
destructively
depending
on
the
width
of
the
film.
This
results
in
different
colors
\
\
being
visible.
The
answer
is
(C).
\n\n
Q:
Compared
with
the
mass
of
a
uranium
atom
\
\
undergoing
fission,
the
combined
masses
of
the
products
after
fission
are
\n
(A)
\
\
less
(B)
more
(C)
the
same
(D)
zero
\n
A:
Let's
think
step
by
step.
Fission
releases
\
\
energy,
which
comes
from
the
rest
mass
of
its
initial
nucleus.
Thus
the
mass
of
\
\
the
products
is
less
than
the
mass
of
the
reactant
uranium
nucleus.
The
answer
\
\
is
(A).
\n\n
Q:
Things
that
are
equivalent
according
to
the
equivalence
principle
\
\
are
\n
(A)
space
and
time.
(B)
a
traveling
twin
and
a
stay-at-home
twin.
(C)
gravity
\
\
and
acceleration.
(D)
mass
and
energy.
\n
A:
Let's
think
step
by
step.
Einstein’s
\
\
famous
equivalence
principle
states
that
gravity
and
acceleration
are
equivalent.
\
\
The
answer
is
(C).
\n\n
Q:
Which
of
these
three
elements
has
the
most
mass
per
nucleon?
\n\
(A)
Hydrogen
(B)
Iron
(C)
Uranium
(D)
Same
in
each
\n
A:
Let's
think
step
by
step.
\
\
Due
to
nuclear
binding
energy,
the
mass
of
an
atomic
nucleus
is
less
than
the
\
\
sum
of
individual
masses
of
the
free
constituent
protons
and
neutrons;
this
is
\
\
known
as
the
mass
defect.
Hydrogen
has
no
mass
defect
because
it
has
only
a
single
\
\
nucleon,
so
it
will
have
the
most
mass
per
nucleon.
The
answer
is
(A).
\n\n
Q:
A
\
\
model
airplane
flies
slower
when
flying
into
the
wind
and
faster
with
wind
at
\
\
its
back.
When
launched
at
right
angles
to
the
wind
a
cross
wind
its
groundspeed
\
\
compared
with
flying
in
still
air
is
\n
(A)
the
same
(B)
greater
(C)
less
(D)
either
\
\
greater
or
less
depending
on
wind
speed
\n
A:
Let's
think
step
by
step.
The
plane’s
\
\
speed
in
the
direction
of
the
wind
is
greater
than
it
would
be
in
the
absence
\
\
of
wind,
and
its
direction
orthogonal
to
the
wind
is
the
same
as
it
would
be
in
\
\
the
absence
of
the
wind.
The
total
speed,
which
is
these
two
components
added
\
\
in
quadrature,
is
thus
greater
than
the
speed
in
still
air.
The
answer
is
(B)."
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_conceptual_physics"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_econometrics.yaml
View file @
109ed1c7
dataset_name
:
econometrics
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
econometrics.
\n\
"
dataset_name
"
:
"
econometrics
"
"
description
"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
econometrics.
\n\
\n
Q:
Suppose
now
that
a
researcher
wishes
to
use
information
criteria
to
determine
\
\
the
optimal
lag
length
for
a
VAR.
500
observations
are
available
for
the
bi-variate
\
\
VAR,
and
the
values
of
the
determinant
of
the
variance-covariance
matrix
of
residuals
\
\
are
0.0336,
0.0169,
0.0084,
and
0.0062
for
1,
2,
3,
and
4
lags
respectively.
What
\
\
is
the
optimal
model
order
according
to
Akaike's
information
criterion?
\n
(A)
1
\
\
lag
(B)
2
lags
(C)
3
lags
(D)
4
lags
\n
A:
Let's
think
step
by
step.
We
refer
to
\
\
Wikipedia
articles
on
econometrics
for
help.
Let
\u2019
s
solve
this
problem
step
\
\
by
step.
First
of
all,
let
\u2019
s
recall
that
for
a
given
set
of
data,
Akaike's
\
\
information
criterion
(AIC)
allows
us
to
measure
how
well
a
statistical
model
\
\
fits
the
data;
it
is
an
estimator
of
prediction
error.
Here
in
this
problem
we
\
\
will
need
to
use
the
formula
ln(det(sigma_hat))
+
(2
*
k
/
T)
to
determine
the
\
\
values
of
Akaike
\u2019
s
criterion,
where
ln
denotes
the
natural
log
function,
\
\
det
the
determinant
function,
k
the
total
number
of
parameters
in
total
(across
\
\
both
equations),
and
T
the
number
of
observations
(which,
in
this
case,
is
equal
\
\
to
500).
For
1
lag,
the
number
of
parameters
in
total
is
equal
to
6;
for
2
lags,
\
\
it
is
10;
for
3
lags,
it
is
14;
and
for
4
lags,
it
is
18.
Now,
let
\u2019
s
calculate
\
\
the
values
of
the
criterion
for
each
lag:
\n
(A)
1
lag:
ln(0.0336)
+
(2
*
6
/
500)
\
\
=
ln(0.0336)
+
(12
/
500)
=
-3.369
\n
(B)
2
lags:
ln(0.0169)
+
(2
*
10
/
500)
=
\
\
ln(0.0169)
+
(20
/
500)
=
-4.040
\n
(C)
3
lags:
ln(0.0084)
+
(2
*
14
/
500)
=
ln(0.0084)
\
\
+
(28
/
500)
=-4.724
\n
(D)
4
lags:
ln(0.0062)
+
(2
*
18
/
500)
=
ln(0.0062)
+
(36
\
\
/
500)
=-5.011
\n
Because
the
optimal
model
order
according
to
AIC
minimizes
the
\
\
information
criterion,
the
answer
should
be
the
one
with
the
lowest
value.
In
\
\
this
case,
(D)
has
the
lowest
value.
The
answer
is
(C).
\n\n
Q:
Consider
the
following
\
\
AR(1)
model
with
the
disturbances
having
zero
mean
and
unit
variance
\n
yt
=
0.2
\
\
+
0.4
yt-1
+
ut
\n
The
(unconditional)
mean
of
y
will
be
given
by
\n
(A)
0.2
(B)
0.4
\
\
(C)
0.5
(D)
0.33
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
\
\
on
econometrics
for
help.
Let
\u2019
s
solve
this
problem
step
by
step.
If
we
have
\
\
a
an
AR(1)
model
with
the
disturbances
having
zero
mean
and
unit
variance,
then
\
\
the
unconditional
mean
of
y
is
equal
to
the
following:
\n
unconditional
mean
of
\
\
y
=
(the
intercept
term)
/
(1
-
autoregressive
coefficient)
\n
We
know
that
the
\
\
intercept
term
is
0.2
and
the
autoregressive
coefficient
is
0.4;
thus,
we
have:
\n\
unconditional
mean
of
y
=
(0.2)
/
(1
-
0.4)
=
(0.2)
/
(0.6)
=
2
/
6
=
1
/
3,
which
\
\
is
approximately
0.33.
That
means
that
the
answer
should
be
(D)
0.33.
The
answer
\
\
is
(D).
\n\n
Q:
What
would
be
then
consequences
for
the
OLS
estimator
if
heteroscedasticity
\
\
is
present
in
a
regression
model
but
ignored?
\n
(A)
It
will
be
biased
(B)
It
will
\
\
be
inconsistent
(C)
It
will
be
inefficient
(D)
All
of
(a),
(b)
and
(c)
will
be
\
\
true.
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
econometrics
\
\
for
help.
Heteroscedasticity
refers
to
the
condition
where
the
variance
of
the
\
\
error
terms
is
not
constant
across
multiple
observations.
If
heteroscedasticity
\
\
is
present
in
a
regression
model,
then
the
coefficient
estimates
in
the
OLS
estimator
\
\
will
be
not
only
unbiased
and
consistent
but
also
inefficient.
Because
(A)
and
\
\
(B)
are
incorrect
choices
and
(C)
is
a
correct
choice,
(D)
cannot
be
the
right
\
\
answer.
Ultimately,
(C)
is
the
only
true
choice.
The
answer
is
(C).
\n\n
Q:
Suppose
\
\
that
a
test
statistic
has
associated
with
it
a
p-value
of
0.08.
Which
one
of
the
\
\
following
statements
is
true?
\n
(i)
If
the
size
of
the
test
were
exactly
8%,
we
\
\
would
be
indifferent
between
rejecting
and
not
rejecting
the
null
hypothesis
\n\
(ii)
The
null
would
be
rejected
if
a
10%
size
of
test
were
used
\n
(iii)
The
null
\
\
would
not
be
rejected
if
a
1%
size
of
test
were
used
\n
(iv)
The
null
would
be
rejected
\
\
if
a
5%
size
of
test
were
used.
\n
(A)
(ii)
and
(iv)
only
(B)
(i)
and
(iii)
only
\
\
(C)
(i),
(ii),
and
(iii)
only
(D)
(i),
(ii),
(iii),
and
(iv).
\n
A:
Let's
think
\
\
step
by
step.
We
refer
to
Wikipedia
articles
on
econometrics
for
help.
Let
\u2019
\
s
reason
about
each
of
the
options
.
\n
(i)
is
a
true
statement.
\n
(ii)
is
a
true
statement.
\n\
(iii)
is
a
true
statement.
\n
(iv)
is
not
a
true
statement.
Thus,
(i),
(ii),
and
(iii)
\
\
are
true.
The
answer
is
(C).
\n\n
Q:
For
a
stationary
autoregressive
process,
shocks
\
\
will
\n
(A)
Eventually
die
away
(B)
Persist
indefinitely
(C)
Grow
exponentially
\
\
(D)
Never
occur
\n
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
\
\
econometrics
for
help.
This
is
a
formal
logic
problem
about
stationally
process.
\
\
For
a
stationary
autoregressive
process,
shocks
will
eventually
die
away.
The
\
\
answer
is
(A).
"
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_econometrics
\
Wikipedia
articles
on
econometrics
for
help.
Let
’
s
solve
this
problem
step
by
\
\
step.
First
of
all,
let
’
s
recall
that
for
a
given
set
of
data,
Akaike's
information
\
\
criterion
(AIC)
allows
us
to
measure
how
well
a
statistical
model
fits
the
data;
\
\
it
is
an
estimator
of
prediction
error.
Here
in
this
problem
we
will
need
to
use
\
\
the
formula
ln(det(sigma_hat))
+
(2
*
k
/
T)
to
determine
the
values
of
Akaike’s
\
\
criterion,
where
ln
denotes
the
natural
log
function,
det
the
determinant
function,
\
\
k
the
total
number
of
parameters
in
total
(across
both
equations),
and
T
the
number
\
\
of
observations
(which,
in
this
case,
is
equal
to
500).
For
1
lag,
the
number
\
\
of
parameters
in
total
is
equal
to
6;
for
2
lags,
it
is
10;
for
3
lags,
it
is
\
\
14;
and
for
4
lags,
it
is
18.
Now,
let
’
s
calculate
the
values
of
the
criterion
\
\
for
each
lag:
\n
(A)
1
lag:
ln(0.0336)
+
(2
*
6
/
500)
=
ln(0.0336)
+
(12
/
500)
\
\
=
-3.369
\n
(B)
2
lags:
ln(0.0169)
+
(2
*
10
/
500)
=
ln(0.0169)
+
(20
/
500)
=
\
\
-4.040
\n
(C)
3
lags:
ln(0.0084)
+
(2
*
14
/
500)
=
ln(0.0084)
+
(28
/
500)
=-4.724
\n
\
(D)
4
lags:
ln(0.0062)
+
(2
*
18
/
500)
=
ln(0.0062)
+
(36
/
500)
=-5.011
\n
Because
\
\
the
optimal
model
order
according
to
AIC
minimizes
the
information
criterion,
\
\
the
answer
should
be
the
one
with
the
lowest
value.
In
this
case,
(D)
has
the
\
\
lowest
value.
The
answer
is
(C).
\n\n
Q:
Consider
the
following
AR(1)
model
with
\
\
the
disturbances
having
zero
mean
and
unit
variance
\n
yt
=
0.2
+
0.4
yt-1
+
ut
\n
\
The
(unconditional)
mean
of
y
will
be
given
by
\n
(A)
0.2
(B)
0.4
(C)
0.5
(D)
0.33
\n
\
A:
Let's
think
step
by
step.
We
refer
to
Wikipedia
articles
on
econometrics
for
\
\
help.
Let
’
s
solve
this
problem
step
by
step.
If
we
have
a
an
AR(1)
model
with
\
\
the
disturbances
having
zero
mean
and
unit
variance,
then
the
unconditional
mean
\
\
of
y
is
equal
to
the
following:
\n
unconditional
mean
of
y
=
(the
intercept
term)
\
\
/
(1
-
autoregressive
coefficient)
\n
We
know
that
the
intercept
term
is
0.2
and
\
\
the
autoregressive
coefficient
is
0.4;
thus,
we
have:
\n
unconditional
mean
of
y
\
\
=
(0.2)
/
(1
-
0.4)
=
(0.2)
/
(0.6)
=
2
/
6
=
1
/
3,
which
is
approximately
0.33.
\
\
That
means
that
the
answer
should
be
(D)
0.33.
The
answer
is
(D).
\n\n
Q:
What
would
\
\
be
then
consequences
for
the
OLS
estimator
if
heteroscedasticity
is
present
in
\
\
a
regression
model
but
ignored?
\n
(A)
It
will
be
biased
(B)
It
will
be
inconsistent
\
\
(C)
It
will
be
inefficient
(D)
All
of
(a),
(b)
and
(c)
will
be
true.
\n
A:
Let's
\
\
think
step
by
step.
We
refer
to
Wikipedia
articles
on
econometrics
for
help.
Heteroscedasticity
\
\
refers
to
the
condition
where
the
variance
of
the
error
terms
is
not
constant
\
\
across
multiple
observations.
If
heteroscedasticity
is
present
in
a
regression
\
\
model,
then
the
coefficient
estimates
in
the
OLS
estimator
will
be
not
only
unbiased
\
\
and
consistent
but
also
inefficient.
Because
(A)
and
(B)
are
incorrect
choices
\
\
and
(C)
is
a
correct
choice,
(D)
cannot
be
the
right
answer.
Ultimately,
(C)
is
\
\
the
only
true
choice.
The
answer
is
(C).
\n\n
Q:
Suppose
that
a
test
statistic
has
\
\
associated
with
it
a
p-value
of
0.08.
Which
one
of
the
following
statements
is
\
\
true?
\n
(i)
If
the
size
of
the
test
were
exactly
8%,
we
would
be
indifferent
between
\
\
rejecting
and
not
rejecting
the
null
hypothesis
\n
(ii)
The
null
would
be
rejected
\
\
if
a
10%
size
of
test
were
used
\n
(iii)
The
null
would
not
be
rejected
if
a
1%
\
\
size
of
test
were
used
\n
(iv)
The
null
would
be
rejected
if
a
5%
size
of
test
were
\
\
used.
\n
(A)
(ii)
and
(iv)
only
(B)
(i)
and
(iii)
only
(C)
(i),
(ii),
and
(iii)
\
\
only
(D)
(i),
(ii),
(iii),
and
(iv).
\n
A:
Let's
think
step
by
step.
We
refer
to
\
\
Wikipedia
articles
on
econometrics
for
help.
Let
’s
reason
about
each
of
the
options.
\n
\
(i)
is
a
true
statement
.
\n
(i
i
)
is
a
true
statement.
\n
(ii
i
)
is
a
true
statement.
\n\
(iv)
is
not
a
true
statement.
Thus,
(i),
(ii),
and
(iii)
are
true.
The
answer
is
\
\
(C).
\n\n
Q:
For
a
stationary
autoregressive
process,
shocks
will
\n
(A)
Eventually
\
\
die
away
(B)
Persist
indefinitely
(C)
Grow
exponentially
(D)
Never
occur
\n
A:
Let's
\
\
think
step
by
step.
We
refer
to
Wikipedia
articles
on
econometrics
for
help.
This
\
\
is
a
formal
logic
problem
about
stationally
process.
For
a
stationary
autoregressive
\
\
process,
shocks
will
eventually
die
away.
The
answer
is
(A)."
"
group"
:
"
mmlu_flan_cot_fewshot_social_sciences
"
"
include
"
:
"
_mmlu_flan_cot_fewshot_template_yaml
"
"
task
"
:
"
mmlu_flan_cot_fewshot_econometrics
"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_electrical_engineering.yaml
View file @
109ed1c7
dataset_name
:
electrical_engineering
description
:
"
\n
The
following
are
multiple
choice
questions
(with
answers)
about
electrical
\
\
engineering.
\n\n
Q:
A
point
pole
has
a
strength
of
4
\u03C0
*
10^-4
weber.
The
force
\
\
in
newtons
on
a
point
pole
of
4
\u03C0
*
1.5
*
10^-4
weber
placed
at
a
distance
\
"
dataset_name
"
:
"
electrical_engineering
"
"
description
"
:
"
\n
The
following
are
multiple
choice
questions
(with
answers)
about
\
\
electrical
engineering.
\n\n
Q:
A
point
pole
has
a
strength
of
4
π
*
10^-4
weber.
\
\
The
force
in
newtons
on
a
point
pole
of
4
π
*
1.5
*
10^-4
weber
placed
at
a
distance
\
\
of
10
cm
from
it
will
be
\n
(A)
15
N.
(B)
20
N.
(C)
7.5
N.
(D)
3.75
N.
\n
A:
Let's
\
\
think
step
by
step.
The
force
between
two
point
poles
is
given
by
m_1m_2/(mu_0
\
\
4
\\
pi
r^2),
in
analogy
to
Coulomb
\u2019
s
law.
Plugging
in
the
values
given
in
\
\
the
question,
we
calculate
that
the
force
is
approximately
15
N.
The
answer
is
\
\
(A).
\n\
n
Q:
The
coil
of
a
moving
coil
meter
has
100
turns,
is
40
mm
long
and
30
\
\
mm
wide.
The
control
torque
is
240*10-6
N-m
on
full
scale.
If
magnetic
flux
density
\
\
is
1Wb/m2
range
of
meter
is
\n
(A)
1
mA.
(B)
2
mA.
(C)
3
mA.
(D)
4
mA.
\n
A:
Let's
\
\
think
step
by
step.
The
torque
on
a
coil
in
a
uniform
magnetic
field
is
given
\
\
by
BANI,
where
B
is
the
magnetic
flux
density,
A
is
the
area
of
the
coil,
N
is
\
\
the
number
of
turns,
and
I
is
the
current.
So
we
have
that
I
=
(Torque)/(BAN),
\
\
or
240e-6/(1200e-6
*
100
*
1)
=
2e-3.
The
answer
is
(B).
\n\n
Q:
In
an
SR
latch
\
\
built
from
NOR
gates,
which
condition
is
not
allowed
\n
(A)
S=0,
R=0
(B)
S=0,
R=1
\
\
(C)
S=1,
R=0
(D)
S=1,
R=1
\n
A:
Let's
think
step
by
step.
An
SR
latch
is
a
set-reset
\
\
latch;
in
the
case
where
S=1
and
R=1,
the
circuit
has
no
stable
state;
instead
\
\
a
race
condition
will
be
produced
within
the
circuit,
so
the
device
will
be
in
\
\
an
undefined
state.
So
S=1,
R=1
is
an
illegal
input.
The
answer
is
(D).
\n\n
Q:
\
\
Two
long
parallel
conductors
carry
100
A.
If
the
conductors
are
separated
by
20
\
\
mm,
the
force
per
meter
of
length
of
each
conductor
will
be
\n
(A)
100
N.
(B)
0.1
\
\
N.
(C)
1
N.
(D)
0.01
N.
\n
A:
Let's
think
step
by
step.
The
magnetic
force-per-length
\
\
between
two
current-carrying
conductors
is
given
by
\\
mu_0
I_1
I_2
/
(2
\\
pi
r),
\
\
where
$r$
is
the
separation
distance
and
I_1
and
I_2
are
the
currents.
Plugging
\
\
in
100
A
for
I_1
and
I_2,
and
20
mm
for
r,
gives
0.1
N.
The
answer
is
(B).
\n\n\
Q:
In
a
2
pole
lap
winding
dc
machine
,
the
resistance
of
one
conductor
is
2
\u03A9
\
\
and
total
number
of
conductors
is
100.
Find
the
total
resistance
\n
(A)
200
\u03A9
\
\
(B)
100
\u03A9
(C)
50
\u03A9
(D)
10
\u03A9\n
A:
Let's
think
step
by
step.
In
lap
winding,
\
\
effectively
two
resistors
are
connected
in
parallel,
so
the
actual
resistance
\
\
of
each
pair
is
1
Ohm.
Since
we
have
50
pairs,
we
get
a
total
resistance
of
50
\
\
Ohms.
The
answer
is
(C).
"
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_electrical_engineering
\
4
\\
pi
r^2),
in
analogy
to
Coulomb
’
s
law.
Plugging
in
the
values
given
in
the
\
\
question,
we
calculate
that
the
force
is
approximately
15
N.
The
answer
is
(A).
\n
\
\n
Q:
The
coil
of
a
moving
coil
meter
has
100
turns,
is
40
mm
long
and
30
mm
wide.
\
\
The
control
torque
is
240*10-6
N-m
on
full
scale.
If
magnetic
flux
density
is
\
\
1Wb/m2
range
of
meter
is
\n
(A)
1
mA.
(B)
2
mA.
(C)
3
mA.
(D)
4
mA.
\n
A:
Let's
think
\
\
step
by
step.
The
torque
on
a
coil
in
a
uniform
magnetic
field
is
given
by
BANI,
\
\
where
B
is
the
magnetic
flux
density,
A
is
the
area
of
the
coil,
N
is
the
number
\
\
of
turns,
and
I
is
the
current.
So
we
have
that
I
=
(Torque)/(BAN),
or
240e-6/(1200e-6
\
\
*
100
*
1)
=
2e-3.
The
answer
is
(B).
\n\n
Q:
In
an
SR
latch
built
from
NOR
gates,
\
\
which
condition
is
not
allowed
\n
(A)
S=0,
R=0
(B)
S=0,
R=1
(C)
S=1,
R=0
(D)
S=1,
\
\
R=1
\n
A:
Let's
think
step
by
step.
An
SR
latch
is
a
set-reset
latch;
in
the
case
\
\
where
S=1
and
R=1,
the
circuit
has
no
stable
state;
instead
a
race
condition
will
\
\
be
produced
within
the
circuit,
so
the
device
will
be
in
an
undefined
state.
So
\
\
S=1,
R=1
is
an
illegal
input.
The
answer
is
(D).
\n\n
Q:
Two
long
parallel
conductors
\
\
carry
100
A.
If
the
conductors
are
separated
by
20
mm,
the
force
per
meter
of
\
\
length
of
each
conductor
will
be
\n
(A)
100
N.
(B)
0.1
N.
(C)
1
N.
(D)
0.01
N.
\n
\
A:
Let's
think
step
by
step.
The
magnetic
force-per-length
between
two
current-carrying
\
\
conductors
is
given
by
\\
mu_0
I_1
I_2
/
(2
\\
pi
r),
where
$r$
is
the
separation
\
\
distance
and
I_1
and
I_2
are
the
currents.
Plugging
in
100
A
for
I_1
and
I_2,
\
\
and
20
mm
for
r,
gives
0.1
N.
The
answer
is
(B).
\n\n
Q:
In
a
2
pole
lap
winding
\
\
dc
machine
,
the
resistance
of
one
conductor
is
2
Ω
and
total
number
of
conductors
\
\
is
100.
Find
the
total
resistance
\n
(A)
200
Ω
(B)
100Ω
(C)
50Ω
(D)
10Ω
\n
A:
Let's
\
\
think
step
by
step.
In
lap
winding,
effectively
two
resistors
are
connected
in
\
\
parallel,
so
the
actual
resistance
of
each
pair
is
1
Ohm.
Since
we
have
50
pairs,
\
\
we
get
a
total
resistance
of
50
Ohms.
The
answer
is
(C)."
"
group"
:
"
mmlu_flan_cot_fewshot_stem
"
"
include
"
:
"
_mmlu_flan_cot_fewshot_template_yaml
"
"
task
"
:
"
mmlu_flan_cot_fewshot_electrical_engineering
"
lm_eval/tasks/mmlu/flan_cot_fewshot/mmlu_elementary_mathematics.yaml
View file @
109ed1c7
dataset_name
:
elementary_mathematics
description
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
elementary
\
"
dataset_name
"
:
"
elementary_mathematics
"
"
description
"
:
"
The
following
are
multiple
choice
questions
(with
answers)
about
elementary
\
\
mathematics.
\n\n
Q:
Olivia
used
the
rule
\"
Add
11
\"
to
create
the
number
pattern
\
\
shown
below.
10,
21,
32,
43,
54.
Which
statement
about
the
number
pattern
is
true?
\n\
(A)
The
10th
number
in
the
pattern
will
be
an
even
number.
\n
(B)
The
number
pattern
\
...
...
@@ -22,19 +22,20 @@ description: "The following are multiple choice questions (with answers) about e
\
the
other
choices
are
incorrect.
The
answer
is
(A).
\n\n
Q:
A
store
sells
107
different
\
\
colors
of
paint.
They
have
25
cans
of
each
color
in
storage.
The
number
of
cans
\
\
of
paint
the
store
has
in
storage
can
be
found
using
the
expression
below.
107
\
\ \xD7
25.
How
many
cans
of
paint
does
the
store
have
in
storage?
\n
(A)
749
\n
(B)
\
\
2,675
\n
(C)
2,945
\n
(D)
4,250
\n
A:
Let's
think
step
by
step.
We
can
calculate
107
\
\
x
25
=
(100
x
25)
+
(7
x
25)
=
2500
+
175
=
2675.
The
answer
is
(B).
\n\n
Q:
A
total
\
\
of
30
players
will
play
basketball
at
a
park.
There
will
be
exactly
5
players
\
\
on
each
team.
Which
statement
correctly
explains
how
to
find
the
number
of
teams
\
\
needed?
\n
(A)
Add
5
to
30
to
find
35
teams.
\n
(B)
Divide
30
by
5
to
find
6
teams.
\n\
(C)
Multiply
30
and
5
to
find
150
teams.
\n
(D)
Subtract
5
from
30
to
find
25
teams.
\n\
A:
Let's
think
step
by
step.
We
want
to
find
the
number
of
teams.
We
know
that
there
\
\
are
5
players/team,
and
30
players.
Thus
to
get
the
number
of
teams
we
divide
\
\
players
by
players/team,
so
30
players
/
5
players/team
=
6
teams.
The
answer
\
\
is
(B).
\n\n
Q:
Which
expression
is
equivalent
to
5
x
9?
\n
(A)
(5
x
4)
x
(6
x
5)
\n\
(B)
(5
x
5)
+
(5
x
4)
\n
(C)
(5
x
5)
+
(5
x
9)
\n
(D)
(5
x
9)
x
(6
x
9)
\n
A:
Let's
think
\
\
step
by
step.
We
know
that
9
=
(5
+
4),
so
5
x
9
=
5
x
(5
+
4)
=
(5
x
5)
+
(5
\
\
x
4).
The
answer
is
(B)."
include
:
_mmlu_flan_cot_fewshot_template_yaml
task
:
mmlu_flan_cot_fewshot_elementary_mathematics
\
×
25.
How
many
cans
of
paint
does
the
store
have
in
storage?
\n
(A)
749
\n
(B)
2,675
\n\
(C)
2,945
\n
(D)
4,250
\n
A:
Let's
think
step
by
step.
We
can
calculate
107
x
25
=
(100
\
\
x
25)
+
(7
x
25)
=
2500
+
175
=
2675.
The
answer
is
(B).
\n\n
Q:
A
total
of
30
players
\
\
will
play
basketball
at
a
park.
There
will
be
exactly
5
players
on
each
team.
\
\
Which
statement
correctly
explains
how
to
find
the
number
of
teams
needed?
\n
(A)
\
\
Add
5
to
30
to
find
35
teams.
\n
(B)
Divide
30
by
5
to
find
6
teams.
\n
(C)
Multiply
\
\
30
and
5
to
find
150
teams.
\n
(D)
Subtract
5
from
30
to
find
25
teams.
\n
A:
Let's
\
\
think
step
by
step.
We
want
to
find
the
number
of
teams.
We
know
that
there
are
\
\
5
players/team,
and
30
players.
Thus
to
get
the
number
of
teams
we
divide
players
\
\
by
players/team,
so
30
players
/
5
players/team
=
6
teams.
The
answer
is
(B).
\n\
\n
Q:
Which
expression
is
equivalent
to
5
x
9?
\n
(A)
(5
x
4)
x
(6
x
5)
\n
(B)
(5
x
5)
\
\
+
(5
x
4)
\n
(C)
(5
x
5)
+
(5
x
9)
\n
(D)
(5
x
9)
x
(6
x
9)
\n
A:
Let's
think
step
by
\
\
step.
We
know
that
9
=
(5
+
4),
so
5
x
9
=
5
x
(5
+
4)
=
(5
x
5)
+
(5
x
4).
The
\
\
answer
is
(B)."
"
group"
:
"
mmlu_flan_cot_fewshot_stem"
"
include"
:
"
_mmlu_flan_cot_fewshot_template_yaml"
"
task"
:
"
mmlu_flan_cot_fewshot_elementary_mathematics"
Prev
1
2
3
4
5
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment