Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
e2909b33
Commit
e2909b33
authored
Feb 26, 2024
by
haileyschoelkopf
Browse files
transfer zeroshot improvements on bbh to fewshot
parent
96d185fa
Changes
56
Hide whitespace changes
Inline
Side-by-side
Showing
16 changed files
with
399 additions
and
0 deletions
+399
-0
lm_eval/tasks/bbh/fewshot/logical_deduction_three_objects.yaml
...al/tasks/bbh/fewshot/logical_deduction_three_objects.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/movie_recommendation.yaml
lm_eval/tasks/bbh/fewshot/movie_recommendation.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/multistep_arithmetic_two.yaml
lm_eval/tasks/bbh/fewshot/multistep_arithmetic_two.yaml
+10
-0
lm_eval/tasks/bbh/fewshot/navigate.yaml
lm_eval/tasks/bbh/fewshot/navigate.yaml
+10
-0
lm_eval/tasks/bbh/fewshot/object_counting.yaml
lm_eval/tasks/bbh/fewshot/object_counting.yaml
+10
-0
lm_eval/tasks/bbh/fewshot/penguins_in_a_table.yaml
lm_eval/tasks/bbh/fewshot/penguins_in_a_table.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/reasoning_about_colored_objects.yaml
...al/tasks/bbh/fewshot/reasoning_about_colored_objects.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/ruin_names.yaml
lm_eval/tasks/bbh/fewshot/ruin_names.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/salient_translation_error_detection.yaml
...asks/bbh/fewshot/salient_translation_error_detection.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/snarks.yaml
lm_eval/tasks/bbh/fewshot/snarks.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/sports_understanding.yaml
lm_eval/tasks/bbh/fewshot/sports_understanding.yaml
+13
-0
lm_eval/tasks/bbh/fewshot/temporal_sequences.yaml
lm_eval/tasks/bbh/fewshot/temporal_sequences.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_five_objects.yaml
...s/bbh/fewshot/tracking_shuffled_objects_five_objects.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_seven_objects.yaml
.../bbh/fewshot/tracking_shuffled_objects_seven_objects.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_three_objects.yaml
.../bbh/fewshot/tracking_shuffled_objects_three_objects.yaml
+12
-0
lm_eval/tasks/bbh/fewshot/utils.py
lm_eval/tasks/bbh/fewshot/utils.py
+224
-0
No files found.
lm_eval/tasks/bbh/fewshot/logical_deduction_three_objects.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
The
following
paragraphs
each
describe
a
set
of
three
objects
arranged
in
a
fixed
order.
The
statements
are
logically
consistent
within
each
paragraph.
In
a
golf
tournament,
there
were
three
golfers:
Amy,
Eli,
and
Eve.
Eve
finished
above
Amy.
Eli
finished
below
Amy.
\n
Options:
\n
(A)
Amy
finished
last
\n
(B)
Eli
finished
last
\n
(C)
Eve
finished
last
\n
A:
(B)
\n\n
Q:
The
following
paragraphs
each
describe
a
set
of
three
objects
arranged
in
a
fixed
order.
The
statements
are
logically
consistent
within
each
paragraph.
On
a
shelf,
there
are
three
books:
a
white
book,
a
green
book,
and
an
orange
book.
The
green
book
is
to
the
right
of
the
white
book.
The
orange
book
is
the
rightmost.
\n
Options:
\n
(A)
The
white
book
is
the
leftmost
\n
(B)
The
green
book
is
the
leftmost
\n
(C)
The
orange
book
is
the
leftmost
\n
A:
(A)
\n\n
Q:
The
following
paragraphs
each
describe
a
set
of
three
objects
arranged
in
a
fixed
order.
The
statements
are
logically
consistent
within
each
paragraph.
On
a
shelf,
there
are
three
books:
a
red
book,
a
gray
book,
and
a
white
book.
The
white
book
is
to
the
left
of
the
gray
book.
The
red
book
is
the
second
from
the
left.
\n
Options:
\n
(A)
The
red
book
is
the
leftmost
\n
(B)
The
gray
book
is
the
leftmost
\n
(C)
The
white
book
is
the
leftmost
\n
A:
(C)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_logical_deduction_three_objects"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/movie_recommendation.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
Find
a
movie
similar
to
Star
Wars
Episode
IV
-
A
New
Hope,
Indiana
Jones
and
the
Last
Crusade,
Star
Wars
Episode
V
-
The
Empire
Strikes
Back,
The
Big
Lebowski:
\n
Options:
\n
(A)
Tetsuo
\n
(B)
the
Ironman
\n
(C)
The
Princess
Bride
\n
(D)
The
Barkley
Marathons
The
Race
That
Eats
Its
Young
\n
(E)
Bug
\n
A:
(C)
\n\n
Q:
Find
a
movie
similar
to
Twister,
The
Silence
of
the
Lambs,
Independence
Day,
Braveheart:
\n
Options:
\n
(A)
They
Shoot
Horses
\n
(B)
Don't
They
\n
(C)
Forrest
Gump
\n
(D)
The
Salton
Sea
\n
(E)
Extreme
Days
\n
A:
(C)
\n\n
Q:
Find
a
movie
similar
to
Minority
Report,
Total
Recall,
Inside
Out,
Forrest
Gump:
\n
Options:
\n
(A)
Phenomena
\n
(B)
Lilting
\n
(C)
Catwoman
\n
(D)
Edge
of
Tomorrow
\n
A:
(D)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_movie_recommendation"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/multistep_arithmetic_two.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,13 @@
"
doc_to_text"
:
"
Q:
((-5
+
9
*
-4
-
0)
*
(4
+
-7
+
0
*
-5))
=
\n
A:
123
\n\n
Q:
((-9
*
7
*
7
*
-9)
+
(4
*
-9
-
8
-
-4))
=
\n
A:
3929
\n\n
Q:
((-3
+
5
*
8
*
-4)
-
(9
-
8
*
-7
+
-9))
=
\n
A:
-219
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_multistep_arithmetic_two"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.NumberParseRegexFilter
group_select
:
0
regex_pattern
:
"
([-0-9]+)"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/navigate.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,13 @@
"
doc_to_text"
:
"
Q:
If
you
follow
these
instructions,
do
you
return
to
the
starting
point?
Turn
left.
Turn
around.
Turn
left.
Take
7
steps.
Take
2
steps.
Take
4
steps.
Take
8
steps.
\n
Options:
\n
-
Yes
\n
-
No
\n
A:
No
\n\n
Q:
If
you
follow
these
instructions,
do
you
return
to
the
starting
point?
Turn
around.
Take
1
step.
Take
6
steps.
Turn
around.
Take
6
steps.
Take
9
steps.
Take
1
step.
\n
Options:
\n
-
Yes
\n
-
No
\n
A:
No
\n\n
Q:
If
you
follow
these
instructions,
do
you
return
to
the
starting
point?
Always
face
forward.
Take
2
steps
right.
Take
9
steps
left.
Take
7
steps
right.
\n
Options:
\n
-
Yes
\n
-
No
\n
A:
Yes
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_navigate"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
"
regex"
group_select
:
0
regex_pattern
:
"
\\
b(Yes|No|yes|no)
\\
b"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/object_counting.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,13 @@
"
doc_to_text"
:
"
Q:
I
have
a
blackberry,
a
clarinet,
a
nectarine,
a
plum,
a
strawberry,
a
banana,
a
flute,
an
orange,
and
a
violin.
How
many
fruits
do
I
have?
\n
A:
6
\n\n
Q:
I
have
an
orange,
a
raspberry,
two
peaches,
a
blackberry,
an
apple,
a
grape,
a
nectarine,
and
three
plums.
How
many
fruits
do
I
have?
\n
A:
11
\n\n
Q:
I
have
a
lettuce
head,
a
head
of
broccoli,
an
onion,
a
stalk
of
celery,
two
carrots,
a
garlic,
and
a
yam.
How
many
vegetables
do
I
have?
\n
A:
8
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_object_counting"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.NumberParseRegexFilter
group_select
:
0
regex_pattern
:
"
([-0-9]+)"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/penguins_in_a_table.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
Here
is
a
table
where
the
first
line
is
a
header
and
each
subsequent
line
is
a
penguin:
name,
age,
height
(cm),
weight
(kg)
Louis,
7,
50,
11
Bernard,
5,
80,
13
Vincent,
9,
60,
11
Gwen,
8,
70,
15
For
example:
the
age
of
Louis
is
7,
the
weight
of
Gwen
is
15
kg,
the
height
of
Bernard
is
80
cm.
We
now
add
a
penguin
to
the
table:
\n
James,
12,
90,
12
\n
How
many
penguins
are
less
than
8
years
old?
\n
Options:
\n
(A)
1
\n
(B)
2
\n
(C)
3
\n
(D)
4
\n
(E)
5
\n
A:
(B)
\n\n
Q:
Here
is
a
table
where
the
first
line
is
a
header
and
each
subsequent
line
is
a
penguin:
name,
age,
height
(cm),
weight
(kg)
Louis,
7,
50,
11
Bernard,
5,
80,
13
Vincent,
9,
60,
11
Gwen,
8,
70,
15
For
example:
the
age
of
Louis
is
7,
the
weight
of
Gwen
is
15
kg,
the
height
of
Bernard
is
80
cm.
Which
is
the
youngest
penguin?
\n
Options:
\n
(A)
Louis
\n
(B)
Bernard
\n
(C)
Vincent
\n
(D)
Gwen
\n
(E)
James
\n
A:
(B)
\n\n
Q:
Here
is
a
table
where
the
first
line
is
a
header
and
each
subsequent
line
is
a
penguin:
name,
age,
height
(cm),
weight
(kg)
Louis,
7,
50,
11
Bernard,
5,
80,
13
Vincent,
9,
60,
11
Gwen,
8,
70,
15
For
example:
the
age
of
Louis
is
7,
the
weight
of
Gwen
is
15
kg,
the
height
of
Bernard
is
80
cm.
What
is
the
name
of
the
second
penguin
sorted
by
alphabetic
order?
\n
Options:
\n
(A)
Louis
\n
(B)
Bernard
\n
(C)
Vincent
\n
(D)
Gwen
\n
(E)
James
\n
A:
(D)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_penguins_in_a_table"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/reasoning_about_colored_objects.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
On
the
nightstand,
there
is
a
red
pencil,
a
purple
mug,
a
burgundy
keychain,
a
fuchsia
teddy
bear,
a
black
plate,
and
a
blue
stress
ball.
What
color
is
the
stress
ball?
\n
Options:
\n
(A)
red
\n
(B)
orange
\n
(C)
yellow
\n
(D)
green
\n
(E)
blue
\n
(F)
brown
\n
(G)
magenta
\n
(H)
fuchsia
\n
(I)
mauve
\n
(J)
teal
\n
(K)
turquoise
\n
(L)
burgundy
\n
(M)
silver
\n
(N)
gold
\n
(O)
black
\n
(P)
grey
\n
(Q)
purple
\n
(R)
pink
\n
A:
(E)
\n\n
Q:
On
the
table,
you
see
a
bunch
of
objects
arranged
in
a
row:
a
purple
paperclip,
a
pink
stress
ball,
a
brown
keychain,
a
green
scrunchiephone
charger,
a
mauve
fidget
spinner,
and
a
burgundy
pen.
What
is
the
color
of
the
object
directly
to
the
right
of
the
stress
ball?
\n
Options:
\n
(A)
red
\n
(B)
orange
\n
(C)
yellow
\n
(D)
green
\n
(E)
blue
\n
(F)
brown
\n
(G)
magenta
\n
(H)
fuchsia
\n
(I)
mauve
\n
(J)
teal
\n
(K)
turquoise
\n
(L)
burgundy
\n
(M)
silver
\n
(N)
gold
\n
(O)
black
\n
(P)
grey
\n
(Q)
purple
\n
(R)
pink
\n
A:
(F)
\n\n
Q:
On
the
nightstand,
you
see
the
following
items
arranged
in
a
row:
a
teal
plate,
a
burgundy
keychain,
a
yellow
scrunchiephone
charger,
an
orange
mug,
a
pink
notebook,
and
a
grey
cup.
How
many
non-orange
items
do
you
see
to
the
left
of
the
teal
item?
\n
Options:
\n
(A)
zero
\n
(B)
one
\n
(C)
two
\n
(D)
three
\n
(E)
four
\n
(F)
five
\n
(G)
six
\n
A:
(A)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_reasoning_about_colored_objects"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/ruin_names.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
Which
of
the
following
is
a
humorous
edit
of
this
artist
or
movie
name:
'whitesnake'?
\n
Options:
\n
(A)
whitesnape
\n
(B)
whitesnapke
\n
(C)
whitesnuake
\n
(D)
mwhitesnake
\n
A:
(A)
\n\n
Q:
Which
of
the
following
is
a
humorous
edit
of
this
artist
or
movie
name:
'one
of
our
dinosaurs
is
missing'?
\n
Options:
\n
(A)
ofne
of
our
dinosaurs
is
missing
\n
(B)
one
af
our
dinosaurs
is
missing
\n
(C)
one
of
our
dinosaurs
is
pissing
\n
(D)
one
of
our
dinosaur
is
missing
\n
A:
(C)
\n\n
Q:
Which
of
the
following
is
a
humorous
edit
of
this
artist
or
movie
name:
'counting
crows'?
\n
Options:
\n
(A)
countingy
crows
\n
(B)
counting
cows
\n
(C)
courting
crows
\n
(D)
coutnting
crows
\n
A:
(B)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_ruin_names"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/salient_translation_error_detection.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
The
following
translations
from
German
to
English
contain
a
particular
error.
That
error
will
be
one
of
the
following
types:
Named
Entities:
An
entity
(names,
places,
locations,
etc.)
is
changed
to
a
different
entity.
Numerical
Values:
Numerical
values
(ordinals
or
cardinals),
dates,
and/or
units
are
changed.
Modifiers
or
Adjectives:
The
modifiers
and
adjectives
pertaining
to
a
noun
are
changed.
Negation
or
Antonyms:
Introduce
or
remove
a
negation
or
change
comparatives
to
their
antonyms.
Facts:
Trivial
factual
errors
not
pertaining
to
the
above
classes
are
introduced
in
the
translations.
Dropped
Content:
A
significant
clause
in
the
translation
is
removed.
Please
identify
that
error.
Source:
In
der
Liste
der
Baudenkmale
in
Lenzen
(Elbe)
sind
alle
Baudenkmale
der
brandenburgischen
Stadt
Lenzen
(Elbe)
und
ihrer
Ortsteile
aufgelistet.
\n
Translation:
In
the
list
of
architectural
monuments
in
Lenzen
all
architectural
monuments
of
the
Brandenburg
city
of
Lenzen
and
its
districts
are
listed.
\n
The
translation
contains
an
error
pertaining
to
\n
Options:
\n
(A)
Modifiers
or
Adjectives
\n
(B)
Numerical
Values
\n
(C)
Negation
or
Antonyms
\n
(D)
Named
Entities
\n
(E)
Dropped
Content
\n
(F)
Facts
\n
A:
(D)
\n\n
Q:
The
following
translations
from
German
to
English
contain
a
particular
error.
That
error
will
be
one
of
the
following
types:
Named
Entities:
An
entity
(names,
places,
locations,
etc.)
is
changed
to
a
different
entity.
Numerical
Values:
Numerical
values
(ordinals
or
cardinals),
dates,
and/or
units
are
changed.
Modifiers
or
Adjectives:
The
modifiers
and
adjectives
pertaining
to
a
noun
are
changed.
Negation
or
Antonyms:
Introduce
or
remove
a
negation
or
change
comparatives
to
their
antonyms.
Facts:
Trivial
factual
errors
not
pertaining
to
the
above
classes
are
introduced
in
the
translations.
Dropped
Content:
A
significant
clause
in
the
translation
is
removed.
Please
identify
that
error.
Source:
Auf
dieser
Seite
sind
die
Baudenkmäler
der
oberbayerischen
Großen
Kreisstadt
Landsberg
am
Lech
zusammengestellt.
\n
Translation:
On
this
page
are
compiled
the
architectural
monuments
of
the
town
of
Landsberg
am
Lech.
\n
The
translation
contains
an
error
pertaining
to
\n
Options:
\n
(A)
Modifiers
or
Adjectives
\n
(B)
Numerical
Values
\n
(C)
Negation
or
Antonyms
\n
(D)
Named
Entities
\n
(E)
Dropped
Content
\n
(F)
Facts
\n
A:
(E)
\n\n
Q:
The
following
translations
from
German
to
English
contain
a
particular
error.
That
error
will
be
one
of
the
following
types:
Named
Entities:
An
entity
(names,
places,
locations,
etc.)
is
changed
to
a
different
entity.
Numerical
Values:
Numerical
values
(ordinals
or
cardinals),
dates,
and/or
units
are
changed.
Modifiers
or
Adjectives:
The
modifiers
and
adjectives
pertaining
to
a
noun
are
changed.
Negation
or
Antonyms:
Introduce
or
remove
a
negation
or
change
comparatives
to
their
antonyms.
Facts:
Trivial
factual
errors
not
pertaining
to
the
above
classes
are
introduced
in
the
translations.
Dropped
Content:
A
significant
clause
in
the
translation
is
removed.
Please
identify
that
error.
Source:
Łeba
ist
eine
Kleinstadt
und
ein
Badeort
im
Powiat
Lęborski
der
polnischen
Woiwodschaft
Pommern.
\n
Translation:
Eba
is
not
a
small
town
and
seaside
resort
in
the
Powiat
Léborski
county
of
the
Pomeranian
Voivodeship
of
Poland.
\n
The
translation
contains
an
error
pertaining
to
\n
Options:
\n
(A)
Modifiers
or
Adjectives
\n
(B)
Numerical
Values
\n
(C)
Negation
or
Antonyms
\n
(D)
Named
Entities
\n
(E)
Dropped
Content
\n
(F)
Facts
\n
A:
(C)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_salient_translation_error_detection"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/snarks.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
Which
statement
is
sarcastic?
\n
Options:
\n
(A)
Yes,
because
having
interests
and
actively
researching
them
is
a
huge
waste
\n
(B)
Yes,
because
having
interests
and
actively
researching
them
is
a
huge
deal
\n
A:
(A)
\n\n
Q:
Which
statement
is
sarcastic?
\n
Options:
\n
(A)
No
one
is
going
to
disagree
with
you
on
this.
Avoiding
ad
hominem
attacks
really
help
your
case
\n
(B)
No
one
is
going
to
disagree
with
you
on
this.
Ad
hominem
attacks
really
help
your
case
\n
A:
(B)
\n\n
Q:
Which
statement
is
sarcastic?
\n
Options:
\n
(A)
Consistency
in
the
league's
punishments?
What
do
you
think
this
is
supposed
to
be,
politics?
\n
(B)
Consistency
in
the
league's
punishments?
What
do
you
think
this
is
supposed
to
be,
moral?
\n
A:
(A)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_snarks"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/sports_understanding.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,16 @@
"
doc_to_text"
:
"
Q:
Is
the
following
sentence
plausible?
\"
Bam
Adebayo
scored
a
reverse
layup
in
the
Western
Conference
Finals.
\"\n
A:
yes
\n\n
Q:
Is
the
following
sentence
plausible?
\"
Santi
Cazorla
scored
a
touchdown.
\"\n
A:
no
\n\n
Q:
Is
the
following
sentence
plausible?
\"
DeMar
DeRozan
was
called
for
the
goal
tend.
\"\n
A:
yes
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_sports_understanding"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MapRegexFilter
group_select
:
0
ignore_case
:
true
regex_pattern_to_value
:
\b(no|not plausible)\b
:
"
no"
\b(yes|plausible)\b
:
"
yes"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/temporal_sequences.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
Today,
Emily
went
to
the
museum.
Between
what
times
could
they
have
gone?
\n
We
know
that:
\n
Emily
woke
up
at
1pm.
\n
Elizabeth
saw
Emily
reading
at
the
library
from
2pm
to
4pm.
\n
Jessica
saw
Emily
watching
a
movie
at
the
theater
from
4pm
to
5pm.
\n
Leslie
saw
Emily
waiting
at
the
airport
from
5pm
to
6pm.
\n
William
saw
Emily
buying
clothes
at
the
mall
from
6pm
to
7pm.
\n
The
museum
was
closed
after
7pm.
\n
Between
what
times
could
Emily
have
gone
to
the
museum?
\n
Options:
\n
(A)
1pm
to
2pm
\n
(B)
6pm
to
7pm
\n
(C)
5pm
to
6pm
\n
(D)
2pm
to
4pm
\n
A:
(A)
\n\n
Q:
Today,
Elizabeth
went
to
the
amusement
park.
Between
what
times
could
they
have
gone?
\n
We
know
that:
\n
Elizabeth
woke
up
at
7am.
\n
David
saw
Elizabeth
fixing
their
computer
at
the
electronic
store
from
1pm
to
2pm.
\n
Sarah
saw
Elizabeth
playing
tennis
at
the
tennis
court
from
2pm
to
3pm.
\n
Susan
saw
Elizabeth
walking
towards
the
Statue
of
Liberty
from
3pm
to
6pm.
\n
Andrew
saw
Elizabeth
taking
photos
near
the
Eiffel
Tower
from
6pm
to
9pm.
\n
Emily
saw
Elizabeth
getting
a
coffee
at
the
cafe
from
9pm
to
10pm.
\n
The
amusement
park
was
closed
after
10pm.
\n
Between
what
times
could
Elizabeth
have
gone
to
the
amusement
park?
\n
Options:
\n
(A)
7am
to
1pm
\n
(B)
9pm
to
10pm
\n
(C)
1pm
to
2pm
\n
(D)
3pm
to
6pm
\n
A:
(A)
\n\n
Q:
Today,
Tiffany
went
to
the
beach.
Between
what
times
could
they
have
gone?
\n
We
know
that:
\n
Tiffany
woke
up
at
5am.
\n
Betty
saw
Tiffany
getting
a
coffee
at
the
cafe
from
5am
to
6am.
\n
Jessica
saw
Tiffany
working
at
the
office
from
6am
to
9am.
\n
John
saw
Tiffany
stretching
at
a
yoga
studio
from
9am
to
12pm.
\n
Sean
saw
Tiffany
sitting
on
a
rooftop
from
12pm
to
2pm.
\n
Sarah
saw
Tiffany
playing
tennis
at
the
tennis
court
from
2pm
to
3pm.
\n
The
beach
was
closed
after
4pm.
\n
Between
what
times
could
Tiffany
have
gone
to
the
beach?
\n
Options:
\n
(A)
9am
to
12pm
\n
(B)
12pm
to
2pm
\n
(C)
5am
to
6am
\n
(D)
3pm
to
4pm
\n
A:
(D)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_temporal_sequences"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_five_objects.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
Alice,
Bob,
and
Claire
are
playing
a
game.
At
the
start
of
the
game,
they
are
each
holding
a
ball:
Alice
has
a
yellow
ball,
Bob
has
a
blue
ball,
and
Claire
has
a
pink
ball.
\n
As
the
game
progresses,
pairs
of
players
trade
balls.
First,
Claire
and
Alice
swap
balls.
Then,
Alice
and
Bob
swap
balls.
Finally,
Claire
and
Bob
swap
balls.
At
the
end
of
the
game,
Bob
has
the
\n
Options:
\n
(A)
yellow
ball
\n
(B)
blue
ball
\n
(C)
pink
ball
\n
A:
(A)
\n\n
Q:
Alice,
Bob,
and
Claire
are
playing
a
game.
At
the
start
of
the
game,
they
are
each
holding
a
ball:
Alice
has
a
white
ball,
Bob
has
a
purple
ball,
and
Claire
has
a
pink
ball.
\n
As
the
game
progresses,
pairs
of
players
trade
balls.
First,
Bob
and
Alice
swap
balls.
Then,
Bob
and
Claire
swap
balls.
Finally,
Bob
and
Alice
swap
balls.
At
the
end
of
the
game,
Alice
has
the
\n
Options:
\n
(A)
white
ball
\n
(B)
purple
ball
\n
(C)
pink
ball
\n
A:
(C)
\n\n
Q:
Alice,
Bob,
and
Claire
are
dancers
at
a
square
dance.
At
the
start
of
a
song,
they
each
have
a
partner:
Alice
is
dancing
with
Lola,
Bob
is
dancing
with
Rodrigo,
and
Claire
is
dancing
with
Patrick.
\n
Throughout
the
song,
the
dancers
often
trade
partners.
First,
Alice
and
Bob
switch
partners.
Then,
Claire
and
Bob
switch
partners.
Finally,
Bob
and
Alice
switch
partners.
At
the
end
of
the
dance,
Alice
is
dancing
with
\n
Options:
\n
(A)
Lola
\n
(B)
Rodrigo
\n
(C)
Patrick
\n
A:
(C)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_tracking_shuffled_objects_five_objects"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_seven_objects.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
Alice,
Bob,
and
Claire
are
playing
a
game.
At
the
start
of
the
game,
they
are
each
holding
a
ball:
Alice
has
a
yellow
ball,
Bob
has
a
blue
ball,
and
Claire
has
a
pink
ball.
\n
As
the
game
progresses,
pairs
of
players
trade
balls.
First,
Claire
and
Alice
swap
balls.
Then,
Alice
and
Bob
swap
balls.
Finally,
Claire
and
Bob
swap
balls.
At
the
end
of
the
game,
Bob
has
the
\n
Options:
\n
(A)
yellow
ball
\n
(B)
blue
ball
\n
(C)
pink
ball
\n
A:
(A)
\n\n
Q:
Alice,
Bob,
and
Claire
are
playing
a
game.
At
the
start
of
the
game,
they
are
each
holding
a
ball:
Alice
has
a
white
ball,
Bob
has
a
purple
ball,
and
Claire
has
a
pink
ball.
\n
As
the
game
progresses,
pairs
of
players
trade
balls.
First,
Bob
and
Alice
swap
balls.
Then,
Bob
and
Claire
swap
balls.
Finally,
Bob
and
Alice
swap
balls.
At
the
end
of
the
game,
Alice
has
the
\n
Options:
\n
(A)
white
ball
\n
(B)
purple
ball
\n
(C)
pink
ball
\n
A:
(C)
\n\n
Q:
Alice,
Bob,
and
Claire
are
dancers
at
a
square
dance.
At
the
start
of
a
song,
they
each
have
a
partner:
Alice
is
dancing
with
Lola,
Bob
is
dancing
with
Rodrigo,
and
Claire
is
dancing
with
Patrick.
\n
Throughout
the
song,
the
dancers
often
trade
partners.
First,
Alice
and
Bob
switch
partners.
Then,
Claire
and
Bob
switch
partners.
Finally,
Bob
and
Alice
switch
partners.
At
the
end
of
the
dance,
Alice
is
dancing
with
\n
Options:
\n
(A)
Lola
\n
(B)
Rodrigo
\n
(C)
Patrick
\n
A:
(C)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_tracking_shuffled_objects_seven_objects"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/tracking_shuffled_objects_three_objects.yaml
View file @
e2909b33
...
...
@@ -3,3 +3,15 @@
"
doc_to_text"
:
"
Q:
Alice,
Bob,
and
Claire
are
playing
a
game.
At
the
start
of
the
game,
they
are
each
holding
a
ball:
Alice
has
a
yellow
ball,
Bob
has
a
blue
ball,
and
Claire
has
a
pink
ball.
\n
As
the
game
progresses,
pairs
of
players
trade
balls.
First,
Claire
and
Alice
swap
balls.
Then,
Alice
and
Bob
swap
balls.
Finally,
Claire
and
Bob
swap
balls.
At
the
end
of
the
game,
Bob
has
the
\n
Options:
\n
(A)
yellow
ball
\n
(B)
blue
ball
\n
(C)
pink
ball
\n
A:
(A)
\n\n
Q:
Alice,
Bob,
and
Claire
are
playing
a
game.
At
the
start
of
the
game,
they
are
each
holding
a
ball:
Alice
has
a
white
ball,
Bob
has
a
purple
ball,
and
Claire
has
a
pink
ball.
\n
As
the
game
progresses,
pairs
of
players
trade
balls.
First,
Bob
and
Alice
swap
balls.
Then,
Bob
and
Claire
swap
balls.
Finally,
Bob
and
Alice
swap
balls.
At
the
end
of
the
game,
Alice
has
the
\n
Options:
\n
(A)
white
ball
\n
(B)
purple
ball
\n
(C)
pink
ball
\n
A:
(C)
\n\n
Q:
Alice,
Bob,
and
Claire
are
dancers
at
a
square
dance.
At
the
start
of
a
song,
they
each
have
a
partner:
Alice
is
dancing
with
Lola,
Bob
is
dancing
with
Rodrigo,
and
Claire
is
dancing
with
Patrick.
\n
Throughout
the
song,
the
dancers
often
trade
partners.
First,
Alice
and
Bob
switch
partners.
Then,
Claire
and
Bob
switch
partners.
Finally,
Bob
and
Alice
switch
partners.
At
the
end
of
the
dance,
Alice
is
dancing
with
\n
Options:
\n
(A)
Lola
\n
(B)
Rodrigo
\n
(C)
Patrick
\n
A:
(C)
\n\n
Q:
{{input}}
\n
A:"
"
include"
:
"
_fewshot_template_yaml"
"
task"
:
"
bbh_fewshot_tracking_shuffled_objects_three_objects"
filter_list
:
-
name
:
"
strict-match"
filter
:
-
function
:
"
take_first"
-
name
:
"
flexible-extract"
filter
:
-
function
:
!function
utils.MultiChoiceRegexFilter
group_select
:
0
ignore_case
:
true
ignore_punctuation
:
true
regex_pattern
:
"
(
\\
([A-Z]
\\
))"
-
function
:
"
take_first"
lm_eval/tasks/bbh/fewshot/utils.py
0 → 100644
View file @
e2909b33
import
collections
import
re
import
sys
import
unicodedata
from
lm_eval.filters.extraction
import
Filter
,
RegexFilter
class
ExtendedRegexFilter
(
RegexFilter
):
punct_tbl
=
dict
.
fromkeys
(
i
for
i
in
range
(
sys
.
maxunicode
)
if
unicodedata
.
category
(
chr
(
i
)).
startswith
(
"P"
)
)
def
__init__
(
self
,
regex_pattern
:
str
=
r
"#### (\-?[0-9\.\,]+)"
,
group_select
=
0
,
fallback
:
str
=
"[invalid]"
,
ignore_case
=
False
,
ignore_punctuation
=
False
,
regexes_to_ignore
=
None
,
)
->
None
:
super
().
__init__
(
regex_pattern
,
group_select
,
fallback
)
self
.
ignore_case
=
ignore_case
self
.
ignore_punctuation
=
ignore_punctuation
self
.
regexes_to_ignore
=
regexes_to_ignore
def
filter_ignores
(
self
,
st
):
if
self
.
regexes_to_ignore
is
not
None
:
for
s
in
self
.
regexes_to_ignore
:
st
=
re
.
sub
(
s
,
""
,
st
)
if
self
.
ignore_case
:
st
=
st
.
lower
()
if
self
.
ignore_punctuation
:
# https://stackoverflow.com/a/266162
st
=
st
.
translate
(
self
.
punct_tbl
)
return
st
def
find_match
(
self
,
regex
,
resp
,
convert_dict
=
{}):
match
=
regex
.
findall
(
resp
)
if
match
:
match
=
match
[
self
.
group_select
]
if
isinstance
(
match
,
tuple
):
match
=
[
m
for
m
in
match
if
m
][
0
]
match
=
match
.
strip
()
if
match
and
match
in
convert_dict
:
match
=
convert_dict
[
match
]
return
match
class
MapRegexFilter
(
ExtendedRegexFilter
):
def
__init__
(
self
,
regex_pattern_to_value
:
dict
=
{},
group_select
=
0
,
fallback
:
str
=
"[invalid]"
,
ignore_case
=
False
,
ignore_punctuation
=
False
,
regexes_to_ignore
=
None
,
)
->
None
:
"""
regex_pattern_to_value: Match the regex pattern and change the result into the value
group_select: Selects the (group_select)th match from the findall result. We use the whole regex_patterns, concatenated by |
ignore_case: Lowers the case of response before matching with the given regex
ignore_punctuation: Remove the punctuation before matching with the given regex
regexes_to_ignore: Remove these regexes before matching with the given regex
"""
super
().
__init__
(
"|"
.
join
(
list
(
regex_pattern_to_value
.
keys
())),
group_select
,
fallback
,
ignore_case
,
ignore_punctuation
,
regexes_to_ignore
,
)
self
.
regex_to_value
=
{
re
.
compile
(
r
):
v
for
r
,
v
in
regex_pattern_to_value
.
items
()
}
def
apply
(
self
,
resps
,
docs
):
filtered_resps
=
[]
for
r
in
resps
:
filtered
=
[]
for
resp
in
r
:
whole_match_considering_group_select
=
self
.
find_match
(
self
.
regex
,
self
.
filter_ignores
(
resp
)
)
if
whole_match_considering_group_select
:
for
regex
,
mapped_value
in
self
.
regex_to_value
.
items
():
match
=
self
.
find_match
(
regex
,
self
.
filter_ignores
(
whole_match_considering_group_select
),
)
if
match
:
match
=
mapped_value
break
if
not
whole_match_considering_group_select
or
not
match
:
match
=
self
.
fallback
filtered
.
append
(
match
)
filtered_resps
.
append
(
filtered
)
return
filtered_resps
class
NumberParseRegexFilter
(
ExtendedRegexFilter
):
def
apply
(
self
,
resps
,
docs
):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
# independently (and keep them a list.)
filtered_resps
=
[]
import
regex
from
word2number
import
w2n
# https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
english_number_regex
=
regex
.
compile
(
"((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S
\r\n
]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
)
for
r
in
resps
:
filtered
=
[]
for
resp
in
r
:
match
=
self
.
find_match
(
self
.
regex
,
resp
)
if
not
match
:
match
=
self
.
find_match
(
english_number_regex
,
resp
.
lower
())
if
match
:
match
=
str
(
w2n
.
word_to_num
(
match
))
if
not
match
:
match
=
self
.
fallback
filtered
.
append
(
match
)
filtered_resps
.
append
(
filtered
)
return
filtered_resps
class
WordSortFilter
(
Filter
):
""" """
def
apply
(
self
,
resps
,
docs
):
filtered_resps
=
[]
for
r
,
doc
in
zip
(
resps
,
docs
):
words
=
doc
[
"input"
].
split
(
"List:"
)[
1
].
strip
().
split
()
regex
=
re
.
compile
(
"|"
.
join
([
f
"
\\
b
{
w
}
\\
b"
for
w
in
words
]))
filtered
=
[]
for
resp
in
r
:
match
=
regex
.
findall
(
resp
)
match
.
reverse
()
ordered_words
=
reversed
(
collections
.
OrderedDict
(
zip
(
match
,
[
None
]
*
len
(
match
)))
)
filtered
.
append
(
" "
.
join
(
ordered_words
))
filtered_resps
.
append
(
filtered
)
return
filtered_resps
class
MultiChoiceRegexFilter
(
ExtendedRegexFilter
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
group_select: Selects the (group_select)th match from the findall result.
ignore_case: Ignores the case during step 1 matching
ignore_punctuation: Remove the punctuation during step 1 matching
regexes_to_ignore: Remove these regexes during step 1 matching
"""
super
().
__init__
(
*
args
,
**
kwargs
)
def
apply
(
self
,
resps
,
docs
):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
# independently (and keep them a list.)
filtered_resps
=
[]
for
r
,
doc
in
zip
(
resps
,
docs
):
fallback_regexes
=
[]
choice_to_alpha
=
{}
next_alpha
=
"A"
without_paren_fallback_regexes
=
[]
without_paren_to_target
=
{}
multiple_choices_regex
=
re
.
compile
(
r
"\([A-Z]\)([^\n^(]*)"
)
match
=
multiple_choices_regex
.
findall
(
doc
[
"input"
])
for
m
in
match
:
m
=
self
.
filter_ignores
(
m
.
strip
())
fallback_regexes
.
append
(
f
"
{
re
.
escape
(
m
)
}
"
)
choice_to_alpha
[
m
]
=
f
"(
{
next_alpha
}
)"
without_paren_fallback_regexes
.
append
(
next_alpha
)
without_paren_to_target
[
next_alpha
]
=
f
"(
{
next_alpha
}
)"
next_alpha
=
chr
(
ord
(
next_alpha
)
+
1
)
fallback_regex
=
re
.
compile
(
"|"
.
join
(
fallback_regexes
))
without_paren_fallback_regex
=
"|"
.
join
(
without_paren_fallback_regexes
)
without_paren_fallback_regex
=
re
.
compile
(
f
":[\s]*(
{
without_paren_fallback_regex
}
)"
)
filtered
=
[]
for
resp
in
r
:
match
=
self
.
find_match
(
self
.
regex
,
resp
)
if
not
match
:
match
=
self
.
find_match
(
fallback_regex
,
self
.
filter_ignores
(
resp
),
choice_to_alpha
)
if
not
match
:
match
=
self
.
find_match
(
without_paren_fallback_regex
,
resp
,
without_paren_to_target
)
if
not
match
:
match
=
self
.
fallback
filtered
.
append
(
match
)
filtered_resps
.
append
(
filtered
)
return
filtered_resps
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment