Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
c3f724cf
"092/benchmark_serving.py" did not exist on "77765e10b1637256b4ae6d4badeb22d2b5c51c26"
Commit
c3f724cf
authored
Feb 08, 2021
by
Leo Gao
Browse files
Change glue and superglue prompts
parent
1050109b
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
41 additions
and
31 deletions
+41
-31
lm_eval/tasks/glue.py
lm_eval/tasks/glue.py
+17
-17
lm_eval/tasks/superglue.py
lm_eval/tasks/superglue.py
+12
-12
lm_eval/utils.py
lm_eval/utils.py
+11
-1
main.py
main.py
+1
-1
No files found.
lm_eval/tasks/glue.py
View file @
c3f724cf
...
@@ -3,7 +3,7 @@ from lm_eval.base import rf, mean, f1_score, matthews_corrcoef
...
@@ -3,7 +3,7 @@ from lm_eval.base import rf, mean, f1_score, matthews_corrcoef
from
scipy.stats
import
pearsonr
,
spearmanr
from
scipy.stats
import
pearsonr
,
spearmanr
from
tqdm
import
auto
as
tqdm_lib
from
tqdm
import
auto
as
tqdm_lib
from
.
common
import
HFTask
,
yesno
from
.
common
import
HFTask
,
yesno
from
..utils
import
general_detokenize
# Single-Sentence Tasks
# Single-Sentence Tasks
...
@@ -22,10 +22,10 @@ class CoLA(HFTask):
...
@@ -22,10 +22,10 @@ class CoLA(HFTask):
return
True
return
True
def
fewshot_description
(
self
):
def
fewshot_description
(
self
):
return
"Does this sentence make sense?
:
\t
True or False
?
"
return
"Does this sentence make sense?
(
True or False
)
"
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"
Sentence: {}
\n
Answer:"
.
format
(
doc
[
"sentence"
])
return
"
{}
\n
Question: Does this sentence make sense?
\n
Answer:"
.
format
(
doc
[
"sentence"
])
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
({
1
:
"True"
,
0
:
"False"
}[
doc
[
"label"
]])
return
" {}"
.
format
({
1
:
"True"
,
0
:
"False"
}[
doc
[
"label"
]])
...
@@ -71,8 +71,8 @@ class SST(HFTask):
...
@@ -71,8 +71,8 @@ class SST(HFTask):
return
"Indicate if each sentence is Positive or Negative."
return
"Indicate if each sentence is Positive or Negative."
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"
sentence:
\t
{}
\t
\n
a
nswer:"
.
format
(
return
"
{}
\n
Question: Is this sentence Positive or Negative?
\n
A
nswer:"
.
format
(
doc
[
"sentence"
],
general_detokenize
(
doc
[
"sentence"
]
)
,
)
)
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
...
@@ -127,9 +127,9 @@ class MNLI(HFTask):
...
@@ -127,9 +127,9 @@ class MNLI(HFTask):
return
self
.
data
[
"test_matched"
]
return
self
.
data
[
"test_matched"
]
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
q
uestion:
\t
{}
\t
True, False or Neither?
\n
a
nswer:"
.
format
(
return
"{}
\n
Q
uestion:
{}
True, False or Neither?
\n
A
nswer:"
.
format
(
doc
[
"premise"
],
doc
[
"premise"
],
doc
[
"hypothesis"
],
doc
[
"hypothesis"
]
+
(
''
if
doc
[
"hypothesis"
].
endswith
(
'.'
)
else
'.'
)
,
)
)
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
...
@@ -187,7 +187,7 @@ class QNLI(HFTask):
...
@@ -187,7 +187,7 @@ class QNLI(HFTask):
return
True
return
True
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"
question:
\t
{}
\n
response:
\t
{}
\n
Does this answer the question
, Yes or No?
:"
.
format
(
return
"
{}
\n
{}
\n
Question:
Does this
response
answer the question
?
\n
Answer
:"
.
format
(
doc
[
"question"
],
doc
[
"question"
],
doc
[
"sentence"
],
doc
[
"sentence"
],
)
)
...
@@ -235,7 +235,7 @@ class WNLI(HFTask):
...
@@ -235,7 +235,7 @@ class WNLI(HFTask):
return
True
return
True
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
q
uestion:
\t
{}
\t
True, False or Neither?
\n
a
nswer:"
.
format
(
return
"{}
\n
Q
uestion:
{}
True, False or Neither?
\n
A
nswer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence1"
],
doc
[
"sentence2"
],
doc
[
"sentence2"
],
)
)
...
@@ -284,7 +284,7 @@ class RTE(HFTask):
...
@@ -284,7 +284,7 @@ class RTE(HFTask):
return
True
return
True
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
q
uestion:
\t
{}
\t
True or False?
\n
a
nswer:"
.
format
(
return
"{}
\n
Q
uestion:
{}
True or False?
\n
A
nswer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence1"
],
doc
[
"sentence2"
],
doc
[
"sentence2"
],
)
)
...
@@ -338,17 +338,17 @@ class MRPC(HFTask):
...
@@ -338,17 +338,17 @@ class MRPC(HFTask):
return
"Indicate if both sentences mean the same thing."
return
"Indicate if both sentences mean the same thing."
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"
s
entence 1:
\t
{}
\n
s
entence 2:
\t
{}
\n
a
nswer:"
.
format
(
return
"
S
entence 1:
{}
\n
S
entence 2:
{}
\n
Question: Do both sentences mean the same thing?
\n
A
nswer:"
.
format
(
doc
[
"sentence1"
],
general_detokenize
(
doc
[
"sentence1"
]
)
,
doc
[
"sentence2"
],
general_detokenize
(
doc
[
"sentence2"
]
)
,
)
)
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" {}"
.
format
(
yesno
(
doc
[
"label"
]))
return
" {}"
.
format
(
yesno
(
doc
[
"label"
]))
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
):
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
"
y
es"
)
ll_yes
,
_
=
rf
.
loglikelihood
(
ctx
,
"
Y
es"
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
"
n
o"
)
ll_no
,
_
=
rf
.
loglikelihood
(
ctx
,
"
N
o"
)
return
ll_yes
,
ll_no
return
ll_yes
,
ll_no
def
process_results
(
self
,
doc
,
results
):
def
process_results
(
self
,
doc
,
results
):
...
@@ -390,7 +390,7 @@ class QQP(HFTask):
...
@@ -390,7 +390,7 @@ class QQP(HFTask):
return
"Indicate if both questions ask the same thing."
return
"Indicate if both questions ask the same thing."
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"
q
uestion 1:
\t
{}
\n
q
uestion 2:
\t
{}
\n
a
nswer:"
.
format
(
return
"
Q
uestion 1:
{}
\n
Q
uestion 2:
{}
\n
Question: Do both questions ask the same thing?
\n
A
nswer:"
.
format
(
doc
[
"question1"
],
doc
[
"question1"
],
doc
[
"question2"
],
doc
[
"question2"
],
)
)
...
@@ -443,7 +443,7 @@ class STSB(HFTask):
...
@@ -443,7 +443,7 @@ class STSB(HFTask):
"where 5 means identical and 0 means unrelated."
"where 5 means identical and 0 means unrelated."
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"sentence 1:
\t
{}
\n
sentence 2:
\t
{}
\n
a
nswer:"
.
format
(
return
"sentence 1:
{}
\n
sentence 2:
{}
\n
A
nswer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence1"
],
doc
[
"sentence2"
],
doc
[
"sentence2"
],
)
)
...
...
lm_eval/tasks/superglue.py
View file @
c3f724cf
...
@@ -28,7 +28,7 @@ class BoolQ(HFTask):
...
@@ -28,7 +28,7 @@ class BoolQ(HFTask):
return
"Read the following passages and answer each question with a yes or a no."
return
"Read the following passages and answer each question with a yes or a no."
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'passage'
]
}
\n
q
uestion:
{
doc
[
'question'
]
}
\n
a
nswer:"
return
f
"
{
doc
[
'passage'
]
}
\n
Q
uestion:
{
doc
[
'question'
]
}
\n
A
nswer:"
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
" "
+
yesno
(
doc
[
'label'
])
return
" "
+
yesno
(
doc
[
'label'
])
...
@@ -80,7 +80,7 @@ class CommitmentBank(HFTask):
...
@@ -80,7 +80,7 @@ class CommitmentBank(HFTask):
"to the truth of the hypothesis. The three possible labels are true, false or neither."
"to the truth of the hypothesis. The three possible labels are true, false or neither."
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"{}
\n
q
uestion: {}
t
rue,
f
alse or
n
either?
\n
a
nswer:"
.
format
(
return
"{}
\n
Q
uestion: {}
. T
rue,
F
alse or
N
either?
\n
A
nswer:"
.
format
(
doc
[
"premise"
],
doc
[
"premise"
],
doc
[
"hypothesis"
],
doc
[
"hypothesis"
],
)
)
...
@@ -89,12 +89,12 @@ class CommitmentBank(HFTask):
...
@@ -89,12 +89,12 @@ class CommitmentBank(HFTask):
# True = entailment
# True = entailment
# False = contradiction
# False = contradiction
# Neither = neutral
# Neither = neutral
return
" {}"
.
format
({
0
:
"
t
rue"
,
1
:
"
n
either"
,
2
:
"
f
alse"
}[
doc
[
"label"
]])
return
" {}"
.
format
({
0
:
"
T
rue"
,
1
:
"
N
either"
,
2
:
"
F
alse"
}[
doc
[
"label"
]])
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
):
ll_true
,
_
=
rf
.
loglikelihood
(
ctx
,
'
t
rue'
)
ll_true
,
_
=
rf
.
loglikelihood
(
ctx
,
'
T
rue'
)
ll_neither
,
_
=
rf
.
loglikelihood
(
ctx
,
'
n
either'
)
ll_neither
,
_
=
rf
.
loglikelihood
(
ctx
,
'
N
either'
)
ll_false
,
_
=
rf
.
loglikelihood
(
ctx
,
'
f
alse'
)
ll_false
,
_
=
rf
.
loglikelihood
(
ctx
,
'
F
alse'
)
return
ll_true
,
ll_neither
,
ll_false
return
ll_true
,
ll_neither
,
ll_false
...
@@ -214,15 +214,15 @@ class MultiRC(HFTask):
...
@@ -214,15 +214,15 @@ class MultiRC(HFTask):
return
"READING COMPREHENSION ANSWER KEY"
return
"READING COMPREHENSION ANSWER KEY"
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
f
"
{
doc
[
'paragraph'
]
}
\n
\n
{
doc
[
'question'
]
}
\n
"
return
f
"
{
doc
[
'paragraph'
]
}
\n
Question:
{
doc
[
'question'
]
}
\n
Answer:
"
def
doc_to_target
(
self
,
doc
):
def
doc_to_target
(
self
,
doc
):
return
self
.
format_answer
(
answer
=
doc
[
"answer"
],
label
=
doc
[
"label"
])
return
self
.
format_answer
(
answer
=
doc
[
"answer"
],
label
=
doc
[
"label"
])
@
staticmethod
@
staticmethod
def
format_answer
(
answer
,
label
):
def
format_answer
(
answer
,
label
):
label_str
=
"
True
"
if
label
else
"
False
"
label_str
=
"
Yes
"
if
label
else
"
No
"
return
f
"
[
{
label_str
}
]
{
answer
}
"
return
f
"
{
label_str
}
,
{
answer
}
"
def
construct_requests
(
self
,
doc
,
ctx
):
def
construct_requests
(
self
,
doc
,
ctx
):
true_choice
=
self
.
format_answer
(
answer
=
doc
[
"answer"
],
label
=
True
)
true_choice
=
self
.
format_answer
(
answer
=
doc
[
"answer"
],
label
=
True
)
...
@@ -364,8 +364,8 @@ class WordsInContext(HFTask):
...
@@ -364,8 +364,8 @@ class WordsInContext(HFTask):
return
""
return
""
def
doc_to_text
(
self
,
doc
):
def
doc_to_text
(
self
,
doc
):
return
"
{}
\n
{}
\n
Question: Is the word '{}' used in the same way in the"
\
return
"
Sentence 1: {}
\n
Sentence 2:
{}
\n
Question: Is the word '{}' used in the same way in the"
\
" two sentences above?
\n
a
nswer:"
.
format
(
" two sentences above?
\n
A
nswer:"
.
format
(
doc
[
"sentence1"
],
doc
[
"sentence1"
],
doc
[
"sentence2"
],
doc
[
"sentence2"
],
doc
[
"sentence1"
][
doc
[
"start1"
]:
doc
[
"end1"
]],
doc
[
"sentence1"
][
doc
[
"start1"
]:
doc
[
"end1"
]],
...
@@ -438,7 +438,7 @@ class SGWinogradSchemaChallenge(HFTask):
...
@@ -438,7 +438,7 @@ class SGWinogradSchemaChallenge(HFTask):
# NOTE: HuggingFace span indices are word-based not character-based.
# NOTE: HuggingFace span indices are word-based not character-based.
pre
=
" "
.
join
(
raw_passage
.
split
()[:
doc
[
"span2_index"
]])
pre
=
" "
.
join
(
raw_passage
.
split
()[:
doc
[
"span2_index"
]])
post
=
raw_passage
[
len
(
pre
)
+
len
(
doc
[
"span2_text"
])
+
1
:]
post
=
raw_passage
[
len
(
pre
)
+
len
(
doc
[
"span2_text"
])
+
1
:]
passage
=
pre
+
" *{}*"
.
format
(
doc
[
'span2_text'
])
+
post
passage
=
general_detokenize
(
pre
+
" *{}*"
.
format
(
doc
[
'span2_text'
])
+
post
)
noun
=
doc
[
"span1_text"
]
noun
=
doc
[
"span1_text"
]
pronoun
=
doc
[
"span2_text"
]
pronoun
=
doc
[
"span2_text"
]
text
=
(
text
=
(
...
...
lm_eval/utils.py
View file @
c3f724cf
import
os
import
os
import
re
class
ExitCodeError
(
Exception
):
class
ExitCodeError
(
Exception
):
...
@@ -39,4 +40,13 @@ def chunks(iter, n):
...
@@ -39,4 +40,13 @@ def chunks(iter, n):
yield
arr
yield
arr
arr
=
[]
arr
=
[]
if
arr
:
yield
arr
if
arr
:
yield
arr
\ No newline at end of file
def
general_detokenize
(
string
):
string
=
string
.
replace
(
" n't"
,
"n't"
)
string
=
string
.
replace
(
" )"
,
")"
)
string
=
string
.
replace
(
"( "
,
"("
)
string
=
string
.
replace
(
"
\"
"
,
"
\"
"
)
string
=
string
.
replace
(
"
\"
"
,
"
\"
"
)
string
=
re
.
sub
(
r
" (['.,])"
,
r
"\1"
)
return
string
\ No newline at end of file
main.py
View file @
c3f724cf
...
@@ -16,7 +16,7 @@ def parse_args():
...
@@ -16,7 +16,7 @@ def parse_args():
parser
.
add_argument
(
'--model_args'
,
default
=
""
)
parser
.
add_argument
(
'--model_args'
,
default
=
""
)
parser
.
add_argument
(
'--tasks'
,
default
=
"all_tasks"
)
parser
.
add_argument
(
'--tasks'
,
default
=
"all_tasks"
)
parser
.
add_argument
(
'--provide_description'
,
action
=
"store_true"
)
parser
.
add_argument
(
'--provide_description'
,
action
=
"store_true"
)
parser
.
add_argument
(
'--num_fewshot'
,
type
=
int
,
default
=
1
)
parser
.
add_argument
(
'--num_fewshot'
,
type
=
int
,
default
=
0
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
1234
)
parser
.
add_argument
(
'--seed'
,
type
=
int
,
default
=
1234
)
parser
.
add_argument
(
'--output_path'
,
default
=
None
)
parser
.
add_argument
(
'--output_path'
,
default
=
None
)
parser
.
add_argument
(
'--limit'
,
type
=
int
,
default
=
None
)
parser
.
add_argument
(
'--limit'
,
type
=
int
,
default
=
None
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment