Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
c64bf9a9
Commit
c64bf9a9
authored
Oct 17, 2023
by
lintangsutawika
Browse files
change all mentions of `greedy_until` to `generate_until`
parent
04ca5671
Changes
236
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
54 additions
and
122 deletions
+54
-122
lm_eval/api/task.py
lm_eval/api/task.py
+9
-9
lm_eval/benchmarks/__init__.py
lm_eval/benchmarks/__init__.py
+0
-76
lm_eval/models/anthropic_llms.py
lm_eval/models/anthropic_llms.py
+3
-3
lm_eval/models/dummy.py
lm_eval/models/dummy.py
+1
-1
lm_eval/models/huggingface.py
lm_eval/models/huggingface.py
+2
-2
lm_eval/models/openai_completions.py
lm_eval/models/openai_completions.py
+3
-3
lm_eval/models/textsynth.py
lm_eval/models/textsynth.py
+8
-8
lm_eval/tasks/__init__.py
lm_eval/tasks/__init__.py
+1
-1
lm_eval/tasks/babi/babi.yaml
lm_eval/tasks/babi/babi.yaml
+1
-1
lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
...asks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
+1
-1
lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
...ks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
+1
-1
lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
+1
-1
lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
+1
-1
lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
...al/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
+1
-1
lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
...asks/benchmarks/flan/yaml_templates/held_in_template_yaml
+1
-1
lm_eval/tasks/benchmarks/minerva_math.yaml
lm_eval/tasks/benchmarks/minerva_math.yaml
+0
-0
lm_eval/tasks/benchmarks/t0_eval.yaml
lm_eval/tasks/benchmarks/t0_eval.yaml
+10
-10
lm_eval/tasks/bigbench/generate_tasks.py
lm_eval/tasks/bigbench/generate_tasks.py
+2
-2
lm_eval/tasks/bigbench/generate_until/abstract_narrative_understanding.yaml
...ench/generate_until/abstract_narrative_understanding.yaml
+4
-0
lm_eval/tasks/bigbench/generate_until/anachronisms.yaml
lm_eval/tasks/bigbench/generate_until/anachronisms.yaml
+4
-0
No files found.
lm_eval/api/task.py
View file @
c64bf9a9
...
...
@@ -44,7 +44,7 @@ ALL_OUTPUT_TYPES = [
"loglikelihood"
,
"multiple_choice"
,
"loglikelihood_rolling"
,
"g
reedy
_until"
,
"g
enerate
_until"
,
]
...
...
@@ -80,7 +80,7 @@ class TaskConfig(dict):
num_fewshot
:
int
=
0
# scoring options
metric_list
:
list
=
None
output_type
:
str
=
"g
reedy
_until"
output_type
:
str
=
"g
enerate
_until"
generation_kwargs
:
dict
=
None
repeats
:
int
=
1
filter_list
:
Union
[
str
,
list
]
=
None
...
...
@@ -97,11 +97,11 @@ class TaskConfig(dict):
self
.
dataset_path
=
inspect
.
getfile
(
import_module
(
self
.
dataset_path
))
if
self
.
generation_kwargs
is
not
None
:
if
self
.
output_type
!=
"g
reedy
_until"
:
if
self
.
output_type
!=
"g
enerate
_until"
:
eval_logger
.
warning
(
f
"[
{
self
.
task
}
] passed `generation_kwargs`, but not using `output_type: g
reedy
_until`!"
f
"[
{
self
.
task
}
] passed `generation_kwargs`, but not using `output_type: g
enerate
_until`!"
)
assert
self
.
output_type
!=
"g
reedy
_until"
assert
self
.
output_type
!=
"g
enerate
_until"
if
"temperature"
in
self
.
generation_kwargs
:
self
.
generation_kwargs
[
"temperature"
]
=
float
(
...
...
@@ -111,7 +111,7 @@ class TaskConfig(dict):
if
"until"
not
in
self
.
generation_kwargs
:
self
.
generation_kwargs
[
"until"
]
=
[
self
.
fewshot_delimiter
]
else
:
if
self
.
output_type
==
"g
reedy
_until"
:
if
self
.
output_type
==
"g
enerate
_until"
:
# ensure that we greedily generate in absence of explicit arguments otherwise
self
.
generation_kwargs
=
{
"until"
:
None
...
...
@@ -958,7 +958,7 @@ class ConfigurableTask(Task):
)
return
request_list
elif
self
.
OUTPUT_TYPE
==
"g
reedy
_until"
:
elif
self
.
OUTPUT_TYPE
==
"g
enerate
_until"
:
arguments
=
(
ctx
,
self
.
config
.
generation_kwargs
)
return
Instance
(
...
...
@@ -1070,7 +1070,7 @@ class ConfigurableTask(Task):
acc_mutual_info
=
1.0
if
np
.
argmax
(
lls_mutual_info
)
==
gold
else
0.0
result_dict
[
"acc_mutual_info"
]
=
acc_mutual_info
elif
self
.
OUTPUT_TYPE
==
"g
reedy
_until"
:
elif
self
.
OUTPUT_TYPE
==
"g
enerate
_until"
:
gold
=
self
.
doc_to_target
(
doc
)
result
=
results
[
0
]
if
self
.
config
.
doc_to_choice
is
not
None
:
...
...
@@ -1134,7 +1134,7 @@ class ConfigurableTask(Task):
else
:
raise
ValueError
(
f
"Passed invalid output_type '
{
self
.
OUTPUT_TYPE
}
' ! Please use one of "
,
"'loglikelihood', 'loglikelihood_rolling', 'g
reedy
_until' or 'multiple_choice'"
,
"'loglikelihood', 'loglikelihood_rolling', 'g
enerate
_until' or 'multiple_choice'"
,
)
return
result_dict
...
...
lm_eval/benchmarks/__init__.py
deleted
100644 → 0
View file @
04ca5671
import
os
import
yaml
from
lm_eval
import
utils
from
lm_eval.tasks
import
register_configurable_task
,
check_prompt_config
from
lm_eval.logger
import
eval_logger
from
lm_eval.api.registry
import
(
TASK_REGISTRY
,
GROUP_REGISTRY
,
ALL_TASKS
,
)
def
include_benchmarks
(
task_dir
:
str
)
->
None
:
for
root
,
subdirs
,
file_list
in
os
.
walk
(
task_dir
):
if
(
subdirs
==
[]
or
"__pycache__"
in
subdirs
)
and
(
len
(
file_list
)
>
0
):
for
f
in
file_list
:
if
f
.
endswith
(
".yaml"
):
try
:
benchmark_path
=
os
.
path
.
join
(
root
,
f
)
with
open
(
benchmark_path
,
"rb"
)
as
file
:
yaml_config
=
yaml
.
full_load
(
file
)
if
"prompts"
in
yaml_config
:
continue
# Skip it
assert
"group"
in
yaml_config
group
=
yaml_config
[
"group"
]
all_task_list
=
yaml_config
[
"task"
]
config_list
=
[
task
for
task
in
all_task_list
if
type
(
task
)
!=
str
]
task_list
=
[
task
for
task
in
all_task_list
if
type
(
task
)
==
str
]
for
task_config
in
config_list
:
yaml_dir
=
os
.
path
.
dirname
(
benchmark_path
)
task_config
=
utils
.
load_yaml_config
(
yaml_config
=
task_config
,
yaml_dir
=
yaml_dir
)
if
"use_prompt"
in
task_config
:
if
"yaml"
in
task_config
[
"use_prompt"
]:
task_config
[
"use_prompt"
]
=
os
.
path
.
join
(
root
,
task_config
[
"use_prompt"
]
)
var_configs
=
check_prompt_config
(
{
**
task_config
,
**
{
"group"
:
group
},
}
)
for
config
in
var_configs
:
register_configurable_task
(
config
)
task_names
=
utils
.
pattern_match
(
task_list
,
ALL_TASKS
)
for
task
in
task_names
:
if
task
in
TASK_REGISTRY
:
if
group
in
GROUP_REGISTRY
:
GROUP_REGISTRY
[
group
].
append
(
task
)
else
:
GROUP_REGISTRY
[
group
]
=
[
task
]
ALL_TASKS
.
add
(
group
)
except
Exception
as
error
:
eval_logger
.
warning
(
"Failed to load benchmark in
\n
"
f
"
{
benchmark_path
}
\n
"
" Benchmark will not be added to registry
\n
"
f
" Error:
{
error
}
"
)
task_dir
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/"
include_benchmarks
(
task_dir
)
lm_eval/models/anthropic_llms.py
View file @
c64bf9a9
...
...
@@ -138,7 +138,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
def
_loglikelihood_tokens
(
self
,
requests
,
disable_tqdm
:
bool
=
False
):
raise
NotImplementedError
(
"No support for logits."
)
def
g
reedy
_until
(
self
,
requests
)
->
List
[
str
]:
def
g
enerate
_until
(
self
,
requests
)
->
List
[
str
]:
if
not
requests
:
return
[]
...
...
@@ -164,7 +164,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
)
res
.
append
(
response
)
self
.
cache_hook
.
add_partial
(
"g
reedy
_until"
,
request
,
response
)
self
.
cache_hook
.
add_partial
(
"g
enerate
_until"
,
request
,
response
)
except
anthropic
.
APIConnectionError
as
e
:
# type: ignore # noqa: F821
eval_logger
.
critical
(
f
"Server unreachable:
{
e
.
__cause__
}
"
)
break
...
...
@@ -179,7 +179,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
raise
NotImplementedError
()
def
_model_generate
(
self
,
context
,
max_length
,
eos_token_id
):
# Isn't used because we override g
reedy
_until
# Isn't used because we override g
enerate
_until
raise
NotImplementedError
()
def
loglikelihood
(
self
,
requests
):
...
...
lm_eval/models/dummy.py
View file @
c64bf9a9
...
...
@@ -20,7 +20,7 @@ class DummyLM(LM):
return
res
def
g
reedy
_until
(
self
,
requests
):
def
g
enerate
_until
(
self
,
requests
):
res
=
[]
for
ctx
,
_
in
requests
:
...
...
lm_eval/models/huggingface.py
View file @
c64bf9a9
...
...
@@ -813,7 +813,7 @@ class HFLM(LM):
return
re_ord
.
get_original
(
res
)
def
g
reedy
_until
(
self
,
requests
):
def
g
enerate
_until
(
self
,
requests
):
res
=
defaultdict
(
list
)
re_ords
=
{}
...
...
@@ -930,7 +930,7 @@ class HFLM(LM):
res
[
key
].
append
(
s
)
self
.
cache_hook
.
add_partial
(
"g
reedy
_until"
,
(
context
,
gen_kwargs
),
s
"g
enerate
_until"
,
(
context
,
gen_kwargs
),
s
)
pbar
.
update
(
1
)
# reorder this group of results back to original unsorted form
...
...
lm_eval/models/openai_completions.py
View file @
c64bf9a9
...
...
@@ -203,7 +203,7 @@ class OpenaiCompletionsLM(LM):
self
.
cache_hook
.
add_partial
(
"loglikelihood"
,
cache_key
,
answer
)
return
re_ord
.
get_original
(
res
)
def
g
reedy
_until
(
self
,
requests
)
->
List
[
str
]:
def
g
enerate
_until
(
self
,
requests
)
->
List
[
str
]:
if
not
requests
:
return
[]
res
=
[]
...
...
@@ -260,7 +260,7 @@ class OpenaiCompletionsLM(LM):
# partial caching
self
.
cache_hook
.
add_partial
(
"g
reedy
_until"
,
(
context
,
{
"until"
:
until_
}),
s
"g
enerate
_until"
,
(
context
,
{
"until"
:
until_
}),
s
)
res
.
append
(
s
)
...
...
@@ -271,7 +271,7 @@ class OpenaiCompletionsLM(LM):
raise
NotImplementedError
()
def
_model_generate
(
self
,
context
,
max_length
,
eos_token_id
):
# Isn't used because we override g
reedy
_until
# Isn't used because we override g
enerate
_until
raise
NotImplementedError
()
def
loglikelihood_rolling
(
self
,
requests
)
->
List
[
float
]:
...
...
lm_eval/models/textsynth.py
View file @
c64bf9a9
...
...
@@ -58,7 +58,7 @@ class TextSynthLM(LM):
@
property
def
eot_token_id
(
self
):
# Isn't used because we override loglikelihood, loglikelihood_rolling and g
reedy
_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and g
enerate
_until
raise
NotImplementedError
()
@
property
...
...
@@ -72,20 +72,20 @@ class TextSynthLM(LM):
@
property
def
batch_size
(
self
):
# Isn't used because we override loglikelihood, loglikelihood_rolling and g
reedy
_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and g
enerate
_until
raise
NotImplementedError
()
@
property
def
device
(
self
):
# Isn't used because we override loglikelihood, loglikelihood_rolling and g
reedy
_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and g
enerate
_until
raise
NotImplementedError
()
def
tok_encode
(
self
,
string
:
str
):
# Isn't used because we override loglikelihood, loglikelihood_rolling and g
reedy
_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and g
enerate
_until
raise
NotImplementedError
()
def
tok_decode
(
self
,
tokens
):
# Isn't used because we override loglikelihood, loglikelihood_rolling and g
reedy
_until
# Isn't used because we override loglikelihood, loglikelihood_rolling and g
enerate
_until
raise
NotImplementedError
()
def
loglikelihood
(
self
,
requests
):
...
...
@@ -122,7 +122,7 @@ class TextSynthLM(LM):
"input tokenization support from TextSynth."
)
def
g
reedy
_until
(
self
,
requests
):
def
g
enerate
_until
(
self
,
requests
):
if
not
requests
:
return
[]
...
...
@@ -146,7 +146,7 @@ class TextSynthLM(LM):
s
=
resp
[
"text"
]
res
.
append
(
s
)
self
.
cache_hook
.
add_partial
(
"g
reedy
_until"
,
(
inp
,
request_args
),
s
)
self
.
cache_hook
.
add_partial
(
"g
enerate
_until"
,
(
inp
,
request_args
),
s
)
else
:
logger
.
error
(
f
"The following response does not contain generated `text`. "
...
...
@@ -160,5 +160,5 @@ class TextSynthLM(LM):
raise
NotImplementedError
()
def
_model_generate
(
self
,
context
,
max_length
,
eos_token_id
):
# Isn't used because we override g
reedy
_until
# Isn't used because we override g
enerate
_until
raise
NotImplementedError
()
lm_eval/tasks/__init__.py
View file @
c64bf9a9
...
...
@@ -98,7 +98,7 @@ def check_prompt_config(
]
)
},
**
{
"output_type"
:
"g
reedy
_until"
},
**
{
"output_type"
:
"g
enerate
_until"
},
}
)
else
:
...
...
lm_eval/tasks/babi/babi.yaml
View file @
c64bf9a9
task
:
babi
dataset_path
:
Muennighoff/babi
dataset_name
:
null
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
training_split
:
train
validation_split
:
valid
test_split
:
test
...
...
lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
View file @
c64bf9a9
group: bbh_flan_cot_fewshot
dataset_path: lukaemon/bbh
output_type: g
reedy
_until
output_type: g
enerate
_until
test_split: test
doc_to_target: "{{target}}"
metric_list:
...
...
lm_eval/tasks/bbh/flan_cot_zeroshot/_flan_cot_zeroshot_template_yaml
View file @
c64bf9a9
group: bbh_flan_cot_zeroshot
dataset_path: lukaemon/bbh
output_type: g
reedy
_until
output_type: g
enerate
_until
test_split: test
doc_to_target: "{{target}}"
metric_list:
...
...
lm_eval/tasks/bbh/flan_fewshot/_flan_fewshot_template_yaml
View file @
c64bf9a9
group: bbh_flan_fewshot
dataset_path: lukaemon/bbh
output_type: g
reedy
_until
output_type: g
enerate
_until
test_split: test
doc_to_target: "{{target}}"
metric_list:
...
...
lm_eval/tasks/bbh/flan_zeroshot/_flan_zeroshot_template_yaml
View file @
c64bf9a9
group: bbh_flan_zeroshot
dataset_path: lukaemon/bbh
output_type: g
reedy
_until
output_type: g
enerate
_until
test_split: test
doc_to_target: "{{target}}"
metric_list:
...
...
lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
View file @
c64bf9a9
group: flan-cot
output_type: g
reedy
_until
output_type: g
enerate
_until
validation_split: validation
doc_to_target: "{{answer}}"
metric_list:
...
...
lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
View file @
c64bf9a9
output_type: g
reedy
_until
output_type: g
enerate
_until
validation_split: validation
metric_list:
- metric: exact_match
...
...
lm_eval/benchmarks/minerva_math.yaml
→
lm_eval/
tasks/
benchmarks/minerva_math.yaml
View file @
c64bf9a9
File moved
lm_eval/tasks/benchmarks/t0_eval.yaml
View file @
c64bf9a9
...
...
@@ -6,7 +6,7 @@ task:
use_prompt
:
promptsource:*
training_split
:
train
validation_split
:
validation
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
@@ -19,7 +19,7 @@ task:
use_prompt
:
promptsource:*
training_split
:
train
validation_split
:
validation
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
@@ -32,7 +32,7 @@ task:
use_prompt
:
promptsource:*
training_split
:
train
validation_split
:
validation
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
@@ -44,7 +44,7 @@ task:
use_prompt
:
promptsource:*
training_split
:
train
validation_split
:
validation
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
@@ -56,7 +56,7 @@ task:
use_prompt
:
promptsource:*
training_split
:
train_r1
validation_split
:
dev_r1
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
@@ -68,7 +68,7 @@ task:
use_prompt
:
promptsource:*
training_split
:
train_r2
validation_split
:
dev_r2
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
@@ -80,7 +80,7 @@ task:
use_prompt
:
promptsource:*
training_split
:
train_r3
validation_split
:
dev_r3
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
@@ -93,7 +93,7 @@ task:
use_prompt
:
promptsource:*
training_split
:
train
validation_split
:
validation
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
@@ -105,7 +105,7 @@ task:
use_prompt
:
promptsource:*
training_split
:
train
validation_split
:
validation
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
@@ -118,7 +118,7 @@ task:
use_prompt
:
promptsource:*
training_split
:
train
validation_split
:
validation
output_type
:
g
reedy
_until
output_type
:
g
enerate
_until
metric_list
:
-
metric
:
exact_match
aggregation
:
mean
...
...
lm_eval/tasks/bigbench/generate_tasks.py
View file @
c64bf9a9
...
...
@@ -175,8 +175,8 @@ all_subtasks = [
def
main
()
->
None
:
for
path
,
task_type
in
zip
(
[
"multiple_choice"
,
"g
reedy
_until"
],
[
"multiple_choice_template_yaml"
,
"g
reedy
_until_template_yaml"
],
[
"multiple_choice"
,
"g
enerate
_until"
],
[
"multiple_choice_template_yaml"
,
"g
enerate
_until_template_yaml"
],
):
os
.
makedirs
(
path
,
exist_ok
=
True
)
for
task
in
all_subtasks
:
...
...
lm_eval/tasks/bigbench/g
reedy
_until/abstract_narrative_understanding.yaml
→
lm_eval/tasks/bigbench/g
enerate
_until/abstract_narrative_understanding.yaml
View file @
c64bf9a9
# Generated by utils.py
dataset_name
:
abstract_narrative_understanding_zero_shot
include
:
../g
reedy
_until_template_yaml
task
:
bigbench_abstract_narrative_understanding_g
reedy
_until
include
:
../g
enerate
_until_template_yaml
task
:
bigbench_abstract_narrative_understanding_g
enerate
_until
lm_eval/tasks/bigbench/g
reedy
_until/anachronisms.yaml
→
lm_eval/tasks/bigbench/g
enerate
_until/anachronisms.yaml
View file @
c64bf9a9
# Generated by utils.py
dataset_name
:
anachronisms_zero_shot
include
:
../g
reedy
_until_template_yaml
task
:
bigbench_anachronisms_g
reedy
_until
include
:
../g
enerate
_until_template_yaml
task
:
bigbench_anachronisms_g
enerate
_until
Prev
1
2
3
4
5
…
12
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment