Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
3d1b8f43
Unverified
Commit
3d1b8f43
authored
Jul 03, 2024
by
Lintang Sutawika
Committed by
GitHub
Jul 03, 2024
Browse files
Merge branch 'main' into group-agg-rework
parents
e200c24e
d855d0ba
Changes
317
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
312 additions
and
18 deletions
+312
-18
lm_eval/tasks/tmmluplus/default/_generate_configs.py
lm_eval/tasks/tmmluplus/default/_generate_configs.py
+1
-0
lm_eval/utils.py
lm_eval/utils.py
+54
-1
pyproject.toml
pyproject.toml
+1
-1
scripts/clean_training_data/README.md
scripts/clean_training_data/README.md
+1
-1
scripts/make_table_results.py
scripts/make_table_results.py
+1
-0
scripts/make_table_tasks.py
scripts/make_table_tasks.py
+1
-0
scripts/write_out.py
scripts/write_out.py
+5
-0
scripts/zeno_visualize.py
scripts/zeno_visualize.py
+35
-12
tests/models/test_neuralmagic.py
tests/models/test_neuralmagic.py
+1
-0
tests/models/test_vllm.py
tests/models/test_vllm.py
+2
-3
tests/test_evaluator.py
tests/test_evaluator.py
+74
-0
tests/test_include_path.py
tests/test_include_path.py
+93
-0
tests/testconfigs/arc_test.yaml
tests/testconfigs/arc_test.yaml
+21
-0
tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
...rained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+8
-0
tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
...rained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+4
-0
tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
...rained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+5
-0
tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
...rained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
+5
-0
No files found.
lm_eval/tasks/tmmluplus/default/_generate_configs.py
View file @
3d1b8f43
"""
"""
Take in a YAML, and output all "other" splits with this YAML
Take in a YAML, and output all "other" splits with this YAML
"""
"""
import
argparse
import
argparse
import
os
import
os
...
...
lm_eval/utils.py
View file @
3d1b8f43
...
@@ -152,6 +152,55 @@ def general_detokenize(string):
...
@@ -152,6 +152,55 @@ def general_detokenize(string):
return
string
return
string
def
get_file_task_name
(
filename
:
str
)
->
str
:
"""
Given the sample results filenames, extracts and returns the task name.
"""
return
filename
[
filename
.
find
(
"_"
)
+
1
:
filename
.
rfind
(
"_"
)]
def
get_file_datetime
(
filename
:
str
)
->
str
:
"""
Given the results and sample results filenames, extracts and returns the datetime.
"""
return
filename
[
filename
.
rfind
(
"_"
)
+
1
:].
replace
(
".json"
,
""
)
def
sanitize_model_name
(
model_name
:
str
)
->
str
:
"""
Given the model name, returns a sanitized version of it.
"""
return
re
.
sub
(
r
"[\"<>:/\|\\?\*\[\]]+"
,
"__"
,
model_name
)
def
sanitize_task_name
(
task_name
:
str
)
->
str
:
"""
Given the task name, returns a sanitized version of it.
"""
return
re
.
sub
(
r
"\W"
,
"_"
,
task_name
)
def
get_latest_filename
(
filenames
:
List
[
str
])
->
str
:
"""
Given a list of filenames, returns the filename with the latest datetime.
"""
return
max
(
filenames
,
key
=
lambda
f
:
get_file_datetime
(
f
))
def
get_results_filenames
(
filenames
:
List
[
str
])
->
List
[
str
]:
"""
Extracts filenames that correspond to aggregated results.
"""
return
[
f
for
f
in
filenames
if
"/results_"
in
f
and
".json"
in
f
]
def
get_sample_results_filenames
(
filenames
:
List
[
str
])
->
List
[
str
]:
"""
Extracts filenames that correspond to sample results.
"""
return
[
f
for
f
in
filenames
if
"/samples_"
in
f
and
".json"
in
f
]
def
get_rolling_token_windows
(
token_list
,
prefix_token
,
max_seq_len
,
context_len
):
def
get_rolling_token_windows
(
token_list
,
prefix_token
,
max_seq_len
,
context_len
):
"""
"""
- context_len allows for a rolling window context, allowing each prediction window to potentially
- context_len allows for a rolling window context, allowing each prediction window to potentially
...
@@ -300,7 +349,11 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
...
@@ -300,7 +349,11 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
if
"alias"
in
dic
:
if
"alias"
in
dic
:
k
=
dic
.
pop
(
"alias"
)
k
=
dic
.
pop
(
"alias"
)
for
(
mf
),
v
in
dic
.
items
():
metric_items
=
dic
.
items
()
if
sort_results
:
metric_items
=
sorted
(
metric_items
)
for
(
mf
),
v
in
metric_items
:
m
,
_
,
f
=
mf
.
partition
(
","
)
m
,
_
,
f
=
mf
.
partition
(
","
)
if
m
.
endswith
(
"_stderr"
):
if
m
.
endswith
(
"_stderr"
):
continue
continue
...
...
pyproject.toml
View file @
3d1b8f43
...
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
...
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
[project]
name
=
"lm_eval"
name
=
"lm_eval"
version
=
"0.4.
2
"
version
=
"0.4.
3
"
authors
=
[
authors
=
[
{name=
"EleutherAI"
,
email=
"contact@eleuther.ai"
}
{name=
"EleutherAI"
,
email=
"contact@eleuther.ai"
}
]
]
...
...
scripts/clean_training_data/README.md
View file @
3d1b8f43
...
@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
...
@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
the match, splitting the training data into chunks
the match, splitting the training data into chunks
3) Any chunks less than
`minimum_slice_length`
are removed
3) Any chunks less than
`minimum_slice_length`
are removed
4) Training data sets split into more than
`too_dirty_cutoff`
are considered
4) Training data sets split into more than
`too_dirty_cutoff`
are considered
completey contaminated and removed
complete
l
y contaminated and removed
OpenAI used:
OpenAI used:
```
```
...
...
scripts/make_table_results.py
View file @
3d1b8f43
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
Usage:
Usage:
python make_table_tasks.py --output <markdown_filename>
python make_table_tasks.py --output <markdown_filename>
"""
"""
import
json
import
json
import
logging
import
logging
import
os
import
os
...
...
scripts/make_table_tasks.py
View file @
3d1b8f43
...
@@ -2,6 +2,7 @@
...
@@ -2,6 +2,7 @@
Usage:
Usage:
python make_table_tasks.py --output <markdown_filename>
python make_table_tasks.py --output <markdown_filename>
"""
"""
import
argparse
import
argparse
import
logging
import
logging
...
...
scripts/write_out.py
View file @
3d1b8f43
...
@@ -70,6 +70,11 @@ def main():
...
@@ -70,6 +70,11 @@ def main():
if
docs
is
not
None
:
if
docs
is
not
None
:
iters
.
append
(
docs
)
iters
.
append
(
docs
)
if
len
(
iters
)
==
0
:
raise
ValueError
(
f
"Passed --sets '
{
args
.
sets
}
' but this task has no splits which match. Please specify a different --sets value."
)
docs
=
join_iters
(
iters
)
docs
=
join_iters
(
iters
)
with
open
(
with
open
(
...
...
scripts/zeno_visualize.py
View file @
3d1b8f43
...
@@ -7,7 +7,12 @@ from pathlib import Path
...
@@ -7,7 +7,12 @@ from pathlib import Path
import
pandas
as
pd
import
pandas
as
pd
from
zeno_client
import
ZenoClient
,
ZenoMetric
from
zeno_client
import
ZenoClient
,
ZenoMetric
from
lm_eval.utils
import
eval_logger
from
lm_eval.utils
import
(
eval_logger
,
get_latest_filename
,
get_results_filenames
,
get_sample_results_filenames
,
)
def
parse_args
():
def
parse_args
():
...
@@ -45,13 +50,15 @@ def main():
...
@@ -45,13 +50,15 @@ def main():
assert
len
(
models
)
>
0
,
"No model directories found in the data_path."
assert
len
(
models
)
>
0
,
"No model directories found in the data_path."
# Get the tasks from the latest results file of the first model.
tasks
=
set
(
tasks_for_model
(
models
[
0
],
args
.
data_path
))
tasks
=
set
(
tasks_for_model
(
models
[
0
],
args
.
data_path
))
for
model
in
models
:
# Make sure that all models have the same tasks.
# Get tasks names from the latest results file for each model
# Get intersection of tasks for all models
for
model
in
models
:
old_tasks
=
tasks
.
copy
()
old_tasks
=
tasks
.
copy
()
task_count
=
len
(
tasks
)
task_count
=
len
(
tasks
)
model_tasks
=
set
(
tasks_for_model
(
model
,
args
.
data_path
))
model_tasks
=
tasks_for_model
(
model
,
args
.
data_path
)
tasks
.
intersection
(
set
(
model_tasks
))
tasks
.
intersection
(
set
(
model_tasks
))
if
task_count
!=
len
(
tasks
):
if
task_count
!=
len
(
tasks
):
...
@@ -66,22 +73,36 @@ def main():
...
@@ -66,22 +73,36 @@ def main():
for
task
in
tasks
:
for
task
in
tasks
:
# Upload data for all models
# Upload data for all models
for
model_index
,
model
in
enumerate
(
models
):
for
model_index
,
model
in
enumerate
(
models
):
# Get latest results and sample results for a model
model_dir
=
Path
(
args
.
data_path
,
model
)
model_files
=
[
f
.
as_posix
()
for
f
in
model_dir
.
iterdir
()
if
f
.
is_file
()]
model_results_filenames
=
get_results_filenames
(
model_files
)
model_sample_filenames
=
get_sample_results_filenames
(
model_files
)
latest_results
=
get_latest_filename
(
[
Path
(
f
).
name
for
f
in
model_results_filenames
]
)
latest_sample_results
=
get_latest_filename
(
[
Path
(
f
).
name
for
f
in
model_sample_filenames
if
task
in
f
]
)
model_args
=
re
.
sub
(
model_args
=
re
.
sub
(
r
"[\"<>:/\|\\?\*\[\]]+"
,
r
"[\"<>:/\|\\?\*\[\]]+"
,
"__"
,
"__"
,
json
.
load
(
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"
results
.json"
),
encoding
=
"utf-8"
)
open
(
Path
(
args
.
data_path
,
model
,
latest_
results
),
encoding
=
"utf-8"
)
)[
"config"
][
"model_args"
],
)[
"config"
][
"model_args"
],
)
)
print
(
model_args
)
data
=
[]
with
open
(
with
open
(
Path
(
args
.
data_path
,
model
,
f
"
{
model_args
}
_
{
task
}
.jsonl"
),
Path
(
args
.
data_path
,
model
,
latest_sample_results
),
"r"
,
"r"
,
encoding
=
"utf-8"
,
encoding
=
"utf-8"
,
)
as
file
:
)
as
file
:
data
=
json
.
loads
(
file
.
read
())
for
line
in
file
:
data
.
append
(
json
.
loads
(
line
.
strip
()))
configs
=
json
.
load
(
configs
=
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"
results
.json"
),
encoding
=
"utf-8"
)
open
(
Path
(
args
.
data_path
,
model
,
latest_
results
),
encoding
=
"utf-8"
)
)[
"configs"
]
)[
"configs"
]
config
=
configs
[
task
]
config
=
configs
[
task
]
...
@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
...
@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
Returns:
Returns:
list: A list of tasks for the model.
list: A list of tasks for the model.
"""
"""
dir_path
=
Path
(
data_path
,
model
)
# get latest model results for a given name
config
=
(
model_dir
=
Path
(
data_path
,
model
)
json
.
load
(
open
(
Path
(
dir_path
,
"results.json"
),
encoding
=
"utf-8"
))[
"configs"
],
model_files
=
[
f
.
as_posix
()
for
f
in
model_dir
.
iterdir
()
if
f
.
is_file
()]
)
model_results_filenames
=
get_results_filenames
(
model_files
)
latest_results
=
get_latest_filename
(
model_results_filenames
)
config
=
(
json
.
load
(
open
(
latest_results
,
encoding
=
"utf-8"
))[
"configs"
],)
return
list
(
config
[
0
].
keys
())
return
list
(
config
[
0
].
keys
())
...
...
tests/models/test_neuralmagic.py
View file @
3d1b8f43
...
@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
...
@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
]
]
@
pytest
.
mark
.
skip
(
reason
=
"test failing"
)
@
pytest
.
mark
.
parametrize
(
"model_id,task"
,
SPARSEML_MODELS_TASKS
)
@
pytest
.
mark
.
parametrize
(
"model_id,task"
,
SPARSEML_MODELS_TASKS
)
def
test_sparseml_eval
(
model_id
,
task
):
def
test_sparseml_eval
(
model_id
,
task
):
lm
=
get_model
(
"sparseml"
).
create_from_arg_string
(
lm
=
get_model
(
"sparseml"
).
create_from_arg_string
(
...
...
tests/models/test_vllm.py
View file @
3d1b8f43
from
typing
import
List
from
typing
import
List
import
pytest
import
pytest
import
torch
from
lm_eval
import
tasks
from
lm_eval
import
tasks
from
lm_eval.api.instance
import
Instance
from
lm_eval.api.instance
import
Instance
...
@@ -11,7 +10,7 @@ task_manager = tasks.TaskManager()
...
@@ -11,7 +10,7 @@ task_manager = tasks.TaskManager()
@
pytest
.
mark
.
skip
(
reason
=
"requires CUDA"
)
@
pytest
.
mark
.
skip
(
reason
=
"requires CUDA"
)
class
T
EST
_VLLM
:
class
T
est
_VLLM
:
vllm
=
pytest
.
importorskip
(
"vllm"
)
vllm
=
pytest
.
importorskip
(
"vllm"
)
try
:
try
:
from
lm_eval.models.vllm_causallms
import
VLLM
from
lm_eval.models.vllm_causallms
import
VLLM
...
@@ -19,7 +18,7 @@ class TEST_VLLM:
...
@@ -19,7 +18,7 @@ class TEST_VLLM:
LM
=
VLLM
(
pretrained
=
"EleutherAI/pythia-70m"
)
LM
=
VLLM
(
pretrained
=
"EleutherAI/pythia-70m"
)
except
ModuleNotFoundError
:
except
ModuleNotFoundError
:
pass
pass
torch
.
use_deterministic_algorithms
(
True
)
#
torch.use_deterministic_algorithms(True)
task_list
=
task_manager
.
load_task_or_group
([
"arc_easy"
,
"gsm8k"
,
"wikitext"
])
task_list
=
task_manager
.
load_task_or_group
([
"arc_easy"
,
"gsm8k"
,
"wikitext"
])
multiple_choice_task
=
task_list
[
"arc_easy"
]
# type: ignore
multiple_choice_task
=
task_list
[
"arc_easy"
]
# type: ignore
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
multiple_choice_task
.
build_all_requests
(
limit
=
10
,
rank
=
0
,
world_size
=
1
)
...
...
tests/test_evaluator.py
View file @
3d1b8f43
import
os
import
os
import
re
from
typing
import
List
from
typing
import
List
import
pytest
import
pytest
...
@@ -6,6 +7,7 @@ import pytest
...
@@ -6,6 +7,7 @@ import pytest
import
lm_eval.api
as
api
import
lm_eval.api
as
api
import
lm_eval.evaluator
as
evaluator
import
lm_eval.evaluator
as
evaluator
from
lm_eval
import
tasks
from
lm_eval
import
tasks
from
lm_eval.utils
import
make_table
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
...
@@ -31,6 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
...
@@ -31,6 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false"
10000
,
10000
,
),
),
],
],
ids
=
lambda
d
:
f
"
{
d
}
"
,
)
)
def
test_evaluator
(
def
test_evaluator
(
task_name
:
List
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
,
bootstrap_iters
:
int
task_name
:
List
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
,
bootstrap_iters
:
int
...
@@ -75,3 +78,74 @@ def test_evaluator(
...
@@ -75,3 +78,74 @@ def test_evaluator(
x
==
y
x
==
y
for
x
,
y
in
zip
([
y
for
_
,
y
in
r
(
e1
).
items
()],
[
y
for
_
,
y
in
r
(
e2
).
items
()])
for
x
,
y
in
zip
([
y
for
_
,
y
in
r
(
e1
).
items
()],
[
y
for
_
,
y
in
r
(
e2
).
items
()])
)
)
@
pytest
.
mark
.
parametrize
(
"task_name,limit,model,model_args"
,
[
(
[
"ai2_arc"
],
10
,
"hf"
,
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu"
,
),
(
[
"mmlu_abstract_algebra"
,
"mmlu_global_facts"
,
"mmlu_public_relations"
],
10
,
"hf"
,
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu"
,
),
(
[
"lambada_openai"
],
10
,
"hf"
,
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu"
,
),
(
[
"wikitext"
],
10
,
"hf"
,
"pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu"
,
),
],
ids
=
lambda
d
:
f
"
{
d
}
"
,
)
def
test_printed_results
(
task_name
:
List
[
str
],
limit
:
int
,
model
:
str
,
model_args
:
str
):
results
=
evaluator
.
simple_evaluate
(
model
=
model
,
tasks
=
task_name
,
limit
=
limit
,
model_args
=
model_args
,
bootstrap_iters
=
0
,
random_seed
=
0
,
numpy_random_seed
=
0
,
torch_random_seed
=
0
,
fewshot_random_seed
=
0
,
)
filename
=
"_"
.
join
(
(
"-"
.
join
(
task_name
),
str
(
limit
),
str
(
model
),
re
.
sub
(
r
"[^a-zA-Z0-9_\-\.]"
,
"-"
,
model_args
),
)
)
filepath
=
f
"./tests/testdata/
{
filename
}
.txt"
with
open
(
filepath
,
"r"
)
as
f
:
t1
=
f
.
read
().
strip
()
t2
=
make_table
(
results
).
strip
()
t1_lines
,
t2_lines
=
t1
.
splitlines
(),
t2
.
splitlines
()
assert
len
(
t1_lines
)
==
len
(
t2_lines
)
for
t1_line
,
t2_line
in
zip
(
t1_lines
,
t2_lines
):
t1_items
,
t2_items
=
t1_line
.
split
(
"|"
),
t2_line
.
split
(
"|"
)
assert
len
(
t1_items
)
==
len
(
t2_items
)
for
t1_item
,
t2_item
in
zip
(
t1_items
,
t2_items
):
try
:
t1_item
=
float
(
t1_item
)
t2_item
=
float
(
t2_item
)
assert
abs
(
t1_item
-
t2_item
)
<
0.3
except
ValueError
:
assert
t1_item
==
t2_item
tests/test_include_path.py
0 → 100644
View file @
3d1b8f43
import
os
import
pytest
import
lm_eval.api
as
api
import
lm_eval.evaluator
as
evaluator
from
lm_eval
import
tasks
@
pytest
.
mark
.
parametrize
(
"limit,model,model_args"
,
[
(
10
,
"hf"
,
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu"
,
),
],
)
def
test_include_correctness
(
limit
:
int
,
model
:
str
,
model_args
:
str
):
task_name
=
[
"arc_easy"
]
task_manager
=
tasks
.
TaskManager
()
task_dict
=
tasks
.
get_task_dict
(
task_name
,
task_manager
)
e1
=
evaluator
.
simple_evaluate
(
model
=
model
,
tasks
=
task_name
,
limit
=
limit
,
model_args
=
model_args
,
)
assert
e1
is
not
None
# run with evaluate() and "arc_easy" test config (included from ./testconfigs path)
lm
=
api
.
registry
.
get_model
(
model
).
create_from_arg_string
(
model_args
,
{
"batch_size"
:
None
,
"max_batch_size"
:
None
,
"device"
:
None
,
},
)
task_name
=
[
"arc_easy"
]
task_manager
=
tasks
.
TaskManager
(
include_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/testconfigs"
,
include_defaults
=
False
,
)
task_dict
=
tasks
.
get_task_dict
(
task_name
,
task_manager
)
e2
=
evaluator
.
evaluate
(
lm
=
lm
,
task_dict
=
task_dict
,
limit
=
limit
,
)
assert
e2
is
not
None
# check that caching is working
def
r
(
x
):
return
x
[
"results"
][
"arc_easy"
]
assert
all
(
x
==
y
for
x
,
y
in
zip
([
y
for
_
,
y
in
r
(
e1
).
items
()],
[
y
for
_
,
y
in
r
(
e2
).
items
()])
)
# test that setting include_defaults = False works as expected and that include_path works
def
test_no_include_defaults
():
task_name
=
[
"arc_easy"
]
task_manager
=
tasks
.
TaskManager
(
include_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/testconfigs"
,
include_defaults
=
False
,
)
# should succeed, because we've included an 'arc_easy' task from this dir
task_dict
=
tasks
.
get_task_dict
(
task_name
,
task_manager
)
# should fail, since ./testconfigs has no arc_challenge task
task_name
=
[
"arc_challenge"
]
with
pytest
.
raises
(
KeyError
):
task_dict
=
tasks
.
get_task_dict
(
task_name
,
task_manager
)
# noqa: F841
# test that include_path containing a task shadowing another task's name fails
# def test_shadowed_name_fails():
# task_name = ["arc_easy"]
# task_manager = tasks.TaskManager(include_path=os.path.dirname(os.path.abspath(__file__)) + "/testconfigs")
# task_dict = tasks.get_task_dict(task_name, task_manager)
tests/testconfigs/arc_test.yaml
0 → 100644
View file @
3d1b8f43
task
:
arc_easy
dataset_path
:
allenai/ai2_arc
dataset_name
:
ARC-Easy
output_type
:
multiple_choice
training_split
:
train
validation_split
:
validation
test_split
:
test
doc_to_text
:
"
Question:
{{question}}
\n
Answer:"
doc_to_target
:
"
{{choices.label.index(answerKey)}}"
doc_to_choice
:
"
{{choices.text}}"
should_decontaminate
:
true
doc_to_decontamination_query
:
"
Question:
{{question}}
\n
Answer:"
metric_list
:
-
metric
:
acc
aggregation
:
mean
higher_is_better
:
true
-
metric
:
acc_norm
aggregation
:
mean
higher_is_better
:
true
metadata
:
version
:
1.0
tests/testdata/ai2_arc_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
0 → 100644
View file @
3d1b8f43
| Tasks |Version|Filter|n-shot| Metric | |Value| |Stderr|
|----------------|-------|------|-----:|--------|---|----:|---|------|
|ai2_arc |N/A |none | 0|acc |↑ | 0.15|± |N/A |
| | |none | 0|acc_norm|↑ | 0.05|± |N/A |
| - arc_challenge| 1|none | 0|acc |↑ | 0.00|± |N/A |
| | |none | 0|acc_norm|↑ | 0.00|± |N/A |
| - arc_easy | 1|none | 0|acc |↑ | 0.30|± |N/A |
| | |none | 0|acc_norm|↑ | 0.10|± |N/A |
\ No newline at end of file
tests/testdata/lambada_openai_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
0 → 100644
View file @
3d1b8f43
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------------|------:|------|-----:|----------|---|-------:|---|------|
|lambada_openai| 1|none | 0|acc |↑ | 0.1000|± |N/A |
| | |none | 0|perplexity|↓ |605.4879|± |N/A |
\ No newline at end of file
tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
0 → 100644
View file @
3d1b8f43
| Tasks |Version|Filter|n-shot|Metric| |Value| |Stderr|
|----------------|------:|------|-----:|------|---|----:|---|------|
|abstract_algebra| 0|none | 0|acc |↑ | 0.2|± |N/A |
|global_facts | 0|none | 0|acc |↑ | 0.2|± |N/A |
|public_relations| 0|none | 0|acc |↑ | 0.2|± |N/A |
\ No newline at end of file
tests/testdata/wikitext_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt
0 → 100644
View file @
3d1b8f43
| Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr|
|--------|------:|------|-----:|---------------|---|-------:|---|------|
|wikitext| 2|none | 0|bits_per_byte |↓ | 1.3394|± |N/A |
| | |none | 0|byte_perplexity|↓ | 2.5304|± |N/A |
| | |none | 0|word_perplexity|↓ |130.4812|± |N/A |
\ No newline at end of file
Prev
1
…
12
13
14
15
16
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment