Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
51f27158
Commit
51f27158
authored
Feb 01, 2024
by
lintangsutawika
Browse files
udpate with merge
parents
924c9790
f5408b6b
Changes
50
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
108 additions
and
25 deletions
+108
-25
pyproject.toml
pyproject.toml
+4
-5
scripts/build_benchmark.py
scripts/build_benchmark.py
+2
-2
scripts/clean_training_data/generate_13_grams.py
scripts/clean_training_data/generate_13_grams.py
+2
-2
scripts/clean_training_data/investigate_pile.py
scripts/clean_training_data/investigate_pile.py
+2
-2
scripts/make_table_results.py
scripts/make_table_results.py
+3
-3
scripts/make_table_tasks.py
scripts/make_table_tasks.py
+1
-1
scripts/regression.py
scripts/regression.py
+5
-1
scripts/write_out.py
scripts/write_out.py
+1
-1
scripts/zeno_visualize.py
scripts/zeno_visualize.py
+12
-8
tests/models/test_openvino.py
tests/models/test_openvino.py
+76
-0
No files found.
pyproject.toml
View file @
51f27158
...
...
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name
=
"lm_eval"
version
=
"0.4.
0
"
version
=
"0.4.
1
"
authors
=
[
{name=
"EleutherAI"
,
email=
"contact@eleuther.ai"
}
]
...
...
@@ -56,15 +56,14 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
anthropic
=
["anthropic"]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"mypy"
]
gptq
=
["auto-gptq[triton]
@
git+https://github.com/PanQiWei/AutoGPTQ
"]
gptq
=
["auto-gptq[triton]
>=0.6.0
"]
ifeval
=
[
"langdetect"
,
"immutabledict"
]
mamba
=
[
"mamba_ssm"
,
"causal-conv1d==1.0.2"
]
math
=
[
"sympy>=1.12"
,
"antlr4-python3-runtime==4.11"
]
multilingual
=
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
,
"pycountry"
]
openai
=
[
"openai==1.3.9"
,
"tiktoken"
]
promptsource
=
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
]
optimum
=
["optimum[openvino]"]
promptsource
=
["promptsource>=0.2.3"]
sentencepiece
=
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
]
testing
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
]
vllm
=
["vllm<=0.2.5"]
...
...
scripts/build_benchmark.py
View file @
51f27158
...
...
@@ -23,7 +23,7 @@ def parse_args():
if
__name__
==
"__main__"
:
args
=
parse_args
()
with
open
(
args
.
benchmark_path
)
as
file
:
with
open
(
args
.
benchmark_path
,
encoding
=
"utf-8"
)
as
file
:
TASK_LIST
=
yaml
.
full_load
(
file
)
for
task
in
tqdm
(
TASK_LIST
):
eval_logger
.
info
(
f
"Processing
{
task
}
"
)
...
...
@@ -57,5 +57,5 @@ if __name__ == "__main__":
file_save_path
=
os
.
path
.
join
(
file_path
,
full_file_name
)
eval_logger
.
info
(
f
"Save to
{
file_save_path
}
"
)
with
open
(
file_save_path
,
"w"
)
as
yaml_file
:
with
open
(
file_save_path
,
"w"
,
encoding
=
"utf-8"
)
as
yaml_file
:
yaml
.
dump
(
config_dict
,
yaml_file
)
scripts/clean_training_data/generate_13_grams.py
View file @
51f27158
...
...
@@ -119,7 +119,7 @@ class Buckets:
def
do_ngrams_in_buckets
(
n_value
,
working_directory
,
bucket_count
):
pile_statistics
=
json
.
load
(
open
(
"pile_statistics.json"
,
"r"
))
pile_statistics
=
json
.
load
(
open
(
"pile_statistics.json"
,
"r"
,
encoding
=
"utf-8"
))
pile_document_count
=
pile_statistics
[
"Document Count"
]
start_offsets
=
pile_statistics
[
"File Start Offsets"
]
...
...
@@ -212,4 +212,4 @@ if __name__ == "__main__":
info_dict
=
{
"title"
:
"dataset ngrams"
,
"ngram_size"
:
13
}
info_dict_path
=
os
.
path
.
join
(
args
.
working_directory
,
"info.json"
)
json
.
dump
(
info_dict
,
open
(
info_dict_path
,
"w"
))
json
.
dump
(
info_dict
,
open
(
info_dict_path
,
"w"
,
encoding
=
"utf-8"
))
scripts/clean_training_data/investigate_pile.py
View file @
51f27158
...
...
@@ -79,7 +79,7 @@ if __name__ == "__main__":
stats_file_path
=
"pile_statistics.json"
if
os
.
path
.
exists
(
stats_file_path
):
stats
=
json
.
load
(
open
(
stats_file_path
,
"r"
))
stats
=
json
.
load
(
open
(
stats_file_path
,
"r"
,
encoding
=
"utf-8"
))
else
:
document_count
,
total_document_size_chars
,
start_offsets
=
get_stats
()
stats
=
{
...
...
@@ -88,7 +88,7 @@ if __name__ == "__main__":
"Total Pile Characters"
:
total_document_size_chars
,
"File Start Offsets"
:
start_offsets
,
}
json
.
dump
(
stats
,
open
(
stats_file_path
,
"w"
),
indent
=
4
)
json
.
dump
(
stats
,
open
(
stats_file_path
,
"w"
,
encoding
=
"utf-8"
),
indent
=
4
)
print
(
f
"document_count:
{
stats
[
'Document Count'
]
}
"
)
print
(
f
"total_chars:
{
stats
[
'Total Pile Characters'
]
}
"
)
...
...
scripts/make_table_results.py
View file @
51f27158
...
...
@@ -61,14 +61,14 @@ if __name__ == "__main__":
if
not
filenames
:
continue
path_readme
=
os
.
path
.
join
(
dirpath
,
"README.md"
)
with
open
(
path_readme
,
"w"
)
as
f
:
with
open
(
path_readme
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
# get path name, only last folder
path_name
=
dirpath
.
split
(
"/"
)[
-
1
]
f
.
write
(
f
"#
{
path_name
}
\n\n
"
)
for
filename
in
sorted
([
f
for
f
in
filenames
if
f
.
endswith
(
".json"
)]):
path
=
os
.
path
.
join
(
dirpath
,
filename
)
with
open
(
path
,
"r"
)
as
f
:
with
open
(
path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
result_dict
=
json
.
load
(
f
)
with
open
(
path_readme
,
"a"
)
as
f
:
with
open
(
path_readme
,
"a"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
f
"##
{
filename
}
\n
"
)
f
.
write
(
f
"
{
make_table
(
result_dict
)
}
\n
"
)
scripts/make_table_tasks.py
View file @
51f27158
...
...
@@ -50,5 +50,5 @@ if __name__ == "__main__":
values
.
append
(
v
)
writer
.
value_matrix
=
values
table
=
writer
.
dumps
()
with
open
(
args
.
output
,
"w"
)
as
f
:
with
open
(
args
.
output
,
"w"
,
encoding
=
"utf-8"
)
as
f
:
f
.
write
(
table
)
scripts/regression.py
View file @
51f27158
...
...
@@ -94,7 +94,11 @@ def eval_models(args, branch=None):
ret
=
os
.
system
(
command
)
results
[
model
]
=
json
.
load
(
open
(
output_path
))
if
ret
==
0
else
{
"results"
:
{}}
results
[
model
]
=
(
json
.
load
(
open
(
output_path
,
encoding
=
"utf-8"
))
if
ret
==
0
else
{
"results"
:
{}}
)
end_time
=
time
.
time
()
...
...
scripts/write_out.py
View file @
51f27158
...
...
@@ -53,7 +53,7 @@ def main():
os
.
makedirs
(
args
.
output_base_path
,
exist_ok
=
True
)
for
task_name
,
task
in
task_dict
.
items
():
if
typ
e
(
task
)
==
tuple
:
if
isinstanc
e
(
task
,
tuple
)
:
group_name
,
task
=
task
rnd
=
random
.
Random
()
rnd
.
seed
(
args
.
seed
)
...
...
scripts/zeno_visualize.py
View file @
51f27158
...
...
@@ -69,18 +69,20 @@ def main():
model_args
=
re
.
sub
(
"/|="
,
"__"
,
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
)))[
"config"
][
"model_args"
],
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
),
encoding
=
"utf-8"
)
)[
"config"
][
"model_args"
],
)
with
open
(
Path
(
args
.
data_path
,
model
,
f
"
{
model_args
}
_
{
task
}
.jsonl"
),
"r"
Path
(
args
.
data_path
,
model
,
f
"
{
model_args
}
_
{
task
}
.jsonl"
),
"r"
,
encoding
=
"utf-8"
,
)
as
file
:
data
=
json
.
loads
(
file
.
read
())
configs
=
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
)))[
"configs"
]
configs
=
json
.
load
(
open
(
Path
(
args
.
data_path
,
model
,
"results.json"
),
encoding
=
"utf-8"
)
)[
"configs"
]
config
=
configs
[
task
]
if
model_index
==
0
:
# Only need to assemble data for the first model
...
...
@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str):
list: A list of tasks for the model.
"""
dir_path
=
Path
(
data_path
,
model
)
config
=
(
json
.
load
(
open
(
Path
(
dir_path
,
"results.json"
)))[
"configs"
],)
config
=
(
json
.
load
(
open
(
Path
(
dir_path
,
"results.json"
),
encoding
=
"utf-8"
))[
"configs"
],
)
return
list
(
config
[
0
].
keys
())
...
...
tests/models/test_openvino.py
0 → 100644
View file @
51f27158
import
random
import
tempfile
import
pytest
from
optimum.intel
import
OVModelForCausalLM
from
transformers
import
AutoTokenizer
import
lm_eval.evaluator
as
evaluator
import
lm_eval.tasks
as
tasks
from
lm_eval.api.registry
import
get_model
tasks
.
initialize_tasks
()
SUPPORTED_ARCHITECTURES_TASKS
=
{
"facebook/opt-125m"
:
"lambada_openai"
,
"hf-internal-testing/tiny-random-gpt2"
:
"wikitext"
,
}
@
pytest
.
mark
.
parametrize
(
"model_id,task"
,
SUPPORTED_ARCHITECTURES_TASKS
.
items
())
def
test_evaluator
(
model_id
,
task
):
with
tempfile
.
TemporaryDirectory
()
as
tmpdirname
:
model
=
OVModelForCausalLM
.
from_pretrained
(
model_id
,
export
=
True
,
use_cache
=
True
)
model
.
save_pretrained
(
tmpdirname
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
model_id
)
tokenizer
.
save_pretrained
(
tmpdirname
)
lm
=
get_model
(
"openvino"
).
create_from_arg_string
(
f
"pretrained=
{
tmpdirname
}
"
,
{
"batch_size"
:
1
,
"device"
:
"cpu"
,
},
)
def
ll_fn
(
reqs
):
for
ctx
,
cont
in
[
req
.
args
for
req
in
reqs
]:
if
len
(
ctx
)
==
0
:
continue
# space convention
assert
ctx
[
-
1
]
!=
" "
assert
cont
[
0
]
==
" "
or
ctx
[
-
1
]
==
"
\n
"
res
=
[]
random
.
seed
(
42
)
for
_
in
reqs
:
res
.
append
((
-
random
.
random
(),
False
))
return
res
def
ll_perp_fn
(
reqs
):
for
(
string
,)
in
[
req
.
args
for
req
in
reqs
]:
assert
isinstance
(
string
,
str
)
res
=
[]
random
.
seed
(
42
)
for
_
in
reqs
:
res
.
append
(
-
random
.
random
())
return
res
lm
.
loglikelihood
=
ll_fn
lm
.
loglikelihood_rolling
=
ll_perp_fn
limit
=
10
evaluator
.
simple_evaluate
(
model
=
lm
,
tasks
=
[
task
],
num_fewshot
=
0
,
limit
=
limit
,
bootstrap_iters
=
10
,
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment