Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
2106fbeb
Commit
2106fbeb
authored
Jan 15, 2025
by
Baber
Browse files
Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/openai_completions.py
parents
4354fe46
703fbffd
Changes
574
Hide whitespace changes
Inline
Side-by-side
Showing
14 changed files
with
148 additions
and
29 deletions
+148
-29
lm_eval/tasks/xquad/xquad_hi.yaml
lm_eval/tasks/xquad/xquad_hi.yaml
+4
-0
lm_eval/tasks/xquad/xquad_ro.yaml
lm_eval/tasks/xquad/xquad_ro.yaml
+4
-0
lm_eval/tasks/xquad/xquad_ru.yaml
lm_eval/tasks/xquad/xquad_ru.yaml
+4
-0
lm_eval/tasks/xquad/xquad_th.yaml
lm_eval/tasks/xquad/xquad_th.yaml
+4
-0
lm_eval/tasks/xquad/xquad_tr.yaml
lm_eval/tasks/xquad/xquad_tr.yaml
+4
-0
lm_eval/tasks/xquad/xquad_vi.yaml
lm_eval/tasks/xquad/xquad_vi.yaml
+4
-0
lm_eval/tasks/xquad/xquad_zh.yaml
lm_eval/tasks/xquad/xquad_zh.yaml
+4
-0
lm_eval/utils.py
lm_eval/utils.py
+10
-5
pyproject.toml
pyproject.toml
+7
-2
scripts/clean_training_data/generate_13_grams.py
scripts/clean_training_data/generate_13_grams.py
+1
-1
scripts/zeno_visualize.py
scripts/zeno_visualize.py
+25
-13
tests/models/test_api.py
tests/models/test_api.py
+3
-3
tests/models/test_gptqmodel.py
tests/models/test_gptqmodel.py
+54
-0
tests/test_tasks.py
tests/test_tasks.py
+20
-5
No files found.
lm_eval/tasks/xquad/xquad_hi.yaml
0 → 100755
View file @
2106fbeb
include
:
xquad_common_yaml
task
:
xquad_hi
dataset_name
:
xquad.hi
doc_to_text
:
"
प्रसंग:
{{context}}
\n\n
सवाल:
{{question}}
\n\n
उत्तर:"
lm_eval/tasks/xquad/xquad_ro.yaml
0 → 100755
View file @
2106fbeb
include
:
xquad_common_yaml
task
:
xquad_ro
dataset_name
:
xquad.ro
doc_to_text
:
"
Context:
{{context}}
\n\n
Întrebare:
{{question}}
\n\n
Răspuns:"
lm_eval/tasks/xquad/xquad_ru.yaml
0 → 100755
View file @
2106fbeb
include
:
xquad_common_yaml
task
:
xquad_ru
dataset_name
:
xquad.ru
doc_to_text
:
"
Контекст:
{{context}}
\n\n
Вопрос:
{{question}}
\n\n
Ответ:"
lm_eval/tasks/xquad/xquad_th.yaml
0 → 100755
View file @
2106fbeb
include
:
xquad_common_yaml
task
:
xquad_th
dataset_name
:
xquad.th
doc_to_text
:
"
บริบท:
{{context}}
\n\n
คำถาม:
{{question}}
\n\n
คำตอบ:"
lm_eval/tasks/xquad/xquad_tr.yaml
0 → 100755
View file @
2106fbeb
include
:
xquad_common_yaml
task
:
xquad_tr
dataset_name
:
xquad.tr
doc_to_text
:
"
Bağlam:
{{context}}
\n\n
Soru:
{{question}}
\n\n
Cevap:"
lm_eval/tasks/xquad/xquad_vi.yaml
0 → 100755
View file @
2106fbeb
include
:
xquad_common_yaml
task
:
xquad_vi
dataset_name
:
xquad.vi
doc_to_text
:
"
Bối
cảnh:
{{context}}
\n\n
Câu
hỏi:
{{question}}
\n\n
Trả
lời:"
lm_eval/tasks/xquad/xquad_zh.yaml
0 → 100755
View file @
2106fbeb
include
:
xquad_common_yaml
task
:
xquad_zh
dataset_name
:
xquad.zh
doc_to_text
:
"
语境:
{{context}}
\n\n
问题:
{{question}}
\n\n
回答:"
lm_eval/utils.py
View file @
2106fbeb
...
...
@@ -10,7 +10,7 @@ import os
import
re
from
dataclasses
import
asdict
,
is_dataclass
from
itertools
import
islice
from
typing
import
Any
,
Callable
,
List
from
typing
import
Any
,
Callable
,
Generator
,
List
,
Tuple
import
numpy
as
np
import
yaml
...
...
@@ -104,7 +104,8 @@ def simple_parse_args_string(args_string):
return
{}
arg_list
=
[
arg
for
arg
in
args_string
.
split
(
","
)
if
arg
]
args_dict
=
{
k
:
handle_arg_string
(
v
)
for
k
,
v
in
[
arg
.
split
(
"="
)
for
arg
in
arg_list
]
kv
[
0
]:
handle_arg_string
(
"="
.
join
(
kv
[
1
:]))
for
kv
in
[
arg
.
split
(
"="
)
for
arg
in
arg_list
]
}
return
args_dict
...
...
@@ -201,7 +202,9 @@ def get_sample_results_filenames(filenames: List[str]) -> List[str]:
return
[
f
for
f
in
filenames
if
"/samples_"
in
f
and
".json"
in
f
]
def
get_rolling_token_windows
(
token_list
,
prefix_token
,
max_seq_len
,
context_len
):
def
get_rolling_token_windows
(
token_list
:
List
[
int
],
prefix_token
:
int
,
max_seq_len
:
int
,
context_len
:
int
)
->
Generator
[
Tuple
[
List
[
int
],
List
[
int
]],
None
,
None
]:
"""
- context_len allows for a rolling window context, allowing each prediction window to potentially
condition on some context
...
...
@@ -228,7 +231,7 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
# Special handling for first window: predict all tokens
first_seq_len
=
min
(
max_seq_len
,
len
(
token_list
))
yield
(
[
prefix_token
]
+
token_list
[:
first_seq_len
-
1
],
token_list
[:
first_seq_len
]
)
yield
[
prefix_token
]
+
token_list
[:
first_seq_len
-
1
],
token_list
[:
first_seq_len
]
predicted
+=
first_seq_len
while
predicted
<
len
(
token_list
):
...
...
@@ -242,7 +245,9 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
predicted
+=
window_pred_len
def
make_disjoint_window
(
pair
):
def
make_disjoint_window
(
pair
:
Tuple
[
List
[
int
],
List
[
int
]],
)
->
Tuple
[
List
[
int
],
List
[
int
]]:
"""Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
a
,
b
=
pair
return
a
[:
len
(
a
)
-
(
len
(
b
)
-
1
)],
b
...
...
pyproject.toml
View file @
2106fbeb
...
...
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name
=
"lm_eval"
version
=
"0.4.
5
"
version
=
"0.4.
7
"
authors
=
[
{name=
"EleutherAI"
,
email=
"contact@eleuther.ai"
}
]
...
...
@@ -16,7 +16,7 @@ classifiers = [
"License :: OSI Approved :: MIT License"
,
"Operating System :: OS Independent"
,
]
requires-python
=
">=3.
8
"
requires-python
=
">=3.
9
"
license
=
{
"text"
=
"MIT"
}
dependencies
=
[
"accelerate>=0.26.0"
,
...
...
@@ -62,6 +62,7 @@ dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
deepsparse
=
["deepsparse-nightly[llm]>=1.8.0.20240404"]
gptq
=
["auto-gptq[triton]>=0.6.0"]
hf_transfer
=
["hf_transfer"]
ibm_watsonx_ai
=
["ibm_watsonx_ai>=1.1.22"]
ifeval
=
[
"langdetect"
,
"immutabledict"
,
"nltk>=3.9.1"
]
neuronx
=
["optimum[neuronx]"]
mamba
=
[
"mamba_ssm"
,
"causal-conv1d==1.0.2"
]
...
...
@@ -75,12 +76,15 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm
=
["vllm>=0.4.2"]
zeno
=
[
"pandas"
,
"zeno-client"
]
wandb
=
[
"wandb>=0.16.3"
,
"pandas"
,
"numpy"
]
gptqmodel
=
["gptqmodel>=1.0.9"]
japanese_leaderboard
=
[
"emoji==2.14.0"
,
"neologdn==0.5.3"
,
"fugashi[unidic-lite]"
,
"rouge_score>=0.1.2"
]
all
=
[
"lm_eval[anthropic]"
,
"lm_eval[dev]"
,
"lm_eval[deepsparse]"
,
"lm_eval[gptq]"
,
"lm_eval[hf_transfer]"
,
"lm_eval[ibm_watsonx_ai]"
,
"lm_eval[ifeval]"
,
"lm_eval[mamba]"
,
"lm_eval[math]"
,
...
...
@@ -93,6 +97,7 @@ all = [
"lm_eval[vllm]"
,
"lm_eval[zeno]"
,
"lm_eval[wandb]"
,
"lm_eval[japanese_leaderboard]"
,
]
[tool.ruff.lint]
...
...
scripts/clean_training_data/generate_13_grams.py
View file @
2106fbeb
...
...
@@ -55,7 +55,7 @@ def yield_pile(start_offsets=None, checkpoint_offset=None):
print
(
"We expect the pile archives to be in the 'pile' directory, but this was not found."
)
raise
Exception
(
"Pile directory not found."
)
raise
FileNotFoundError
(
"Pile directory not found."
)
files
=
list
(
sorted
(
glob
.
glob
(
os
.
path
.
join
(
directory
,
"*.jsonl.zst*"
))))
...
...
scripts/zeno_visualize.py
View file @
2106fbeb
...
...
@@ -109,13 +109,14 @@ def main():
if
model_index
==
0
:
# Only need to assemble data for the first model
metrics
=
[]
for
metric
in
config
[
"metric_list"
]:
metrics
.
append
(
ZenoMetric
(
name
=
metric
[
"metric"
],
type
=
"mean"
,
columns
=
[
metric
[
"metric"
]],
if
metric
.
get
(
"aggregation"
)
==
"mean"
:
metrics
.
append
(
ZenoMetric
(
name
=
metric
[
"metric"
],
type
=
"mean"
,
columns
=
[
metric
[
"metric"
]],
)
)
)
project
=
client
.
create_project
(
name
=
args
.
project_name
+
(
f
"_
{
task
}
"
if
len
(
tasks
)
>
1
else
""
),
view
=
"text-classification"
,
...
...
@@ -168,7 +169,11 @@ def generate_dataset(
Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
"""
ids
=
[
x
[
"doc_id"
]
for
x
in
data
]
ids
=
(
[
x
[
"doc_id"
]
for
x
in
data
]
if
not
config
.
get
(
"filter_list"
)
else
[
f
"
{
x
[
'doc_id'
]
}
.
{
x
[
'filter'
]
}
"
for
x
in
data
]
)
labels
=
[
x
[
"target"
]
for
x
in
data
]
instance
=
[
""
]
*
len
(
ids
)
...
...
@@ -190,6 +195,7 @@ def generate_dataset(
return
pd
.
DataFrame
(
{
"id"
:
ids
,
"doc_id"
:
[
x
[
"doc_id"
]
for
x
in
data
],
"data"
:
instance
,
"input_len"
:
[
len
(
x
)
for
x
in
instance
],
"labels"
:
labels
,
...
...
@@ -208,8 +214,15 @@ def generate_system_df(data, config):
Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
"""
ids
=
[
x
[
"doc_id"
]
for
x
in
data
]
ids
=
(
[
x
[
"doc_id"
]
for
x
in
data
]
if
not
config
.
get
(
"filter_list"
)
else
[
f
"
{
x
[
'doc_id'
]
}
.
{
x
[
'filter'
]
}
"
for
x
in
data
]
)
system_dict
=
{
"id"
:
ids
}
system_dict
[
"doc_id"
]
=
[
x
[
"doc_id"
]
for
x
in
data
]
if
config
.
get
(
"filter_list"
):
system_dict
[
"filter"
]
=
[
x
[
"filter"
]
for
x
in
data
]
system_dict
[
"output"
]
=
[
""
]
*
len
(
ids
)
if
config
[
"output_type"
]
==
"loglikelihood"
:
...
...
@@ -228,11 +241,10 @@ def generate_system_df(data, config):
system_dict
[
"output"
]
=
[
str
(
x
[
"filtered_resps"
][
0
])
for
x
in
data
]
system_dict
[
"output_length"
]
=
[
len
(
str
(
x
[
"filtered_resps"
][
0
]))
for
x
in
data
]
metrics
=
{}
for
metric
in
config
[
"metric_list"
]:
if
"aggregation"
in
metric
and
metric
[
"aggregation"
]
==
"mean"
:
metrics
[
metric
[
"metric"
]]
=
[
x
[
metric
[
"metric"
]]
for
x
in
data
]
metrics
=
{
metric
[
"metric"
]:
[
x
[
metric
[
"metric"
]]
for
x
in
data
]
for
metric
in
config
[
"metric_list"
]
}
system_dict
.
update
(
metrics
)
system_df
=
pd
.
DataFrame
(
system_dict
)
return
system_df
...
...
tests/models/test_api.py
View file @
2106fbeb
...
...
@@ -63,13 +63,13 @@ def test_create_payload_loglikelihood(api):
(
[
"Hello, how are"
],
True
,
{
"max_gen_toks"
:
100
,
"temperature"
:
0.7
},
{
"max_gen_toks"
:
100
,
"temperature"
:
0.7
,
"until"
:
[
"hi"
]
},
{
"prompt"
:
"Hello, how are"
,
"model"
:
"gpt-3.5-turbo"
,
"max_tokens"
:
100
,
"temperature"
:
0.7
,
"stop"
:
[
"
<|endoftext|>
"
],
"stop"
:
[
"
hi
"
],
"seed"
:
1234
,
},
),
...
...
@@ -82,7 +82,7 @@ def test_create_payload_loglikelihood(api):
"model"
:
"gpt-3.5-turbo"
,
"max_tokens"
:
256
,
"temperature"
:
0
,
"stop"
:
[
"<|endoftext|>"
],
"stop"
:
[],
"seed"
:
1234
,
},
),
...
...
tests/models/test_gptqmodel.py
0 → 100644
View file @
2106fbeb
from
typing
import
List
import
pytest
import
lm_eval
def
assert_less_than
(
value
,
threshold
,
desc
):
if
value
is
not
None
:
assert
float
(
value
)
<
threshold
,
f
"
{
desc
}
should be less than
{
threshold
}
"
@
pytest
.
mark
.
skip
(
reason
=
"requires CUDA"
)
class
Test_GPTQModel
:
gptqmodel
=
pytest
.
importorskip
(
"gptqmodel"
,
minversion
=
"1.0.9"
)
MODEL_ID
=
"ModelCloud/Opt-125-GPTQ-4bit-10-25-2024"
def
test_gptqmodel
(
self
)
->
None
:
acc
=
"acc"
acc_norm
=
"acc_norm"
acc_value
=
None
acc_norm_value
=
None
task
=
"arc_easy"
model_args
=
f
"pretrained=
{
self
.
MODEL_ID
}
,gptqmodel=True"
tasks
:
List
[
str
]
=
[
task
]
results
=
lm_eval
.
simple_evaluate
(
model
=
"hf"
,
model_args
=
model_args
,
tasks
=
tasks
,
device
=
"cuda"
,
)
column
=
"results"
dic
=
results
.
get
(
column
,
{}).
get
(
self
.
task
)
if
dic
is
not
None
:
if
"alias"
in
dic
:
_
=
dic
.
pop
(
"alias"
)
items
=
sorted
(
dic
.
items
())
for
k
,
v
in
items
:
m
,
_
,
f
=
k
.
partition
(
","
)
if
m
.
endswith
(
"_stderr"
):
continue
if
m
==
acc
:
acc_value
=
"%.4f"
%
v
if
isinstance
(
v
,
float
)
else
v
if
m
==
acc_norm
:
acc_norm_value
=
"%.4f"
%
v
if
isinstance
(
v
,
float
)
else
v
assert_less_than
(
acc_value
,
0.43
,
"acc"
)
assert_less_than
(
acc_norm_value
,
0.39
,
"acc_norm"
)
tests/test_tasks.py
View file @
2106fbeb
import
os
from
itertools
import
islice
import
datasets
import
pytest
import
lm_eval.tasks
as
tasks
...
...
@@ -10,6 +11,7 @@ from lm_eval.evaluator_utils import get_task_list
from
.utils
import
new_tasks
datasets
.
config
.
HF_DATASETS_TRUST_REMOTE_CODE
=
True
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
task_manager
=
tasks
.
TaskManager
()
# Default Task
...
...
@@ -77,10 +79,17 @@ class TestNewTasks:
)
_array
=
[
task
.
doc_to_text
(
doc
)
for
doc
in
arr
]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert
all
(
isinstance
(
x
,
str
)
and
(
x
[
-
1
]
!=
" "
if
len
(
x
)
!=
0
else
True
)
for
x
in
_array
)
target_delimiter
:
str
=
task
.
config
.
target_delimiter
if
not
task
.
multiple_input
:
for
x
in
_array
:
assert
isinstance
(
x
,
str
)
assert
(
(
x
[
-
1
].
isspace
()
is
False
if
len
(
x
)
>
0
else
True
)
if
target_delimiter
.
isspace
()
else
True
),
"doc_to_text ends in a whitespace and target delimiter also a whitespace"
else
:
pass
def
test_create_choices
(
self
,
task_class
,
limit
):
task
=
task_class
...
...
@@ -121,5 +130,11 @@ class TestNewTasks:
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
requests
=
[
task
.
construct_requests
(
doc
,
task
.
doc_to_text
(
doc
))
for
doc
in
arr
]
# ctx is "" for multiple input tasks
requests
=
[
task
.
construct_requests
(
doc
=
doc
,
ctx
=
""
if
task
.
multiple_input
else
task
.
doc_to_text
(
doc
)
)
for
doc
in
arr
]
assert
len
(
requests
)
==
limit
if
limit
else
True
Prev
1
…
25
26
27
28
29
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment