Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
35a24652
Unverified
Commit
35a24652
authored
Aug 15, 2023
by
Aflah
Committed by
GitHub
Aug 15, 2023
Browse files
Merge pull request #1 from EleutherAI/toxicity-test
Toxicity test
parents
52213e29
0021de21
Changes
157
Hide whitespace changes
Inline
Side-by-side
Showing
17 changed files
with
123 additions
and
34 deletions
+123
-34
lm_eval/tasks/xnli/xnli_el.yaml
lm_eval/tasks/xnli/xnli_el.yaml
+7
-0
lm_eval/tasks/xnli/xnli_en.yaml
lm_eval/tasks/xnli/xnli_en.yaml
+7
-0
lm_eval/tasks/xnli/xnli_es.yaml
lm_eval/tasks/xnli/xnli_es.yaml
+7
-0
lm_eval/tasks/xnli/xnli_fr.yaml
lm_eval/tasks/xnli/xnli_fr.yaml
+7
-0
lm_eval/tasks/xnli/xnli_hi.yaml
lm_eval/tasks/xnli/xnli_hi.yaml
+7
-0
lm_eval/tasks/xnli/xnli_ru.yaml
lm_eval/tasks/xnli/xnli_ru.yaml
+7
-0
lm_eval/tasks/xnli/xnli_sw.yaml
lm_eval/tasks/xnli/xnli_sw.yaml
+7
-0
lm_eval/tasks/xnli/xnli_th.yaml
lm_eval/tasks/xnli/xnli_th.yaml
+7
-0
lm_eval/tasks/xnli/xnli_tr.yaml
lm_eval/tasks/xnli/xnli_tr.yaml
+7
-0
lm_eval/tasks/xnli/xnli_ur.yaml
lm_eval/tasks/xnli/xnli_ur.yaml
+7
-0
lm_eval/tasks/xnli/xnli_vi.yaml
lm_eval/tasks/xnli/xnli_vi.yaml
+7
-0
lm_eval/tasks/xnli/xnli_zh.yaml
lm_eval/tasks/xnli/xnli_zh.yaml
+7
-0
lm_eval/utils.py
lm_eval/utils.py
+3
-6
main.py
main.py
+1
-1
setup.py
setup.py
+25
-19
tests/extra/test_new_tasks.py
tests/extra/test_new_tasks.py
+5
-4
tests/test_tasks.py
tests/test_tasks.py
+5
-4
No files found.
lm_eval/tasks/xnli/xnli_el.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
el
doc_to_choice
:
'
{{[premise+",
σωστός?
Ναί,
"+hypothesis,premise+",
σωστός?
Έτσι,
"+hypothesis,premise+",
σωστός?
όχι,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_el
lm_eval/tasks/xnli/xnli_en.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
en
doc_to_choice
:
'
{{[premise+",
right?
Yes,
"+hypothesis,premise+",
right?
Also,
"+hypothesis,premise+",
right?
No,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_en
lm_eval/tasks/xnli/xnli_es.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
es
doc_to_choice
:
'
{{[premise+",
correcto?
Sí,
"+hypothesis,premise+",
correcto?
Asi
que,
"+hypothesis,premise+",
correcto?
No,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_es
lm_eval/tasks/xnli/xnli_fr.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
fr
doc_to_choice
:
'
{{[premise+",
correct?
Oui,
"+hypothesis,premise+",
correct?
Aussi,
"+hypothesis,premise+",
correct?
Non,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_fr
lm_eval/tasks/xnli/xnli_hi.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
hi
doc_to_choice
:
'
{{[premise+",
सही?
हाँ,
"+hypothesis,premise+",
सही?
इसलिए,
"+hypothesis,premise+",
सही?
नहीं,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_hi
lm_eval/tasks/xnli/xnli_ru.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
ru
doc_to_choice
:
'
{{[premise+",
правильно?
Да,
"+hypothesis,premise+",
правильно?
Так,
"+hypothesis,premise+",
правильно?
Нет,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_ru
lm_eval/tasks/xnli/xnli_sw.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
sw
doc_to_choice
:
'
{{[premise+",
sahihi?
Ndiyo,
"+hypothesis,premise+",
sahihi?
Hivyo,
"+hypothesis,premise+",
sahihi?
Hapana,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_sw
lm_eval/tasks/xnli/xnli_th.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
th
doc_to_choice
:
'
{{[premise+",
ถูกต้อง?
ใช่,
"+hypothesis,premise+",
ถูกต้อง?
ดังนั้น,
"+hypothesis,premise+",
ถูกต้อง?
ไม่,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_th
lm_eval/tasks/xnli/xnli_tr.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
tr
doc_to_choice
:
'
{{[premise+",
doğru?
Evet,
"+hypothesis,premise+",
doğru?
Böylece,
"+hypothesis,premise+",
doğru?
Hayır,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_tr
lm_eval/tasks/xnli/xnli_ur.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
ur
doc_to_choice
:
'
{{[premise+",
صحیح?
جی
ہاں,
"+hypothesis,premise+",
صحیح?
اس
لئے,
"+hypothesis,premise+",
صحیح?
نہیں,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_ur
lm_eval/tasks/xnli/xnli_vi.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
vi
doc_to_choice
:
'
{{[premise+",
đúng?
Vâng,
"+hypothesis,premise+",
đúng?
Vì
vậy,
"+hypothesis,premise+",
đúng?
Không,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_vi
lm_eval/tasks/xnli/xnli_zh.yaml
0 → 100644
View file @
35a24652
# Generated by utils.py
dataset_name
:
zh
doc_to_choice
:
'
{{[premise+",
正确?
是的,
"+hypothesis,premise+",
正确?
所以,
"+hypothesis,premise+",
正确?
不是的,
"+hypothesis]}}'
doc_to_text
:
'
'
include
:
xnli_common_yaml
task
:
xnli_zh
lm_eval/utils.py
View file @
35a24652
...
...
@@ -286,7 +286,6 @@ def make_table(result_dict, column="results"):
latex_writer
.
headers
=
[
column_name
,
"Version"
,
"Fewshot"
,
"Filter"
,
"Metric"
,
"Value"
,
...
...
@@ -298,7 +297,6 @@ def make_table(result_dict, column="results"):
for
k
,
dic
in
result_dict
[
column
].
items
():
version
=
result_dict
[
"versions"
][
k
]
n
=
str
(
result_dict
[
"configs"
][
k
][
"num_fewshot"
])
for
(
mf
),
v
in
dic
.
items
():
m
,
_
,
f
=
mf
.
partition
(
","
)
if
m
.
endswith
(
"_stderr"
):
...
...
@@ -306,11 +304,10 @@ def make_table(result_dict, column="results"):
if
m
+
"_stderr"
+
","
+
f
in
dic
:
se
=
dic
[
m
+
"_stderr"
+
","
+
f
]
values
.
append
([
k
,
version
,
n
,
f
,
m
,
"%.4f"
%
v
,
"±"
,
"%.4f"
%
se
])
values
.
append
([
k
,
version
,
f
,
m
,
"%.4f"
%
v
,
"±"
,
"%.4f"
%
se
])
else
:
values
.
append
([
k
,
version
,
n
,
f
,
m
,
"%.4f"
%
v
,
""
,
""
])
values
.
append
([
k
,
version
,
f
,
m
,
"%.4f"
%
v
,
""
,
""
])
k
=
""
n
=
""
version
=
""
md_writer
.
value_matrix
=
values
latex_writer
.
value_matrix
=
values
...
...
@@ -459,7 +456,7 @@ env = Environment(loader=BaseLoader, undefined=StrictUndefined)
env
.
filters
[
"regex_replace"
]
=
regex_replace
def
apply_template
(
template
,
doc
)
:
def
apply_template
(
template
:
str
,
doc
:
dict
)
->
str
:
rtemplate
=
env
.
from_string
(
template
)
return
rtemplate
.
render
(
**
doc
)
...
...
main.py
View file @
35a24652
...
...
@@ -32,7 +32,7 @@ def parse_args():
default
=
None
,
help
=
"Number of examples in few-shot context"
,
)
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
1
)
# TODO: only integers
parser
.
add_argument
(
"--batch_size"
,
type
=
str
,
default
=
1
)
parser
.
add_argument
(
"--max_batch_size"
,
type
=
int
,
...
...
setup.py
View file @
35a24652
import
setuptools
import
itertools
with
open
(
"README.md"
,
"r"
,
encoding
=
"utf-8"
)
as
fh
:
long_description
=
fh
.
read
()
extras_require
=
{
"dev"
:
[
"black"
,
"flake8"
,
"pre-commit"
,
"pytest"
,
"pytest-cov"
],
"linting"
:
[
"flake8"
,
"pylint"
,
"mypy"
,
"pre-commit"
,
],
"testing"
:
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
],
"multilingual"
:
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
],
"sentencepiece"
:
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
],
"promptsource"
:
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq"
:
[
"auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"
],
"anthropic"
:
[
"anthropic"
],
"openai"
:
[
"openai"
,
"tiktoken"
],
}
extras_require
[
"all"
]
=
list
(
itertools
.
chain
.
from_iterable
(
extras_require
.
values
()))
setuptools
.
setup
(
name
=
"lm_eval"
,
version
=
"1.0.0"
,
...
...
@@ -15,7 +38,7 @@ setuptools.setup(
packages
=
setuptools
.
find_packages
(),
# required to include yaml files in pip installation
package_data
=
{
"lm_eval"
:
[
"**/*.yaml"
],
"lm_eval"
:
[
"**/*.yaml"
,
"tasks/**/*"
],
"examples"
:
[
"**/*.yaml"
],
},
entry_points
=
{
...
...
@@ -36,7 +59,6 @@ setuptools.setup(
"evaluate>=0.4.0"
,
"jsonlines"
,
"numexpr"
,
"openai>=0.6.4"
,
"omegaconf>=2.2"
,
"peft>=0.2.0"
,
"pybind11>=2.6.2"
,
...
...
@@ -51,21 +73,5 @@ setuptools.setup(
"transformers>=4.1"
,
"zstandard"
,
],
extras_require
=
{
"dev"
:
[
"black"
,
"flake8"
,
"pre-commit"
,
"pytest"
,
"pytest-cov"
],
"linting"
:
[
"flake8"
,
"pylint"
,
"mypy"
,
"pre-commit"
,
],
"testing"
:
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
],
"multilingual"
:
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
],
"sentencepiece"
:
[
"sentencepiece>=0.1.98"
,
"protobuf>=4.22.1"
],
"promptsource"
:
[
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq"
:
[
"auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"
],
"anthropic"
:
[
"anthropic"
],
},
extras_require
=
extras_require
,
)
tests/extra/test_new_tasks.py
View file @
35a24652
...
...
@@ -92,7 +92,7 @@ class TestNewTasks:
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
if
"multiple_choice"
in
task
.
_config
.
group
:
if
"multiple_choice"
in
task
.
_config
.
output_type
:
_array
=
[
task
.
doc_to_choice
(
doc
)
for
doc
in
arr
]
# assert all(len(x) == 4 for x in _array)
assert
all
(
isinstance
(
x
,
list
)
for
x
in
_array
)
...
...
@@ -106,8 +106,8 @@ class TestNewTasks:
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array_target
=
[
task
.
doc_to_target
(
doc
)
for
doc
in
arr
]
assert
all
(
isinstance
(
label
,
int
)
for
label
in
_array_target
)
assert
len
(
_array_target
)
==
limit
if
limit
else
True
if
task
.
_config
.
output_type
==
"multiple_choice"
:
assert
all
(
isinstance
(
label
,
int
)
for
label
in
_array_target
)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
...
...
@@ -116,6 +116,7 @@ class TestNewTasks:
task_class
().
build_all_requests
(
rank
=
1
,
limit
=
limit
,
world_size
=
1
)
assert
task_class
.
instances
is
not
None
# ToDO: Add proper testing
def
test_construct_requests
(
self
,
task_class
,
limit
):
task
=
task_class
()
arr
=
(
...
...
@@ -124,5 +125,5 @@ class TestNewTasks:
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
requests
=
[
task
.
construct_requests
(
doc
,
task
.
doc_to_text
(
doc
))
for
doc
in
arr
]
assert
all
(
isinstance
(
doc
,
list
)
for
doc
in
requests
)
#
assert all(isinstance(doc, list) for doc in requests)
assert
len
(
requests
)
==
limit
if
limit
else
True
tests/test_tasks.py
View file @
35a24652
...
...
@@ -83,7 +83,7 @@ def test_create_choices(task_class, limit):
if
task
.
has_test_docs
()
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
if
"multiple_choice"
in
task
.
_config
.
group
:
if
"multiple_choice"
in
task
.
_config
.
output_type
:
_array
=
[
task
.
doc_to_choice
(
doc
)
for
doc
in
arr
]
# assert all(len(x) == 4 for x in _array)
assert
all
(
isinstance
(
x
,
list
)
for
x
in
_array
)
...
...
@@ -98,8 +98,8 @@ def test_doc_to_target(task_class, limit):
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
_array_target
=
[
task
.
doc_to_target
(
doc
)
for
doc
in
arr
]
assert
all
(
isinstance
(
label
,
int
)
for
label
in
_array_target
)
assert
len
(
_array_target
)
==
limit
if
limit
else
True
if
task
.
_config
.
output_type
==
"multiple_choice"
:
assert
all
(
isinstance
(
label
,
int
)
for
label
in
_array_target
)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
...
...
@@ -110,6 +110,7 @@ def test_build_all_requests(task_class, limit):
assert
task_class
.
instances
is
not
None
# ToDO: Add proper testing
def
test_construct_requests
(
task_class
,
limit
):
task
=
task_class
()
arr
=
(
...
...
@@ -118,7 +119,7 @@ def test_construct_requests(task_class, limit):
else
list
(
islice
(
task
.
validation_docs
(),
limit
))
)
requests
=
[
task
.
construct_requests
(
doc
,
task
.
doc_to_text
(
doc
))
for
doc
in
arr
]
assert
all
(
isinstance
(
doc
,
list
)
for
doc
in
requests
)
#
assert all(isinstance(doc, list) for doc in requests)
assert
len
(
requests
)
==
limit
if
limit
else
True
...
...
Prev
1
…
4
5
6
7
8
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment