Unverified Commit 35a24652 authored by Aflah's avatar Aflah Committed by GitHub
Browse files

Merge pull request #1 from EleutherAI/toxicity-test

Toxicity test
parents 52213e29 0021de21
# Generated by utils.py
dataset_name: el
doc_to_choice: '{{[premise+", σωστός? Ναί, "+hypothesis,premise+", σωστός? Έτσι, "+hypothesis,premise+",
σωστός? όχι, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_el
# Generated by utils.py
dataset_name: en
doc_to_choice: '{{[premise+", right? Yes, "+hypothesis,premise+", right? Also, "+hypothesis,premise+",
right? No, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_en
# Generated by utils.py
dataset_name: es
doc_to_choice: '{{[premise+", correcto? Sí, "+hypothesis,premise+", correcto? Asi
que, "+hypothesis,premise+", correcto? No, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_es
# Generated by utils.py
dataset_name: fr
doc_to_choice: '{{[premise+", correct? Oui, "+hypothesis,premise+", correct? Aussi,
"+hypothesis,premise+", correct? Non, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_fr
# Generated by utils.py
dataset_name: hi
doc_to_choice: '{{[premise+", सही? हाँ, "+hypothesis,premise+", सही? इसलिए, "+hypothesis,premise+",
सही? नहीं, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_hi
# Generated by utils.py
dataset_name: ru
doc_to_choice: '{{[premise+", правильно? Да, "+hypothesis,premise+", правильно? Так,
"+hypothesis,premise+", правильно? Нет, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_ru
# Generated by utils.py
dataset_name: sw
doc_to_choice: '{{[premise+", sahihi? Ndiyo, "+hypothesis,premise+", sahihi? Hivyo,
"+hypothesis,premise+", sahihi? Hapana, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_sw
# Generated by utils.py
dataset_name: th
doc_to_choice: '{{[premise+", ถูกต้อง? ใช่, "+hypothesis,premise+", ถูกต้อง? ดังนั้น,
"+hypothesis,premise+", ถูกต้อง? ไม่, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_th
# Generated by utils.py
dataset_name: tr
doc_to_choice: '{{[premise+", doğru? Evet, "+hypothesis,premise+", doğru? Böylece,
"+hypothesis,premise+", doğru? Hayır, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_tr
# Generated by utils.py
dataset_name: ur
doc_to_choice: '{{[premise+", صحیح? جی ہاں, "+hypothesis,premise+", صحیح? اس لئے,
"+hypothesis,premise+", صحیح? نہیں, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_ur
# Generated by utils.py
dataset_name: vi
doc_to_choice: '{{[premise+", đúng? Vâng, "+hypothesis,premise+", đúng? vậy, "+hypothesis,premise+",
đúng? Không, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_vi
# Generated by utils.py
dataset_name: zh
doc_to_choice: '{{[premise+", 正确? 是的, "+hypothesis,premise+", 正确? 所以, "+hypothesis,premise+",
正确? 不是的, "+hypothesis]}}'
doc_to_text: ''
include: xnli_common_yaml
task: xnli_zh
......@@ -286,7 +286,6 @@ def make_table(result_dict, column="results"):
latex_writer.headers = [
column_name,
"Version",
"Fewshot",
"Filter",
"Metric",
"Value",
......@@ -298,7 +297,6 @@ def make_table(result_dict, column="results"):
for k, dic in result_dict[column].items():
version = result_dict["versions"][k]
n = str(result_dict["configs"][k]["num_fewshot"])
for (mf), v in dic.items():
m, _, f = mf.partition(",")
if m.endswith("_stderr"):
......@@ -306,11 +304,10 @@ def make_table(result_dict, column="results"):
if m + "_stderr" + "," + f in dic:
se = dic[m + "_stderr" + "," + f]
values.append([k, version, n, f, m, "%.4f" % v, "±", "%.4f" % se])
values.append([k, version, f, m, "%.4f" % v, "±", "%.4f" % se])
else:
values.append([k, version, n, f, m, "%.4f" % v, "", ""])
values.append([k, version, f, m, "%.4f" % v, "", ""])
k = ""
n = ""
version = ""
md_writer.value_matrix = values
latex_writer.value_matrix = values
......@@ -459,7 +456,7 @@ env = Environment(loader=BaseLoader, undefined=StrictUndefined)
env.filters["regex_replace"] = regex_replace
def apply_template(template, doc):
def apply_template(template: str, doc: dict) -> str:
rtemplate = env.from_string(template)
return rtemplate.render(**doc)
......
......@@ -32,7 +32,7 @@ def parse_args():
default=None,
help="Number of examples in few-shot context",
)
parser.add_argument("--batch_size", type=int, default=1) # TODO: only integers
parser.add_argument("--batch_size", type=str, default=1)
parser.add_argument(
"--max_batch_size",
type=int,
......
import setuptools
import itertools
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
extras_require = {
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
"linting": [
"flake8",
"pylint",
"mypy",
"pre-commit",
],
"testing": ["pytest", "pytest-cov", "pytest-xdist"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
"promptsource": [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
"openai": ["openai", "tiktoken"],
}
extras_require["all"] = list(itertools.chain.from_iterable(extras_require.values()))
setuptools.setup(
name="lm_eval",
version="1.0.0",
......@@ -15,7 +38,7 @@ setuptools.setup(
packages=setuptools.find_packages(),
# required to include yaml files in pip installation
package_data={
"lm_eval": ["**/*.yaml"],
"lm_eval": ["**/*.yaml", "tasks/**/*"],
"examples": ["**/*.yaml"],
},
entry_points={
......@@ -36,7 +59,6 @@ setuptools.setup(
"evaluate>=0.4.0",
"jsonlines",
"numexpr",
"openai>=0.6.4",
"omegaconf>=2.2",
"peft>=0.2.0",
"pybind11>=2.6.2",
......@@ -51,21 +73,5 @@ setuptools.setup(
"transformers>=4.1",
"zstandard",
],
extras_require={
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
"linting": [
"flake8",
"pylint",
"mypy",
"pre-commit",
],
"testing": ["pytest", "pytest-cov", "pytest-xdist"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1"],
"promptsource": [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
},
extras_require=extras_require,
)
......@@ -92,7 +92,7 @@ class TestNewTasks:
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
if "multiple_choice" in task._config.group:
if "multiple_choice" in task._config.output_type:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array)
......@@ -106,8 +106,8 @@ class TestNewTasks:
else list(islice(task.validation_docs(), limit))
)
_array_target = [task.doc_to_target(doc) for doc in arr]
assert all(isinstance(label, int) for label in _array_target)
assert len(_array_target) == limit if limit else True
if task._config.output_type == "multiple_choice":
assert all(isinstance(label, int) for label in _array_target)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
......@@ -116,6 +116,7 @@ class TestNewTasks:
task_class().build_all_requests(rank=1, limit=limit, world_size=1)
assert task_class.instances is not None
# ToDO: Add proper testing
def test_construct_requests(self, task_class, limit):
task = task_class()
arr = (
......@@ -124,5 +125,5 @@ class TestNewTasks:
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
assert all(isinstance(doc, list) for doc in requests)
# assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
......@@ -83,7 +83,7 @@ def test_create_choices(task_class, limit):
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
if "multiple_choice" in task._config.group:
if "multiple_choice" in task._config.output_type:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array)
......@@ -98,8 +98,8 @@ def test_doc_to_target(task_class, limit):
else list(islice(task.validation_docs(), limit))
)
_array_target = [task.doc_to_target(doc) for doc in arr]
assert all(isinstance(label, int) for label in _array_target)
assert len(_array_target) == limit if limit else True
if task._config.output_type == "multiple_choice":
assert all(isinstance(label, int) for label in _array_target)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
......@@ -110,6 +110,7 @@ def test_build_all_requests(task_class, limit):
assert task_class.instances is not None
# ToDO: Add proper testing
def test_construct_requests(task_class, limit):
task = task_class()
arr = (
......@@ -118,7 +119,7 @@ def test_construct_requests(task_class, limit):
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
assert all(isinstance(doc, list) for doc in requests)
# assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment