Commit ac50adb5 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged with latest big-refactor

parents 6355d06f a3252ed7
from datasets import Dataset
from functools import partial
def process_docs(dataset, set_answer_type="bool"):
FEATURES = ["title", "abstract", "question", "answer", "answer_type"]
def _categorise_answer(answer_blob):
if answer_blob["unanswerable"]:
answer = "unanswerable"
answer_type = "unanswerable"
return answer, answer_type
elif answer_blob["yes_no"]:
answer = "yes"
answer_type = "bool"
return answer, answer_type
elif answer_blob["free_form_answer"]:
answer = answer_blob["free_form_answer"]
answer_type = "free form answer"
return answer, answer_type
elif answer_blob["extractive_spans"]:
answer = answer_blob["extractive_spans"]
answer_type = "extractive_spans"
return answer, answer_type
elif answer_blob["yes_no"] is False:
answer = "no"
answer_type = "bool"
return answer, answer_type
def _flatten(doc):
"""Given a `doc`, flatten it out so that each JSON blob
contains exactly one question and one answer. Logic taken from
the reference implementation available at
https://github.com/allenai/qasper-led-baseline/blob/main/scripts/evaluator.py
"""
obs_list = {
"title": [],
"abstract": [],
"question": [],
"answer": [],
"answer_type": [],
}
title = doc.pop("title")
abstract = doc.pop("abstract")
for question, answer_list in zip(doc["qas"]["question"], doc["qas"]["answers"]):
for answer_blob in answer_list["answer"]:
answer, answer_type = _categorise_answer(answer_blob)
if answer_type == set_answer_type:
obs_list["title"].append(title)
obs_list["abstract"].append(abstract)
obs_list["question"].append(question)
obs_list["answer_type"].append(answer_type)
if type(answer) == list:
answer = ", ".join(answer)
obs_list["answer"].append(answer)
return obs_list
dataset = dataset.map(
_flatten,
remove_columns=[key for key in dataset.features.keys() if key not in FEATURES],
)
new_dataset = {}
for key in dataset.features.keys():
new_dataset[key] = [x for row in dataset[key] for x in row]
return Dataset.from_dict(new_dataset)
process_docs_bool = partial(process_docs, set_answer_type="bool")
process_docs_freeform = partial(process_docs, set_answer_type="free form answer")
# Task-name
### Paper
Title: `paper title goes here`
Abstract: `link to paper PDF or arXiv abstract goes here`
`Short description of paper / benchmark goes here:`
Homepage: `homepage to the benchmark's website goes here, if applicable`
### Citation
```
BibTeX-formatted citation goes here
```
### Subtasks
List or describe tasks defined in this folder, and their names here:
* `task_name`: `1-sentence description of what this particular task does`
* `task_name2`: .....
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: squadv2
dataset_path: squad_v2
output_type: greedy_until
training_split: train
validation_split: validation
doc_to_text: "Title: {{title}}\n\nBackground: {{context}}\n\nQuestion: {{question}}\n\n Answer:"
doc_to_target: "{% if answers.text| length > 0 %}{{answers.text}}{% else %}{{['']}}{% endif %}"
target_delimiter: ""
should_decontaminate: true
doc_to_decontamination_query: context
generation_kwargs:
until:
- "\n"
# filter_list:
# - name: remove_whitespace
# filter:
# - function: remove_whitespace
# - function: take_first
metric_list:
- metric: !function utils.exact
aggregation: mean
higher_is_better: true
- metric: !function utils.f1
aggregation: mean
higher_is_better: true
include: default.yaml
task: squadv2_noans_loglikelihood
dataset_path: squad_v2
output_type: loglikelihood
training_split: train
validation_split: validation
doc_to_target: " unanswerable"
metric_list:
- metric: perplexity
import re
import string
import collections
def normalize_answer(s):
"""Lower text and remove punctuation, articles and extra whitespace."""
def remove_articles(text):
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
return re.sub(regex, " ", text)
def white_space_fix(text):
return " ".join(text.split())
def remove_punc(text):
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_tokens(s):
if not s:
return []
return normalize_answer(s).split()
# Exact match (the normalized answer exactly match the gold answer)
def exact(predictions, references):
return int(normalize_answer(references[0]) == normalize_answer(predictions[0]))
# The F-score of predicted tokens versus the gold answer
def f1(predictions, references):
gold_toks = get_tokens(references[0])
pred_toks = get_tokens(predictions[0])
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
# If either is no-answer, then F1 is 1 if they agree, 0 otherwise
return int(gold_toks == pred_toks)
if num_same == 0:
return 0
precision = 1.0 * num_same / len(pred_toks)
recall = 1.0 * num_same / len(gold_toks)
f1 = (2 * precision * recall) / (precision + recall)
return f1
group: squadv2_complete
task:
- squadv2
- squadv2_noans_loglikelihood
......@@ -10,7 +10,7 @@ try:
except ModuleNotFoundError:
raise Exception(
"`pycountry` is required for generating translation task prompt templates. \
please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]",
please install pycountry via pip install lm-eval[multilingual] or pip install -e .[multilingual]",
)
......
......@@ -16,7 +16,6 @@ import gc
import torch
import transformers
from omegaconf import OmegaConf
from jinja2 import BaseLoader, Environment, StrictUndefined
from itertools import islice
......@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
args_string = args_string.strip()
if not args_string:
return {}
arg_list = args_string.split(",")
args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list))
arg_list = [arg for arg in args_string.split(",") if arg]
args_dict = {k: v for k, v in [arg.split("=") for arg in arg_list]}
return args_dict
......@@ -267,9 +266,9 @@ def make_table(result_dict, column: str = "results"):
from pytablewriter import MarkdownTableWriter, LatexTableWriter
if column == "results":
column_name = "Task"
elif column == "aggregate":
column_name = "Benchmark"
column_name = "Tasks"
elif column == "groups":
column_name = "Groups"
md_writer = MarkdownTableWriter()
latex_writer = LatexTableWriter()
......@@ -395,8 +394,10 @@ def import_function(loader, node):
function_name = loader.construct_scalar(node)
yaml_path = os.path.dirname(loader.name)
module_name, function_name = function_name.split(".")
module_path = os.path.join(yaml_path, "{}.py".format(module_name))
*module_name, function_name = function_name.split(".")
if type(module_name) == list:
module_name = ".".join(module_name)
module_path = os.path.normpath(os.path.join(yaml_path, "{}.py".format(module_name)))
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
......
......@@ -11,7 +11,6 @@ from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger, SPACING
from lm_eval.tasks import include_task_folder
from lm_eval.benchmarks import include_benchmarks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
......@@ -209,8 +208,8 @@ def main() -> None:
f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
)
print(evaluator.make_table(results))
if "aggregate" in results:
print(evaluator.make_table(results, "aggregate"))
if "groups" in results:
print(evaluator.make_table(results, "groups"))
if __name__ == "__main__":
......
[mypy]
python_version = 3.9
python_version = 3.8
show_traceback = True
check_untyped_defs = True
no_implicit_reexport = True
......
[build-system]
requires = ["setuptools>=40.8.0", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "lm_eval"
version = "1.0.0"
authors = [
{name="EleutherAI", email="contact@eleuther.ai"}
]
description = "A framework for evaluating language models"
readme = "README.md"
classifiers = [
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
requires-python = ">=3.8"
license = { "text" = "MIT" }
dependencies = [
"accelerate>=0.21.0",
"evaluate",
"datasets>=2.0.0",
"evaluate>=0.4.0",
"jsonlines",
"numexpr",
"peft>=0.2.0",
"pybind11>=2.6.2",
"pytablewriter",
"rouge-score>=0.0.4",
"sacrebleu>=1.5.0",
"scikit-learn>=0.24.1",
"sqlitedict",
"torch>=1.8",
"tqdm-multiprocess",
"transformers>=4.1",
"zstandard",
]
[tool.setuptools]
packages = ["lm_eval"]
# required to include yaml files in pip installation
[tool.setuptools.package-data]
lm_eval = ["**/*.yaml", "tasks/**/*"]
examples = ["**/*.yaml"]
[project.scripts]
lm-eval = "main:main"
lm_eval = "main:main"
[project.urls]
Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"]
linting = [
"flake8",
"pylint",
"mypy",
"pre-commit",
]
testing = ["pytest", "pytest-cov", "pytest-xdist"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
promptsource = [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
]
gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
anthropic = ["anthropic"]
openai = ["openai", "tiktoken"]
all = [
"lm_eval[dev]",
"lm_eval[testing]",
"lm_eval[linting]",
"lm_eval[multilingual]",
"lm_eval[sentencepiece]",
"lm_eval[promptsource]",
"lm_eval[gptq]",
"lm_eval[anthropic]",
"lm_eval[openai]"
]
......@@ -38,17 +38,21 @@ def main():
iters = []
for set in args.sets.split(","):
docs = None
if set == "train" and task.has_training_docs():
docs = task.training_docs()
if set == "val" and task.has_validation_docs():
docs = task.validation_docs()
if set == "test" and task.has_test_docs():
docs = task.test_docs()
iters.append(docs)
if docs is not None:
iters.append(docs)
docs = join_iters(iters)
with open(os.path.join(args.output_base_path, task_name), "w") as f:
with open(
os.path.join(args.output_base_path, task_name), "w", encoding="utf8"
) as f:
for i, doc in (
zip(range(args.num_examples), docs)
if args.num_examples > 0
......
import setuptools
import itertools
with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
extras_require = {
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
"linting": [
"flake8",
"pylint",
"mypy",
"pre-commit",
],
"testing": ["pytest", "pytest-cov", "pytest-xdist"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1", "pycountry"],
"promptsource": [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
"openai": ["openai", "tiktoken"],
}
extras_require["all"] = list(itertools.chain.from_iterable(extras_require.values()))
setuptools.setup(
name="lm_eval",
version="1.0.0",
author="EleutherAI",
author_email="contact@eleuther.ai",
description="A framework for evaluating language models",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/EleutherAI/lm-evaluation-harness",
packages=setuptools.find_packages(),
# required to include yaml files in pip installation
package_data={
"lm_eval": ["**/*.yaml", "tasks/**/*"],
"examples": ["**/*.yaml"],
},
entry_points={
"console_scripts": ["lm-eval = main:main", "lm_eval = main:main"],
},
include_package_data=True,
classifiers=[
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires=">=3.9",
install_requires=[
"accelerate>=0.21.0",
"evaluate",
"datasets>=2.0.0",
"evaluate>=0.4.0",
"jsonlines",
"numexpr",
"omegaconf>=2.2",
"peft>=0.2.0",
"pybind11>=2.6.2",
"pytablewriter",
"rouge-score>=0.0.4",
"sacrebleu>=1.5.0",
"scikit-learn>=0.24.1",
"sqlitedict",
"torch>=1.8",
"tqdm-multiprocess",
"transformers>=4.1",
"zstandard",
],
extras_require=extras_require,
)
# This is to make sure that the package supports editable installs
setuptools.setup()
......@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
# import lm_eval.models as models
import lm_eval.api as api
import lm_eval.evaluator as evaluator
from typing import List
import random
import pytest
......@@ -26,7 +27,7 @@ import pytest
)
],
)
def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str):
def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
task_name = task_name
limit = 10
......
......@@ -9,6 +9,7 @@ import os
# This is the path where the output for the changed files for the tasks folder is stored
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
# reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files
def load_changed_files(file_path: str) -> List[str]:
......@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]:
return list(_output)
def new_tasks() -> Union[list[str], None]:
def new_tasks() -> Union[List[str], None]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment