"configs/vscode:/vscode.git/clone" did not exist on "c0acd06b052404b3737350ef3a4524c45701634d"
Commit 1f351067 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into qasper

parents 50f4428b 33d52483
...@@ -63,10 +63,10 @@ jobs: ...@@ -63,10 +63,10 @@ jobs:
- name: Test with pytest - name: Test with pytest
# if new tasks are added, run tests on them # if new tasks are added, run tests on them
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto run: python -m pytest tests/test_tasks.py -s -vv
# if api is modified, run tests on it # if api is modified, run tests on it
- name: Test more tasks with pytest - name: Test more tasks with pytest
env: env:
API: true API: true
if: steps.changed-tasks.outputs.api_any_modified == 'true' if: steps.changed-tasks.outputs.api_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto run: python -m pytest tests/test_tasks.py -s -vv
import abc import abc
import os import os
from typing import Union, List, Tuple import torch
from typing import Union, List, Tuple, Optional, Type, TypeVar
from sqlitedict import SqliteDict from sqlitedict import SqliteDict
import json import json
import hashlib import hashlib
...@@ -11,6 +12,8 @@ from tqdm import tqdm ...@@ -11,6 +12,8 @@ from tqdm import tqdm
from lm_eval import utils from lm_eval import utils
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
T = TypeVar("T", bound="LM")
class LM(abc.ABC): class LM(abc.ABC):
def __init__(self) -> None: def __init__(self) -> None:
...@@ -111,11 +114,28 @@ class LM(abc.ABC): ...@@ -111,11 +114,28 @@ class LM(abc.ABC):
pass pass
@classmethod @classmethod
def create_from_arg_string(cls, arg_string, additional_config=None): def create_from_arg_string(
cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
) -> T:
"""
Creates an instance of the LM class using the given argument string and additional config.
Parameters:
- arg_string: A string containing arguments in the format key1=value1,key2=value2.
- additional_config: Optional dictionary containing additional configuration parameters.
Returns:
- Instance of the LM class.
"""
additional_config = {} if additional_config is None else additional_config additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string) args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None} args2 = {k: v for k, v in additional_config.items() if v is not None}
if args2.get("device") == "mps" or args.get("device") == "mps": # TODO: delete once float16 MPS is fixed in torch stable
if (
args2.get("device") in ("mps", "mps:0")
or args.get("device") in ("mps", "mps:0")
and "dev" not in torch.__version__
):
args["dtype"] = "float32" args["dtype"] = "float32"
return cls(**args, **args2) return cls(**args, **args2)
......
...@@ -674,22 +674,22 @@ class ConfigurableTask(Task): ...@@ -674,22 +674,22 @@ class ConfigurableTask(Task):
check_choices = test_choice check_choices = test_choice
else: else:
check_choices = [test_target] check_choices = [test_target]
if self.config.doc_to_choice is not None:
for choice in check_choices: for choice in check_choices:
choice_has_whitespace = True if " " in choice else False choice_has_whitespace = True if choice[0].isspace() else False
delimiter_has_whitespace = ( delimiter_has_whitespace = (
True if " " in self.config.target_delimiter else False True if self.config.target_delimiter[-1].isspace() else False
)
if delimiter_has_whitespace and choice_has_whitespace:
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" have whitespace'
)
elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
) )
if delimiter_has_whitespace and choice_has_whitespace:
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" have whitespace'
)
elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
)
def download(self, dataset_kwargs=None) -> None: def download(self, dataset_kwargs=None) -> None:
self.dataset = datasets.load_dataset( self.dataset = datasets.load_dataset(
path=self.DATASET_PATH, path=self.DATASET_PATH,
...@@ -1067,6 +1067,9 @@ class ConfigurableTask(Task): ...@@ -1067,6 +1067,9 @@ class ConfigurableTask(Task):
# it assumes that doc_to_target returns a number. # it assumes that doc_to_target returns a number.
choices = self.doc_to_choice(doc) choices = self.doc_to_choice(doc)
gold = choices[gold] gold = choices[gold]
# we expect multiple_targets to be a list.
elif self.multiple_target:
gold = list(gold)
else: else:
gold = str(gold) gold = str(gold)
...@@ -1077,6 +1080,10 @@ class ConfigurableTask(Task): ...@@ -1077,6 +1080,10 @@ class ConfigurableTask(Task):
# return true if any are true # return true if any are true
# TODO: this may break for multipLe_target, non zero-or-1 metrics # TODO: this may break for multipLe_target, non zero-or-1 metrics
scores = [] scores = []
if not isinstance(gold, list):
# sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
# print(gold)
gold = [gold]
for gold_option in gold: for gold_option in gold:
try: try:
result_score = self._metric_fn_list[metric]( result_score = self._metric_fn_list[metric](
......
...@@ -44,7 +44,7 @@ def include_benchmarks(task_dir: str) -> None: ...@@ -44,7 +44,7 @@ def include_benchmarks(task_dir: str) -> None:
task_names = utils.pattern_match(task_list, ALL_TASKS) task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names: for task in task_names:
if task in TASK_REGISTRY: if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
if group in GROUP_REGISTRY: if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task) GROUP_REGISTRY[group].append(task)
else: else:
......
group: pythia group: pythia
task: task:
- lambada_openai - lambada_openai
- wikitext - logiqa
- piqa - piqa
- sciq - sciq
- wsc - wikitext
- winogrande - winogrande
- arc - wsc
- logiqa - ai2_arc
- blimp - blimp
- hendrycksTest* - hendrycksTest*
...@@ -120,6 +120,8 @@ def simple_evaluate( ...@@ -120,6 +120,8 @@ def simple_evaluate(
task_obj = task_dict[task_name] task_obj = task_dict[task_name]
if type(task_obj) == tuple: if type(task_obj) == tuple:
group, task_obj = task_obj group, task_obj = task_obj
if task_obj is None:
continue
config = task_obj._config config = task_obj._config
if num_fewshot is not None: if num_fewshot is not None:
...@@ -209,23 +211,30 @@ def evaluate( ...@@ -209,23 +211,30 @@ def evaluate(
samples = collections.defaultdict(list) samples = collections.defaultdict(list)
# tracks all Instances/requests a model must generate output on. # tracks all Instances/requests a model must generate output on.
requests = collections.defaultdict(list) requests = collections.defaultdict(list)
# Stores task scores based on task grouping. # Aggregated task scores presented with groups
aggregate = collections.defaultdict(dict) results_agg = collections.defaultdict(dict)
# tracks if a task was chosen via user selecting a group containing it # Aggregated groups scores only
task_groups = collections.defaultdict(dict) groups_agg = collections.defaultdict(dict)
# stores the amount to pad out reqs per req. type so that # stores the amount to pad out reqs per req. type so that
# number of fwd passes per distributed rank is equal # number of fwd passes per distributed rank is equal
padding_requests = collections.defaultdict(int) padding_requests = collections.defaultdict(int)
# store the hierarchy to do proper ordering
# Stores group related keys and values for group-aggregation task_hierarchy = collections.defaultdict(list)
task_groups = collections.defaultdict(dict) # store the ordering of tasks and groups
task_order = collections.defaultdict(int)
# store the aggregation for aggregating across tasks in the same group
sample_agg_fn = collections.defaultdict(dict)
# get lists of each type of request # get lists of each type of request
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
if type(task) == tuple: if type(task) == tuple:
group, task = task group_name, task = task
task_groups[task_name] = group task_hierarchy[group_name].append(task_name)
aggregate[task_name] = {} else:
task_hierarchy[task_name] = []
if task is None:
continue
versions[task_name] = task.VERSION versions[task_name] = task.VERSION
configs[task_name] = dict(task.dump_config()) configs[task_name] = dict(task.dump_config())
...@@ -301,6 +310,8 @@ def evaluate( ...@@ -301,6 +310,8 @@ def evaluate(
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
if type(task) == tuple: if type(task) == tuple:
group, task = task group, task = task
if task is None:
continue
task.apply_filters() task.apply_filters()
### Collect values of metrics on all datapoints ### ### Collect values of metrics on all datapoints ###
...@@ -310,6 +321,8 @@ def evaluate( ...@@ -310,6 +321,8 @@ def evaluate(
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
if type(task) == tuple: if type(task) == tuple:
group, task = task group, task = task
if task is None:
continue
# TODO: make it possible to use a different metric per filter # TODO: make it possible to use a different metric per filter
# iterate over different filters used # iterate over different filters used
for key in task.instances[0].filtered_resps.keys(): for key in task.instances[0].filtered_resps.keys():
...@@ -396,27 +409,64 @@ def evaluate( ...@@ -396,27 +409,64 @@ def evaluate(
vals = vals_torch vals = vals_torch
if lm.rank == 0: if lm.rank == 0:
### Get task ordering for correct sample-wide aggregation
group_to_task = {}
for group in task_hierarchy.keys():
if group not in task_order:
task_order[group] = 0
if len(task_hierarchy[group]) > 0:
group_to_task[group] = task_hierarchy[group].copy()
for task in task_hierarchy[group]:
if task in task_order:
task_order[task] += 1
else:
task_order[task] = 1 + task_order[group]
if task in task_hierarchy:
group_to_task[group].remove(task)
group_to_task[group].extend(task_hierarchy[task])
task_to_group = {}
for group in group_to_task:
for task in group_to_task[group]:
if task in task_to_group:
task_to_group[task].append(group)
else:
task_to_group[task] = [group]
### Aggregate results over all datapoints ### ### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs # aggregate results ; run bootstrap CIs
for (task_name, key, metric), items in vals.items(): for (task_name, key, metric), items in vals.items():
task = task_dict[task_name] task = task_dict[task_name]
metric_key = metric + "," + key
if type(task) == tuple: if type(task) == tuple:
group, task = task group_name, task = task
task_score = task.aggregation()[metric](items) else:
results[task_name][metric + "," + key] = task_score group_name = None
# Need to put back in results agg_fn = task.aggregation()[metric]
# pythia | acc task_score = agg_fn(items)
# | perplexity
# | word_perplexity if group_name is not None:
# | byte_perplexity sample_metric_key = metric + "(sample agg)," + key
# | bits_per_byte for grouping in task_to_group[task_name]:
if task_name in task_groups: if metric_key in results[grouping]:
group_name = task_groups[task_name] results[grouping][metric_key].append(task_score)
if metric in list(aggregate[group_name].keys()): else:
aggregate[group_name][metric].append(task_score) results[grouping][metric_key] = [task_score]
else:
aggregate[group_name][metric] = [task_score] if sample_metric_key in results[grouping]:
results[grouping][sample_metric_key] += items
else:
results[grouping][sample_metric_key] = items.copy()
sample_agg_fn[grouping][sample_metric_key] = agg_fn
results[task_name][metric_key] = task_score
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this # so we run them less iterations. still looking for a cleaner way to do this
...@@ -431,19 +481,38 @@ def evaluate( ...@@ -431,19 +481,38 @@ def evaluate(
if stderr is not None: if stderr is not None:
results[task_name][metric + "_stderr" + "," + key] = stderr(items) results[task_name][metric + "_stderr" + "," + key] = stderr(items)
if bool(aggregate): if bool(results):
for group in aggregate.keys(): for task_or_group in results.keys():
for metric in aggregate[group].keys(): for metric in results[task_or_group].keys():
aggregate[group][metric] = np.average(aggregate[group][metric]) if type(results[task_or_group][metric]) == list:
versions[group] = "N/A" if "(sample agg)" in metric:
results[task_or_group][metric] = sample_agg_fn[
task_or_group
][metric](results[task_or_group][metric])
else:
results[task_or_group][metric] = np.average(
results[task_or_group][metric]
)
versions[task_or_group] = "N/A"
for task_name, task in task_dict.items():
if type(task) == tuple:
group_name, task = task
order = task_order[group_name]
tabbed_name = "-" * order + group_name
results_agg[tabbed_name] = results[group_name]
versions[tabbed_name] = versions[group_name]
if order == 0:
groups_agg[group_name] = results[group_name]
order = task_order[task_name]
tabbed_name = "-" * order + task_name
results_agg[tabbed_name] = results[task_name]
versions[tabbed_name] = versions[task_name]
results_dict = { results_dict = {
"results": dict(sorted(results.items())), "results": dict(results_agg.items()),
**( **({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
{"aggregate": dict(sorted(aggregate.items()))}
if bool(aggregate)
else {}
),
"configs": dict(sorted(configs.items())), "configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())), "versions": dict(sorted(versions.items())),
} }
......
...@@ -107,17 +107,20 @@ class HFLM(LM): ...@@ -107,17 +107,20 @@ class HFLM(LM):
if not (parallelize or accelerator.num_processes > 1): if not (parallelize or accelerator.num_processes > 1):
# use user-passed device # use user-passed device
device_list = set( device_list = set(
["cuda", "cpu", "mps"] ["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())] + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+ ["mps", "mps:0"]
) )
if device: if device:
if device not in device_list: if device not in device_list:
device = int(device) device = int(device)
self._device = torch.device(device) self._device = torch.device(device)
eval_logger.info(f"Using device '{device}'") eval_logger.info(f"Using device '{device}'")
if device == "mps": if device in ("mps", "mps:0") and "dev" not in torch.__version__:
eval_logger.info( eval_logger.info(
"MPS is still in beta and only supports float32; setting dtype to float32." "MPS: Setting dtype to float32. To use float16 with MPS, please install a nightly build of "
"PyTorch: pip3 install --pre torch torchvision torchaudio --index-url "
"https://download.pytorch.org/whl/nightly/cpu"
) )
else: else:
eval_logger.info("Device not specified") eval_logger.info("Device not specified")
......
import ast
from typing import Dict
from lm_eval import utils from lm_eval import utils
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
...@@ -5,7 +8,7 @@ from lm_eval.logger import eval_logger ...@@ -5,7 +8,7 @@ from lm_eval.logger import eval_logger
# Stores prompts in a dictionary indexed by 2 levels: # Stores prompts in a dictionary indexed by 2 levels:
# prompt category name, and prompt name. # prompt category name, and prompt name.
# This allows us to access prompts # This allows us to access prompts
PROMPT_REGISTRY: dict[str, dict[str, str]] = { PROMPT_REGISTRY: Dict[str, Dict[str, str]] = {
"qa-basic": { "qa-basic": {
"question-newline-answer": "Question: {{question}}\nAnswer:", "question-newline-answer": "Question: {{question}}\nAnswer:",
"q-newline-a": "Q: {{question}}\nA:", "q-newline-a": "Q: {{question}}\nA:",
...@@ -63,6 +66,12 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwa ...@@ -63,6 +66,12 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwa
else: else:
prompts = DatasetTemplates(dataset_name=dataset_name, subset_name=subset_name) prompts = DatasetTemplates(dataset_name=dataset_name, subset_name=subset_name)
category_name, prompt_name = use_prompt.split(":") category_name, *prompt_name = use_prompt.split(":")
# TODO allow to multiple prompt naming
# if len(prompt_name) > 1:
# prompt_list = []
# for prompt in prompt_name:
# prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
# else:
prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names) prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
return [":".join([category_name, prompt]) for prompt in prompt_list] return [":".join([category_name, prompt]) for prompt in prompt_list]
import os import os
import yaml import yaml
from typing import List, Union from typing import List, Union, Dict
from lm_eval import utils from lm_eval import utils
from lm_eval import prompts from lm_eval import prompts
...@@ -15,7 +15,7 @@ from lm_eval.api.registry import ( ...@@ -15,7 +15,7 @@ from lm_eval.api.registry import (
) )
def register_configurable_task(config: dict[str, str]) -> int: def register_configurable_task(config: Dict[str, str]) -> int:
SubClass = type( SubClass = type(
config["task"] + "ConfigurableTask", config["task"] + "ConfigurableTask",
(ConfigurableTask,), (ConfigurableTask,),
...@@ -38,7 +38,7 @@ def register_configurable_task(config: dict[str, str]) -> int: ...@@ -38,7 +38,7 @@ def register_configurable_task(config: dict[str, str]) -> int:
return 0 return 0
def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]: def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]:
all_configs = [] all_configs = []
if "use_prompt" in config: if "use_prompt" in config:
prompt_list = prompts.load_prompt_list( prompt_list = prompts.load_prompt_list(
...@@ -69,7 +69,7 @@ def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]: ...@@ -69,7 +69,7 @@ def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
return all_configs return all_configs
def get_task_name_from_config(task_config: dict[str, str]) -> str: def get_task_name_from_config(task_config: Dict[str, str]) -> str:
if "dataset_name" in task_config: if "dataset_name" in task_config:
return "{dataset_path}_{dataset_name}".format(**task_config) return "{dataset_path}_{dataset_name}".format(**task_config)
else: else:
...@@ -128,7 +128,7 @@ def get_task_name_from_object(task_object): ...@@ -128,7 +128,7 @@ def get_task_name_from_object(task_object):
# TODO: pass num_fewshot and other cmdline overrides in a better way # TODO: pass num_fewshot and other cmdline overrides in a better way
def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs): def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
config = {**kwargs} config = {**kwargs}
...@@ -136,6 +136,9 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs): ...@@ -136,6 +136,9 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
task_name_from_config_dict = {} task_name_from_config_dict = {}
task_name_from_object_dict = {} task_name_from_object_dict = {}
if type(task_name_list) != list:
task_name_list = [task_name_list]
for task_element in task_name_list: for task_element in task_name_list:
if isinstance(task_element, str): if isinstance(task_element, str):
...@@ -143,12 +146,20 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs): ...@@ -143,12 +146,20 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
group_name = task_element group_name = task_element
for task_name in GROUP_REGISTRY[task_element]: for task_name in GROUP_REGISTRY[task_element]:
if task_name not in task_name_from_registry_dict: if task_name not in task_name_from_registry_dict:
task_obj = get_task_dict(task_name)
if task_name in task_obj.keys():
task_dict = {
task_name: (group_name, task_obj[task_name]),
}
else:
task_dict = {
task_name: (group_name, None),
**task_obj,
}
task_name_from_registry_dict = { task_name_from_registry_dict = {
**task_name_from_registry_dict, **task_name_from_registry_dict,
task_name: ( **task_dict,
group_name,
get_task(task_name=task_name, config=config),
),
} }
else: else:
task_name = task_element task_name = task_element
......
task: nq_open
dataset_path: nq_open
output_type: greedy_until
training_split: train
validation_split: validation
description: "Answer these questions:\n"
doc_to_text: "Q: {{question}}?\nA:"
doc_to_target: "{{answer}}" # TODO: should be multi-target
fewshot_delimiter: "\n"
generation_kwargs:
until:
- "\n"
- "."
- ","
do_sample: false
temperature: 0.0
filter_list:
- name: remove_whitespace
filter:
- function: remove_whitespace
- function: take_first
target_delimiter: " "
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- "\ban|a|the\b"
...@@ -10,7 +10,7 @@ try: ...@@ -10,7 +10,7 @@ try:
except ModuleNotFoundError: except ModuleNotFoundError:
raise Exception( raise Exception(
"`pycountry` is required for generating translation task prompt templates. \ "`pycountry` is required for generating translation task prompt templates. \
please install pycountry via pip install lm-eval[multilingua] or pip install -e .[multilingual]", please install pycountry via pip install lm-eval[multilingual] or pip install -e .[multilingual]",
) )
......
...@@ -16,7 +16,6 @@ import gc ...@@ -16,7 +16,6 @@ import gc
import torch import torch
import transformers import transformers
from omegaconf import OmegaConf
from jinja2 import BaseLoader, Environment, StrictUndefined from jinja2 import BaseLoader, Environment, StrictUndefined
from itertools import islice from itertools import islice
...@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string): ...@@ -55,8 +54,8 @@ def simple_parse_args_string(args_string):
args_string = args_string.strip() args_string = args_string.strip()
if not args_string: if not args_string:
return {} return {}
arg_list = args_string.split(",") arg_list = [arg for arg in args_string.split(",") if arg]
args_dict = OmegaConf.to_object(OmegaConf.from_dotlist(arg_list)) args_dict = {k: v for k, v in [arg.split("=") for arg in arg_list]}
return args_dict return args_dict
...@@ -267,9 +266,9 @@ def make_table(result_dict, column: str = "results"): ...@@ -267,9 +266,9 @@ def make_table(result_dict, column: str = "results"):
from pytablewriter import MarkdownTableWriter, LatexTableWriter from pytablewriter import MarkdownTableWriter, LatexTableWriter
if column == "results": if column == "results":
column_name = "Task" column_name = "Tasks"
elif column == "aggregate": elif column == "groups":
column_name = "Benchmark" column_name = "Groups"
md_writer = MarkdownTableWriter() md_writer = MarkdownTableWriter()
latex_writer = LatexTableWriter() latex_writer = LatexTableWriter()
......
...@@ -209,8 +209,8 @@ def main() -> None: ...@@ -209,8 +209,8 @@ def main() -> None:
f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
) )
print(evaluator.make_table(results)) print(evaluator.make_table(results))
if "aggregate" in results: if "groups" in results:
print(evaluator.make_table(results, "aggregate")) print(evaluator.make_table(results, "groups"))
if __name__ == "__main__": if __name__ == "__main__":
......
[build-system] [build-system]
requires = ["setuptools>=40.8.0", "wheel"] requires = ["setuptools>=40.8.0", "wheel"]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
[project]
name = "lm_eval"
version = "1.0.0"
authors = [
{name="EleutherAI", email="contact@eleuther.ai"}
]
description = "A framework for evaluating language models"
readme = "README.md"
classifiers = [
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
requires-python = ">=3.9"
license = { "text" = "MIT" }
dependencies = [
"accelerate>=0.21.0",
"evaluate",
"datasets>=2.0.0",
"evaluate>=0.4.0",
"jsonlines",
"numexpr",
"peft>=0.2.0",
"pybind11>=2.6.2",
"pytablewriter",
"rouge-score>=0.0.4",
"sacrebleu>=1.5.0",
"scikit-learn>=0.24.1",
"sqlitedict",
"torch>=1.8",
"tqdm-multiprocess",
"transformers>=4.1",
"zstandard",
]
[tool.setuptools]
packages = ["lm_eval"]
# required to include yaml files in pip installation
[tool.setuptools.package-data]
lm_eval = ["**/*.yaml", "tasks/**/*"]
examples = ["**/*.yaml"]
[project.scripts]
lm-eval = "main:main"
lm_eval = "main:main"
[project.urls]
Homepage = "https://github.com/EleutherAI/lm-evaluation-harness"
Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
dev = ["black", "flake8", "pre-commit", "pytest", "pytest-cov"]
linting = [
"flake8",
"pylint",
"mypy",
"pre-commit",
]
testing = ["pytest", "pytest-cov", "pytest-xdist"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
promptsource = [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
]
gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
anthropic = ["anthropic"]
openai = ["openai", "tiktoken"]
all = [
"lm_eval[dev]",
"lm_eval[testing]",
"lm_eval[linting]",
"lm_eval[multilingual]",
"lm_eval[sentencepiece]",
"lm_eval[promptsource]",
"lm_eval[gptq]",
"lm_eval[anthropic]",
"lm_eval[openai]"
]
...@@ -38,13 +38,15 @@ def main(): ...@@ -38,13 +38,15 @@ def main():
iters = [] iters = []
for set in args.sets.split(","): for set in args.sets.split(","):
docs = None
if set == "train" and task.has_training_docs(): if set == "train" and task.has_training_docs():
docs = task.training_docs() docs = task.training_docs()
if set == "val" and task.has_validation_docs(): if set == "val" and task.has_validation_docs():
docs = task.validation_docs() docs = task.validation_docs()
if set == "test" and task.has_test_docs(): if set == "test" and task.has_test_docs():
docs = task.test_docs() docs = task.test_docs()
iters.append(docs) if docs is not None:
iters.append(docs)
docs = join_iters(iters) docs = join_iters(iters)
......
import setuptools import setuptools
import itertools
with open("README.md", "r", encoding="utf-8") as fh: # This is to make sure that the package supports editable installs
long_description = fh.read() setuptools.setup()
extras_require = {
"dev": ["black", "flake8", "pre-commit", "pytest", "pytest-cov"],
"linting": [
"flake8",
"pylint",
"mypy",
"pre-commit",
],
"testing": ["pytest", "pytest-cov", "pytest-xdist"],
"multilingual": ["nagisa>=0.2.7", "jieba>=0.42.1"],
"sentencepiece": ["sentencepiece>=0.1.98", "protobuf>=4.22.1", "pycountry"],
"promptsource": [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
"openai": ["openai", "tiktoken"],
}
extras_require["all"] = list(itertools.chain.from_iterable(extras_require.values()))
setuptools.setup(
name="lm_eval",
version="1.0.0",
author="EleutherAI",
author_email="contact@eleuther.ai",
description="A framework for evaluating language models",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/EleutherAI/lm-evaluation-harness",
packages=setuptools.find_packages(),
# required to include yaml files in pip installation
package_data={
"lm_eval": ["**/*.yaml", "tasks/**/*"],
"examples": ["**/*.yaml"],
},
entry_points={
"console_scripts": ["lm-eval = main:main", "lm_eval = main:main"],
},
include_package_data=True,
classifiers=[
"Development Status :: 3 - Alpha",
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires=">=3.9",
install_requires=[
"accelerate>=0.21.0",
"evaluate",
"datasets>=2.0.0",
"evaluate>=0.4.0",
"jsonlines",
"numexpr",
"omegaconf>=2.2",
"peft>=0.2.0",
"pybind11>=2.6.2",
"pytablewriter",
"rouge-score>=0.0.4",
"sacrebleu>=1.5.0",
"scikit-learn>=0.24.1",
"sqlitedict",
"torch>=1.8",
"tqdm-multiprocess",
"transformers>=4.1",
"zstandard",
],
extras_require=extras_require,
)
...@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks ...@@ -7,6 +7,7 @@ import lm_eval.tasks as tasks
# import lm_eval.models as models # import lm_eval.models as models
import lm_eval.api as api import lm_eval.api as api
import lm_eval.evaluator as evaluator import lm_eval.evaluator as evaluator
from typing import List
import random import random
import pytest import pytest
...@@ -26,7 +27,7 @@ import pytest ...@@ -26,7 +27,7 @@ import pytest
) )
], ],
) )
def test_evaluator(task_name: list[str], limit: int, model: str, model_args: str): def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
task_name = task_name task_name = task_name
limit = 10 limit = 10
......
...@@ -9,6 +9,7 @@ import os ...@@ -9,6 +9,7 @@ import os
# This is the path where the output for the changed files for the tasks folder is stored # This is the path where the output for the changed files for the tasks folder is stored
# FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt" # FILE_PATH = file_path = ".github/outputs/tasks_all_changed_and_modified_files.txt"
# reads a text file and returns a list of words # reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files # used to read the output of the changed txt from tj-actions/changed-files
def load_changed_files(file_path: str) -> List[str]: def load_changed_files(file_path: str) -> List[str]:
...@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]: ...@@ -32,7 +33,7 @@ def parser(full_path: List[str]) -> List[str]:
return list(_output) return list(_output)
def new_tasks() -> Union[list[str], None]: def new_tasks() -> Union[List[str], None]:
FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt" FILENAME = ".github/outputs/tasks_all_changed_and_modified_files.txt"
if os.path.exists(FILENAME): if os.path.exists(FILENAME):
# If tasks folder has changed then we get the list of files from FILENAME # If tasks folder has changed then we get the list of files from FILENAME
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment