Commit 2041dc34 authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

Merge branch 'big-refactor' into bigbench

parents 67c0f73a 15f4a3ef
name: Tasks Modified
# name: Tasks Modified
on:
push:
branches:
- 'big-refactor*'
pull_request:
branches:
- 'big-refactor*'
workflow_dispatch:
# comment/edit out the above to stop/change the triggers
jobs:
changed_files:
runs-on: ubuntu-latest # windows-latest || macos-latest
timeout-minutes: 120
name: Scan for changed tasks
steps:
- name: checkout
uses: actions/checkout@v3
with:
fetch-depth: 2 # OR "2" -> To retrieve the preceding commit.
# on:
# push:
# branches:
# - 'big-refactor*'
# pull_request:
# branches:
# - 'big-refactor*'
# workflow_dispatch:
# # comment/edit out the above to stop/change the triggers
# jobs:
# changed_files:
# runs-on: ubuntu-latest # windows-latest || macos-latest
# timeout-minutes: 120
# name: Scan for changed tasks
# steps:
# - name: checkout
# uses: actions/checkout@v3
# with:
# fetch-depth: 2 # OR "2" -> To retrieve the preceding commit.
# Uses the tj-actions/changed-files@v37 action to check for changes.
# Outputs provided here: https://github.com/tj-actions/changed-files#outputs
# The `files_yaml` input optionally takes a yaml string to specify filters,
# and prepends the filter name to the standard output names.
- name: Check task folders
id: changed-tasks
uses: tj-actions/changed-files@v37.1.2
with:
# tasks checks the tasks folder and api checks the api folder for changes
files_yaml: |
tasks:
- lm_eval/tasks/**
api:
- lm_eval/api/**
write_output_files: true
# # Uses the tj-actions/changed-files@v37 action to check for changes.
# # Outputs provided here: https://github.com/tj-actions/changed-files#outputs
# # The `files_yaml` input optionally takes a yaml string to specify filters,
# # and prepends the filter name to the standard output names.
# - name: Check task folders
# id: changed-tasks
# uses: tj-actions/changed-files@v37.1.2
# with:
# # tasks checks the tasks folder and api checks the api folder for changes
# files_yaml: |
# tasks:
# - lm_eval/tasks/**
# api:
# - lm_eval/api/**
# write_output_files: true
# The next step is optional; the files are written to the workspace by default (above).
# so it's just for debugging
- name: Run Tests
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: |
echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
echo "One or more test file(s) has changed."
echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
# # The next step is optional; the files are written to the workspace by default (above).
# # so it's just for debugging
# - name: Run Tests
# if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
# run: |
# echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
# echo "One or more test file(s) has changed."
# echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
- name: Set up Python 3.9
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
uses: actions/setup-python@v4
with:
python-version: 3.9
cache: 'pip'
cache-dependency-path: setup.py
- name: Install dependencies
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: |
python -m pip install --upgrade pip
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
# if new tasks are added, run tests on them
if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto
# if api is modified, run tests on it
- name: Test more tasks with pytest
env:
API: true
if: steps.changed-tasks.outputs.api_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto
# - name: Set up Python 3.9
# if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
# uses: actions/setup-python@v4
# with:
# python-version: 3.9
# cache: 'pip'
# cache-dependency-path: setup.py
# - name: Install dependencies
# if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
# run: |
# python -m pip install --upgrade pip
# pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
# # Install optional git dependencies
# # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
# - name: Test with pytest
# # if new tasks are added, run tests on them
# if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
# run: python -m pytest tests/test_tasks.py -s -vv
# # if api is modified, run tests on it
# - name: Test more tasks with pytest
# env:
# API: true
# if: steps.changed-tasks.outputs.api_any_modified == 'true'
# run: python -m pytest tests/test_tasks.py -s -vv
......@@ -22,10 +22,10 @@ jobs:
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.9
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
python-version: 3.9
python-version: 3.8
cache: pip
cache-dependency-path: setup.py
- name: Install dependencies
......@@ -40,39 +40,38 @@ jobs:
flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
# mypy turned off for now
# # mypy turned off for now
# - name: Lint with mypy
# run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
# Job 2
testcpu:
name: CPU Tests
runs-on: ubuntu-latest
strategy:
matrix:
python-version: [ "3.9", "3.10", "3.11" ]
timeout-minutes: 30
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
cache: pip
cache-dependency-path: setup.py
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
- name: Archive artifacts
uses: actions/upload-artifact@v3
with:
name: output_results
path: |
test_logs/*
# testcpu:
# name: CPU Tests
# runs-on: ubuntu-latest
# strategy:
# matrix:
# python-version: [ "3.8", "3.9", "3.10", "3.11" ]
# timeout-minutes: 30
# steps:
# - name: Checkout Code
# uses: actions/checkout@v3
# - name: Set up Python ${{ matrix.python-version }}
# uses: actions/setup-python@v4
# with:
# python-version: ${{ matrix.python-version }}
# cache: pip
# cache-dependency-path: setup.py
# - name: Install dependencies
# run: |
# python -m pip install --upgrade pip
# pip install -e '.[testing,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu
# # Install optional git dependencies
# # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
# - name: Test with pytest
# run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/tests_master --ignore=tests/extra
# - name: Archive artifacts
# uses: actions/upload-artifact@v3
# with:
# name: output_results
# path: |
# test_logs/*
......@@ -33,14 +33,14 @@ repos:
rev: 22.3.0
hooks:
- id: black
language_version: python3.9
language_version: python3.8
- repo: https://github.com/codespell-project/codespell
rev: v2.1.0
hooks:
- id: codespell
exclude: >
(?x)^(
.*\.json|ignore.txt
.*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml
)$
args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
- repo: https://github.com/pre-commit/mirrors-mypy
......
import abc
import os
from typing import Union, List, Tuple
import torch
from typing import Union, List, Tuple, Optional, Type, TypeVar
from sqlitedict import SqliteDict
import json
import hashlib
......@@ -11,6 +12,8 @@ from tqdm import tqdm
from lm_eval import utils
from lm_eval.logger import eval_logger
T = TypeVar("T", bound="LM")
class LM(abc.ABC):
def __init__(self) -> None:
......@@ -111,11 +114,28 @@ class LM(abc.ABC):
pass
@classmethod
def create_from_arg_string(cls, arg_string, additional_config=None):
def create_from_arg_string(
cls: Type[T], arg_string: str, additional_config: Optional[dict] = None
) -> T:
"""
Creates an instance of the LM class using the given argument string and additional config.
Parameters:
- arg_string: A string containing arguments in the format key1=value1,key2=value2.
- additional_config: Optional dictionary containing additional configuration parameters.
Returns:
- Instance of the LM class.
"""
additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
if args2.get("device") == "mps" or args.get("device") == "mps":
# TODO: delete once float16 MPS is fixed in torch stable
if (
args2.get("device") in ("mps", "mps:0")
or args.get("device") in ("mps", "mps:0")
and "dev" not in torch.__version__
):
args["dtype"] = "float32"
return cls(**args, **args2)
......
class Sampler:
class ContextSampler:
def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
self.rnd = rnd
assert self.rnd, "must pass rnd to FewShotSampler!"
......@@ -46,14 +46,14 @@ class Sampler:
)
+ self.target_delimiter
+ (
self.doc_to_target(doc)[0]
str(self.doc_to_target(doc)[0])
if type(self.doc_to_target(doc)) is list
else self.doc_to_target(doc)
if (
self.config.doc_to_choice is None
or type(self.doc_to_target(doc)) is str
)
else self.doc_to_choice(doc)[self.doc_to_target(doc)]
else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
)
for doc in selected_docs
]
......@@ -71,7 +71,19 @@ class Sampler:
return self.rnd.sample(self.docs, n)
class BalancedSampler(Sampler):
class FirstNSampler(ContextSampler):
def sample(self, n) -> None:
"""
Draw the first `n` samples in order from the specified split.
Used for tasks with "canonical" ordered fewshot examples, such as MMLU and CMMLU.
"""
assert n <= len(
self.docs
), f"Error: number of fewshot samples requested exceeds the {len(self.docs)} that are available."
return self.docs[:n]
class BalancedSampler(ContextSampler):
def sample(self, n) -> None:
"""
TODO: this should return approximately class-balanced samples from our fewshot examples.
......@@ -81,12 +93,27 @@ class BalancedSampler(Sampler):
pass
class ManualSampler(Sampler):
class ManualSampler(ContextSampler):
def sample(self, n) -> None:
""" """
pass
SAMPLER_REGISTRY = {
"default": ContextSampler,
"first_n": FirstNSampler,
}
def get_sampler(name):
try:
return SAMPLER_REGISTRY[name]
except KeyError:
raise ValueError(
f"Attempted to use contextsampler '{name}', but no sampling strategy for this name found! Supported model names: {', '.join(SAMPLER_REGISTRY.keys())}"
)
# TODO: how should we do design here? might be better to have a single sampler and pass more kwargs at init.
# Depends what's easier for new user to add own functionality on top of
......
......@@ -75,6 +75,7 @@ class TaskConfig(dict):
description: str = ""
target_delimiter: str = " "
fewshot_delimiter: str = "\n\n"
fewshot_config: dict = None
# runtime configuration options
num_fewshot: int = 0
# scoring options
......@@ -581,7 +582,7 @@ class ConfigurableTask(Task):
INV_AGG_REGISTRY = {v: k for k, v in AGGREGATION_REGISTRY.items()}
metric_agg = get_default_aggregation(metric_name)
eval_logger.warning(
f"metric {metric_name} is defined, but aggregation is not. "
f"[Task: {self._config.task}] metric {metric_name} is defined, but aggregation is not. "
f"using default "
f"aggregation={INV_AGG_REGISTRY[metric_agg]}"
)
......@@ -593,7 +594,7 @@ class ConfigurableTask(Task):
]
else:
eval_logger.warning(
f"metric {metric_name} is defined, but higher_is_better is not. "
f"[Task: {self._config.task}] metric {metric_name} is defined, but higher_is_better is not. "
f"using default "
f"higher_is_better={is_higher_better(metric_name)}"
)
......@@ -629,9 +630,11 @@ class ConfigurableTask(Task):
self.prompt = None
if self.fewshot_docs() is not None:
self.sampler = samplers.Sampler(
list(self.fewshot_docs()), self, rnd=random.Random(1234)
)
self.sampler = samplers.get_sampler(
self.config.fewshot_config.get("sampler", "default")
if self.config.fewshot_config
else "default"
)(list(self.fewshot_docs()), self, rnd=random.Random(1234))
if self.has_test_docs():
self.task_docs = self.test_docs()
......@@ -674,22 +677,22 @@ class ConfigurableTask(Task):
check_choices = test_choice
else:
check_choices = [test_target]
for choice in check_choices:
choice_has_whitespace = True if " " in choice else False
delimiter_has_whitespace = (
True if " " in self.config.target_delimiter else False
)
if delimiter_has_whitespace and choice_has_whitespace:
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" have whitespace'
)
elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
if self.config.doc_to_choice is not None:
for choice in check_choices:
choice_has_whitespace = True if choice[0].isspace() else False
delimiter_has_whitespace = (
True if self.config.target_delimiter[-1].isspace() else False
)
if delimiter_has_whitespace and choice_has_whitespace:
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" have whitespace'
)
elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
eval_logger.warning(
f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
)
def download(self, dataset_kwargs=None) -> None:
self.dataset = datasets.load_dataset(
path=self.DATASET_PATH,
......@@ -838,7 +841,10 @@ class ConfigurableTask(Task):
and (target_string[0] == "[")
and (target_string[-1] == "]")
):
return ast.literal_eval(target_string)
try:
return ast.literal_eval(target_string)
except (SyntaxError, ValueError):
return target_string
else:
return target_string
elif type(doc_to_target) == list:
......@@ -1067,6 +1073,9 @@ class ConfigurableTask(Task):
# it assumes that doc_to_target returns a number.
choices = self.doc_to_choice(doc)
gold = choices[gold]
# we expect multiple_targets to be a list.
elif self.multiple_target:
gold = list(gold)
else:
gold = str(gold)
......@@ -1077,6 +1086,10 @@ class ConfigurableTask(Task):
# return true if any are true
# TODO: this may break for multipLe_target, non zero-or-1 metrics
scores = []
if not isinstance(gold, list):
# sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
# print(gold)
gold = [gold]
for gold_option in gold:
try:
result_score = self._metric_fn_list[metric](
......
import os
import yaml
from lm_eval import utils
from lm_eval.tasks import register_configurable_task, check_prompt_config
from lm_eval.logger import eval_logger
from lm_eval.api.registry import (
TASK_REGISTRY,
GROUP_REGISTRY,
ALL_TASKS,
)
def include_benchmarks(task_dir: str) -> None:
for root, subdirs, file_list in os.walk(task_dir):
if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
for f in file_list:
if f.endswith(".yaml"):
try:
benchmark_path = os.path.join(root, f)
with open(benchmark_path, "rb") as file:
yaml_config = yaml.full_load(file)
assert "group" in yaml_config
group = yaml_config["group"]
all_task_list = yaml_config["task"]
config_list = [
task for task in all_task_list if type(task) != str
]
task_list = [
task for task in all_task_list if type(task) == str
]
for task_config in config_list:
var_configs = check_prompt_config(
{
**task_config,
**{"group": group},
}
)
for config in var_configs:
register_configurable_task(config)
task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names:
if task in TASK_REGISTRY:
if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task)
else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
except Exception as error:
eval_logger.warning(
"Failed to load benchmark in\n"
f" {benchmark_path}\n"
" Benchmark will not be added to registry\n"
f" Error: {error}"
)
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_benchmarks(task_dir)
group: minerva_math
task:
- minerva_math_algebra
- minerva_math_counting_and_prob
- minerva_math_geometry
- minerva_math_intermediate_algebra
- minerva_math_num_theory
- minerva_math_prealgebra
- minerva_math_precalc
......@@ -3,7 +3,7 @@ import string
import pickle
import traceback
from pprint import pprint
from typing import Iterator, Sequence, TypeVar
from typing import Iterator, Sequence, TypeVar, List, Tuple
# This is a cpp module. Compile janitor_util.cpp with:
# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
......@@ -21,7 +21,7 @@ T = TypeVar("T")
# Implementation from nltk source
# https://www.nltk.org/_modules/nltk/util.html
def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[tuple[T, ...]]:
def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
history = []
while n > 1:
# PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
......@@ -70,14 +70,14 @@ def word_ngrams(s: str, n: int) -> Iterator[str]:
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def split_indices(s: str) -> Iterator[tuple[str, tuple[int, int]]]:
def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
"""Splits a string on whitespaces and records the indices of each in the original string.
@:return generator((word, (start_idx, end_idx)), ...)
"""
return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
def word_ngrams_indices(s: str, n: int) -> Iterator[tuple[str, tuple[int, int]]]:
def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
"""Splits a string into pairs of (ngram words, their start/end indices)"""
tokens_with_indices = split_indices(s)
......@@ -157,7 +157,7 @@ class Janitor:
print("WARNING: Janitor running in python mode")
return self.register_contaminant_python(dirt_string)
def clean(self, dirty_string: str) -> list[str]:
def clean(self, dirty_string: str) -> List[str]:
"""Clean a string (e.g. a training set) by removing all ngrams previously
registered as contaminants. Returns a list of clean chunks, or empty if
the string was too dirty"""
......@@ -168,8 +168,8 @@ class Janitor:
return self.clean_python(dirty_string)
def _split_chunks(
self, dirty_string: str, dirty_parts: Sequence[tuple]
) -> list[str]:
self, dirty_string: str, dirty_parts: Sequence[Tuple]
) -> List[str]:
clean_chunks = []
splice_idx = 0
end = -1
......@@ -197,7 +197,7 @@ class Janitor:
janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
)
def clean_cpp(self, dirty_string: str) -> list[str]:
def clean_cpp(self, dirty_string: str) -> List[str]:
contamination_indices = janitor_util.clean_ngram_with_indices(
dirty_string, self.delete_chars, self.ngram_n
)
......@@ -215,7 +215,7 @@ class Janitor:
word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
)
def clean_python(self, dirty_string: str) -> list[str]:
def clean_python(self, dirty_string: str) -> List[str]:
contamination_indices = (
(None, *idx_pair)
for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
......
......@@ -11,7 +11,6 @@ import numpy as np
import lm_eval.api
import lm_eval.tasks
import lm_eval.benchmarks
import lm_eval.models
import lm_eval.api.metrics
import lm_eval.api.registry
......@@ -120,6 +119,8 @@ def simple_evaluate(
task_obj = task_dict[task_name]
if type(task_obj) == tuple:
group, task_obj = task_obj
if task_obj is None:
continue
config = task_obj._config
if num_fewshot is not None:
......@@ -209,23 +210,30 @@ def evaluate(
samples = collections.defaultdict(list)
# tracks all Instances/requests a model must generate output on.
requests = collections.defaultdict(list)
# Stores task scores based on task grouping.
aggregate = collections.defaultdict(dict)
# tracks if a task was chosen via user selecting a group containing it
task_groups = collections.defaultdict(dict)
# Aggregated task scores presented with groups
results_agg = collections.defaultdict(dict)
# Aggregated groups scores only
groups_agg = collections.defaultdict(dict)
# stores the amount to pad out reqs per req. type so that
# number of fwd passes per distributed rank is equal
padding_requests = collections.defaultdict(int)
# Stores group related keys and values for group-aggregation
task_groups = collections.defaultdict(dict)
# store the hierarchy to do proper ordering
task_hierarchy = collections.defaultdict(list)
# store the ordering of tasks and groups
task_order = collections.defaultdict(int)
# store the aggregation for aggregating across tasks in the same group
sample_agg_fn = collections.defaultdict(dict)
# get lists of each type of request
for task_name, task in task_dict.items():
if type(task) == tuple:
group, task = task
task_groups[task_name] = group
aggregate[task_name] = {}
group_name, task = task
task_hierarchy[group_name].append(task_name)
else:
task_hierarchy[task_name] = []
if task is None:
continue
versions[task_name] = task.VERSION
configs[task_name] = dict(task.dump_config())
......@@ -301,6 +309,8 @@ def evaluate(
for task_name, task in task_dict.items():
if type(task) == tuple:
group, task = task
if task is None:
continue
task.apply_filters()
### Collect values of metrics on all datapoints ###
......@@ -310,6 +320,8 @@ def evaluate(
for task_name, task in task_dict.items():
if type(task) == tuple:
group, task = task
if task is None:
continue
# TODO: make it possible to use a different metric per filter
# iterate over different filters used
for key in task.instances[0].filtered_resps.keys():
......@@ -396,34 +408,71 @@ def evaluate(
vals = vals_torch
if lm.rank == 0:
### Get task ordering for correct sample-wide aggregation
group_to_task = {}
for group in task_hierarchy.keys():
if group not in task_order:
task_order[group] = 0
if len(task_hierarchy[group]) > 0:
group_to_task[group] = task_hierarchy[group].copy()
for task in task_hierarchy[group]:
if task in task_order:
task_order[task] += 1
else:
task_order[task] = 1 + task_order[group]
if task in task_hierarchy:
group_to_task[group].remove(task)
group_to_task[group].extend(task_hierarchy[task])
task_to_group = {}
for group in group_to_task:
for task in group_to_task[group]:
if task in task_to_group:
task_to_group[task].append(group)
else:
task_to_group[task] = [group]
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
for (task_name, key, metric), items in vals.items():
task = task_dict[task_name]
metric_key = metric + "," + key
if type(task) == tuple:
group, task = task
task_score = task.aggregation()[metric](items)
results[task_name][metric + "," + key] = task_score
# Need to put back in results
# pythia | acc
# | perplexity
# | word_perplexity
# | byte_perplexity
# | bits_per_byte
if task_name in task_groups:
group_name = task_groups[task_name]
if metric in list(aggregate[group_name].keys()):
aggregate[group_name][metric].append(task_score)
else:
aggregate[group_name][metric] = [task_score]
group_name, task = task
else:
group_name = None
agg_fn = task.aggregation()[metric]
task_score = agg_fn(items)
if group_name is not None:
sample_metric_key = metric + "(sample agg)," + key
for grouping in task_to_group[task_name]:
if metric_key in results[grouping]:
results[grouping][metric_key].append(task_score)
else:
results[grouping][metric_key] = [task_score]
if sample_metric_key in results[grouping]:
results[grouping][sample_metric_key] += items
else:
results[grouping][sample_metric_key] = items.copy()
sample_agg_fn[grouping][sample_metric_key] = agg_fn
results[task_name][metric_key] = task_score
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
if False: # bootstrap_iters > 0:
if bootstrap_iters > 0:
stderr = lm_eval.api.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000)
bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
)
......@@ -431,19 +480,38 @@ def evaluate(
if stderr is not None:
results[task_name][metric + "_stderr" + "," + key] = stderr(items)
if bool(aggregate):
for group in aggregate.keys():
for metric in aggregate[group].keys():
aggregate[group][metric] = np.average(aggregate[group][metric])
versions[group] = "N/A"
if bool(results):
for task_or_group in results.keys():
for metric in results[task_or_group].keys():
if type(results[task_or_group][metric]) == list:
if "(sample agg)" in metric:
results[task_or_group][metric] = sample_agg_fn[
task_or_group
][metric](results[task_or_group][metric])
else:
results[task_or_group][metric] = np.average(
results[task_or_group][metric]
)
versions[task_or_group] = "N/A"
for task_name, task in task_dict.items():
if type(task) == tuple:
group_name, task = task
order = task_order[group_name]
tabbed_name = "-" * order + group_name
results_agg[tabbed_name] = results[group_name]
versions[tabbed_name] = versions[group_name]
if order == 0:
groups_agg[group_name] = results[group_name]
order = task_order[task_name]
tabbed_name = "-" * order + task_name
results_agg[tabbed_name] = results[task_name]
versions[tabbed_name] = versions[task_name]
results_dict = {
"results": dict(sorted(results.items())),
**(
{"aggregate": dict(sorted(aggregate.items()))}
if bool(aggregate)
else {}
),
"results": dict(results_agg.items()),
**({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
}
......
......@@ -107,17 +107,20 @@ class HFLM(LM):
if not (parallelize or accelerator.num_processes > 1):
# use user-passed device
device_list = set(
["cuda", "cpu", "mps"]
["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+ ["mps", "mps:0"]
)
if device:
if device not in device_list:
device = int(device)
self._device = torch.device(device)
eval_logger.info(f"Using device '{device}'")
if device == "mps":
if device in ("mps", "mps:0") and "dev" not in torch.__version__:
eval_logger.info(
"MPS is still in beta and only supports float32; setting dtype to float32."
"MPS: Setting dtype to float32. To use float16 with MPS, please install a nightly build of "
"PyTorch: pip3 install --pre torch torchvision torchaudio --index-url "
"https://download.pytorch.org/whl/nightly/cpu"
)
else:
eval_logger.info("Device not specified")
......@@ -505,7 +508,7 @@ class HFLM(LM):
self.tokenizer, stop, 1, context.shape[0]
)
return self.model.generate(
context,
input_ids=context,
max_length=max_length,
stopping_criteria=stopping_criteria,
pad_token_id=self.eot_token_id,
......
import ast
from typing import Dict
from lm_eval import utils
from lm_eval.logger import eval_logger
......@@ -5,7 +8,7 @@ from lm_eval.logger import eval_logger
# Stores prompts in a dictionary indexed by 2 levels:
# prompt category name, and prompt name.
# This allows us to access prompts
PROMPT_REGISTRY: dict[str, dict[str, str]] = {
PROMPT_REGISTRY: Dict[str, Dict[str, str]] = {
"qa-basic": {
"question-newline-answer": "Question: {{question}}\nAnswer:",
"q-newline-a": "Q: {{question}}\nA:",
......@@ -63,6 +66,12 @@ def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwa
else:
prompts = DatasetTemplates(dataset_name=dataset_name, subset_name=subset_name)
category_name, prompt_name = use_prompt.split(":")
category_name, *prompt_name = use_prompt.split(":")
# TODO allow to multiple prompt naming
# if len(prompt_name) > 1:
# prompt_list = []
# for prompt in prompt_name:
# prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
# else:
prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
return [":".join([category_name, prompt]) for prompt in prompt_list]
......@@ -16,7 +16,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] MCTACO
- [x] Pubmed QA
- [x] SciQ
- [ ] QASPER
- [x] QASPER
- [x] QA4MRE
- [x] TriviaQA
- [x] AI2 ARC
......@@ -36,7 +36,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen)
- [ ] MuTual
- [x] MuTual
- [ ] Hendrycks Math (Hailey)
- [x] Asdiv
- [ ] GSM8k
......
import os
import yaml
from typing import List, Union
from typing import List, Union, Dict
from lm_eval import utils
from lm_eval import prompts
......@@ -15,7 +15,7 @@ from lm_eval.api.registry import (
)
def register_configurable_task(config: dict[str, str]) -> int:
def register_configurable_task(config: Dict[str, str]) -> int:
SubClass = type(
config["task"] + "ConfigurableTask",
(ConfigurableTask,),
......@@ -38,7 +38,35 @@ def register_configurable_task(config: dict[str, str]) -> int:
return 0
def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
def register_configurable_group(config: Dict[str, str]) -> int:
group = config["group"]
all_task_list = config["task"]
config_list = [task for task in all_task_list if type(task) != str]
task_list = [task for task in all_task_list if type(task) == str]
for task_config in config_list:
var_configs = check_prompt_config(
{
**task_config,
**{"group": group},
}
)
for config in var_configs:
register_configurable_task(config)
task_names = utils.pattern_match(task_list, ALL_TASKS)
for task in task_names:
if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
if group in GROUP_REGISTRY:
GROUP_REGISTRY[group].append(task)
else:
GROUP_REGISTRY[group] = [task]
ALL_TASKS.add(group)
return 0
def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]:
all_configs = []
if "use_prompt" in config:
prompt_list = prompts.load_prompt_list(
......@@ -69,14 +97,14 @@ def check_prompt_config(config: dict[str, str]) -> List[dict[str, str]]:
return all_configs
def get_task_name_from_config(task_config: dict[str, str]) -> str:
def get_task_name_from_config(task_config: Dict[str, str]) -> str:
if "dataset_name" in task_config:
return "{dataset_path}_{dataset_name}".format(**task_config)
else:
return "{dataset_path}".format(**task_config)
def include_task_folder(task_dir: str) -> None:
def include_task_folder(task_dir: str, register_task=True) -> None:
"""
Calling this function
"""
......@@ -87,9 +115,16 @@ def include_task_folder(task_dir: str) -> None:
yaml_path = os.path.join(root, f)
try:
config = utils.load_yaml_config(yaml_path)
all_configs = check_prompt_config(config)
for config in all_configs:
register_configurable_task(config)
if register_task:
all_configs = check_prompt_config(config)
for config in all_configs:
register_configurable_task(config)
else:
# If a `task` in config is a list,
# that means it's a benchmark
if type(config["task"]) == list:
register_configurable_group(config)
except Exception as error:
eval_logger.warning(
......@@ -102,6 +137,8 @@ def include_task_folder(task_dir: str) -> None:
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_task_folder(task_dir)
# Register Benchmarks after all tasks have been added
include_task_folder(task_dir, register_task=False)
def get_task(task_name, config):
......@@ -128,7 +165,7 @@ def get_task_name_from_object(task_object):
# TODO: pass num_fewshot and other cmdline overrides in a better way
def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
config = {**kwargs}
......@@ -136,6 +173,9 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
task_name_from_config_dict = {}
task_name_from_object_dict = {}
if type(task_name_list) != list:
task_name_list = [task_name_list]
for task_element in task_name_list:
if isinstance(task_element, str):
......@@ -143,12 +183,20 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
group_name = task_element
for task_name in GROUP_REGISTRY[task_element]:
if task_name not in task_name_from_registry_dict:
task_obj = get_task_dict(task_name)
if task_name in task_obj.keys():
task_dict = {
task_name: (group_name, task_obj[task_name]),
}
else:
task_dict = {
task_name: (group_name, None),
**task_obj,
}
task_name_from_registry_dict = {
**task_name_from_registry_dict,
task_name: (
group_name,
get_task(task_name=task_name, config=config),
),
**task_dict,
}
else:
task_name = task_element
......
# ASDiv
### Paper
Title: `ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers`
Abstract: https://arxiv.org/abs/2106.15772
ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
patterns and problem types) English math word problem (MWP) corpus for evaluating
the capability of various MWP solvers. Existing MWP corpora for studying AI progress
remain limited either in language usage patterns or in problem types. We thus present
a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
types taught in elementary school. Each MWP is annotated with its problem type and grade
level (for indicating the level of difficulty).
NOTE: We currently ignore formulas for answer generation.
Homepage: https://github.com/chaochun/nlu-asdiv-dataset
### Citation
```
@misc{miao2021diverse,
title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
year={2021},
eprint={2106.15772},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
```
### Groups and Tasks
#### Groups
* Not part of a group yet.
#### Tasks
* `asdiv`
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: pythia
task:
- lambada_openai
- wikitext
- logiqa
- piqa
- sciq
- wsc
- wikitext
- winogrande
- arc
- logiqa
- wsc
- ai2_arc
- blimp
- hendrycksTest*
# C-Eval (Validation)
### Paper
C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models
https://arxiv.org/pdf/2305.08322.pdf
C-Eval is a comprehensive Chinese evaluation suite for foundation models.
It consists of 13948 multi-choice questions spanning 52 diverse disciplines
and four difficulty levels.
Homepage: https://cevalbenchmark.com/
### Citation
```bibtex
@article{huang2023ceval,
title={C-Eval: A Multi-Level Multi-Discipline Chinese Evaluation Suite for Foundation Models},
author={Huang, Yuzhen and Bai, Yuzhuo and Zhu, Zhihao and Zhang, Junlei and Zhang, Jinghan and Su, Tangjun and Liu, Junteng and Lv, Chuancheng and Zhang, Yikai and Lei, Jiayi and Fu, Yao and Sun, Maosong and He, Junxian},
journal={arXiv preprint arXiv:2305.08322},
year={2023}
}
```
SUBJECTS = {
"computer_network":"计算机网络",
"operating_system":"操作系统",
"computer_architecture":"计算机组成",
"college_programming":"大学编程",
"college_physics":"大学物理",
"college_chemistry":"大学化学",
"advanced_mathematics":"高等数学",
"probability_and_statistics":"概率统计",
"discrete_mathematics":"离散数学",
"electrical_engineer":"注册电气工程师",
"metrology_engineer":"注册计量师",
"high_school_mathematics":"高中数学",
"high_school_physics":"高中物理",
"high_school_chemistry":"高中化学",
"high_school_biology":"高中生物",
"middle_school_mathematics":"初中数学",
"middle_school_biology":"初中生物",
"middle_school_physics":"初中物理",
"middle_school_chemistry":"初中化学",
"veterinary_medicine":"兽医学",
"college_economics":"大学经济学",
"business_administration":"工商管理",
"marxism":"马克思主义基本原理",
"mao_zedong_thought":"毛泽东思想和中国特色社会主义理论体系概论",
"education_science":"教育学",
"teacher_qualification":"教师资格",
"high_school_politics":"高中政治",
"high_school_geography":"高中地理",
"middle_school_politics":"初中政治",
"middle_school_geography":"初中地理",
"modern_chinese_history":"近代史纲要",
"ideological_and_moral_cultivation":"思想道德修养与法律基础",
"logic":"逻辑学",
"law":"法学",
"chinese_language_and_literature":"中国语言文学",
"art_studies":"艺术学",
"professional_tour_guide":"导游资格",
"legal_professional":"法律职业资格",
"high_school_chinese":"高中语文",
"high_school_history":"高中历史",
"middle_school_history":"初中历史",
"civil_servant":"公务员",
"sports_science":"体育学",
"plant_protection":"植物保护",
"basic_medicine":"基础医学",
"clinical_medicine":"临床医学",
"urban_and_rural_planner":"注册城乡规划师",
"accountant":"注册会计师",
"fire_engineer":"注册消防工程师",
"environmental_impact_assessment_engineer":"环境影响评价工程师",
"tax_accountant":"税务师",
"physician":"医师资格"
}
# CMMLU
### Paper
CMMLU: Measuring massive multitask language understanding in Chinese
https://arxiv.org/abs/2306.09212
CMMLU is a comprehensive evaluation benchmark specifically designed to evaluate the knowledge and reasoning abilities of LLMs within the context of Chinese language and culture.
CMMLU covers a wide range of subjects, comprising 67 topics that span from elementary to advanced professional levels.
Homepage: https://github.com/haonan-li/CMMLU
### Citation
```bibtex
@misc{li2023cmmlu,
title={CMMLU: Measuring massive multitask language understanding in Chinese},
author={Haonan Li and Yixuan Zhang and Fajri Koto and Yifei Yang and Hai Zhao and Yeyun Gong and Nan Duan and Timothy Baldwin},
year={2023},
eprint={2306.09212},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
### Groups and Tasks
#### Groups
- `ceval-valid`: All 52 subjects of the C-Eval dataset, evaluated following the methodology in MMLU's original implementation. This implementation consists solely of the validation set of C-Eval, as the test set requires submission of model predictions to an external site.
#### Tasks
The following tasks evaluate subjects in the C-Eval dataset using loglikelihood-based multiple-choice scoring:
- `ceval-valid_{subject_english}`
### Checklist
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group: ceval-valid
dataset_path: ceval/ceval-exam
validation_split: val
fewshot_split: dev
fewshot_config:
sampler: first_n
output_type: multiple_choice
doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
version: "1.0"
"""
Take in a YAML, and output all other splits with this YAML
"""
import os
import yaml
import argparse
from tqdm import tqdm
from lm_eval.logger import eval_logger
SUBJECTS = {
"computer_network": "计算机网络",
"operating_system": "操作系统",
"computer_architecture": "计算机组成",
"college_programming": "大学编程",
"college_physics": "大学物理",
"college_chemistry": "大学化学",
"advanced_mathematics": "高等数学",
"probability_and_statistics": "概率统计",
"discrete_mathematics": "离散数学",
"electrical_engineer": "注册电气工程师",
"metrology_engineer": "注册计量师",
"high_school_mathematics": "高中数学",
"high_school_physics": "高中物理",
"high_school_chemistry": "高中化学",
"high_school_biology": "高中生物",
"middle_school_mathematics": "初中数学",
"middle_school_biology": "初中生物",
"middle_school_physics": "初中物理",
"middle_school_chemistry": "初中化学",
"veterinary_medicine": "兽医学",
"college_economics": "大学经济学",
"business_administration": "工商管理",
"marxism": "马克思主义基本原理",
"mao_zedong_thought": "毛泽东思想和中国特色社会主义理论体系概论",
"education_science": "教育学",
"teacher_qualification": "教师资格",
"high_school_politics": "高中政治",
"high_school_geography": "高中地理",
"middle_school_politics": "初中政治",
"middle_school_geography": "初中地理",
"modern_chinese_history": "近代史纲要",
"ideological_and_moral_cultivation": "思想道德修养与法律基础",
"logic": "逻辑学",
"law": "法学",
"chinese_language_and_literature": "中国语言文学",
"art_studies": "艺术学",
"professional_tour_guide": "导游资格",
"legal_professional": "法律职业资格",
"high_school_chinese": "高中语文",
"high_school_history": "高中历史",
"middle_school_history": "初中历史",
"civil_servant": "公务员",
"sports_science": "体育学",
"plant_protection": "植物保护",
"basic_medicine": "基础医学",
"clinical_medicine": "临床医学",
"urban_and_rural_planner": "注册城乡规划师",
"accountant": "注册会计师",
"fire_engineer": "注册消防工程师",
"environmental_impact_assessment_engineer": "环境影响评价工程师",
"tax_accountant": "税务师",
"physician": "医师资格",
}
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_yaml_path", required=True)
parser.add_argument("--save_prefix_path", default="ceval-valid")
parser.add_argument("--cot_prompt_path", default=None)
parser.add_argument("--task_prefix", default="")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path) as f:
cot_file = json.load(f)
for subject_eng, subject_zh in tqdm(SUBJECTS.items()):
if args.cot_prompt_path is not None:
description = cot_file[subject_eng]
else:
description = f"以下是中国关于{subject_zh}的单项选择题,请选出其中的正确答案。\n\n"
yaml_dict = {
"include": base_yaml_name,
"task": f"ceval-valid_{args.task_prefix}_{subject_eng}"
if args.task_prefix != ""
else f"ceval-valid_{subject_eng}",
"dataset_name": subject_eng,
"description": description,
}
file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
width=float("inf"),
allow_unicode=True,
default_style='"',
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment