Commit 0348ed97 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged latest update

parents 451a1873 6769119f
# datasets
This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not.
__NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path.
__WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed.
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""ASDIV dataset."""
import os
import xml.etree.ElementTree as ET
import datasets
_CITATION = """\
@misc{miao2021diverse,
title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
year={2021},
eprint={2106.15772},
archivePrefix={arXiv},
primaryClass={cs.AI}
}
"""
_DESCRIPTION = """\
ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
patterns and problem types) English math word problem (MWP) corpus for evaluating
the capability of various MWP solvers. Existing MWP corpora for studying AI progress
remain limited either in language usage patterns or in problem types. We thus present
a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
types taught in elementary school. Each MWP is annotated with its problem type and grade
level (for indicating the level of difficulty).
"""
_HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"
# TODO: Add the licence for the dataset here if you can find it
_LICENSE = ""
_URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
class ASDiv(datasets.GeneratorBasedBuilder):
"""ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
VERSION = datasets.Version("0.0.1")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name="asdiv",
version=VERSION,
description="A diverse corpus for evaluating and developing english math word problem solvers",
)
]
def _info(self):
features = datasets.Features(
{
"body": datasets.Value("string"),
"question": datasets.Value("string"),
"solution_type": datasets.Value("string"),
"answer": datasets.Value("string"),
"formula": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)
def _split_generators(self, dl_manager):
urls = _URLS
data_dir = dl_manager.download_and_extract(urls)
base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50"
return [
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
# These kwargs will be passed to _generate_examples
gen_kwargs={
"filepath": os.path.join(
data_dir, base_filepath, "dataset", "ASDiv.xml"
),
"split": datasets.Split.VALIDATION,
},
),
]
# method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
def _generate_examples(self, filepath, split):
tree = ET.parse(filepath)
root = tree.getroot()
for key, problem in enumerate(root.iter("Problem")):
yield key, {
"body": problem.find("Body").text,
"question": problem.find("Question").text,
"solution_type": problem.find("Solution-Type").text,
"answer": problem.find("Answer").text,
"formula": problem.find("Formula").text,
}
{"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n year={2021},\n eprint={2106.15772},\n archivePrefix={arXiv},\n primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
import os
from typing import Any
import zstandard
import json
import jsonlines
......@@ -9,7 +10,7 @@ import tqdm
from pathlib import Path
def json_serial(obj):
def json_serial(obj: Any) -> str:
"""JSON serializer for objects not serializable by default json code"""
if isinstance(obj, (datetime.datetime,)):
......@@ -19,7 +20,7 @@ def json_serial(obj):
# Modified version of lm_dataformat Archive for single file.
class Archive:
def __init__(self, file_path, compression_level=3):
def __init__(self, file_path: str, compression_level: int = 3) -> None:
self.file_path = file_path
dir_name = os.path.dirname(file_path)
if dir_name:
......@@ -28,7 +29,7 @@ class Archive:
self.cctx = zstandard.ZstdCompressor(level=compression_level)
self.compressor = self.cctx.stream_writer(self.fh)
def add_data(self, data, meta={}):
def add_data(self, data, meta={}) -> None:
self.compressor.write(
json.dumps({"text": data, "meta": meta}, default=json_serial).encode(
"UTF-8"
......@@ -36,7 +37,7 @@ class Archive:
+ b"\n"
)
def commit(self):
def commit(self) -> None:
self.compressor.flush(zstandard.FLUSH_FRAME)
self.fh.flush()
self.fh.close()
......@@ -44,10 +45,16 @@ class Archive:
# Modified version of lm_dataformat Reader with self.fh set, allowing peeking for tqdm.
class Reader:
def __init__(self):
def __init__(self) -> None:
pass
def read(self, file, get_meta=False, autojoin_paragraphs=True, para_joiner="\n\n"):
def read(
self,
file,
get_meta: bool = False,
autojoin_paragraphs: bool = True,
para_joiner: str = "\n\n",
):
with open(file, "rb") as fh:
self.fh = fh
cctx = zstandard.ZstdDecompressor()
......@@ -72,7 +79,7 @@ class Reader:
class TextArchive:
def __init__(self, file_path, mode="rb+"):
def __init__(self, file_path, mode: str = "rb+") -> None:
self.file_path = file_path
dir_name = os.path.dirname(file_path)
if dir_name:
......@@ -83,21 +90,21 @@ class TextArchive:
self.fh = open(self.file_path, mode)
def add_data(self, data):
def add_data(self, data) -> None:
self.fh.write(data.encode("UTF-8") + b"\n")
def commit(self):
def commit(self) -> None:
self.fh.flush()
self.fh.close()
class TextReader:
def __init__(self, file_path):
def __init__(self, file_path) -> None:
self.file_path = file_path
# Optimized mmap read with infrequent tqdm updates to maintain speed
# Tested up to 250MB/s.
def read_tqdm(self, update_frequency=10000):
def read_tqdm(self, update_frequency: int = 10000):
current_file_position = 0
line_counter = 0
with open(self.file_path, "r") as fh, tqdm.tqdm(
......@@ -149,7 +156,7 @@ class TextReader:
# Optimized for speed. Decompresses the archive in shell before
# using the mmap'd TextReader.
class ZStdTextReader:
def __init__(self, file):
def __init__(self, file) -> None:
self.file = file
def read_tqdm(self):
......
......@@ -11,7 +11,7 @@ from .archiver import ZStdTextReader
# Was used for testing the evaluator decoupled from the full logic below
def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
def get_train_overlap_stub(docs: dict, ngrams_path: str, ngrams_n_size: str):
simulated_overlap = 0.1
contaminated = int(len(docs) * simulated_overlap)
return random.sample(range(len(docs)), contaminated)
......@@ -25,6 +25,7 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
# scripts are an info.json file containing the n_gram_size (13) and a bunch of "ngrams_{x}.bkt.txt.sorted.zst"
# files. These should exist in the "ngrams_path" provided to this function.
# Algorithm:
# 1. Build lookups for each dataset {ngram: list(document_ids)}
# 2. Merge into an overall lookup {ngram: [(task_name, task_set, doc_ids),]}
......@@ -33,7 +34,7 @@ def get_train_overlap_stub(docs, ngrams_path, ngrams_n_size):
# 4. Strip the task_set from the dictionary keys and return
#
# We cache the task+set lookups as well as the overlaps.
def get_train_overlap(docs_by_task_set, ngrams_path, limit):
def get_train_overlap(docs_by_task_set: dict, ngrams_path: str, limit: int) -> dict:
# return get_train_overlap_stub(docs, ngrams_path, ngrams_n_size)
info_dict_path = os.path.join(ngrams_path, "info.json")
......@@ -46,7 +47,7 @@ def get_train_overlap(docs_by_task_set, ngrams_path, limit):
print("Building Lookups...")
start = time.perf_counter()
def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit):
def get_overlaps_dump_path(task_name, task_set, ngrams_n_size, limit) -> str:
return f"data/{task_name}/{task_set}_{ngrams_n_size}grams_limit{limit}.overlaps"
lookups = {}
......
import re
import string
import timeit
import pickle
import traceback
from pprint import pprint
from typing import Iterator, Sequence, TypeVar, List, Tuple
# This is a cpp module. Compile janitor_util.cpp with:
# c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) -undefined dynamic_lookup
......@@ -16,10 +16,12 @@ except Exception:
traceback.print_exc()
JANITOR_CPP = False
T = TypeVar("T")
# Implementation from nltk source
# https://www.nltk.org/_modules/nltk/util.html
def form_ngrams(sequence, n):
def form_ngrams(sequence: Iterator[T], n: int) -> Iterator[Tuple[T, ...]]:
history = []
while n > 1:
# PEP 479, prevent RuntimeError from being raised when StopIteration bubbles out of generator
......@@ -36,7 +38,7 @@ def form_ngrams(sequence, n):
del history[0]
def word_ngrams(s, n):
def word_ngrams(s: str, n: int) -> Iterator[str]:
"""Splits a string into ngram words"""
tokens = s.split() # not a generator :(
ngram_seqs = form_ngrams(iter(tokens), n)
......@@ -68,14 +70,14 @@ def word_ngrams(s, n):
# https://stackoverflow.com/questions/13734451/string-split-with-indices-in-python
def split_indices(s):
def split_indices(s: str) -> Iterator[Tuple[str, Tuple[int, int]]]:
"""Splits a string on whitespaces and records the indices of each in the original string.
@:return generator((word, (start_idx, end_idx)), ...)
"""
return ((m.group(0), (m.start(), m.end() - 1)) for m in re.finditer(r"\S+", s))
def word_ngrams_indices(s, n):
def word_ngrams_indices(s: str, n: int) -> Iterator[Tuple[str, Tuple[int, int]]]:
"""Splits a string into pairs of (ngram words, their start/end indices)"""
tokens_with_indices = split_indices(s)
......@@ -104,16 +106,15 @@ def word_ngrams_indices(s, n):
class Janitor:
# FIXME delete_chars: Should anything else go here? Special chars?
def __init__(
self,
ngram_n=13,
window_to_remove=200,
too_dirty_cutoff=10,
minimum_slice_length=200,
delete_chars=string.punctuation,
):
ngram_n: int = 13,
window_to_remove: int = 200,
too_dirty_cutoff: int = 10,
minimum_slice_length: int = 200,
delete_chars: str = string.punctuation,
) -> None:
self.ngram_n = ngram_n
self.window_to_remove = window_to_remove
self.too_dirty_cutoff = too_dirty_cutoff
......@@ -135,11 +136,11 @@ class Janitor:
# I/O for saving contamination ngrams
##############
def save_contamination_ngrams(self, filename):
def save_contamination_ngrams(self, filename: str) -> None:
with open(filename, "wb") as fp:
pickle.dump(filename, fp)
def load_contamination_ngrams(self, filename):
def load_contamination_ngrams(self, filename: str) -> None:
with open(filename, "rb") as fp:
self.dirt_ngrams = pickle.load(fp)
......@@ -147,7 +148,7 @@ class Janitor:
# Call these :)
##############
def register_contaminant(self, dirt_string):
def register_contaminant(self, dirt_string: str) -> None:
"""Register a string as contamination to be removed, e.g. a test set
This breaks the dirt_string into ngrams to store for future cleaning"""
if JANITOR_CPP:
......@@ -156,7 +157,7 @@ class Janitor:
print("WARNING: Janitor running in python mode")
return self.register_contaminant_python(dirt_string)
def clean(self, dirty_string):
def clean(self, dirty_string: str) -> List[str]:
"""Clean a string (e.g. a training set) by removing all ngrams previously
registered as contaminants. Returns a list of clean chunks, or empty if
the string was too dirty"""
......@@ -166,7 +167,9 @@ class Janitor:
print("WARNING: Janitor running in python mode")
return self.clean_python(dirty_string)
def _split_chunks(self, dirty_string, dirty_parts):
def _split_chunks(
self, dirty_string: str, dirty_parts: Sequence[Tuple]
) -> List[str]:
clean_chunks = []
splice_idx = 0
end = -1
......@@ -189,12 +192,12 @@ class Janitor:
# Fast C++
##############
def register_contaminant_cpp(self, dirt_string):
def register_contaminant_cpp(self, dirt_string) -> None:
self.dirt_ngrams.update(
janitor_util.clean_ngram(dirt_string, self.delete_chars, self.ngram_n)
)
def clean_cpp(self, dirty_string):
def clean_cpp(self, dirty_string: str) -> List[str]:
contamination_indices = janitor_util.clean_ngram_with_indices(
dirty_string, self.delete_chars, self.ngram_n
)
......@@ -204,15 +207,15 @@ class Janitor:
# Slow python
##############
def normalize_string(self, s):
def normalize_string(self, s: str) -> str:
return s.translate(self.translation_table)
def register_contaminant_python(self, dirt_string):
def register_contaminant_python(self, dirt_string: str) -> None:
self.dirt_ngrams.update(
word_ngrams(self.normalize_string(dirt_string), self.ngram_n)
)
def clean_python(self, dirty_string):
def clean_python(self, dirty_string: str) -> List[str]:
contamination_indices = (
(None, *idx_pair)
for dirty_ngram, idx_pair in word_ngrams_indices(dirty_string, self.ngram_n)
......
......@@ -11,7 +11,6 @@ import numpy as np
import lm_eval.api
import lm_eval.tasks
import lm_eval.benchmarks
import lm_eval.models
import lm_eval.api.metrics
import lm_eval.api.registry
......@@ -42,11 +41,11 @@ def simple_evaluate(
device=None,
use_cache=None,
limit=None,
bootstrap_iters=100000,
check_integrity=False,
bootstrap_iters: int = 100000,
check_integrity: bool = False,
decontamination_ngrams_path=None,
write_out=False,
log_samples=True,
write_out: bool = False,
log_samples: bool = True,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -117,10 +116,11 @@ def simple_evaluate(
task_dict = lm_eval.tasks.get_task_dict(tasks)
for task_name in task_dict.keys():
task_obj = task_dict[task_name]
if type(task_obj) == tuple:
group, task_obj = task_obj
if task_obj is None:
continue
config = task_obj._config
if num_fewshot is not None:
......@@ -175,17 +175,17 @@ def evaluate(
lm,
task_dict,
limit=None,
bootstrap_iters=100000,
bootstrap_iters: int = 100000,
decontamination_ngrams_path=None,
write_out=False,
log_samples=True,
write_out: bool = False,
log_samples: bool = True,
):
"""Instantiate and evaluate a model on a list of tasks.
:param lm: obj
Language Model
:param task_dict: dict[str, Task]
Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
Dictionary of tasks. Tasks will be taken to have name type(task).config.task .
:param limit: int, optional
Limit the number of examples per task (only use this for testing)
:param bootstrap_iters:
......@@ -210,24 +210,30 @@ def evaluate(
samples = collections.defaultdict(list)
# tracks all Instances/requests a model must generate output on.
requests = collections.defaultdict(list)
# Stores task scores based on task grouping.
aggregate = collections.defaultdict(dict)
# tracks if a task was chosen via user selecting a group containing it
task_groups = collections.defaultdict(dict)
# Aggregated task scores presented with groups
results_agg = collections.defaultdict(dict)
# Aggregated groups scores only
groups_agg = collections.defaultdict(dict)
# stores the amount to pad out reqs per req. type so that
# number of fwd passes per distributed rank is equal
padding_requests = collections.defaultdict(int)
# Stores group related keys and values for group-aggregation
task_groups = collections.defaultdict(dict)
# store the hierarchy to do proper ordering
task_hierarchy = collections.defaultdict(list)
# store the ordering of tasks and groups
task_order = collections.defaultdict(int)
# store the aggregation for aggregating across tasks in the same group
sample_agg_fn = collections.defaultdict(dict)
# get lists of each type of request
for task_name, task in task_dict.items():
if type(task) == tuple:
group, task = task
task_groups[task_name] = group
aggregate[task_name] = {}
group_name, task = task
task_hierarchy[group_name].append(task_name)
else:
task_hierarchy[task_name] = []
if task is None:
continue
versions[task_name] = task.VERSION
configs[task_name] = dict(task.dump_config())
......@@ -252,7 +258,8 @@ def evaluate(
# print the prompt for the first few documents
if inst.doc_id < 1:
eval_logger.info(
f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\n{inst.args[0]}\n(end of prompt on previous line)"
f"Task: {task_name}; document {inst.doc_id}; context prompt (starting on next line):\
\n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)"
)
eval_logger.info(f"Request: {str(inst)}")
......@@ -302,6 +309,8 @@ def evaluate(
for task_name, task in task_dict.items():
if type(task) == tuple:
group, task = task
if task is None:
continue
task.apply_filters()
### Collect values of metrics on all datapoints ###
......@@ -311,6 +320,8 @@ def evaluate(
for task_name, task in task_dict.items():
if type(task) == tuple:
group, task = task
if task is None:
continue
# TODO: make it possible to use a different metric per filter
# iterate over different filters used
for key in task.instances[0].filtered_resps.keys():
......@@ -349,7 +360,6 @@ def evaluate(
# if multigpu, then gather data across all ranks
# first gather logged samples across all ranks
for task_name, task_samples in list(samples.items()):
full_samples = [None] * lm.world_size
torch.distributed.all_gather_object(full_samples, task_samples)
......@@ -358,33 +368,39 @@ def evaluate(
# then collect metrics across all ranks
vals_torch = collections.defaultdict(list)
for (task_name, key, metric), items in vals.items():
numitem = 0
if type(items[0]) == tuple:
numitem = len(items[0])
# distributed gather requires all ranks to have same dimensions
# so we pad out with float32 min value
pad_value = torch.finfo(torch.float32).min
metrics_tensor = torch.tensor(items, device=lm.device)
original_dtype = metrics_tensor.dtype # store original dtype
torch_device_tensor = lm.accelerator.pad_across_processes(
metrics_tensor.to(torch.float32), pad_index=pad_value
)
gathered_item = lm.accelerator.gather(torch_device_tensor)
if isinstance(items[0], (str, list)):
# handle the string case
gathered_items = [None] * lm.accelerator.num_processes
torch.distributed.all_gather_object(gathered_items, items)
if numitem > 0:
gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
gathered_item = list(itertools.chain.from_iterable(gathered_items))
else:
gathered_filtered = gathered_item[gathered_item != pad_value]
# distributed gather requires all ranks to have same dimensions
# so we pad out with float32 min value
pad_value = torch.finfo(torch.float32).min
metrics_tensor = torch.tensor(items, device=lm.device)
original_dtype = metrics_tensor.dtype # store original dtype
torch_device_tensor = lm.accelerator.pad_across_processes(
metrics_tensor.to(torch.float32), pad_index=pad_value
)
gathered_item = lm.accelerator.gather(torch_device_tensor)
gathered_item = (
gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
)
# reconvert if we were passed a tuple of values
if numitem > 0:
gathered_item = [tuple(g) for g in gathered_item]
if numitem > 0:
gathered_filtered = gathered_item[gathered_item[:, 0] != pad_value]
else:
gathered_filtered = gathered_item[gathered_item != pad_value]
gathered_item = (
gathered_filtered.to(original_dtype).cpu().detach().numpy().tolist()
)
# reconvert if we were passed a tuple of values
if numitem > 0:
gathered_item = [tuple(g) for g in gathered_item]
if lm.rank == 0:
vals_torch[(task_name, key, metric)] = gathered_item
......@@ -392,34 +408,71 @@ def evaluate(
vals = vals_torch
if lm.rank == 0:
### Get task ordering for correct sample-wide aggregation
group_to_task = {}
for group in task_hierarchy.keys():
if group not in task_order:
task_order[group] = 0
if len(task_hierarchy[group]) > 0:
group_to_task[group] = task_hierarchy[group].copy()
for task in task_hierarchy[group]:
if task in task_order:
task_order[task] += 1
else:
task_order[task] = 1 + task_order[group]
if task in task_hierarchy:
group_to_task[group].remove(task)
group_to_task[group].extend(task_hierarchy[task])
task_to_group = {}
for group in group_to_task:
for task in group_to_task[group]:
if task in task_to_group:
task_to_group[task].append(group)
else:
task_to_group[task] = [group]
### Aggregate results over all datapoints ###
# aggregate results ; run bootstrap CIs
for (task_name, key, metric), items in vals.items():
task = task_dict[task_name]
metric_key = metric + "," + key
if type(task) == tuple:
group, task = task
task_score = task.aggregation()[metric](items)
results[task_name][metric + "," + key] = task_score
# Need to put back in results
# pythia | acc
# | perplexity
# | word_perplexity
# | byte_perplexity
# | bits_per_byte
if task_name in task_groups:
group_name = task_groups[task_name]
if metric in list(aggregate[group_name].keys()):
aggregate[group_name][metric].append(task_score)
else:
aggregate[group_name][metric] = [task_score]
group_name, task = task
else:
group_name = None
agg_fn = task.aggregation()[metric]
task_score = agg_fn(items)
if group_name is not None:
sample_metric_key = metric + "(sample agg)," + key
for grouping in task_to_group[task_name]:
if metric_key in results[grouping]:
results[grouping][metric_key].append(task_score)
else:
results[grouping][metric_key] = [task_score]
if sample_metric_key in results[grouping]:
results[grouping][sample_metric_key] += items
else:
results[grouping][sample_metric_key] = items.copy()
sample_agg_fn[grouping][sample_metric_key] = agg_fn
results[task_name][metric_key] = task_score
# hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
# so we run them less iterations. still looking for a cleaner way to do this
if bootstrap_iters > 0:
stderr = lm_eval.api.metrics.stderr_for_metric(
metric=task.aggregation()[metric],
bootstrap_iters=min(bootstrap_iters, 1000)
bootstrap_iters=min(bootstrap_iters, 100)
if metric in ["bleu", "chrf", "ter"]
else bootstrap_iters,
)
......@@ -427,19 +480,38 @@ def evaluate(
if stderr is not None:
results[task_name][metric + "_stderr" + "," + key] = stderr(items)
if bool(aggregate):
for group in aggregate.keys():
for metric in aggregate[group].keys():
aggregate[group][metric] = np.average(aggregate[group][metric])
versions[group] = "N/A"
if bool(results):
for task_or_group in results.keys():
for metric in results[task_or_group].keys():
if type(results[task_or_group][metric]) == list:
if "(sample agg)" in metric:
results[task_or_group][metric] = sample_agg_fn[
task_or_group
][metric](results[task_or_group][metric])
else:
results[task_or_group][metric] = np.average(
results[task_or_group][metric]
)
versions[task_or_group] = "N/A"
for task_name, task in task_dict.items():
if type(task) == tuple:
group_name, task = task
order = task_order[group_name]
tabbed_name = "-" * order + group_name
results_agg[tabbed_name] = results[group_name]
versions[tabbed_name] = versions[group_name]
if order == 0:
groups_agg[group_name] = results[group_name]
order = task_order[task_name]
tabbed_name = "-" * order + task_name
results_agg[tabbed_name] = results[task_name]
versions[tabbed_name] = versions[task_name]
results_dict = {
"results": dict(sorted(results.items())),
**(
{"aggregate": dict(sorted(aggregate.items()))}
if bool(aggregate)
else {}
),
"results": dict(results_agg.items()),
**({"groups": dict(groups_agg.items())} if bool(groups_agg) else {}),
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
}
......
from lm_eval.api.filter import FilterEnsemble
from . import selection
from . import extraction
from . import transformation
FILTER_REGISTRY = {
......@@ -10,6 +11,9 @@ FILTER_REGISTRY = {
"take_first_k": selection.TakeKFilter,
"remove_whitespace": extraction.WhitespaceFilter,
"cot_filter": extraction.CoTFilter,
"lowercase": transformation.LowercaseFilter,
"uppercase": transformation.UppercaseFilter,
"map": transformation.MapFilter,
# TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
# that takes an input and returns a scalar and then should select the max reward,
# or should implement different filters for different ways of handling a reward model's inference.
......@@ -18,14 +22,16 @@ FILTER_REGISTRY = {
def get_filter(filter_name):
return FILTER_REGISTRY[filter_name]
if filter_name in FILTER_REGISTRY:
return FILTER_REGISTRY[filter_name]
else:
return filter_name
def build_filter_ensemble(filter_name, components):
"""
Create a filtering pipeline.
"""
filters = []
for (function, kwargs) in components:
if kwargs is None:
......
......@@ -9,7 +9,7 @@ class DecontaminationFilter(Filter):
name = "track_decontamination"
def __init__(self, path):
def __init__(self, path) -> None:
"""
TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").
......@@ -17,7 +17,7 @@ class DecontaminationFilter(Filter):
"""
self._decontam_results = None
def apply(self, reps):
def apply(self, resps, docs) -> None:
"""
Return {"no_contamination", "only_contamination"} keys for the 2 different subsets
"""
......
......@@ -6,7 +6,9 @@ from lm_eval.api.filter import Filter
class RegexFilter(Filter):
""" """
def __init__(self, regex_pattern=r"#### (\-?[0-9\.\,]+)", fallback="[invalid]"):
def __init__(
self, regex_pattern: str = r"#### (\-?[0-9\.\,]+)", fallback: str = "[invalid]"
) -> None:
"""
pass a string `regex` to run `re.compile(r"regex")` on.
`fallback` defines the output returned if no matches for the regex are located.
......@@ -15,7 +17,7 @@ class RegexFilter(Filter):
self.regex = re.compile(regex_pattern)
self.fallback = fallback
def apply(self, resps):
def apply(self, resps, docs):
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
......@@ -41,12 +43,11 @@ class RegexFilter(Filter):
class WhitespaceFilter(Filter):
""" """
def __init__(self):
def __init__(self) -> None:
pass
def apply(self, resps):
def apply(self, resps, docs):
def filter_set(inst):
filtered_resp = []
for resp in inst:
if resp.startswith(" "):
......
......@@ -4,12 +4,12 @@ from lm_eval.api.filter import Filter
class TakeFirstFilter(Filter):
def __init__(self):
def __init__(self) -> None:
"""
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""
def apply(self, resps):
def apply(self, resps, docs):
"""
Assuming each entry of `resps` is a list of model responses, we discard all but the first response.
"""
......@@ -17,13 +17,12 @@ class TakeFirstFilter(Filter):
class TakeKFilter(Filter):
def __init__(self, *args, **kwargs):
def __init__(self, *args, **kwargs) -> None:
self.k = kwargs.pop("k")
super().__init__(*args, **kwargs)
def apply(self, resps):
def apply(self, resps, docs):
# check we have at least k responses per doc, else we can't take the first k
assert (
len(resps[0]) >= self.k
......@@ -32,12 +31,12 @@ class TakeKFilter(Filter):
class MajorityVoteFilter(Filter):
def __init__(self):
def __init__(self) -> None:
"""
Can define custom behavior here, if an individual instantiation of a Filter class should have state.
"""
def apply(self, resps):
def apply(self, resps, docs):
"""
Each entry of `resps` is a list of model responses.
We select the response that occurs most frequently in each entry of `resps`.
......
from lm_eval.api.filter import Filter
class LowercaseFilter(Filter):
def __init__(self) -> None:
pass
def apply(self, resps, docs):
def filter_set(inst):
return [resp.lower() for resp in inst]
return [filter_set(resp) for resp in resps]
class UppercaseFilter(Filter):
def __init__(self) -> None:
pass
def apply(self, resps, docs):
def filter_set(inst):
return [resp.upper() for resp in inst]
return [filter_set(resp) for resp in resps]
class MapFilter(Filter):
def __init__(self, mapping_dict: dict = {}, default_value=None) -> None:
"""
Initializes the MapFilter with a given mapping dictionary and default value.
Args:
- mapping_dict (dict): A dictionary containing the key-value mappings.
Default is an empty dictionary.
- default_value (Any): The value to be returned when a key is not found in the mapping_dict.
Default is None.
Example:
mapper = MapFilter({'A': 1, 'B': 2}, default_value=0)
"""
assert isinstance(
mapping_dict, dict
), "Provided mapping_dict is not a dictionary"
self.mapping_dict = mapping_dict
self.default_value = default_value
def apply(self, resps, docs):
def filter_set(inst):
return [self.mapping_dict.get(resp, self.default_value) for resp in inst]
return [filter_set(resp) for resp in resps]
......@@ -6,3 +6,5 @@ logging.basicConfig(
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
......@@ -76,7 +76,7 @@ class AnthropicLM(LM):
max_tokens_to_sample: int = 256,
temperature: float = 0, # defaults to 1
**kwargs, # top_p, top_k, etc.
):
) -> None:
"""Anthropic API wrapper.
:param model: str
......@@ -135,11 +135,10 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
def tok_decode(self, tokens: List[int]) -> str:
return self.tokenizer.decode(tokens)
def _loglikelihood_tokens(self, requests, disable_tqdm=False):
def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
raise NotImplementedError("No support for logits.")
def greedy_until(self, requests) -> List[str]:
if not requests:
return []
......
......@@ -5,7 +5,7 @@ from lm_eval.api.registry import register_model
@register_model("dummy")
class DummyLM(LM):
def __init__(self):
def __init__(self) -> None:
super().__init__()
@classmethod
......
import os
import torch
import transformers
from transformers.models.auto.modeling_auto import (
......@@ -20,7 +22,7 @@ from lm_eval.api.registry import register_model
from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
from accelerate import Accelerator, find_executable_batch_size
from accelerate import Accelerator, find_executable_batch_size, DistributedType
from typing import List, Optional, Union
......@@ -67,6 +69,7 @@ class HFLM(LM):
revision: Optional[str] = "main",
subfolder: Optional[str] = None,
tokenizer: Optional[str] = None,
truncation: Optional[bool] = False,
max_length: Optional[int] = None,
device: Optional[str] = "cuda",
dtype: Optional[Union[str, torch.dtype]] = "auto",
......@@ -75,6 +78,7 @@ class HFLM(LM):
low_cpu_mem_usage: Optional[bool] = True,
trust_remote_code: Optional[bool] = False,
use_fast_tokenizer: Optional[bool] = True,
cache_dir: Optional[Union[str, os.PathLike]] = None,
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
parallelize: Optional[bool] = False,
......@@ -90,7 +94,7 @@ class HFLM(LM):
bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
gptq: Optional[Union[bool, str]] = False,
gptq_use_triton: Optional[bool] = False,
):
) -> None:
super().__init__()
assert isinstance(device, str)
......@@ -103,17 +107,20 @@ class HFLM(LM):
if not (parallelize or accelerator.num_processes > 1):
# use user-passed device
device_list = set(
["cuda", "cpu", "mps"]
["cuda", "cpu"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+ ["mps", "mps:0"]
)
if device:
if device not in device_list:
device = int(device)
self._device = torch.device(device)
eval_logger.info(f"Using device '{device}'")
if device == "mps":
if device in ("mps", "mps:0") and "dev" not in torch.__version__:
eval_logger.info(
"MPS is still in beta and only supports float32; setting dtype to float32."
"MPS: Setting dtype to float32. To use float16 with MPS, please install a nightly build of "
"PyTorch: pip3 install --pre torch torchvision torchaudio --index-url "
"https://download.pytorch.org/whl/nightly/cpu"
)
else:
eval_logger.info("Device not specified")
......@@ -240,6 +247,8 @@ class HFLM(LM):
use_fast=use_fast_tokenizer,
)
self.truncation = truncation
self.vocab_size = self.tokenizer.vocab_size
self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
......@@ -289,9 +298,16 @@ class HFLM(LM):
"Failed to place model onto specified device. This may be because the model is quantized via `bitsandbytes`. If the desired GPU is being used, this message is safe to ignore."
)
else:
self._model = accelerator.prepare_model(
self.model, evaluation_mode=True
)
assert accelerator.distributed_type in [
DistributedType.FSDP,
DistributedType.MULTI_GPU,
], "Unsupported distributed type provided. Only DDP and FSDP are supported."
if accelerator.distributed_type == DistributedType.FSDP:
self._model = accelerator.prepare(self.model)
else:
self._model = accelerator.prepare_model(
self.model, evaluation_mode=True
)
self._device = torch.device(f"cuda:{accelerator.local_process_index}")
self.accelerator = accelerator
......@@ -334,7 +350,7 @@ class HFLM(LM):
return self._DEFAULT_MAX_LENGTH
@property
def max_gen_toks(self):
def max_gen_toks(self) -> int:
return 256
@property
......@@ -353,7 +369,7 @@ class HFLM(LM):
def world_size(self):
return self._world_size
def _detect_batch_size(self, requests=None, pos=0):
def _detect_batch_size(self, requests=None, pos: int = 0):
if requests:
_, context_enc, continuation_enc = requests[pos]
max_length = len(
......@@ -403,12 +419,13 @@ class HFLM(LM):
utils.clear_torch_cache()
return batch_size
def tok_encode(self, string: str, left_truncate_len=None):
def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None):
""" """
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
add_special_tokens = False
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
add_special_tokens = True
if add_special_tokens is None:
if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
add_special_tokens = False
elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
add_special_tokens = True
encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)
......@@ -419,7 +436,11 @@ class HFLM(LM):
return encoding
def tok_batch_encode(
self, strings: List[str], padding_side="left", left_truncate_len=None
self,
strings: List[str],
padding_side: str = "left",
left_truncate_len: int = None,
truncation: bool = False,
):
# encode a batch of strings. converts to tensors and pads automatically, unlike tok_encode.
old_padding_side = self.tokenizer.padding_side
......@@ -432,6 +453,7 @@ class HFLM(LM):
encoding = self.tokenizer(
strings,
truncation=truncation,
padding="longest",
return_tensors="pt",
add_special_tokens=add_special_tokens,
......@@ -487,7 +509,7 @@ class HFLM(LM):
self.tokenizer, stop, 1, context.shape[0]
)
return self.model.generate(
context,
input_ids=context,
max_length=max_length,
stopping_criteria=stopping_criteria,
pad_token_id=self.eot_token_id,
......@@ -518,8 +540,12 @@ class HFLM(LM):
if n_spaces > 0:
continuation = context[-n_spaces:] + continuation
context = context[:-n_spaces]
whole_enc = self.tok_encode(context + continuation)
context_enc = self.tok_encode(context)
whole_enc = self.tok_encode(context + continuation, add_special_tokens=False)
context_enc = self.tok_encode(context, add_special_tokens=False)
# whole_enc = self.tok_encode(context + continuation)
# context_enc = self.tok_encode(context, add_special_tokens=False)
context_enc_len = len(context_enc)
continuation_enc = whole_enc[context_enc_len:]
return context_enc, continuation_enc
......@@ -595,7 +621,9 @@ class HFLM(LM):
return loglikelihoods
def _loglikelihood_tokens(self, requests, disable_tqdm=False, override_bs=None):
def _loglikelihood_tokens(
self, requests, disable_tqdm: bool = False, override_bs=None
):
# TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
res = []
......@@ -856,7 +884,9 @@ class HFLM(LM):
# encode, pad, and truncate contexts for this batch
context_enc, attn_masks = self.tok_batch_encode(
contexts, left_truncate_len=max_ctx_len
contexts,
left_truncate_len=max_ctx_len,
truncation=self.truncation,
)
context_enc = context_enc.to(self.device)
attn_masks = attn_masks.to(self.device)
......
......@@ -69,7 +69,7 @@ class OpenaiCompletionsLM(LM):
engine: str = "text-davinci-003",
truncate: bool = False,
batch_size: int = 1,
):
) -> None:
"""
:param engine: str
......@@ -99,12 +99,12 @@ class OpenaiCompletionsLM(LM):
return self.end_of_text_token_id
@property
def max_length(self):
def max_length(self) -> int:
# Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
return 2048
@property
def max_gen_toks(self):
def max_gen_toks(self) -> int:
return 256
@property
......@@ -152,7 +152,7 @@ class OpenaiCompletionsLM(LM):
return self._loglikelihood_tokens(new_reqs)
def _loglikelihood_tokens(
self, requests, disable_tqdm=False
self, requests, disable_tqdm: bool = False
) -> List[Tuple[float, bool]]:
res = []
......
......@@ -41,7 +41,7 @@ def textsynth_completion(**kwargs):
@register_model("textsynth")
class TextSynthLM(LM):
def __init__(self, engine, truncate=False):
def __init__(self, engine, truncate: bool = False) -> None:
"""
:param engine: str
TextSynth API engine (e.g. `gptj_6B`)
......@@ -62,12 +62,12 @@ class TextSynthLM(LM):
raise NotImplementedError()
@property
def max_length(self):
def max_length(self) -> int:
# NOTE: Turn on truncation to avoid errors on long inputs.
return 2048
@property
def max_gen_toks(self):
def max_gen_toks(self) -> int:
return 256
@property
......
import os
import ast
from typing import Dict
from lm_eval import utils
from lm_eval.logger import eval_logger
......@@ -5,7 +9,7 @@ from lm_eval.logger import eval_logger
# Stores prompts in a dictionary indexed by 2 levels:
# prompt category name, and prompt name.
# This allows us to access prompts
PROMPT_REGISTRY = {
PROMPT_REGISTRY: Dict[str, Dict[str, str]] = {
"qa-basic": {
"question-newline-answer": "Question: {{question}}\nAnswer:",
"q-newline-a": "Q: {{question}}\nA:",
......@@ -13,7 +17,7 @@ PROMPT_REGISTRY = {
}
def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None):
# unpack prompt name
category_name, prompt_name = prompt_id.split(":")
if subset_name is None:
......@@ -44,6 +48,14 @@ def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
raise ValueError(
f"{prompt_name} not in prompt list {prompts.all_template_names}"
)
elif ".yaml" in category_name:
import yaml
with open(category_name, "rb") as file:
prompt_yaml_file = yaml.full_load(file)
prompt_string = prompt_yaml_file["prompts"][prompt_name]
return PromptString(prompt_string)
else:
try:
return PROMPT_REGISTRY[category_name][prompt_name]
......@@ -54,15 +66,62 @@ def get_prompt(prompt_id: str, dataset_name=None, subset_name=None):
)
def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwargs):
def load_prompt_list(
use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
):
from promptsource.templates import DatasetTemplates
category_name, prompt_name = use_prompt.split(":")
if subset_name is None:
prompts = DatasetTemplates(dataset_name=dataset_name)
else:
prompts = DatasetTemplates(dataset_name=dataset_name, subset_name=subset_name)
if category_name == "promptsource":
from promptsource.templates import DatasetTemplates
category_name, prompt_name = use_prompt.split(":")
prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
if subset_name is None:
prompts = DatasetTemplates(dataset_name=dataset_name)
else:
prompts = DatasetTemplates(
dataset_name=dataset_name, subset_name=subset_name
)
prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
elif ".yaml" in category_name:
import yaml
if yaml_path is not None:
category_name = os.path.realpath(os.path.join(yaml_path, category_name))
with open(category_name, "rb") as file:
prompt_yaml_file = yaml.full_load(file)
prompt_list = utils.pattern_match(
prompt_name, prompt_yaml_file["prompts"].keys()
)
# category_name, *prompt_name = use_prompt.split(":")
# TODO allow to multiple prompt naming
# if len(prompt_name) > 1:
# prompt_list = []
# for prompt in prompt_name:
# prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
# else:
# prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
return [":".join([category_name, prompt]) for prompt in prompt_list]
class PromptString:
def __init__(self, prompt_string):
self.prompt_string = prompt_string
def apply(self, doc):
doc_to_text = self.prompt_string["doc_to_text"]
doc_to_target = self.prompt_string["doc_to_target"]
# TODO need a way to process doc_to_choice
if "doc_to_choice" in self.prompt_string:
raise "Not yet implemented to accept doc_to_choice"
text_string = utils.apply_template(doc_to_text, doc)
target_string = utils.apply_template(doc_to_target, doc)
return [text_string, target_string]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment