Commit 5d3bf2e7 authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into openai_completions
parents f66730c4 bf26d979
"dataset_name": "tracking_shuffled_objects_five_objects"
"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
"doc_to_text": "Q: {{input}}\nA:"
"include": "_flan_zeroshot_template_yaml"
"task": "bbh_flan_zeroshot_tracking_shuffled_objects_five_objects"
"include": "_zeroshot_template_yaml"
"task": "bbh_zeroshot_tracking_shuffled_objects_five_objects"
"dataset_name": "tracking_shuffled_objects_seven_objects"
"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
"doc_to_text": "Q: {{input}}\nA:"
"include": "_flan_zeroshot_template_yaml"
"task": "bbh_flan_zeroshot_tracking_shuffled_objects_seven_objects"
"include": "_zeroshot_template_yaml"
"task": "bbh_zeroshot_tracking_shuffled_objects_seven_objects"
"dataset_name": "tracking_shuffled_objects_three_objects"
"description": "A task requiring determining the final positions of a set of objects given their initial positions and a description of a sequence of swaps.\n\n"
"doc_to_text": "Q: {{input}}\nA:"
"include": "_flan_zeroshot_template_yaml"
"task": "bbh_flan_zeroshot_tracking_shuffled_objects_three_objects"
"include": "_zeroshot_template_yaml"
"task": "bbh_zeroshot_tracking_shuffled_objects_three_objects"
"dataset_name": "web_of_lies"
"description": "Evaluate a random boolean function expressed as a word problem.\n\n"
"doc_to_text": "Q: {{input}}\nA:"
"include": "_flan_zeroshot_template_yaml"
"task": "bbh_flan_zeroshot_web_of_lies"
"include": "_zeroshot_template_yaml"
"task": "bbh_zeroshot_web_of_lies"
"dataset_name": "word_sorting"
"description": "Sort a list of words.\n\n"
"doc_to_text": "Q: {{input}}\nA:"
"include": "_flan_zeroshot_template_yaml"
"task": "bbh_flan_zeroshot_word_sorting"
"include": "_zeroshot_template_yaml"
"task": "bbh_zeroshot_word_sorting"
# Social IQA
### Paper
Title: Social IQA: Commonsense Reasoning about Social Interactions
Abstract: https://arxiv.org/abs/1904.09728
> We introduce Social IQa, the first largescale benchmark for commonsense reasoning about social situations. Social IQa contains 38,000 multiple choice questions for probing emotional and social intelligence in a variety of everyday situations (e.g., Q: "Jordan wanted to tell Tracy a secret, so Jordan leaned towards Tracy. Why did Jordan do this?" A: "Make sure no one else could hear"). Through crowdsourcing, we collect commonsense questions along with correct and incorrect answers about social interactions, using a new framework that mitigates stylistic artifacts in incorrect answers by asking workers to provide the right answer to a different but related question. Empirical results show that our benchmark is challenging for existing question-answering models based on pretrained language models, compared to human performance (>20% gap). Notably, we further establish Social IQa as a resource for transfer learning of commonsense knowledge, achieving state-of-the-art performance on multiple commonsense reasoning tasks (Winograd Schemas, COPA).
Homepage: https://allenai.org/data/socialiqa
### Citation
```
@inproceedings{sap2019social,
title={Social IQa: Commonsense Reasoning about Social Interactions},
author={Sap, Maarten and Rashkin, Hannah and Chen, Derek and Le Bras, Ronan and Choi, Yejin},
booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
pages={4463--4473},
year={2019}
}
```
### Checklist
For adding novel benchmarks/datasets to the library:
* [X] Is the task an existing benchmark in the literature?
* [X] Have you referenced the original paper that introduced the task?
* [X] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? The original paper doesn't have an associated implementation, but there is an official entry in [BigBench](https://github.com/google/BIG-bench/tree/main/bigbench/benchmark_tasks/social_iqa). I use the same prompting format as BigBench.
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
task: social_iqa
dataset_path: social_i_qa
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "Q: {{context}} {{question}}\nA:"
target_delimiter: " "
doc_to_choice: ["{{answerA}}", "{{answerB}}", "{{answerC}}"]
doc_to_target: "{{label}}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
......@@ -10,7 +10,7 @@ import collections
import importlib.util
import fnmatch
from typing import Iterator, List, Literal, Union
from typing import Iterator, List, Literal, Union, Any, Callable
import gc
import torch
......@@ -59,6 +59,11 @@ def handle_arg_string(arg):
return True
elif arg.lower() == "false":
return False
elif arg.isnumeric():
return int(arg)
try:
return float(arg)
except ValueError:
return arg
......@@ -84,6 +89,32 @@ def join_iters(iters):
def chunks(iter, n: int = 0, fn=None):
"""
Divides an iterable into chunks of specified size or based on a given function.
Useful for batching
Parameters:
- iter: The input iterable to be divided into chunks.
- n: An integer representing the size of each chunk. Default is 0.
- fn: A function that takes the current index and the iterable as arguments and returns the size of the chunk. Default is None.
Returns:
An iterator that yields chunks of the input iterable.
Example usage:
```
data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for chunk in chunks(data, 3):
print(chunk)
```
Output:
```
[1, 2, 3]
[4, 5, 6]
[7, 8, 9]
[10]
```
"""
arr = []
for i, x in enumerate(iter):
arr.append(x)
......@@ -194,7 +225,13 @@ def make_disjoint_window(pair):
class Reorderer:
def __init__(self, arr, fn) -> None:
def __init__(self, arr: List[Any], fn: Callable) -> None:
"""Reorder an array according to some function
Args:
arr (List[Any]): The initial array
fn (Callable[[Any], Any]): A function to determine the priority of elements
"""
self.size = len(arr)
arr = list(enumerate(arr))
arr = group(arr, lambda x: fn(x[1]))
......@@ -206,9 +243,22 @@ class Reorderer:
self.arr = arr
def get_reordered(self):
"""Gets the reordered array
Returns:
List[Any]: The reordered array
"""
return [x[1] for x in self.arr]
def get_original(self, newarr):
"""Restores the original order of a new array based on the old array's order
Args:
newarr (List[Any]): The array to be restored
Returns:
List[Any]: The array restored to the original order
"""
res = [None] * self.size
cov = [False] * self.size
......@@ -435,7 +485,6 @@ yaml.add_constructor("!function", import_function)
def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
if yaml_config is None:
with open(yaml_path, "rb") as file:
yaml_config = yaml.full_load(file)
......@@ -456,7 +505,6 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
include_path.reverse()
final_yaml_config = {}
for path in include_path:
# Assumes that path is a full path.
# If not found, assume the included yaml
# is in the same dir as the original yaml
......@@ -579,7 +627,14 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
self.done_tracker = [False] * batch_size
self.sequence = sequence
self.sequence_ids = tokenizer.encode(sequence, add_special_tokens=False)
self.sequence_id_len = len(self.sequence_ids)
# we look back for 2 more tokens than it takes to encode our stop sequence
# because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
# and we don't want to mistakenly not stop a generation because our
# (string) stop sequence was output in a different tokenization
# NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
# and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
self.sequence_id_len = len(self.sequence_ids) + 2
self.tokenizer = tokenizer
def __call__(self, input_ids, scores, **kwargs) -> bool:
......@@ -589,7 +644,6 @@ class MultiTokenEOSCriteria(transformers.StoppingCriteria):
]
lookback_tokens_batch = self.tokenizer.batch_decode(lookback_ids_batch)
for i, done in enumerate(self.done_tracker):
if not done:
self.done_tracker[i] = self.sequence in lookback_tokens_batch[i]
......
......@@ -71,6 +71,7 @@ promptsource = [
gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
anthropic = ["anthropic"]
openai = ["openai", "tiktoken"]
vllm = ["vllm"]
all = [
"lm_eval[dev]",
"lm_eval[testing]",
......@@ -80,5 +81,6 @@ all = [
"lm_eval[promptsource]",
"lm_eval[gptq]",
"lm_eval[anthropic]",
"lm_eval[openai]"
"lm_eval[openai]",
"lm_eval[vllm]",
]
import unittest
from unittest.mock import patch
import hashlib
import json
import os
import pickle
from lm_eval.models.gguf import GGUFLM
from lm_eval.api.instance import Instance
base_url = "https://matthoffner-ggml-llm-api.hf.space"
def gguf_completion_mock(base_url=None, **kwargs):
# Generate a hash from the parameters
hash_kwargs = {"base_url": base_url, **kwargs}
hash = hashlib.sha256(
json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
).hexdigest()
fname = f"./tests/testdata/gguf_test_{hash}.pkl"
if os.path.exists(fname):
with open(fname, "rb") as fh:
return pickle.load(fh)
else:
print("The file does not exist, attempting to write...")
if "stop" in kwargs:
result = {
"choices": [
{
"text": f"generated text until {kwargs['stop']}",
"logprobs": {"token_logprobs": [-1.2345], "text_offset": 0},
"finish_reason": "length",
}
]
}
else:
# generated with # curl -X 'POST' 'http://localhost:8000/v1/completions' -H 'accept: application/json' -H 'Content-Type: application/json' -d '{"prompt": "string", "logprobs": 10, "temperature": 0.0, "max_tokens": 1, "echo": true}'
result = {
"id": "cmpl-4023976b-bc6a-43b0-a5a9-629f4216c7f3",
"object": "text_completion",
"created": 1700511361,
"model": "../llama-2-7b.Q8_0.gguf",
"choices": [
{
"text": "string(",
"index": 0,
"logprobs": {
"text_offset": [0, 7],
"token_logprobs": [None, -1.033263319857306],
"tokens": [" string", "("],
"top_logprobs": [
None,
{
"(": -1.033263319857306,
"[]": -2.6530743779017394,
".": -3.0377145947291324,
"\n": -3.0399156750513976,
"_": -3.510376089937872,
" =": -3.6957918347193663,
",": -3.9309459866358702,
" of": -4.2834550083949035,
'("': -4.322762841112799,
"()": -4.426229113466925,
},
],
},
"finish_reason": "length",
}
],
"usage": {
"prompt_tokens": 2,
"completion_tokens": 1,
"total_tokens": 3,
},
}
try:
os.makedirs(os.path.dirname(fname), exist_ok=True)
print("Writing file at", fname)
with open(fname, "wb") as fh:
pickle.dump(result, fh)
print("File written successfully")
except Exception as e:
print("File writing failed:", e)
return result
class GGUFLMTest(unittest.TestCase):
@patch(
"lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
)
def test_loglikelihood(self, gguf_completion_mock):
lm = GGUFLM(base_url)
# Test loglikelihood
requests = [
Instance(
request_type="loglikelihood",
doc=args,
arguments=args,
idx=i,
)
for i, args in enumerate([("str", "ing"), ("str", "ing")])
]
res = lm.loglikelihood(requests)
# Assert the loglikelihood response is correct
expected_res = [(logprob, True) for logprob in [0, 0]]
self.assertEqual(res, expected_res)
@patch(
"lm_eval.models.gguf.GGUFLM.gguf_completion", side_effect=gguf_completion_mock
)
def test_generate_until(self, gguf_completion_mock):
lm = GGUFLM(base_url)
# Test generate_until
requests = [
Instance(
request_type="generate_until",
doc={"input": doc},
arguments=(doc, {"until": stop}),
idx=i,
)
for i, (doc, stop) in enumerate([("input1", "stop1"), ("input2", "stop2")])
]
res = lm.generate_until(requests)
# Assert the generate_until response is correct
expected_res = ["generated text until stop1", "generated text until stop2"]
self.assertEqual(res, expected_res)
# @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock)
# def test_loglikelihood_rolling(self, gguf_completion_mock):
# lm = GGUFLM(base_url)
# # Test loglikelihood_rolling
# requests = ["input1", "input2"]
# res = lm.loglikelihood_rolling(requests)
# # Assert the loglikelihood_rolling response is correct
# expected_res = [(-1.2345, True), (-1.2345, True)]
# self.assertEqual(res, expected_res)
if __name__ == "__main__":
unittest.main()
import pytest
from typing import List
from lm_eval.api.instance import Instance
import lm_eval.tasks as tasks
import sys
import torch
@pytest.mark.skip(reason="requires CUDA")
class TEST_VLLM:
vllm = pytest.importorskip("vllm")
try:
from lm_eval.models.vllm_causallms import VLLM
LM = VLLM(pretrained="EleutherAI/pythia-70m")
except ModuleNotFoundError:
pass
torch.use_deterministic_algorithms(True)
tasks.initialize_tasks()
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")() # type: ignore
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until: List[Instance] = generate_until_task.instances
rolling_task = tasks.TASK_REGISTRY.get("wikitext")() # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: List[Instance] = rolling_task.instances
# TODO: make proper tests
def test_logliklihood(self) -> None:
res = self.LM.loglikelihood(self.MULTIPLE_CH)
assert len(res) == len(self.MULTIPLE_CH)
for x in res:
assert isinstance(x[0], float)
def test_generate_until(self) -> None:
res = self.LM.generate_until(self.generate_until)
assert len(res) == len(self.generate_until)
for x in res:
assert isinstance(x, str)
def test_logliklihood_rolling(self) -> None:
res = self.LM.loglikelihood_rolling(self.ROLLING)
for x in res:
assert isinstance(x, float)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment