Commit 6e3ef5ff authored by Benjamin Fattori's avatar Benjamin Fattori
Browse files

Merge remote-tracking branch 'upstream/big-refactor' into refactor-more-tasks

parents 026d2c21 070b6b9c
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "best_option"
use_prompt: "promptsource:best_option"
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "cause_effect"
use_prompt: "promptsource:cause_effect"
group: group:
- super-glue-t5-prompt - super-glue-t5-prompt
task: t5-prompt task: super_glue-copa-t5-prompt
reference: "From Raffel et. al. 2019"
dataset_path: super_glue dataset_path: super_glue
dataset_name: copa dataset_name: copa
training_split: train training_split: train
......
def convert_choice(choice):
return choice[0].lower() + choice[1:]
def doc_to_text(doc):
# Drop the period
connector = {
"cause": "because",
"effect": "therefore",
}[doc["question"]]
return doc["premise"].strip()[:-1] + f" {connector}"
def doc_to_target(doc):
correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
# Connect the sentences
return " " + convert_choice(correct_choice)
group: group:
- super-glue-t5-prompt - super-glue-t5-prompt
task: t5-prompt task: super_glue-record-t5-prompt
reference: "From Raffel et. al. 2019"
dataset_path: super_glue dataset_path: super_glue
dataset_name: record dataset_name: record
training_split: train training_split: train
......
group:
- super-glue-lm-eval-v1
task: "wic"
dataset_path: super_glue
dataset_name: wic
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
gold_alias: "{{label}}" # this will be cast to an int.
template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
metric_list:
- metric: acc
group:
- super-glue-promptsource
task: "GPT-3-prompt"
dataset_path: super_glue
dataset_name: wic
training_split: train
validation_split: validation
use_prompt: "promptsource:GPT-3-prompt"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "GPT-3-prompt-with-label"
use_prompt: "promptsource:GPT-3-prompt-with-label"
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "affirmation_true_or_false"
use_prompt: "promptsource:affirmation_true_or_false"
def doc_to_text(doc):
return (
"Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
" two sentences above?\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
doc["sentence1"][doc["start1"] : doc["end1"]],
)
)
def doc_to_target(doc):
return " {}".format({0: "no", 1: "yes"}[doc["label"]])
group: group:
- super-glue-t5-prompt - super-glue-t5-prompt
task: t5-prompt task: super_glue-wsc-t5-prompt
reference: "From Raffel et. al. 2019"
dataset_path: super_glue dataset_path: super_glue
dataset_name: wsc dataset_name: wsc
training_split: train training_split: train
......
...@@ -10,7 +10,7 @@ import collections ...@@ -10,7 +10,7 @@ import collections
import importlib.util import importlib.util
import fnmatch import fnmatch
from typing import List, Union from typing import List, Literal, Union
import gc import gc
import torch import torch
...@@ -23,15 +23,6 @@ from itertools import islice ...@@ -23,15 +23,6 @@ from itertools import islice
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
class ExitCodeError(Exception):
pass
def sh(x):
if os.system(x):
raise ExitCodeError()
def escaped_split(text, sep_char, maxsplit=-1): def escaped_split(text, sep_char, maxsplit=-1):
"""Split text into a list on occurrences of the given separation """Split text into a list on occurrences of the given separation
character `sep_char`. The separation character may be escaped by a character `sep_char`. The separation character may be escaped by a
...@@ -181,26 +172,6 @@ def make_disjoint_window(pair): ...@@ -181,26 +172,6 @@ def make_disjoint_window(pair):
return a[: len(a) - (len(b) - 1)], b return a[: len(a) - (len(b) - 1)], b
def select_continuation_from_batch_left_padding(
generations: Union[List[List[int]], torch.Tensor], max_context_size: int
):
"""Select the continuation from the batch, removing prompts of different lengths.
Args:
generations (Union[List[List[int]], torch.Tensor]):
A tensor or list-of-lists of shape [batch_size, sequence length].
max_context_size (int):
The size of the biggest context; generations will proceed from that
index.
Example:
PAD PAD Continue : The dog chased the cat [every day of the week]
Riddle me this : The dog chased the cat [yesterday] PAD PAD PAD PAD
Output:
[every day of the week]
[yesterday] PAD PAD PAD PAD
"""
return generations[:, max_context_size:]
class Reorderer: class Reorderer:
def __init__(self, arr, fn): def __init__(self, arr, fn):
self.size = len(arr) self.size = len(arr)
...@@ -398,7 +369,8 @@ def get_git_commit_hash(): ...@@ -398,7 +369,8 @@ def get_git_commit_hash():
try: try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip() git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode() git_hash = git_hash.decode()
except subprocess.CalledProcessError: except subprocess.CalledProcessError or FileNotFoundError:
# FileNotFoundError occurs when git not installed on system
git_hash = None git_hash = None
return git_hash return git_hash
...@@ -481,7 +453,11 @@ def create_iterator(raw_iterator, rank, world_size, limit=None): ...@@ -481,7 +453,11 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
return islice(raw_iterator, rank, limit, world_size) return islice(raw_iterator, rank, limit, world_size)
def pad_and_concat(max_length: int, tensors: List[torch.Tensor], padding_side="right"): def pad_and_concat(
max_length: int,
tensors: List[torch.Tensor],
padding_side: Literal["right", "left"] = "right",
):
""" """
Method for padding a list of tensors given the maximum tensor Method for padding a list of tensors given the maximum tensor
length in the batch. Used for batching inputs and continuations in length in the batch. Used for batching inputs and continuations in
......
...@@ -55,7 +55,7 @@ setuptools.setup( ...@@ -55,7 +55,7 @@ setuptools.setup(
"promptsource": [ "promptsource": [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource" "promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
], ],
"auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"], "gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"], "anthropic": ["anthropic"],
}, },
) )
...@@ -6,14 +6,18 @@ import lm_eval.models ...@@ -6,14 +6,18 @@ import lm_eval.models
def test_description(): def test_description():
seed = 42 seed = 42
num_examples = 1 num_examples = 1
task_names = ["hellaswag", "winogrande"] task_names = ["arc_challenge", "lambada"]
description_dict = { description_dict = {
"hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.", "arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"winogrande": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.", "lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
} }
task_dict = lm_eval.tasks.get_task_dict(task_names) task_dict = lm_eval.tasks.get_task_dict(task_names)
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
# patch description field in task (# TODO: make this much more cleaned up)
task._config.description = description_dict[task_name]
rnd = random.Random() rnd = random.Random()
rnd.seed(seed) rnd.seed(seed)
......
import os import os
import lm_eval.base as base
# import lm_eval.base as base
import lm_eval.api.registry as registry
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
import lm_eval.models as models
# import lm_eval.models as models
import lm_eval.evaluator as evaluator import lm_eval.evaluator as evaluator
import random import random
import pytest import pytest
...@@ -15,8 +19,10 @@ import pytest ...@@ -15,8 +19,10 @@ import pytest
def test_evaluator(taskname, task_class): def test_evaluator(taskname, task_class):
task_dict = tasks.get_task_dict([taskname]) task_dict = tasks.get_task_dict([taskname])
os.system("rm test_cache.db") # TODO: re-add cachingLM
lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db") # os.system("rm test_cache.db")
# lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
lm = registry.get_model("dummy")()
def ll_fn(reqs): def ll_fn(reqs):
for ctx, cont in reqs: for ctx, cont in reqs:
......
import pytest import pytest
import lm_eval.metrics as metrics import lm_eval.api.metrics as metrics
import random import random
......
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
import lm_eval.base as base
import pytest import pytest
from itertools import islice from itertools import islice
...@@ -100,5 +100,5 @@ def test_documents_and_requests(taskname, task_class): ...@@ -100,5 +100,5 @@ def test_documents_and_requests(taskname, task_class):
reqs = [reqs] reqs = [reqs]
# todo: mock lm after refactoring evaluator.py to not be a mess # todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs: # for req in reqs:
assert isinstance(req, base.Request) # assert isinstance(req, base.Request)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment