Commit 6e3ef5ff authored by Benjamin Fattori's avatar Benjamin Fattori
Browse files

Merge remote-tracking branch 'upstream/big-refactor' into refactor-more-tasks

parents 026d2c21 070b6b9c
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "best_option"
use_prompt: "promptsource:best_option"
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "cause_effect"
use_prompt: "promptsource:cause_effect"
group:
- super-glue-t5-prompt
task: t5-prompt
reference: "From Raffel et. al. 2019"
task: super_glue-copa-t5-prompt
dataset_path: super_glue
dataset_name: copa
training_split: train
......
def convert_choice(choice):
return choice[0].lower() + choice[1:]
def doc_to_text(doc):
# Drop the period
connector = {
"cause": "because",
"effect": "therefore",
}[doc["question"]]
return doc["premise"].strip()[:-1] + f" {connector}"
def doc_to_target(doc):
correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
# Connect the sentences
return " " + convert_choice(correct_choice)
group:
- super-glue-t5-prompt
task: t5-prompt
reference: "From Raffel et. al. 2019"
task: super_glue-record-t5-prompt
dataset_path: super_glue
dataset_name: record
training_split: train
......
group:
- super-glue-lm-eval-v1
task: "wic"
dataset_path: super_glue
dataset_name: wic
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target
gold_alias: "{{label}}" # this will be cast to an int.
template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
metric_list:
- metric: acc
group:
- super-glue-promptsource
task: "GPT-3-prompt"
dataset_path: super_glue
dataset_name: wic
training_split: train
validation_split: validation
use_prompt: "promptsource:GPT-3-prompt"
metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "GPT-3-prompt-with-label"
use_prompt: "promptsource:GPT-3-prompt-with-label"
include: promptsource-00.yaml
group:
- super-glue-promptsource
task: "affirmation_true_or_false"
use_prompt: "promptsource:affirmation_true_or_false"
def doc_to_text(doc):
return (
"Sentence 1: {}\nSentence 2: {}\nQuestion: Is the word '{}' used in the same way in the"
" two sentences above?\nAnswer:".format(
doc["sentence1"],
doc["sentence2"],
doc["sentence1"][doc["start1"] : doc["end1"]],
)
)
def doc_to_target(doc):
return " {}".format({0: "no", 1: "yes"}[doc["label"]])
group:
- super-glue-t5-prompt
task: t5-prompt
reference: "From Raffel et. al. 2019"
task: super_glue-wsc-t5-prompt
dataset_path: super_glue
dataset_name: wsc
training_split: train
......
......@@ -10,7 +10,7 @@ import collections
import importlib.util
import fnmatch
from typing import List, Union
from typing import List, Literal, Union
import gc
import torch
......@@ -23,15 +23,6 @@ from itertools import islice
from lm_eval.logger import eval_logger
class ExitCodeError(Exception):
pass
def sh(x):
if os.system(x):
raise ExitCodeError()
def escaped_split(text, sep_char, maxsplit=-1):
"""Split text into a list on occurrences of the given separation
character `sep_char`. The separation character may be escaped by a
......@@ -181,26 +172,6 @@ def make_disjoint_window(pair):
return a[: len(a) - (len(b) - 1)], b
def select_continuation_from_batch_left_padding(
generations: Union[List[List[int]], torch.Tensor], max_context_size: int
):
"""Select the continuation from the batch, removing prompts of different lengths.
Args:
generations (Union[List[List[int]], torch.Tensor]):
A tensor or list-of-lists of shape [batch_size, sequence length].
max_context_size (int):
The size of the biggest context; generations will proceed from that
index.
Example:
PAD PAD Continue : The dog chased the cat [every day of the week]
Riddle me this : The dog chased the cat [yesterday] PAD PAD PAD PAD
Output:
[every day of the week]
[yesterday] PAD PAD PAD PAD
"""
return generations[:, max_context_size:]
class Reorderer:
def __init__(self, arr, fn):
self.size = len(arr)
......@@ -398,7 +369,8 @@ def get_git_commit_hash():
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except subprocess.CalledProcessError:
except subprocess.CalledProcessError or FileNotFoundError:
# FileNotFoundError occurs when git not installed on system
git_hash = None
return git_hash
......@@ -481,7 +453,11 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
return islice(raw_iterator, rank, limit, world_size)
def pad_and_concat(max_length: int, tensors: List[torch.Tensor], padding_side="right"):
def pad_and_concat(
max_length: int,
tensors: List[torch.Tensor],
padding_side: Literal["right", "left"] = "right",
):
"""
Method for padding a list of tensors given the maximum tensor
length in the batch. Used for batching inputs and continuations in
......
......@@ -55,7 +55,7 @@ setuptools.setup(
"promptsource": [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
],
"auto-gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"gptq": ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"],
"anthropic": ["anthropic"],
},
)
......@@ -6,14 +6,18 @@ import lm_eval.models
def test_description():
seed = 42
num_examples = 1
task_names = ["hellaswag", "winogrande"]
task_names = ["arc_challenge", "lambada"]
description_dict = {
"hellaswag": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"winogrande": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
"arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
}
task_dict = lm_eval.tasks.get_task_dict(task_names)
for task_name, task in task_dict.items():
# patch description field in task (# TODO: make this much more cleaned up)
task._config.description = description_dict[task_name]
rnd = random.Random()
rnd.seed(seed)
......
import os
import lm_eval.base as base
# import lm_eval.base as base
import lm_eval.api.registry as registry
import lm_eval.tasks as tasks
import lm_eval.models as models
# import lm_eval.models as models
import lm_eval.evaluator as evaluator
import random
import pytest
......@@ -15,8 +19,10 @@ import pytest
def test_evaluator(taskname, task_class):
task_dict = tasks.get_task_dict([taskname])
os.system("rm test_cache.db")
lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
# TODO: re-add cachingLM
# os.system("rm test_cache.db")
# lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")
lm = registry.get_model("dummy")()
def ll_fn(reqs):
for ctx, cont in reqs:
......
import pytest
import lm_eval.metrics as metrics
import lm_eval.api.metrics as metrics
import random
......
import lm_eval.tasks as tasks
import lm_eval.base as base
import pytest
from itertools import islice
......@@ -100,5 +100,5 @@ def test_documents_and_requests(taskname, task_class):
reqs = [reqs]
# todo: mock lm after refactoring evaluator.py to not be a mess
for req in reqs:
assert isinstance(req, base.Request)
# for req in reqs:
# assert isinstance(req, base.Request)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment