Commit 0d1ef037 authored by lintangsutawika's avatar lintangsutawika
Browse files

solved merge conflict

parents aa44be3f ada4a31d
......@@ -8,18 +8,18 @@ Arguments
directory and the unsorted buckets are removed after.
"""
import glob
import argparse
import glob
import logging
import os
import signal
from signal import SIGINT
import subprocess
from signal import SIGINT
from tqdm import tqdm
import logging
from tqdm_multiprocess.logger import setup_logger_tqdm
logger = logging.getLogger(__name__)
terminate = False
......@@ -31,7 +31,7 @@ def handler(signal_received, frame):
def sort_13_gram_buckets(working_directory):
bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt"))
bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt"))
for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
sorted_file_path = bucket_file_path + ".sorted"
......@@ -49,7 +49,6 @@ parser = argparse.ArgumentParser(description="sort 13gram buckets")
parser.add_argument("-dir", "--working_directory", default="")
if __name__ == "__main__":
version = 1.00
print(f"Running version {version}")
......
import random
import transformers
from lm_eval import tasks, evaluator
from lm_eval import evaluator, tasks
from lm_eval.base import LM
......
from lm_eval import tasks
from itertools import islice
from lm_eval import tasks
ct = 3
for (
......
import transformers
import random
import torch
import torch.nn.functional as F
import random
import transformers
random.seed(42)
......
......@@ -2,10 +2,11 @@
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import json
import logging
from pytablewriter import MarkdownTableWriter, LatexTableWriter
import os
import json
from pytablewriter import LatexTableWriter, MarkdownTableWriter
logging.basicConfig(level=logging.INFO)
......
......@@ -4,9 +4,11 @@ Usage:
"""
import argparse
import logging
from lm_eval import tasks
from pytablewriter import MarkdownTableWriter
from lm_eval import tasks
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
......
import argparse
import os
from typing import Dict, List, Tuple
import numpy as np
import lm_eval.evaluator
from lm_eval import tasks
from lm_eval import utils
import scipy.stats
from typing import Tuple, Dict, List
import pandas as pd
import scipy.stats
import torch
import os
import lm_eval.evaluator
from lm_eval import tasks, utils
os.environ["TOKENIZERS_PARALLELISM"] = "false"
eval_logger = utils.eval_logger
......
......@@ -5,7 +5,7 @@ import subprocess
import time
from pathlib import Path
from lm_eval import evaluator, utils
from lm_eval import utils
from lm_eval.api.registry import ALL_TASKS
......@@ -136,14 +136,16 @@ def main():
args = parse_args()
args.branches = (
args.branches.split(",") if type(args.branches) == str else args.branches
args.branches.split(",") if isinstance(args.branches, str) else args.branches
)
args.models = (
args.models.split(",") if isinstance(args.models, str) else args.models
)
args.models = args.models.split(",") if type(args.models) == str else args.models
args.tasks = (
ALL_TASKS
if args.tasks == "all_tasks"
else utils.pattern_match(args.tasks.split(","), ALL_TASKS)
if type(args.tasks) == str
if isinstance(args.tasks, str)
else args.tasks
)
......
import argparse
import numpy as np
import json
import os
import random
import numpy as np
from lm_eval import tasks
from lm_eval.utils import join_iters, eval_logger
from lm_eval.tasks import initialize_tasks, include_path
from lm_eval.tasks import include_path, initialize_tasks
from lm_eval.utils import eval_logger, join_iters
EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
......
import argparse
import json
import os
import re
from pathlib import Path
import pandas as pd
from zeno_client import ZenoClient, ZenoMetric
from lm_eval.utils import eval_logger
def parse_args():
parser = argparse.ArgumentParser(
description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."
)
parser.add_argument(
"--data_path",
required=True,
help="Where to find the results of the benchmarks that have been run. Uses the name of each subfolder as the model name.",
)
parser.add_argument(
"--project_name",
required=True,
help="The name of the generated Zeno project.",
)
return parser.parse_args()
def main():
"""Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
This scripts expects your results to live in a data folder where subfolders contain results of individual models.
"""
args = parse_args()
client = ZenoClient(os.environ["ZENO_API_KEY"])
# Get all model subfolders from the parent data folder.
models = [
os.path.basename(os.path.normpath(f))
for f in os.scandir(Path(args.data_path))
if f.is_dir()
]
assert len(models) > 0, "No model directories found in the data_path."
tasks = set(tasks_for_model(models[0], args.data_path))
for model in models: # Make sure that all models have the same tasks.
old_tasks = tasks.copy()
task_count = len(tasks)
model_tasks = tasks_for_model(model, args.data_path)
tasks.intersection(set(model_tasks))
if task_count != len(tasks):
eval_logger.warning(
f"All models must have the same tasks. {model} has tasks: {model_tasks} but have already recorded tasks: {old_tasks}. Taking intersection {tasks}"
)
assert (
len(tasks) > 0
), "Must provide at least one task in common amongst models to compare."
for task in tasks:
# Upload data for all models
for model_index, model in enumerate(models):
model_args = re.sub(
"/|=",
"__",
json.load(open(Path(args.data_path, model, "results.json")))["config"][
"model_args"
],
)
with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"), "r"
) as file:
data = json.loads(file.read())
configs = json.load(open(Path(args.data_path, model, "results.json")))[
"configs"
]
config = configs[task]
if model_index == 0: # Only need to assemble data for the first model
metrics = []
for metric in config["metric_list"]:
metrics.append(
ZenoMetric(
name=metric["metric"],
type="mean",
columns=[metric["metric"]],
)
)
project = client.create_project(
name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
view="text-classification",
metrics=metrics,
)
project.upload_dataset(
generate_dataset(data, config),
id_column="id",
data_column="data",
label_column="labels",
)
project.upload_system(
generate_system_df(data, config),
name=model,
id_column="id",
output_column="output",
)
def tasks_for_model(model: str, data_path: str):
"""Get the tasks for a specific model.
Args:
model (str): The name of the model.
data_path (str): The path to the data.
Returns:
list: A list of tasks for the model.
"""
dir_path = Path(data_path, model)
config = (json.load(open(Path(dir_path, "results.json")))["configs"],)
return list(config[0].keys())
def generate_dataset(
data,
config,
):
"""Generate a Zeno dataset from evaluation data.
Args:
data: The data to generate a dataset for.
config: The configuration of the task.
Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
"""
ids = [x["doc_id"] for x in data]
labels = [x["target"] for x in data]
instance = [""] * len(ids)
if config["output_type"] == "loglikelihood":
instance = [x["arguments"][0][0] for x in data]
labels = [x["arguments"][0][1] for x in data]
elif config["output_type"] == "multiple_choice":
instance = [
x["arguments"][0][0]
+ "\n\n"
+ "\n".join([f"- {y[1]}" for y in x["arguments"]])
for x in data
]
elif config["output_type"] == "loglikelihood_rolling":
instance = [x["arguments"][0][0] for x in data]
elif config["output_type"] == "generate_until":
instance = [x["arguments"][0][0] for x in data]
return pd.DataFrame(
{
"id": ids,
"data": instance,
"input_len": [len(x) for x in instance],
"labels": labels,
"output_type": config["output_type"],
}
)
def generate_system_df(data, config):
"""Generate a dataframe for a specific system to be uploaded to Zeno.
Args:
data: The data to generate a dataframe from.
config: The configuration of the task.
Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
"""
ids = [x["doc_id"] for x in data]
system_dict = {"id": ids}
system_dict["output"] = [""] * len(ids)
if config["output_type"] == "loglikelihood":
system_dict["output"] = [
"correct" if x["filtered_resps"][0][1] is True else "incorrect"
for x in data
]
elif config["output_type"] == "multiple_choice":
system_dict["output"] = [
", ".join([str(y[0]) for y in x["filtered_resps"]]) for x in data
]
system_dict["num_answers"] = [len(x["filtered_resps"]) for x in data]
elif config["output_type"] == "loglikelihood_rolling":
system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
elif config["output_type"] == "generate_until":
system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]
metrics = {}
for metric in config["metric_list"]:
if "aggregation" in metric and metric["aggregation"] == "mean":
metrics[metric["metric"]] = [x[metric["metric"]] for x in data]
system_dict.update(metrics)
system_df = pd.DataFrame(system_dict)
return system_df
if __name__ == "__main__":
main()
import setuptools
# This is to make sure that the package supports editable installs
setuptools.setup()
import unittest
from unittest.mock import patch
import hashlib
import json
import os
import pickle
from lm_eval.models.gguf import GGUFLM
import unittest
from unittest.mock import patch
from lm_eval.api.instance import Instance
from lm_eval.models.gguf import GGUFLM
base_url = "https://matthoffner-ggml-llm-api.hf.space"
......
from __future__ import annotations
import pytest
import sys
from pathlib import Path
import numpy as np
from lm_eval.models.huggingface import HFLM
from lm_eval.api.instance import Instance
import lm_eval.tasks as tasks
import sys
import torch
import lm_eval.tasks as tasks
from lm_eval.api.instance import Instance
from lm_eval.models.huggingface import HFLM
tasks.initialize_tasks()
......@@ -106,9 +109,10 @@ class Test_HFLM:
f.write("\n".join(str(x) for x in _res))
assert np.allclose(_res, _RES, atol=1e-2)
# check indices for Multiple Choice
argmax_RES, argmax_res = np.argmax(
np.array(_RES).reshape(-1, 4), axis=1
), np.argmax(np.array(_res).reshape(-1, 4), axis=1)
argmax_RES, argmax_res = (
np.argmax(np.array(_RES).reshape(-1, 4), axis=1),
np.argmax(np.array(_res).reshape(-1, 4), axis=1),
)
assert (argmax_RES == argmax_res).all()
def test_generate_until(self) -> None:
......
import pytest
from typing import List
from lm_eval.api.instance import Instance
import lm_eval.tasks as tasks
import sys
import pytest
import torch
import lm_eval.tasks as tasks
from lm_eval.api.instance import Instance
@pytest.mark.skip(reason="requires CUDA")
class TEST_VLLM:
......
import os
# import lm_eval.base as base
import lm_eval.api.registry as registry
import lm_eval.tasks as tasks
from typing import List
import pytest
# import lm_eval.models as models
import lm_eval.api as api
import lm_eval.evaluator as evaluator
from typing import List
import random
import pytest
import lm_eval.tasks as tasks
tasks.initialize_tasks()
......
import re
from collections import defaultdict
from lm_eval.decontamination.janitor import (
Janitor,
form_ngrams,
word_ngrams,
split_indices,
word_ngrams,
word_ngrams_indices,
)
......@@ -81,7 +80,6 @@ def test_split_indices():
def test_word_ngrams_indices():
sequence = (
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
......@@ -119,9 +117,9 @@ def test_word_ngrams_indices():
# Assumptions from GPT3 Paper:
# the 200 characters to remove include punctuation and is actually a half-window
# All tests below initially test without any registered contaminants, expecting the same sequence back.
def test_janitor1():
# First test using a 1gram and expected the first block before the filth to have some remaining
# characters, but the second block should be completely removed.
......@@ -165,7 +163,6 @@ def test_janitor1():
def test_janitor2():
# Second test using a 1gram and expected the first block before the filth to have some remaining
# characters, and the second block is longer then 200 characters so should also have some remaining.
......@@ -214,7 +211,6 @@ def test_janitor2():
def test_janitor3():
# Same test as above but with a 6gram.
sequence = (
......@@ -262,7 +258,6 @@ def test_janitor3():
def test_janitor4():
# This test adds another block to that from the previous. The middle block should be entirely
# removed as the 200 characters are removed from each side.
......@@ -318,7 +313,6 @@ def test_janitor4():
def test_janitor5():
# Same as above but using multiple different filth 6grams.
sequence = (
......@@ -374,7 +368,6 @@ def test_janitor5():
def test_janitor6():
# Same as above but now we add 10 filths and expect the same result, the following test does 11.
sequence = (
......@@ -438,7 +431,6 @@ def test_janitor6():
def test_janitor7():
# Same as above but now we add 9 filths and expect the same result, the following test does 10.
sequence = (
......
import random
import pytest
import lm_eval.api.metrics as metrics
import random
def test_bootstrapping():
......
from itertools import islice
import pytest
from .utils import new_tasks
import lm_eval.tasks as tasks
from lm_eval.api.task import ConfigurableTask
from .utils import new_tasks
tasks.initialize_tasks()
# Default Task
TASKS = ["arc_easy"]
......@@ -26,7 +30,7 @@ def limit() -> int:
# Tests
@pytest.mark.parametrize("task_class", task_class())
@pytest.mark.parametrize("task_class", task_class(), ids=lambda x: f"{x.config.task}")
class TestNewTasks:
def test_download(self, task_class: ConfigurableTask):
task_class.download()
......
from lm_eval.utils import get_rolling_token_windows, make_disjoint_window
import pytest
from lm_eval.utils import Collator, get_rolling_token_windows, make_disjoint_window
# noinspection DuplicatedCode
......@@ -220,3 +222,76 @@ def test_make_disjoint_window():
)
assert make_disjoint_window(([1, 2, 3, 4, 5], [4, 5, 6])) == ([1, 2, 3], [4, 5, 6])
assert make_disjoint_window(([1, 2, 3, 4, 5], [6])) == ([1, 2, 3, 4, 5], [6])
class TestCollator:
def make_generate_sample(self, end=10):
strings = ["x" * i for i in range(1, end + 1)]
gen_kwargs1, gen_kwargs2 = (
{"temperature": 0},
{"temperature": 0, "until": ["nn", "\n\n"]},
)
args = [
(string, gen_kwargs1 if i < len(strings) // 2 else gen_kwargs2)
for i, string in enumerate(strings)
]
return args
def make_loglikelihood_sample(self, end=11):
samples = [
(("x", "x"), list(range(1, total_length + 1)))
for total_length in range(1, end + 1)
]
return samples
@pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 9)])
def test_generations(self, batch_size, end):
_collate_gen = lambda x: (-len(x[0]), x[0]) # noqa: E731
generation_samples = self.make_generate_sample(int(end))
gens = Collator(generation_samples, _collate_gen, grouping=True)
chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
# check batching
group_one = end // 2
group_two = end - end // 2
assert (
len(chunks) <= batch_size
if batch_size != 0
else len(chunks) in [group_one, group_two]
)
# check if reorder-er is working correctly
assert all(
len(chunks[i][0]) <= len(chunks[i - 1][0])
for i in range(1, len(chunks))
)
# check if grouping correctly
assert all(x[1] == chunks[0][1] for x in chunks)
for x in chunks:
output.append(x)
reordered_output = gens.get_original(output)
# check get original
assert reordered_output == generation_samples
@pytest.mark.parametrize("batch_size, end", [(17, 30), (8, 61), (12, 48), (0, 3)])
def test_loglikelihood(self, batch_size, end):
_collate_log = lambda x: (-len(x[1]), tuple(x[1])) # noqa: E731
loglikelihood_samples = self.make_loglikelihood_sample(int(end))
loglikelihoods = Collator(loglikelihood_samples, _collate_log, grouping=False)
chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
# check batching
assert len(chunks) <= batch_size if batch_size != 0 else len(chunks) == end
# check reorder
assert all(
len(chunks[i][1]) <= len(chunks[i - 1][1])
for i in range(1, len(chunks))
)
for x in chunks:
output.append(x[1])
# check indices
reordered_output = loglikelihoods.get_original(output)
assert reordered_output == [x[1] for x in loglikelihood_samples]
import random
import lm_eval.tasks
import lm_eval.models
def test_description():
seed = 42
num_examples = 1
task_names = ["arc_challenge", "arc_easy"]
description_dict = {
"arc_challenge": "Label for the relevant action:\nSentences describing context, with an incomplete sentence trailing answer that plausibly completes the situation.",
"lambada": "Winograd schema sentence including a either a ___ blank with a missing word, making the pronoun ambiguous, or the same with the word filled in.",
}
task_dict = lm_eval.tasks.get_task_dict(task_names)
for task_name, task in task_dict.items():
# patch description field in task (# TODO: make this much more cleaned up)
task._config.description = description_dict[task_name]
rnd = random.Random()
rnd.seed(seed)
if task.has_training_docs():
docs = task.training_docs()
elif set == "val" and task.has_validation_docs():
docs = task.validation_docs()
elif set == "test" and task.has_test_docs():
docs = task.test_docs()
description = (
description_dict[task_name]
if description_dict and task_name in description_dict
else ""
)
for _, doc in (
zip(range(num_examples), docs) if num_examples > 0 else enumerate(docs)
):
ctx = task.fewshot_context(
doc=doc,
num_fewshot=1,
)
assert description in ctx
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment