Unverified Commit cda25fef authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into standardize_metrics

parents dfb41835 4d10ad56
import os
import yaml
import argparse import argparse
import os
from tqdm import tqdm import yaml
from promptsource.templates import DatasetTemplates from promptsource.templates import DatasetTemplates
from tqdm import tqdm
from lm_eval import utils
# from lm_eval.api.registry import ALL_TASKS # from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
# from lm_eval.tasks import include_task_folder # from lm_eval.tasks import include_task_folder
...@@ -22,7 +21,6 @@ def parse_args(): ...@@ -22,7 +21,6 @@ def parse_args():
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
with open(args.benchmark_path) as file: with open(args.benchmark_path) as file:
......
import glob
import argparse import argparse
import glob
import logging
import os import os
import subprocess
import shutil import shutil
import subprocess
from tqdm import tqdm from tqdm import tqdm
from tqdm_multiprocess import TqdmMultiProcessPool from tqdm_multiprocess import TqdmMultiProcessPool
import logging
from tqdm_multiprocess.logger import setup_logger_tqdm from tqdm_multiprocess.logger import setup_logger_tqdm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -35,7 +35,7 @@ def compress_and_move(working_directory, output_directory, process_count): ...@@ -35,7 +35,7 @@ def compress_and_move(working_directory, output_directory, process_count):
tasks = [] tasks = []
bucket_file_paths = glob.glob( bucket_file_paths = glob.glob(
os.path.join(working_directory, "output", f"*.bkt.txt.sorted") os.path.join(working_directory, "output", "*.bkt.txt.sorted")
) )
for bucket_file_path in bucket_file_paths: for bucket_file_path in bucket_file_paths:
task = (process_task, (working_directory, output_directory, bucket_file_path)) task = (process_task, (working_directory, output_directory, bucket_file_path))
......
...@@ -21,22 +21,22 @@ Arguments ...@@ -21,22 +21,22 @@ Arguments
""" """
import argparse import argparse
import glob
import json import json
import pickle import logging
import os import os
import pickle
import signal
import sys import sys
from pathlib import Path from pathlib import Path
import glob
import signal
from signal import SIGINT from signal import SIGINT
from tqdm import tqdm from tqdm import tqdm
from tqdm_multiprocess.logger import setup_logger_tqdm
from lm_eval.decontamination.archiver import Reader, TextArchive
from lm_eval.decontamination.janitor import Janitor, word_ngrams from lm_eval.decontamination.janitor import Janitor, word_ngrams
from lm_eval.decontamination.archiver import TextArchive, Reader
import logging
from tqdm_multiprocess.logger import setup_logger_tqdm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -89,7 +89,7 @@ class Buckets: ...@@ -89,7 +89,7 @@ class Buckets:
os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets) os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets)
] ]
self.buckets = list(map(TextArchive, self.bucket_files)) self.buckets = list(map(TextArchive, self.bucket_files))
self.checkpoint_file = os.path.join(directory, f"bucket_offsets.ckpt") self.checkpoint_file = os.path.join(directory, "bucket_offsets.ckpt")
if os.path.exists(self.checkpoint_file): if os.path.exists(self.checkpoint_file):
self.bucket_offsets = pickle.load(open(self.checkpoint_file, "rb")) self.bucket_offsets = pickle.load(open(self.checkpoint_file, "rb"))
...@@ -119,7 +119,6 @@ class Buckets: ...@@ -119,7 +119,6 @@ class Buckets:
def do_ngrams_in_buckets(n_value, working_directory, bucket_count): def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
pile_statistics = json.load(open("pile_statistics.json", "r")) pile_statistics = json.load(open("pile_statistics.json", "r"))
pile_document_count = pile_statistics["Document Count"] pile_document_count = pile_statistics["Document Count"]
start_offsets = pile_statistics["File Start Offsets"] start_offsets = pile_statistics["File Start Offsets"]
...@@ -130,13 +129,13 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count): ...@@ -130,13 +129,13 @@ def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
logger.info(f"Generating {n_value}-grams and bucketing.") logger.info(f"Generating {n_value}-grams and bucketing.")
# Done file # Done file
done_file = os.path.join(output_directory, f"ngram_buckets.done") done_file = os.path.join(output_directory, "ngram_buckets.done")
if os.path.exists(done_file): if os.path.exists(done_file):
logger.info("ngrams already generated and bucketed, skipping") logger.info("ngrams already generated and bucketed, skipping")
return return
# Checkpoint # Checkpoint
checkpoint_file = os.path.join(working_directory, f"pile_offset.ckpt") checkpoint_file = os.path.join(working_directory, "pile_offset.ckpt")
if os.path.exists(checkpoint_file): if os.path.exists(checkpoint_file):
checkpoint_offset = pickle.load(open(checkpoint_file, "rb")) checkpoint_offset = pickle.load(open(checkpoint_file, "rb"))
iterate = True iterate = True
......
from lm_eval.decontamination.archiver import Reader import glob
import os
import json import json
import os
from functools import reduce from functools import reduce
import glob
import tqdm
import tqdm
from tqdm_multiprocess import TqdmMultiProcessPool from tqdm_multiprocess import TqdmMultiProcessPool
from lm_eval.decontamination.archiver import Reader
def get_file_stats(file_path, tqdm_func, global_tqdm): def get_file_stats(file_path, tqdm_func, global_tqdm):
reader = Reader() reader = Reader()
......
...@@ -15,18 +15,18 @@ Arguments ...@@ -15,18 +15,18 @@ Arguments
import argparse import argparse
import glob import glob
import logging
import os import os
from pathlib import Path
import re import re
import shutil import shutil
from pathlib import Path
from tqdm import tqdm from tqdm import tqdm
from tqdm_multiprocess import TqdmMultiProcessPool from tqdm_multiprocess import TqdmMultiProcessPool
from tqdm_multiprocess.logger import setup_logger_tqdm
from scripts.clean_training_data.archiver import TextReader, TextArchive from scripts.clean_training_data.archiver import TextArchive, TextReader
import logging
from tqdm_multiprocess.logger import setup_logger_tqdm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -35,7 +35,6 @@ logger = logging.getLogger(__name__) ...@@ -35,7 +35,6 @@ logger = logging.getLogger(__name__)
def process_bucket( def process_bucket(
bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm
): ):
bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path)) # noqa: W605 bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path)) # noqa: W605
done_file = os.path.join( done_file = os.path.join(
processed_directory, f"ngram_bucket_processing_{bucket_id}.done" processed_directory, f"ngram_bucket_processing_{bucket_id}.done"
...@@ -96,7 +95,7 @@ def process_bucket( ...@@ -96,7 +95,7 @@ def process_bucket(
def process_sorted_buckets(working_directory, move_dir, process_count): def process_sorted_buckets(working_directory, move_dir, process_count):
bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted")) bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt.sorted"))
processed_directory = os.path.join(working_directory, "processed") processed_directory = os.path.join(working_directory, "processed")
os.makedirs(processed_directory, exist_ok=True) os.makedirs(processed_directory, exist_ok=True)
...@@ -123,7 +122,6 @@ parser.add_argument("-move", "--move_dir", default="") ...@@ -123,7 +122,6 @@ parser.add_argument("-move", "--move_dir", default="")
parser.add_argument("-procs", "--process_count", type=int, default=4) parser.add_argument("-procs", "--process_count", type=int, default=4)
if __name__ == "__main__": if __name__ == "__main__":
logfile_path = "process13grams.log" logfile_path = "process13grams.log"
setup_logger_tqdm(logfile_path) setup_logger_tqdm(logfile_path)
......
...@@ -8,18 +8,18 @@ Arguments ...@@ -8,18 +8,18 @@ Arguments
directory and the unsorted buckets are removed after. directory and the unsorted buckets are removed after.
""" """
import glob
import argparse import argparse
import glob
import logging
import os import os
import signal import signal
from signal import SIGINT
import subprocess import subprocess
from signal import SIGINT
from tqdm import tqdm from tqdm import tqdm
import logging
from tqdm_multiprocess.logger import setup_logger_tqdm from tqdm_multiprocess.logger import setup_logger_tqdm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
terminate = False terminate = False
...@@ -31,7 +31,7 @@ def handler(signal_received, frame): ...@@ -31,7 +31,7 @@ def handler(signal_received, frame):
def sort_13_gram_buckets(working_directory): def sort_13_gram_buckets(working_directory):
bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt")) bucket_file_paths = glob.glob(os.path.join(working_directory, "*.bkt.txt"))
for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True): for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
sorted_file_path = bucket_file_path + ".sorted" sorted_file_path = bucket_file_path + ".sorted"
...@@ -49,7 +49,6 @@ parser = argparse.ArgumentParser(description="sort 13gram buckets") ...@@ -49,7 +49,6 @@ parser = argparse.ArgumentParser(description="sort 13gram buckets")
parser.add_argument("-dir", "--working_directory", default="") parser.add_argument("-dir", "--working_directory", default="")
if __name__ == "__main__": if __name__ == "__main__":
version = 1.00 version = 1.00
print(f"Running version {version}") print(f"Running version {version}")
......
import random import random
import transformers import transformers
from lm_eval import tasks, evaluator
from lm_eval import evaluator, tasks
from lm_eval.base import LM from lm_eval.base import LM
......
from lm_eval import tasks
from itertools import islice from itertools import islice
from lm_eval import tasks
ct = 3 ct = 3
for ( for (
......
import transformers import random
import torch import torch
import torch.nn.functional as F import torch.nn.functional as F
import random import transformers
random.seed(42) random.seed(42)
......
...@@ -2,10 +2,11 @@ ...@@ -2,10 +2,11 @@
Usage: Usage:
python make_table_tasks.py --output <markdown_filename> python make_table_tasks.py --output <markdown_filename>
""" """
import json
import logging import logging
from pytablewriter import MarkdownTableWriter, LatexTableWriter
import os import os
import json
from pytablewriter import LatexTableWriter, MarkdownTableWriter
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
......
...@@ -4,9 +4,11 @@ Usage: ...@@ -4,9 +4,11 @@ Usage:
""" """
import argparse import argparse
import logging import logging
from lm_eval import tasks
from pytablewriter import MarkdownTableWriter from pytablewriter import MarkdownTableWriter
from lm_eval import tasks
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
import argparse import argparse
import os
from typing import Dict, List, Tuple
import numpy as np import numpy as np
import lm_eval.evaluator
from lm_eval import tasks
from lm_eval import utils
import scipy.stats
from typing import Tuple, Dict, List
import pandas as pd import pandas as pd
import scipy.stats
import torch import torch
import os
import lm_eval.evaluator
from lm_eval import tasks, utils
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
eval_logger = utils.eval_logger eval_logger = utils.eval_logger
......
...@@ -5,7 +5,7 @@ import subprocess ...@@ -5,7 +5,7 @@ import subprocess
import time import time
from pathlib import Path from pathlib import Path
from lm_eval import evaluator, utils from lm_eval import utils
from lm_eval.api.registry import ALL_TASKS from lm_eval.api.registry import ALL_TASKS
...@@ -136,14 +136,16 @@ def main(): ...@@ -136,14 +136,16 @@ def main():
args = parse_args() args = parse_args()
args.branches = ( args.branches = (
args.branches.split(",") if type(args.branches) == str else args.branches args.branches.split(",") if isinstance(args.branches, str) else args.branches
)
args.models = (
args.models.split(",") if isinstance(args.models, str) else args.models
) )
args.models = args.models.split(",") if type(args.models) == str else args.models
args.tasks = ( args.tasks = (
ALL_TASKS ALL_TASKS
if args.tasks == "all_tasks" if args.tasks == "all_tasks"
else utils.pattern_match(args.tasks.split(","), ALL_TASKS) else utils.pattern_match(args.tasks.split(","), ALL_TASKS)
if type(args.tasks) == str if isinstance(args.tasks, str)
else args.tasks else args.tasks
) )
......
import argparse import argparse
import numpy as np
import json
import os import os
import random import random
import numpy as np
from lm_eval import tasks from lm_eval import tasks
from lm_eval.utils import join_iters, eval_logger from lm_eval.tasks import include_path, initialize_tasks
from lm_eval.tasks import initialize_tasks, include_path from lm_eval.utils import eval_logger, join_iters
EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n" EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
......
import argparse
import json
import os
import re
from pathlib import Path
import pandas as pd
from zeno_client import ZenoClient, ZenoMetric
from lm_eval.utils import eval_logger
def parse_args():
parser = argparse.ArgumentParser(
description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."
)
parser.add_argument(
"--data_path",
required=True,
help="Where to find the results of the benchmarks that have been run. Uses the name of each subfolder as the model name.",
)
parser.add_argument(
"--project_name",
required=True,
help="The name of the generated Zeno project.",
)
return parser.parse_args()
def main():
"""Upload the results of your benchmark tasks to the Zeno AI evaluation platform.
This scripts expects your results to live in a data folder where subfolders contain results of individual models.
"""
args = parse_args()
client = ZenoClient(os.environ["ZENO_API_KEY"])
# Get all model subfolders from the parent data folder.
models = [
os.path.basename(os.path.normpath(f))
for f in os.scandir(Path(args.data_path))
if f.is_dir()
]
assert len(models) > 0, "No model directories found in the data_path."
tasks = set(tasks_for_model(models[0], args.data_path))
for model in models: # Make sure that all models have the same tasks.
old_tasks = tasks.copy()
task_count = len(tasks)
model_tasks = tasks_for_model(model, args.data_path)
tasks.intersection(set(model_tasks))
if task_count != len(tasks):
eval_logger.warning(
f"All models must have the same tasks. {model} has tasks: {model_tasks} but have already recorded tasks: {old_tasks}. Taking intersection {tasks}"
)
assert (
len(tasks) > 0
), "Must provide at least one task in common amongst models to compare."
for task in tasks:
# Upload data for all models
for model_index, model in enumerate(models):
model_args = re.sub(
"/|=",
"__",
json.load(open(Path(args.data_path, model, "results.json")))["config"][
"model_args"
],
)
with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"), "r"
) as file:
data = json.loads(file.read())
configs = json.load(open(Path(args.data_path, model, "results.json")))[
"configs"
]
config = configs[task]
if model_index == 0: # Only need to assemble data for the first model
metrics = []
for metric in config["metric_list"]:
metrics.append(
ZenoMetric(
name=metric["metric"],
type="mean",
columns=[metric["metric"]],
)
)
project = client.create_project(
name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
view="text-classification",
metrics=metrics,
)
project.upload_dataset(
generate_dataset(data, config),
id_column="id",
data_column="data",
label_column="labels",
)
project.upload_system(
generate_system_df(data, config),
name=model,
id_column="id",
output_column="output",
)
def tasks_for_model(model: str, data_path: str):
"""Get the tasks for a specific model.
Args:
model (str): The name of the model.
data_path (str): The path to the data.
Returns:
list: A list of tasks for the model.
"""
dir_path = Path(data_path, model)
config = (json.load(open(Path(dir_path, "results.json")))["configs"],)
return list(config[0].keys())
def generate_dataset(
data,
config,
):
"""Generate a Zeno dataset from evaluation data.
Args:
data: The data to generate a dataset for.
config: The configuration of the task.
Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
"""
ids = [x["doc_id"] for x in data]
labels = [x["target"] for x in data]
instance = [""] * len(ids)
if config["output_type"] == "loglikelihood":
instance = [x["arguments"][0][0] for x in data]
labels = [x["arguments"][0][1] for x in data]
elif config["output_type"] == "multiple_choice":
instance = [
x["arguments"][0][0]
+ "\n\n"
+ "\n".join([f"- {y[1]}" for y in x["arguments"]])
for x in data
]
elif config["output_type"] == "loglikelihood_rolling":
instance = [x["arguments"][0][0] for x in data]
elif config["output_type"] == "generate_until":
instance = [x["arguments"][0][0] for x in data]
return pd.DataFrame(
{
"id": ids,
"data": instance,
"input_len": [len(x) for x in instance],
"labels": labels,
"output_type": config["output_type"],
}
)
def generate_system_df(data, config):
"""Generate a dataframe for a specific system to be uploaded to Zeno.
Args:
data: The data to generate a dataframe from.
config: The configuration of the task.
Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
"""
ids = [x["doc_id"] for x in data]
system_dict = {"id": ids}
system_dict["output"] = [""] * len(ids)
if config["output_type"] == "loglikelihood":
system_dict["output"] = [
"correct" if x["filtered_resps"][0][1] is True else "incorrect"
for x in data
]
elif config["output_type"] == "multiple_choice":
system_dict["output"] = [
", ".join([str(y[0]) for y in x["filtered_resps"]]) for x in data
]
system_dict["num_answers"] = [len(x["filtered_resps"]) for x in data]
elif config["output_type"] == "loglikelihood_rolling":
system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
elif config["output_type"] == "generate_until":
system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]
metrics = {}
for metric in config["metric_list"]:
if "aggregation" in metric and metric["aggregation"] == "mean":
metrics[metric["metric"]] = [x[metric["metric"]] for x in data]
system_dict.update(metrics)
system_df = pd.DataFrame(system_dict)
return system_df
if __name__ == "__main__":
main()
import setuptools import setuptools
# This is to make sure that the package supports editable installs # This is to make sure that the package supports editable installs
setuptools.setup() setuptools.setup()
import unittest
from unittest.mock import patch
import hashlib import hashlib
import json import json
import os import os
import pickle import pickle
from lm_eval.models.gguf import GGUFLM import unittest
from unittest.mock import patch
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.models.gguf import GGUFLM
base_url = "https://matthoffner-ggml-llm-api.hf.space" base_url = "https://matthoffner-ggml-llm-api.hf.space"
......
from __future__ import annotations from __future__ import annotations
import pytest
import sys
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
from lm_eval.models.huggingface import HFLM
from lm_eval.api.instance import Instance
import lm_eval.tasks as tasks
import sys
import torch import torch
import lm_eval.tasks as tasks
from lm_eval.api.instance import Instance
from lm_eval.models.huggingface import HFLM
tasks.initialize_tasks() tasks.initialize_tasks()
...@@ -106,9 +109,10 @@ class Test_HFLM: ...@@ -106,9 +109,10 @@ class Test_HFLM:
f.write("\n".join(str(x) for x in _res)) f.write("\n".join(str(x) for x in _res))
assert np.allclose(_res, _RES, atol=1e-2) assert np.allclose(_res, _RES, atol=1e-2)
# check indices for Multiple Choice # check indices for Multiple Choice
argmax_RES, argmax_res = np.argmax( argmax_RES, argmax_res = (
np.array(_RES).reshape(-1, 4), axis=1 np.argmax(np.array(_RES).reshape(-1, 4), axis=1),
), np.argmax(np.array(_res).reshape(-1, 4), axis=1) np.argmax(np.array(_res).reshape(-1, 4), axis=1),
)
assert (argmax_RES == argmax_res).all() assert (argmax_RES == argmax_res).all()
def test_generate_until(self) -> None: def test_generate_until(self) -> None:
......
import pytest
from typing import List from typing import List
from lm_eval.api.instance import Instance
import lm_eval.tasks as tasks import pytest
import sys
import torch import torch
import lm_eval.tasks as tasks
from lm_eval.api.instance import Instance
@pytest.mark.skip(reason="requires CUDA") @pytest.mark.skip(reason="requires CUDA")
class TEST_VLLM: class TEST_VLLM:
......
import os
# import lm_eval.base as base # import lm_eval.base as base
import lm_eval.api.registry as registry from typing import List
import lm_eval.tasks as tasks
import pytest
# import lm_eval.models as models # import lm_eval.models as models
import lm_eval.api as api import lm_eval.api as api
import lm_eval.evaluator as evaluator import lm_eval.evaluator as evaluator
from typing import List import lm_eval.tasks as tasks
import random
import pytest
tasks.initialize_tasks() tasks.initialize_tasks()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment