zeno_visualize.py 8.84 KB
Newer Older
1
2
import argparse
import json
Lintang Sutawika's avatar
Lintang Sutawika committed
3
import logging
4
5
6
7
8
9
10
import os
import re
from pathlib import Path

import pandas as pd
from zeno_client import ZenoClient, ZenoMetric

11
12
13
14
15
from lm_eval.utils import (
    get_latest_filename,
    get_results_filenames,
    get_sample_results_filenames,
)
16
17


Lintang Sutawika's avatar
Lintang Sutawika committed
18
19
20
eval_logger = logging.getLogger(__name__)


21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def parse_args():
    parser = argparse.ArgumentParser(
        description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."
    )
    parser.add_argument(
        "--data_path",
        required=True,
        help="Where to find the results of the benchmarks that have been run. Uses the name of each subfolder as the model name.",
    )
    parser.add_argument(
        "--project_name",
        required=True,
        help="The name of the generated Zeno project.",
    )
    return parser.parse_args()


def main():
    """Upload the results of your benchmark tasks to the Zeno AI evaluation platform.

    This scripts expects your results to live in a data folder where subfolders contain results of individual models.
    """
    args = parse_args()

    client = ZenoClient(os.environ["ZENO_API_KEY"])

    # Get all model subfolders from the parent data folder.
    models = [
        os.path.basename(os.path.normpath(f))
        for f in os.scandir(Path(args.data_path))
        if f.is_dir()
    ]

    assert len(models) > 0, "No model directories found in the data_path."

56
    # Get the tasks from the latest results file of the first model.
57
58
    tasks = set(tasks_for_model(models[0], args.data_path))

59
60
61
    # Get tasks names from the latest results file for each model
    # Get intersection of tasks for all models
    for model in models:
62
63
        old_tasks = tasks.copy()
        task_count = len(tasks)
64
        model_tasks = set(tasks_for_model(model, args.data_path))
65
66
67
68
69
70
71
        tasks.intersection(set(model_tasks))

        if task_count != len(tasks):
            eval_logger.warning(
                f"All models must have the same tasks. {model} has tasks: {model_tasks} but have already recorded tasks: {old_tasks}. Taking intersection {tasks}"
            )

Baber Abbasi's avatar
Baber Abbasi committed
72
73
74
    assert len(tasks) > 0, (
        "Must provide at least one task in common amongst models to compare."
    )
75
76
77
78

    for task in tasks:
        # Upload data for all models
        for model_index, model in enumerate(models):
79
80
81
82
83
84
85
86
87
88
89
            # Get latest results and sample results for a model
            model_dir = Path(args.data_path, model)
            model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
            model_results_filenames = get_results_filenames(model_files)
            model_sample_filenames = get_sample_results_filenames(model_files)
            latest_results = get_latest_filename(
                [Path(f).name for f in model_results_filenames]
            )
            latest_sample_results = get_latest_filename(
                [Path(f).name for f in model_sample_filenames if task in f]
            )
90
            model_args = re.sub(
91
                r"[\"<>:/\|\\?\*\[\]]+",
92
                "__",
93
                json.load(
94
                    open(Path(args.data_path, model, latest_results), encoding="utf-8")
95
                )["config"]["model_args"],
96
            )
97
98
            print(model_args)
            data = []
99
            with open(
100
                Path(args.data_path, model, latest_sample_results),
101
102
                "r",
                encoding="utf-8",
103
            ) as file:
104
105
                for line in file:
                    data.append(json.loads(line.strip()))
106

107
            configs = json.load(
108
                open(Path(args.data_path, model, latest_results), encoding="utf-8")
109
            )["configs"]
110
111
112
113
114
            config = configs[task]

            if model_index == 0:  # Only need to assemble data for the first model
                metrics = []
                for metric in config["metric_list"]:
115
116
117
118
119
120
121
                    if metric.get("aggregation") == "mean":
                        metrics.append(
                            ZenoMetric(
                                name=metric["metric"],
                                type="mean",
                                columns=[metric["metric"]],
                            )
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
                        )
                project = client.create_project(
                    name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
                    view="text-classification",
                    metrics=metrics,
                )
                project.upload_dataset(
                    generate_dataset(data, config),
                    id_column="id",
                    data_column="data",
                    label_column="labels",
                )

            project.upload_system(
                generate_system_df(data, config),
                name=model,
                id_column="id",
                output_column="output",
            )


def tasks_for_model(model: str, data_path: str):
    """Get the tasks for a specific model.

    Args:
        model (str): The name of the model.
        data_path (str): The path to the data.

    Returns:
        list: A list of tasks for the model.
    """
153
154
155
156
157
158
    # get latest model results for a given name
    model_dir = Path(data_path, model)
    model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
    model_results_filenames = get_results_filenames(model_files)
    latest_results = get_latest_filename(model_results_filenames)
    config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
    return list(config[0].keys())


def generate_dataset(
    data,
    config,
):
    """Generate a Zeno dataset from evaluation data.

    Args:
        data: The data to generate a dataset for.
        config: The configuration of the task.

    Returns:
        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
    """
175
176
177
178
179
    ids = (
        [x["doc_id"] for x in data]
        if not config.get("filter_list")
        else [f"{x['doc_id']}.{x['filter']}" for x in data]
    )
180
181
182
183
    labels = [x["target"] for x in data]
    instance = [""] * len(ids)

    if config["output_type"] == "loglikelihood":
Nam D. Tran's avatar
Nam D. Tran committed
184
185
        instance = [x["arguments"]["gen_args_0"]["arg_0"] for x in data]
        labels = [x["arguments"]["gen_args_0"]["arg_1"] for x in data]
186
187
    elif config["output_type"] == "multiple_choice":
        instance = [
Nam D. Tran's avatar
Nam D. Tran committed
188
            x["arguments"]["gen_args_0"]["arg_0"]
189
190
191
192
193
            + "\n\n"
            + "\n".join([f"- {y[1]}" for y in x["arguments"]])
            for x in data
        ]
    elif config["output_type"] == "loglikelihood_rolling":
Nam D. Tran's avatar
Nam D. Tran committed
194
        instance = [x["arguments"]["gen_args_0"]["arg_0"] for x in data]
195
    elif config["output_type"] == "generate_until":
Nam D. Tran's avatar
Nam D. Tran committed
196
        instance = [x["arguments"]["gen_args_0"]["arg_0"] for x in data]
197
198
199
200

    return pd.DataFrame(
        {
            "id": ids,
201
            "doc_id": [x["doc_id"] for x in data],
202
            "data": instance,
203
            "input_len": [len(x) for x in instance],
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
            "labels": labels,
            "output_type": config["output_type"],
        }
    )


def generate_system_df(data, config):
    """Generate a dataframe for a specific system to be uploaded to Zeno.

    Args:
        data: The data to generate a dataframe from.
        config: The configuration of the task.

    Returns:
        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
    """
220
221
222
223
224
    ids = (
        [x["doc_id"] for x in data]
        if not config.get("filter_list")
        else [f"{x['doc_id']}.{x['filter']}" for x in data]
    )
225
    system_dict = {"id": ids}
226
227
228
    system_dict["doc_id"] = [x["doc_id"] for x in data]
    if config.get("filter_list"):
        system_dict["filter"] = [x["filter"] for x in data]
229
    system_dict["output"] = [""] * len(ids)
230
231

    if config["output_type"] == "loglikelihood":
232
        system_dict["output"] = [
233
234
235
236
            "correct" if x["filtered_resps"][0][1] is True else "incorrect"
            for x in data
        ]
    elif config["output_type"] == "multiple_choice":
237
238
239
240
        system_dict["output"] = [
            ", ".join([str(y[0]) for y in x["filtered_resps"]]) for x in data
        ]
        system_dict["num_answers"] = [len(x["filtered_resps"]) for x in data]
241
    elif config["output_type"] == "loglikelihood_rolling":
242
        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
243
    elif config["output_type"] == "generate_until":
244
245
        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
        system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]
246

247
248
249
250
    metrics = {
        metric["metric"]: [x[metric["metric"]] for x in data]
        for metric in config["metric_list"]
    }
251
252
253
254
255
256
257
    system_dict.update(metrics)
    system_df = pd.DataFrame(system_dict)
    return system_df


if __name__ == "__main__":
    main()