zeno_visualize.py 9.37 KB
Newer Older
1
2
import argparse
import json
Lintang Sutawika's avatar
Lintang Sutawika committed
3
import logging
4
5
6
import os
import re
from pathlib import Path
7
from typing import Union
8
9
10
11

import pandas as pd
from zeno_client import ZenoClient, ZenoMetric

12
13
14
15
16
from lm_eval.utils import (
    get_latest_filename,
    get_results_filenames,
    get_sample_results_filenames,
)
17
18


Lintang Sutawika's avatar
Lintang Sutawika committed
19
20
21
eval_logger = logging.getLogger(__name__)


22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
def parse_args():
    parser = argparse.ArgumentParser(
        description="Upload your data to the Zeno AI evaluation platform to visualize results. This requires a ZENO_API_KEY in your environment variables. The eleuther harness must be run with log_samples=True and an output_path set for data to be written to disk."
    )
    parser.add_argument(
        "--data_path",
        required=True,
        help="Where to find the results of the benchmarks that have been run. Uses the name of each subfolder as the model name.",
    )
    parser.add_argument(
        "--project_name",
        required=True,
        help="The name of the generated Zeno project.",
    )
    return parser.parse_args()


39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
def sanitize_string(model_args_raw: Union[str, dict]) -> str:
    """Sanitize the model_args string or dict"""
    # Convert to string if it's a dictionary
    model_args_str = (
        json.dumps(model_args_raw)
        if isinstance(model_args_raw, dict)
        else model_args_raw
    )
    # Apply the sanitization
    return re.sub(
        r"[\"<>:/|\\?*\[\]]+",
        "__",
        model_args_str,
    )


55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def main():
    """Upload the results of your benchmark tasks to the Zeno AI evaluation platform.

    This scripts expects your results to live in a data folder where subfolders contain results of individual models.
    """
    args = parse_args()

    client = ZenoClient(os.environ["ZENO_API_KEY"])

    # Get all model subfolders from the parent data folder.
    models = [
        os.path.basename(os.path.normpath(f))
        for f in os.scandir(Path(args.data_path))
        if f.is_dir()
    ]

    assert len(models) > 0, "No model directories found in the data_path."

73
    # Get the tasks from the latest results file of the first model.
74
75
    tasks = set(tasks_for_model(models[0], args.data_path))

76
77
78
    # Get tasks names from the latest results file for each model
    # Get intersection of tasks for all models
    for model in models:
79
80
        old_tasks = tasks.copy()
        task_count = len(tasks)
81
        model_tasks = set(tasks_for_model(model, args.data_path))
82
83
84
85
86
87
88
        tasks.intersection(set(model_tasks))

        if task_count != len(tasks):
            eval_logger.warning(
                f"All models must have the same tasks. {model} has tasks: {model_tasks} but have already recorded tasks: {old_tasks}. Taking intersection {tasks}"
            )

Baber Abbasi's avatar
Baber Abbasi committed
89
90
91
    assert len(tasks) > 0, (
        "Must provide at least one task in common amongst models to compare."
    )
92
93
94
95

    for task in tasks:
        # Upload data for all models
        for model_index, model in enumerate(models):
96
97
98
99
100
101
102
103
104
105
106
            # Get latest results and sample results for a model
            model_dir = Path(args.data_path, model)
            model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
            model_results_filenames = get_results_filenames(model_files)
            model_sample_filenames = get_sample_results_filenames(model_files)
            latest_results = get_latest_filename(
                [Path(f).name for f in model_results_filenames]
            )
            latest_sample_results = get_latest_filename(
                [Path(f).name for f in model_sample_filenames if task in f]
            )
107
108
            # Load the model_args, which can be either a string or a dictionary
            model_args = sanitize_string(
109
                json.load(
110
111
112
113
114
                    open(
                        Path(args.data_path, model, latest_results),
                        encoding="utf-8",
                    )
                )["config"]["model_args"]
115
            )
116

117
118
            print(model_args)
            data = []
119
            with open(
120
                Path(args.data_path, model, latest_sample_results),
121
122
                "r",
                encoding="utf-8",
123
            ) as file:
124
125
                for line in file:
                    data.append(json.loads(line.strip()))
126

127
            configs = json.load(
128
                open(Path(args.data_path, model, latest_results), encoding="utf-8")
129
            )["configs"]
130
131
132
133
134
            config = configs[task]

            if model_index == 0:  # Only need to assemble data for the first model
                metrics = []
                for metric in config["metric_list"]:
135
136
137
138
139
140
141
                    if metric.get("aggregation") == "mean":
                        metrics.append(
                            ZenoMetric(
                                name=metric["metric"],
                                type="mean",
                                columns=[metric["metric"]],
                            )
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
                        )
                project = client.create_project(
                    name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
                    view="text-classification",
                    metrics=metrics,
                )
                project.upload_dataset(
                    generate_dataset(data, config),
                    id_column="id",
                    data_column="data",
                    label_column="labels",
                )

            project.upload_system(
                generate_system_df(data, config),
                name=model,
                id_column="id",
                output_column="output",
            )


def tasks_for_model(model: str, data_path: str):
    """Get the tasks for a specific model.

    Args:
        model (str): The name of the model.
        data_path (str): The path to the data.

    Returns:
        list: A list of tasks for the model.
    """
173
174
175
176
177
178
    # get latest model results for a given name
    model_dir = Path(data_path, model)
    model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
    model_results_filenames = get_results_filenames(model_files)
    latest_results = get_latest_filename(model_results_filenames)
    config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
    return list(config[0].keys())


def generate_dataset(
    data,
    config,
):
    """Generate a Zeno dataset from evaluation data.

    Args:
        data: The data to generate a dataset for.
        config: The configuration of the task.

    Returns:
        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
    """
195
196
197
198
199
    ids = (
        [x["doc_id"] for x in data]
        if not config.get("filter_list")
        else [f"{x['doc_id']}.{x['filter']}" for x in data]
    )
200
201
202
203
    labels = [x["target"] for x in data]
    instance = [""] * len(ids)

    if config["output_type"] == "loglikelihood":
Nam D. Tran's avatar
Nam D. Tran committed
204
205
        instance = [x["arguments"]["gen_args_0"]["arg_0"] for x in data]
        labels = [x["arguments"]["gen_args_0"]["arg_1"] for x in data]
206
207
    elif config["output_type"] == "multiple_choice":
        instance = [
Nam D. Tran's avatar
Nam D. Tran committed
208
            x["arguments"]["gen_args_0"]["arg_0"]
209
210
211
212
213
            + "\n\n"
            + "\n".join([f"- {y[1]}" for y in x["arguments"]])
            for x in data
        ]
    elif config["output_type"] == "loglikelihood_rolling":
Nam D. Tran's avatar
Nam D. Tran committed
214
        instance = [x["arguments"]["gen_args_0"]["arg_0"] for x in data]
215
    elif config["output_type"] == "generate_until":
Nam D. Tran's avatar
Nam D. Tran committed
216
        instance = [x["arguments"]["gen_args_0"]["arg_0"] for x in data]
217
218
219
220

    return pd.DataFrame(
        {
            "id": ids,
221
            "doc_id": [x["doc_id"] for x in data],
222
            "data": instance,
223
            "input_len": [len(x) for x in instance],
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
            "labels": labels,
            "output_type": config["output_type"],
        }
    )


def generate_system_df(data, config):
    """Generate a dataframe for a specific system to be uploaded to Zeno.

    Args:
        data: The data to generate a dataframe from.
        config: The configuration of the task.

    Returns:
        pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
    """
240
241
242
243
244
    ids = (
        [x["doc_id"] for x in data]
        if not config.get("filter_list")
        else [f"{x['doc_id']}.{x['filter']}" for x in data]
    )
245
    system_dict = {"id": ids}
246
247
248
    system_dict["doc_id"] = [x["doc_id"] for x in data]
    if config.get("filter_list"):
        system_dict["filter"] = [x["filter"] for x in data]
249
    system_dict["output"] = [""] * len(ids)
250
251

    if config["output_type"] == "loglikelihood":
252
        system_dict["output"] = [
253
254
255
256
            "correct" if x["filtered_resps"][0][1] is True else "incorrect"
            for x in data
        ]
    elif config["output_type"] == "multiple_choice":
257
258
259
260
        system_dict["output"] = [
            ", ".join([str(y[0]) for y in x["filtered_resps"]]) for x in data
        ]
        system_dict["num_answers"] = [len(x["filtered_resps"]) for x in data]
261
    elif config["output_type"] == "loglikelihood_rolling":
262
        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
263
    elif config["output_type"] == "generate_until":
264
265
        system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
        system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]
266

267
268
269
270
    metrics = {
        metric["metric"]: [x[metric["metric"]] for x in data]
        for metric in config["metric_list"]
    }
271
272
273
274
275
276
277
    system_dict.update(metrics)
    system_df = pd.DataFrame(system_dict)
    return system_df


if __name__ == "__main__":
    main()