__main__.py 8.44 KB
Newer Older
lintangsutawika's avatar
lintangsutawika committed
1
import os
lintangsutawika's avatar
lintangsutawika committed
2
import re
Jason Phang's avatar
Jason Phang committed
3
import json
4
import fnmatch
lintangsutawika's avatar
lintangsutawika committed
5
import argparse
FarzanehNakhaee's avatar
FarzanehNakhaee committed
6
import logging
7
from pathlib import Path
8
import numpy as np
9
from lm_eval import evaluator, utils
10
from lm_eval.api.registry import ALL_TASKS
lintangsutawika's avatar
lintangsutawika committed
11
from lm_eval.logger import eval_logger, SPACING
12
from lm_eval.tasks import include_path
lintangsutawika's avatar
format  
lintangsutawika committed
13

haileyschoelkopf's avatar
haileyschoelkopf committed
14
from typing import Union
15

Fabrizio Milo's avatar
Fabrizio Milo committed
16

17
18
19
20
21
22
23
24
def _handle_non_serializable(o):
    if isinstance(o, np.int64):
        return int(o)
    elif isinstance(o, set):
        return list(o)
    raise TypeError(f"Object of type {o.__class__.__name__} is not JSON serializable")


haileyschoelkopf's avatar
haileyschoelkopf committed
25
def parse_eval_args() -> argparse.Namespace:
lintangsutawika's avatar
lintangsutawika committed
26
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
27
    parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
lintangsutawika's avatar
lintangsutawika committed
28
29
30
31
32
    parser.add_argument(
        "--tasks",
        default=None,
        help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
    )
33
34
35
36
37
    parser.add_argument(
        "--model_args",
        default="",
        help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
    )
lintangsutawika's avatar
lintangsutawika committed
38
    parser.add_argument(
39
40
        "--num_fewshot",
        type=int,
41
        default=None,
42
43
        help="Number of examples in few-shot context",
    )
lintangsutawika's avatar
lintangsutawika committed
44
    parser.add_argument("--batch_size", type=str, default=1)
lintangsutawika's avatar
lintangsutawika committed
45
46
47
48
49
50
    parser.add_argument(
        "--max_batch_size",
        type=int,
        default=None,
        help="Maximal batch size to try with --batch_size auto",
    )
51
52
53
54
55
56
57
58
59
60
61
    parser.add_argument(
        "--device",
        type=str,
        default=None,
        help="Device to use (e.g. cuda, cuda:0, cpu)",
    )
    parser.add_argument(
        "--output_path",
        default=None,
        type=str,
        metavar="= [dir/file.jsonl] [DIR]",
62
        help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.",
63
    )
lintangsutawika's avatar
lintangsutawika committed
64
65
66
67
68
69
70
    parser.add_argument(
        "--limit",
        type=float,
        default=None,
        help="Limit the number of examples per task. "
        "If <1, limit is a percentage of the total number of examples.",
    )
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
    parser.add_argument(
        "--use_cache",
        type=str,
        default=None,
        help="A path to a sqlite db file for caching model responses. `None` if not caching.",
    )
    parser.add_argument("--decontamination_ngrams_path", default=None)  # TODO: not used
    parser.add_argument(
        "--check_integrity",
        action="store_true",
        help="Whether to run the relevant part of the test suite for the tasks",
    )
    parser.add_argument(
        "--write_out",
        action="store_true",
        default=False,
        help="Prints the prompt for the first few documents",
    )
    parser.add_argument(
        "--log_samples",
        action="store_true",
        default=False,
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis",
    )
95
96
97
98
99
100
    parser.add_argument(
        "--show_config",
        action="store_true",
        default=False,
        help="If True, shows the the full config of all tasks at the end of the evaluation.",
    )
101
102
103
104
105
106
    parser.add_argument(
        "--include_path",
        type=str,
        default=None,
        help="Additional path to include if there are external tasks to include.",
    )
lintangsutawika's avatar
lintangsutawika committed
107
    parser.add_argument(
lintangsutawika's avatar
lintangsutawika committed
108
109
110
        "--verbosity",
        type=str,
        default="INFO",
lintangsutawika's avatar
lintangsutawika committed
111
112
        help="Log error when tasks are not registered.",
    )
113
114
115
116
117
118
    parser.add_argument(
        "--huggingface_token",
        type=str,
        default=None,
        help="huggingface token for downloading some authorization datasets, like toxigen, https://huggingface.co/settings/tokens",
    )
Jason Phang's avatar
Jason Phang committed
119
120
    return parser.parse_args()

Fabrizio Milo's avatar
Fabrizio Milo committed
121

haileyschoelkopf's avatar
haileyschoelkopf committed
122
123
124
125
126
def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    if not args:
        # we allow for args to be passed externally, else we parse them ourselves
        args = parse_eval_args()

lintangsutawika's avatar
lintangsutawika committed
127
    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
haileyschoelkopf's avatar
haileyschoelkopf committed
128
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
Fabrizio Milo's avatar
Fabrizio Milo committed
129

Leo Gao's avatar
Leo Gao committed
130
    if args.limit:
lintangsutawika's avatar
lintangsutawika committed
131
132
133
        eval_logger.warning(
            " --limit SHOULD ONLY BE USED FOR TESTING."
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
Fabrizio Milo's avatar
Fabrizio Milo committed
134
        )
135
136
    if args.huggingface_token:
        from huggingface_hub import login
Leo Gao's avatar
Leo Gao committed
137

138
        login(token=args.huggingface_token)
lintangsutawika's avatar
lintangsutawika committed
139
140
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
141
        include_path(args.include_path)
lintangsutawika's avatar
lintangsutawika committed
142

143
    if args.tasks is None:
144
        task_names = ALL_TASKS
Jason Phang's avatar
Jason Phang committed
145
    else:
146
147
        if os.path.isdir(args.tasks):
            import glob
148
149

            task_names = []
150
151
            yaml_path = os.path.join(args.tasks, "*.yaml")
            for yaml_file in glob.glob(yaml_path):
lintangsutawika's avatar
lintangsutawika committed
152
                config = utils.load_yaml_config(yaml_file)
153
154
                task_names.append(config)
        else:
155
            tasks_list = args.tasks.split(",")
156
            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
157
158
            for task in [task for task in tasks_list if task not in task_names]:
                if os.path.isfile(task):
lintangsutawika's avatar
lintangsutawika committed
159
                    config = utils.load_yaml_config(task)
160
                    task_names.append(config)
baberabb's avatar
baberabb committed
161
            task_missing = [task for task in tasks_list if task not in task_names]
lintangsutawika's avatar
lintangsutawika committed
162

baberabb's avatar
baberabb committed
163
164
165
166
167
168
169
170
171
            if task_missing:
                missing = ", ".join(task_missing)
                eval_logger.error(
                    f"Tasks were not found: {missing}\n"
                    f"{SPACING}Try `lm-eval -h` for list of available tasks",
                )
                raise ValueError(
                    f"Tasks {missing} were not found. Try `lm-eval -h` for list of available tasks."
                )
lintangsutawika's avatar
lintangsutawika committed
172

173
174
    if args.output_path:
        path = Path(args.output_path)
Lintang Sutawika's avatar
Lintang Sutawika committed
175
        # check if file or 'dir/results.json' exists
baberabb's avatar
baberabb committed
176
        if path.is_file() or Path(args.output_path).joinpath("results.json").is_file():
177
178
179
            eval_logger.warning(
                f"File already exists at {path}. Results will be overwritten."
            )
lintangsutawika's avatar
lintangsutawika committed
180
            output_path_file = path.joinpath("results.json")
181
182
183
184
185
186
187
188
189
            assert not path.is_file(), "File already exists"
        # if path json then get parent dir
        elif path.suffix in (".json", ".jsonl"):
            output_path_file = path
            path.parent.mkdir(parents=True, exist_ok=True)
            path = path.parent
        else:
            path.mkdir(parents=True, exist_ok=True)
            output_path_file = path.joinpath("results.json")
190
191
    elif args.log_samples and not args.output_path:
        assert args.output_path, "Specify --output_path"
192

lintangsutawika's avatar
lintangsutawika committed
193
    eval_logger.info(f"Selected Tasks: {task_names}")
194

195
196
197
198
199
200
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
201
        max_batch_size=args.max_batch_size,
202
        device=args.device,
haileyschoelkopf's avatar
haileyschoelkopf committed
203
        use_cache=args.use_cache,
204
205
206
        limit=args.limit,
        decontamination_ngrams_path=args.decontamination_ngrams_path,
        check_integrity=args.check_integrity,
207
        write_out=args.write_out,
208
        log_samples=args.log_samples,
209
    )
210

211
    if results is not None:
212
213
        if args.log_samples:
            samples = results.pop("samples")
214
        dumped = json.dumps(results, indent=2, default=_handle_non_serializable)
215
216
        if args.show_config:
            print(dumped)
217

218
219
        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))

220
        if args.output_path:
221
            output_path_file.open("w").write(dumped)
222

223
224
225
            if args.log_samples:
                for task_name, config in results["configs"].items():
                    output_name = "{}_{}".format(
lintangsutawika's avatar
lintangsutawika committed
226
                        re.sub("/|=", "__", args.model_args), task_name
lintangsutawika's avatar
lintangsutawika committed
227
                    )
228
                    filename = path.joinpath(f"{output_name}.jsonl")
229
230
231
232
                    samples_dumped = json.dumps(
                        samples[task_name], indent=2, default=_handle_non_serializable
                    )
                    filename.open("w").write(samples_dumped)
lintangsutawika's avatar
lintangsutawika committed
233

234
        print(
235
236
            f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
237
238
        )
        print(evaluator.make_table(results))
lintangsutawika's avatar
lintangsutawika committed
239
240
        if "groups" in results:
            print(evaluator.make_table(results, "groups"))
Jason Phang's avatar
lib  
Jason Phang committed
241

242

Jason Phang's avatar
Jason Phang committed
243
if __name__ == "__main__":
haileyschoelkopf's avatar
haileyschoelkopf committed
244
    cli_evaluate()