import os import re import json import fnmatch import jsonlines import argparse import logging from pathlib import Path from lm_eval import evaluator, utils from lm_eval.api.registry import ALL_TASKS from lm_eval.logger import eval_logger, SPACING from lm_eval.tasks import include_task_folder os.environ["TOKENIZERS_PARALLELISM"] = "false" def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("--model", required=True, help="Name of model e.g. `hf`") parser.add_argument( "--tasks", default=None, help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))), ) parser.add_argument( "--model_args", default="", help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`", ) parser.add_argument( "--num_fewshot", type=int, default=None, help="Number of examples in few-shot context", ) parser.add_argument("--batch_size", type=str, default=1) parser.add_argument( "--max_batch_size", type=int, default=None, help="Maximal batch size to try with --batch_size auto", ) parser.add_argument( "--device", type=str, default=None, help="Device to use (e.g. cuda, cuda:0, cpu)", ) parser.add_argument( "--output_path", default=None, type=str, metavar="= [dir/file.jsonl] [DIR]", help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.", ) parser.add_argument( "--limit", type=float, default=None, help="Limit the number of examples per task. " "If <1, limit is a percentage of the total number of examples.", ) parser.add_argument( "--use_cache", type=str, default=None, help="A path to a sqlite db file for caching model responses. `None` if not caching.", ) parser.add_argument("--decontamination_ngrams_path", default=None) # TODO: not used parser.add_argument( "--check_integrity", action="store_true", help="Whether to run the relevant part of the test suite for the tasks", ) parser.add_argument( "--write_out", action="store_true", default=False, help="Prints the prompt for the first few documents", ) parser.add_argument( "--log_samples", action="store_true", default=False, help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis", ) parser.add_argument( "--show_config", action="store_true", default=False, help="If True, shows the the full config of all tasks at the end of the evaluation.", ) parser.add_argument( "--include_path", type=str, default=None, help="Additional path to include if there are external tasks to include.", ) return parser.parse_args() def main() -> None: args = parse_args() if args.limit: eval_logger.warning( " --limit SHOULD ONLY BE USED FOR TESTING." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if args.include_path is not None: eval_logger.info(f"Including path: {args.include_path}") include_task_folder(args.include_path) if args.tasks is None: task_names = ALL_TASKS else: if os.path.isdir(args.tasks): import glob task_names = [] yaml_path = os.path.join(args.tasks, "*.yaml") for yaml_file in glob.glob(yaml_path): config = utils.load_yaml_config(yaml_file) task_names.append(config) else: tasks_list = args.tasks.split(",") task_names = utils.pattern_match(tasks_list, ALL_TASKS) task_missing = [] for task in [task for task in tasks_list if task not in task_names]: if os.path.isfile(task): config = utils.load_yaml_config(task) task_names.append(config) else: task_missing.append(task) if task_missing != []: missing = ", ".join(task_missing) eval_logger.error( f"Tasks were not found: {missing}\n" f"{SPACING}Try `lm-eval -h` for list of available tasks", ) raise ValueError(f"Tasks {missing} were not found.") if args.output_path: path = Path(args.output_path) # check if file or 'dir/results.json' exists if path.is_file() or Path(args.output_path).joinpath("results.json").is_file(): eval_logger.warning( f"File already exists at {path}. Results will be overwritten." ) output_path_file = path.joinpath("results.json") assert not path.is_file(), "File already exists" # if path json then get parent dir elif path.suffix in (".json", ".jsonl"): output_path_file = path path.parent.mkdir(parents=True, exist_ok=True) path = path.parent else: path.mkdir(parents=True, exist_ok=True) output_path_file = path.joinpath("results.json") elif args.log_samples and not args.output_path: assert args.output_path, "Specify --output_path" eval_logger.info(f"Selected Tasks: {task_names}") results = evaluator.simple_evaluate( model=args.model, model_args=args.model_args, tasks=task_names, num_fewshot=args.num_fewshot, batch_size=args.batch_size, max_batch_size=args.max_batch_size, device=args.device, use_cache=args.use_cache, limit=args.limit, decontamination_ngrams_path=args.decontamination_ngrams_path, check_integrity=args.check_integrity, write_out=args.write_out, log_samples=args.log_samples, ) if results is not None: if args.log_samples: samples = results.pop("samples") dumped = json.dumps(results, indent=2, default=lambda o: str(o)) if args.show_config: print(dumped) batch_sizes = ",".join(map(str, results["config"]["batch_sizes"])) if args.output_path: output_path_file.open("w").write(dumped) if args.log_samples: for task_name, config in results["configs"].items(): output_name = "{}_{}".format( re.sub("/|=", "__", args.model_args), task_name ) filename = path.joinpath(f"{output_name}.jsonl") with jsonlines.open(filename, "w") as f: f.write_all(samples[task_name]) print( f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" ) print(evaluator.make_table(results)) if "groups" in results: print(evaluator.make_table(results, "groups")) if __name__ == "__main__": main()