Commit 4cc57af4 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged conflict

parents 9554f8da 9d3247be
include: pile_arxiv.yaml
task: pile_uspto
dataset_name: pile_uspto
include: pile_arxiv.yaml
task: pile_wikipedia
dataset_name: pile_wikipedia
......@@ -28,9 +28,16 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-
### Subtasks
* `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods.
### Checklist
- [x] Is in Eval-harness v1.0 ?
- [x] Has been checked for regression from v1.0?
- [ ] Has been checked for equivalence with original paper methodology?
- [ ] "Main" checked variant clearly denoted?
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
import os
import re
import json
import fnmatch
import jsonlines
import argparse
import logging
from lm_eval import evaluator, utils
from lm_eval.api.registry import GROUP_REGISTRY, TASK_REGISTRY
from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# logger = logging.getLogger("main")
ALL_TASKS = sorted(list(TASK_REGISTRY.keys()) + list(GROUP_REGISTRY.keys()))
class MultiChoice:
......@@ -23,9 +23,8 @@ class MultiChoice:
if len(fnmatch.filter(self.choices, value)) == 0:
eval_logger.warning("{} is not in task list.".format(value))
eval_logger.info(f"Available tasks to choose:")
# for choice in self.choices:
# eval_logger.info(f" {choice}")
eval_logger.info(ALL_TASKS)
for choice in self.choices:
eval_logger.info(f" - {choice}")
return True
def __iter__(self):
......@@ -37,7 +36,7 @@ def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=MultiChoice(ALL_TASKS))
parser.add_argument("--tasks", default=None, choices=MultiChoice(sorted(ALL_TASKS)))
parser.add_argument("--config", default=None)
parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--num_fewshot", type=int, default=0)
......@@ -62,19 +61,7 @@ def pattern_match(patterns, source_list):
return sorted(list(task_names))
def setup_example_logger(output_path, separator):
"""Sets up a logger that will save each example and prediction."""
example_logger = logging.getLogger("examples")
filename = f"./outputs/examples{separator}{output_path}.jsonl"
formatter = logging.Formatter("%(message)s")
handler = logging.FileHandler(filename)
handler.setFormatter(formatter)
example_logger.addHandler(handler)
example_logger.setLevel(logging.INFO)
def main():
os.makedirs("./outputs", exist_ok=True)
args = parse_args()
if args.limit:
......@@ -83,10 +70,6 @@ def main():
"REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
)
path_separator = "."
output_path = args.output_path if args.output_path is not None else ""
setup_example_logger(output_path, path_separator)
if args.tasks is not None:
if os.path.isdir(args.tasks):
import glob
......@@ -118,13 +101,31 @@ def main():
check_integrity=args.check_integrity,
)
if results is not None:
samples = results.pop("samples")
dumped = json.dumps(results, indent=2)
print(dumped)
if args.output_path:
os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
with open(args.output_path, "w") as f:
f.write(dumped)
for task_name, config in results["configs"].items():
output_name = "{}_{}".format(
re.sub("/", "__", args.model_args), task_name
)
if os.path.isdir(args.output_path):
filename = f"./{args.output_path}/{output_name}.jsonl"
elif os.path.isfile(args.output_path):
filename = (
f"./{os.path.dirname(args.output_path)}/{output_name}.jsonl"
)
with jsonlines.open(filename, "w") as f:
f.write_all(samples[task_name])
print(
f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
......
......@@ -32,10 +32,10 @@ def main():
task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names)
description_dict = {}
if args.description_dict_path:
with open(args.description_dict_path, "r") as f:
description_dict = json.load(f)
# description_dict = {}
# if args.description_dict_path:
# with open(args.description_dict_path, "r") as f:
# description_dict = json.load(f)
os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items():
......@@ -55,11 +55,11 @@ def main():
docs = join_iters(iters)
description = (
description_dict[task_name]
if description_dict and task_name in description_dict
else ""
)
# description = (
# description_dict[task_name]
# if description_dict and task_name in description_dict
# else ""
# )
with open(os.path.join(args.output_base_path, task_name), "w") as f:
for i, doc in (
......
# Task-name
### Paper
Title: `paper title goes here`
Abstract: `link to paper PDF or arXiv abstract goes here`
`Short description of paper / benchmark goes here:`
Homepage: `homepage to the benchmark's website goes here, if applicable`
### Citation
```
BibTeX-formatted citation goes here
```
### Subtasks
List or describe tasks defined in this folder, and their names here:
* `task_name`: `1-sentence description of what this particular task does`
* `task_name2`: .....
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment