Commit 4cc57af4 authored by lintangsutawika's avatar lintangsutawika
Browse files

merged conflict

parents 9554f8da 9d3247be
include: pile_arxiv.yaml include: pile_arxiv.yaml
task: pile_uspto task: pile_uspto
dataset_name: pile_uspto dataset_name: pile_uspto
include: pile_arxiv.yaml include: pile_arxiv.yaml
task: pile_wikipedia task: pile_wikipedia
dataset_name: pile_wikipedia dataset_name: pile_wikipedia
...@@ -28,9 +28,16 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext- ...@@ -28,9 +28,16 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-
### Subtasks ### Subtasks
* `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods.
### Checklist ### Checklist
- [x] Is in Eval-harness v1.0 ? * [x] Is the task an existing benchmark in the literature?
- [x] Has been checked for regression from v1.0? * [x] Have you referenced the original paper that introduced the task?
- [ ] Has been checked for equivalence with original paper methodology? * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
- [ ] "Main" checked variant clearly denoted?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
import os import os
import re
import json import json
import fnmatch import fnmatch
import jsonlines
import argparse import argparse
import logging import logging
from lm_eval import evaluator, utils from lm_eval import evaluator, utils
from lm_eval.api.registry import GROUP_REGISTRY, TASK_REGISTRY from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
# logger = logging.getLogger("main")
ALL_TASKS = sorted(list(TASK_REGISTRY.keys()) + list(GROUP_REGISTRY.keys()))
class MultiChoice: class MultiChoice:
...@@ -23,9 +23,8 @@ class MultiChoice: ...@@ -23,9 +23,8 @@ class MultiChoice:
if len(fnmatch.filter(self.choices, value)) == 0: if len(fnmatch.filter(self.choices, value)) == 0:
eval_logger.warning("{} is not in task list.".format(value)) eval_logger.warning("{} is not in task list.".format(value))
eval_logger.info(f"Available tasks to choose:") eval_logger.info(f"Available tasks to choose:")
# for choice in self.choices: for choice in self.choices:
# eval_logger.info(f" {choice}") eval_logger.info(f" - {choice}")
eval_logger.info(ALL_TASKS)
return True return True
def __iter__(self): def __iter__(self):
...@@ -37,7 +36,7 @@ def parse_args(): ...@@ -37,7 +36,7 @@ def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True) parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="") parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=MultiChoice(ALL_TASKS)) parser.add_argument("--tasks", default=None, choices=MultiChoice(sorted(ALL_TASKS)))
parser.add_argument("--config", default=None) parser.add_argument("--config", default=None)
parser.add_argument("--provide_description", action="store_true") parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--num_fewshot", type=int, default=0) parser.add_argument("--num_fewshot", type=int, default=0)
...@@ -62,19 +61,7 @@ def pattern_match(patterns, source_list): ...@@ -62,19 +61,7 @@ def pattern_match(patterns, source_list):
return sorted(list(task_names)) return sorted(list(task_names))
def setup_example_logger(output_path, separator):
"""Sets up a logger that will save each example and prediction."""
example_logger = logging.getLogger("examples")
filename = f"./outputs/examples{separator}{output_path}.jsonl"
formatter = logging.Formatter("%(message)s")
handler = logging.FileHandler(filename)
handler.setFormatter(formatter)
example_logger.addHandler(handler)
example_logger.setLevel(logging.INFO)
def main(): def main():
os.makedirs("./outputs", exist_ok=True)
args = parse_args() args = parse_args()
if args.limit: if args.limit:
...@@ -83,10 +70,6 @@ def main(): ...@@ -83,10 +70,6 @@ def main():
"REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
) )
path_separator = "."
output_path = args.output_path if args.output_path is not None else ""
setup_example_logger(output_path, path_separator)
if args.tasks is not None: if args.tasks is not None:
if os.path.isdir(args.tasks): if os.path.isdir(args.tasks):
import glob import glob
...@@ -118,13 +101,31 @@ def main(): ...@@ -118,13 +101,31 @@ def main():
check_integrity=args.check_integrity, check_integrity=args.check_integrity,
) )
if results is not None: if results is not None:
samples = results.pop("samples")
dumped = json.dumps(results, indent=2) dumped = json.dumps(results, indent=2)
print(dumped) print(dumped)
if args.output_path: if args.output_path:
os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
with open(args.output_path, "w") as f: with open(args.output_path, "w") as f:
f.write(dumped) f.write(dumped)
for task_name, config in results["configs"].items():
output_name = "{}_{}".format(
re.sub("/", "__", args.model_args), task_name
)
if os.path.isdir(args.output_path):
filename = f"./{args.output_path}/{output_name}.jsonl"
elif os.path.isfile(args.output_path):
filename = (
f"./{os.path.dirname(args.output_path)}/{output_name}.jsonl"
)
with jsonlines.open(filename, "w") as f:
f.write_all(samples[task_name])
print( print(
f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, " f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}" f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
......
...@@ -32,10 +32,10 @@ def main(): ...@@ -32,10 +32,10 @@ def main():
task_names = args.tasks.split(",") task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names) task_dict = tasks.get_task_dict(task_names)
description_dict = {} # description_dict = {}
if args.description_dict_path: # if args.description_dict_path:
with open(args.description_dict_path, "r") as f: # with open(args.description_dict_path, "r") as f:
description_dict = json.load(f) # description_dict = json.load(f)
os.makedirs(args.output_base_path, exist_ok=True) os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
...@@ -55,11 +55,11 @@ def main(): ...@@ -55,11 +55,11 @@ def main():
docs = join_iters(iters) docs = join_iters(iters)
description = ( # description = (
description_dict[task_name] # description_dict[task_name]
if description_dict and task_name in description_dict # if description_dict and task_name in description_dict
else "" # else ""
) # )
with open(os.path.join(args.output_base_path, task_name), "w") as f: with open(os.path.join(args.output_base_path, task_name), "w") as f:
for i, doc in ( for i, doc in (
......
# Task-name
### Paper
Title: `paper title goes here`
Abstract: `link to paper PDF or arXiv abstract goes here`
`Short description of paper / benchmark goes here:`
Homepage: `homepage to the benchmark's website goes here, if applicable`
### Citation
```
BibTeX-formatted citation goes here
```
### Subtasks
List or describe tasks defined in this folder, and their names here:
* `task_name`: `1-sentence description of what this particular task does`
* `task_name2`: .....
### Checklist
For adding novel benchmarks/datasets to the library:
* [ ] Is the task an existing benchmark in the literature?
* [ ] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment