Commit f66fc06f authored by haileyschoelkopf's avatar haileyschoelkopf
Browse files

fix merge conflicts

parents b13753cd d714fc95
...@@ -50,7 +50,7 @@ def process_docs(dataset, set_answer_type="bool"): ...@@ -50,7 +50,7 @@ def process_docs(dataset, set_answer_type="bool"):
obs_list["abstract"].append(abstract) obs_list["abstract"].append(abstract)
obs_list["question"].append(question) obs_list["question"].append(question)
obs_list["answer_type"].append(answer_type) obs_list["answer_type"].append(answer_type)
if type(answer) == list: if isinstance(answer, list):
answer = ", ".join(answer) answer = ", ".join(answer)
obs_list["answer"].append(answer) obs_list["answer"].append(answer)
......
group: scrolls group: scrolls
task: task:
- scrolls_qasper - task: scrolls_qasper
- scrolls_quality class: !function task.Qasper
- scrolls_narrativeqa - task: scrolls_quality
- scrolls_contractnli class: !function task.QuALITY
- scrolls_govreport - task: scrolls_narrativeqa
- scrolls_summscreenfd class: !function task.NarrativeQA
- scrolls_qmsum - task: scrolls_contractnli
class: !function task.ContractNLI
- task: scrolls_govreport
class: !function task.GovReport
- task: scrolls_summscreenfd
class: !function task.SummScreenFD
- task: scrolls_qmsum
class: !function task.QMSum
...@@ -115,8 +115,10 @@ class _SCROLLSTask(Task): ...@@ -115,8 +115,10 @@ class _SCROLLSTask(Task):
PRUNE_MAX_TOKENS = None PRUNE_MAX_TOKENS = None
PRUNE_NUM_PROC = None PRUNE_NUM_PROC = None
def __post_init__(self): def __init__(self):
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) super().__init__()
if self.DATASET_NAME is not None:
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
def has_training_docs(self): def has_training_docs(self):
return True return True
...@@ -224,9 +226,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask): ...@@ -224,9 +226,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
def process_results(self, doc, results): def process_results(self, doc, results):
gold = doc["gold"] gold = doc["gold"]
acc = 1.0 if np.argmax(results) == gold else 0.0 lls, _ = zip(*results)
acc = 1.0 if np.argmax(lls) == gold else 0.0
completion_len = np.array([float(len(i)) for i in doc["choices"]]) completion_len = np.array([float(len(i)) for i in doc["choices"]])
acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0 acc_norm = 1.0 if np.argmax(lls / completion_len) == gold else 0.0
return { return {
"acc": acc, "acc": acc,
...@@ -279,7 +282,6 @@ class _SCROLLSSummaryTask(_SCROLLSTask): ...@@ -279,7 +282,6 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:" return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
@register_task("scrolls_qasper")
class Qasper(_SCROLLSTask): class Qasper(_SCROLLSTask):
"""A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers """A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
https://arxiv.org/abs/2105.03011 https://arxiv.org/abs/2105.03011
...@@ -337,7 +339,6 @@ class Qasper(_SCROLLSTask): ...@@ -337,7 +339,6 @@ class Qasper(_SCROLLSTask):
) )
@register_task("scrolls_quality")
class QuALITY(_SCROLLSMultipleChoiceTask): class QuALITY(_SCROLLSMultipleChoiceTask):
"""QuALITY: Question Answering with Long Input Texts, Yes! """QuALITY: Question Answering with Long Input Texts, Yes!
https://arxiv.org/abs/2112.08608 https://arxiv.org/abs/2112.08608
...@@ -366,7 +367,6 @@ class QuALITY(_SCROLLSMultipleChoiceTask): ...@@ -366,7 +367,6 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
return [doc] return [doc]
@register_task("scrolls_narrativeqa")
class NarrativeQA(_SCROLLSTask): class NarrativeQA(_SCROLLSTask):
"""The NarrativeQA Reading Comprehension Challenge """The NarrativeQA Reading Comprehension Challenge
https://arxiv.org/abs/1712.07040 https://arxiv.org/abs/1712.07040
...@@ -400,7 +400,6 @@ class NarrativeQA(_SCROLLSTask): ...@@ -400,7 +400,6 @@ class NarrativeQA(_SCROLLSTask):
) )
@register_task("scrolls_contractnli")
class ContractNLI(_SCROLLSMultipleChoiceTask): class ContractNLI(_SCROLLSMultipleChoiceTask):
"""ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts """ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
https://arxiv.org/abs/1712.07040 https://arxiv.org/abs/1712.07040
...@@ -419,7 +418,6 @@ class ContractNLI(_SCROLLSMultipleChoiceTask): ...@@ -419,7 +418,6 @@ class ContractNLI(_SCROLLSMultipleChoiceTask):
return f"{doc['text']}\n\nHypothesis: {doc['question']}\nConclusion:" return f"{doc['text']}\n\nHypothesis: {doc['question']}\nConclusion:"
@register_task("scrolls_govreport")
class GovReport(_SCROLLSSummaryTask): class GovReport(_SCROLLSSummaryTask):
"""Efficient Attentions for Long Document Summarization """Efficient Attentions for Long Document Summarization
https://arxiv.org/abs/2104.02112 https://arxiv.org/abs/2104.02112
...@@ -433,7 +431,6 @@ class GovReport(_SCROLLSSummaryTask): ...@@ -433,7 +431,6 @@ class GovReport(_SCROLLSSummaryTask):
DATASET_NAME = "gov_report" DATASET_NAME = "gov_report"
@register_task("scrolls_summscreenfd")
class SummScreenFD(_SCROLLSSummaryTask): class SummScreenFD(_SCROLLSSummaryTask):
"""SummScreen: A Dataset for Abstractive Screenplay Summarization """SummScreen: A Dataset for Abstractive Screenplay Summarization
https://arxiv.org/abs/2104.07091 https://arxiv.org/abs/2104.07091
...@@ -442,7 +439,6 @@ class SummScreenFD(_SCROLLSSummaryTask): ...@@ -442,7 +439,6 @@ class SummScreenFD(_SCROLLSSummaryTask):
DATASET_NAME = "summ_screen_fd" DATASET_NAME = "summ_screen_fd"
@register_task("scrolls_qmsum")
class QMSum(_SCROLLSSummaryTask): class QMSum(_SCROLLSSummaryTask):
"""QMSum: A New Benchmark for Query-based Multi-domain """QMSum: A New Benchmark for Query-based Multi-domain
Meeting Summarization Meeting Summarization
......
task: squadv2
class: !function task.SQuAD2
...@@ -21,7 +21,6 @@ from packaging import version ...@@ -21,7 +21,6 @@ from packaging import version
from lm_eval.api.task import Task from lm_eval.api.task import Task
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_task
_CITATION = """ _CITATION = """
@misc{rajpurkar2018know, @misc{rajpurkar2018know,
...@@ -47,7 +46,6 @@ def _squad_agg(key, items): ...@@ -47,7 +46,6 @@ def _squad_agg(key, items):
return _squad_metric(predictions=predictions, references=references).get(key, 0) return _squad_metric(predictions=predictions, references=references).get(key, 0)
@register_task("squadv2")
class SQuAD2(Task): class SQuAD2(Task):
VERSION = 3 VERSION = 3
DATASET_PATH = "squad_v2" DATASET_PATH = "squad_v2"
......
...@@ -7,6 +7,7 @@ training_split: train ...@@ -7,6 +7,7 @@ training_split: train
validation_split: validation validation_split: validation
output_type: generate_until output_type: generate_until
doc_to_text: !function "t5_utils.doc_to_text" doc_to_text: !function "t5_utils.doc_to_text"
process_results: !function "t5_utils.process_results"
doc_to_target: label doc_to_target: label
generation_kwargs: generation_kwargs:
until: until:
...@@ -15,9 +16,5 @@ metric_list: ...@@ -15,9 +16,5 @@ metric_list:
- metric: accuracy - metric: accuracy
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
filter_list:
- name: "wsc_postprocessor"
filter:
- function: !function t5_utils.WSCPostprocess
metadata: metadata:
version: 0.0 version: 1.0
import re import re
from lm_eval.api.filter import Filter from typing import List
def doc_to_text(x): def doc_to_text(x):
text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x)) text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
...@@ -24,14 +23,14 @@ def _wsc_inputs(x): ...@@ -24,14 +23,14 @@ def _wsc_inputs(x):
[ [
" ".join(words[:pronoun_index]), " ".join(words[:pronoun_index]),
"X", "X",
" ".join(words[pronoun_index + 1 :]), " ".join(words[pronoun_index + 1:]),
] ]
) )
# Handle some special cases. # Handle some special cases.
if ( if (
x["text"] x["text"]
== 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. ' == 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
): ):
return ( return (
"The boy continued to whip the pony , and eventually the pony threw " "The boy continued to whip the pony , and eventually the pony threw "
...@@ -40,8 +39,8 @@ def _wsc_inputs(x): ...@@ -40,8 +39,8 @@ def _wsc_inputs(x):
# Using the span2_index, we get 'use' instead of 'it'. # Using the span2_index, we get 'use' instead of 'it'.
if ( if (
x["text"] x["text"]
== "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?" == "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
): ):
return ( return (
"When they had eventually calmed down a bit , and had gotten home, " "When they had eventually calmed down a bit , and had gotten home, "
...@@ -52,56 +51,53 @@ def _wsc_inputs(x): ...@@ -52,56 +51,53 @@ def _wsc_inputs(x):
return create_input() return create_input()
class WSCPostprocess(Filter): DETERMINERS = {
def __init__(self, **kwargs): "a",
self.determiners = { "an",
"a", "few",
"an", "her",
"few", "his",
"her", "each",
"his", "every",
"each", "many",
"every", "much",
"many", "my",
"much", "our",
"my", "some",
"our", "that",
"some", "the",
"that", "their",
"the", "these",
"their", "this",
"these", "those",
"this", "which",
"those", "whose",
"which", "your",
"whose", }
"your",
}
def clean(s: str) -> str:
def clean(self, s): """Ignore capitalization and determiners."""
"""Ignore capitalization and determiners.""" s = s.strip().lower()
s = s.strip().lower() return " ".join([w for w in s.split(" ") if w not in DETERMINERS])
return " ".join([w for w in s.split(" ") if w not in self.determiners])
def apply(self, resps, docs): def process_results(docs: dict, resps: List):
filtered_resps = [] prediction = clean(resps[0])
for prediction, reference in zip(*(resps, docs["span1_text"])): reference = clean(docs["span1_text"])
prediction = self.clean(prediction[0])
reference = self.clean(reference) if ("'" in prediction) != ("'" in reference):
# referent is "Bob's hat" as predicting the referent.
if ("'" in prediction) != ("'" in reference): predicted_referent = False
# referent is "Bob's hat" as predicting the referent. else:
predicted_referent = False prediction_words = set(prediction.split(" "))
else: referent_words = set(reference.split(" "))
prediction_words = set(prediction.split(" "))
referent_words = set(reference.split(" ")) # Handle cases where the prediction is "fuzzy bunny" and the referent is
# "bunny".
# Handle cases where the prediction is "fuzzy bunny" and the referent is predicted_referent = prediction_words.issubset(
# "bunny". referent_words
predicted_referent = prediction_words.issubset( ) or referent_words.issubset(prediction_words)
referent_words
) or referent_words.issubset(prediction_words) acc = 1.0 if predicted_referent == docs["label"] else 0.0
return {"accuracy": acc}
filtered_resps.append(predicted_referent)
return filtered_resps
...@@ -51,7 +51,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None: ...@@ -51,7 +51,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
for lang in LANGUAGES: for lang in LANGUAGES:
file_name = f"xwinograd_{lang}.yaml" file_name = f"xwinograd_{lang}.yaml"
try: try:
with open(f"{output_dir}/{file_name}", "w" if overwrite else "x") as f: with open(f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8") as f:
f.write("# Generated by utils.py\n") f.write("# Generated by utils.py\n")
yaml.dump( yaml.dump(
{ {
......
...@@ -472,6 +472,10 @@ def get_git_commit_hash(): ...@@ -472,6 +472,10 @@ def get_git_commit_hash():
return git_hash return git_hash
def ignore_constructor(loader, node):
return node
def import_function(loader, node): def import_function(loader, node):
function_name = loader.construct_scalar(node) function_name = loader.construct_scalar(node)
yaml_path = os.path.dirname(loader.name) yaml_path = os.path.dirname(loader.name)
...@@ -489,11 +493,14 @@ def import_function(loader, node): ...@@ -489,11 +493,14 @@ def import_function(loader, node):
return function return function
# Add the import_function constructor to the YAML loader def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None, mode="full"):
yaml.add_constructor("!function", import_function) if mode == "simple":
constructor_fn = ignore_constructor
elif mode == "full":
constructor_fn = import_function
def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None): # Add the import_function constructor to the YAML loader
yaml.add_constructor("!function", constructor_fn)
if yaml_config is None: if yaml_config is None:
with open(yaml_path, "rb") as file: with open(yaml_path, "rb") as file:
yaml_config = yaml.full_load(file) yaml_config = yaml.full_load(file)
...@@ -521,7 +528,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None): ...@@ -521,7 +528,7 @@ def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None):
path = os.path.join(yaml_dir, path) path = os.path.join(yaml_dir, path)
try: try:
included_yaml_config = load_yaml_config(path) included_yaml_config = load_yaml_config(yaml_path=path, mode=mode)
final_yaml_config.update(included_yaml_config) final_yaml_config.update(included_yaml_config)
except Exception as ex: except Exception as ex:
# If failed to load, ignore # If failed to load, ignore
......
...@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" ...@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "lm_eval" name = "lm_eval"
version = "0.4.0" version = "0.4.1"
authors = [ authors = [
{name="EleutherAI", email="contact@eleuther.ai"} {name="EleutherAI", email="contact@eleuther.ai"}
] ]
...@@ -56,15 +56,14 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness" ...@@ -56,15 +56,14 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies] [project.optional-dependencies]
anthropic = ["anthropic"] anthropic = ["anthropic"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"] dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"] gptq = ["auto-gptq[triton]>=0.6.0"]
ifeval = ["langdetect", "immutabledict"] ifeval = ["langdetect", "immutabledict"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"] mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"] math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"] multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
openai = ["openai==1.3.9", "tiktoken"] openai = ["openai==1.3.9", "tiktoken"]
promptsource = [ optimum = ["optimum[openvino]"]
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource" promptsource = ["promptsource>=0.2.3"]
]
sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"] sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
testing = ["pytest", "pytest-cov", "pytest-xdist"] testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm<=0.2.5"] vllm = ["vllm<=0.2.5"]
......
...@@ -23,7 +23,7 @@ def parse_args(): ...@@ -23,7 +23,7 @@ def parse_args():
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
with open(args.benchmark_path) as file: with open(args.benchmark_path, encoding="utf-8") as file:
TASK_LIST = yaml.full_load(file) TASK_LIST = yaml.full_load(file)
for task in tqdm(TASK_LIST): for task in tqdm(TASK_LIST):
eval_logger.info(f"Processing {task}") eval_logger.info(f"Processing {task}")
...@@ -57,5 +57,5 @@ if __name__ == "__main__": ...@@ -57,5 +57,5 @@ if __name__ == "__main__":
file_save_path = os.path.join(file_path, full_file_name) file_save_path = os.path.join(file_path, full_file_name)
eval_logger.info(f"Save to {file_save_path}") eval_logger.info(f"Save to {file_save_path}")
with open(file_save_path, "w") as yaml_file: with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(config_dict, yaml_file) yaml.dump(config_dict, yaml_file)
...@@ -119,7 +119,7 @@ class Buckets: ...@@ -119,7 +119,7 @@ class Buckets:
def do_ngrams_in_buckets(n_value, working_directory, bucket_count): def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
pile_statistics = json.load(open("pile_statistics.json", "r")) pile_statistics = json.load(open("pile_statistics.json", "r", encoding="utf-8"))
pile_document_count = pile_statistics["Document Count"] pile_document_count = pile_statistics["Document Count"]
start_offsets = pile_statistics["File Start Offsets"] start_offsets = pile_statistics["File Start Offsets"]
...@@ -212,4 +212,4 @@ if __name__ == "__main__": ...@@ -212,4 +212,4 @@ if __name__ == "__main__":
info_dict = {"title": "dataset ngrams", "ngram_size": 13} info_dict = {"title": "dataset ngrams", "ngram_size": 13}
info_dict_path = os.path.join(args.working_directory, "info.json") info_dict_path = os.path.join(args.working_directory, "info.json")
json.dump(info_dict, open(info_dict_path, "w")) json.dump(info_dict, open(info_dict_path, "w", encoding="utf-8"))
...@@ -79,7 +79,7 @@ if __name__ == "__main__": ...@@ -79,7 +79,7 @@ if __name__ == "__main__":
stats_file_path = "pile_statistics.json" stats_file_path = "pile_statistics.json"
if os.path.exists(stats_file_path): if os.path.exists(stats_file_path):
stats = json.load(open(stats_file_path, "r")) stats = json.load(open(stats_file_path, "r", encoding="utf-8"))
else: else:
document_count, total_document_size_chars, start_offsets = get_stats() document_count, total_document_size_chars, start_offsets = get_stats()
stats = { stats = {
...@@ -88,7 +88,7 @@ if __name__ == "__main__": ...@@ -88,7 +88,7 @@ if __name__ == "__main__":
"Total Pile Characters": total_document_size_chars, "Total Pile Characters": total_document_size_chars,
"File Start Offsets": start_offsets, "File Start Offsets": start_offsets,
} }
json.dump(stats, open(stats_file_path, "w"), indent=4) json.dump(stats, open(stats_file_path, "w", encoding="utf-8"), indent=4)
print(f"document_count: {stats['Document Count']}") print(f"document_count: {stats['Document Count']}")
print(f"total_chars: {stats['Total Pile Characters']}") print(f"total_chars: {stats['Total Pile Characters']}")
......
...@@ -61,14 +61,14 @@ if __name__ == "__main__": ...@@ -61,14 +61,14 @@ if __name__ == "__main__":
if not filenames: if not filenames:
continue continue
path_readme = os.path.join(dirpath, "README.md") path_readme = os.path.join(dirpath, "README.md")
with open(path_readme, "w") as f: with open(path_readme, "w", encoding="utf-8") as f:
# get path name, only last folder # get path name, only last folder
path_name = dirpath.split("/")[-1] path_name = dirpath.split("/")[-1]
f.write(f"# {path_name} \n\n") f.write(f"# {path_name} \n\n")
for filename in sorted([f for f in filenames if f.endswith(".json")]): for filename in sorted([f for f in filenames if f.endswith(".json")]):
path = os.path.join(dirpath, filename) path = os.path.join(dirpath, filename)
with open(path, "r") as f: with open(path, "r", encoding="utf-8") as f:
result_dict = json.load(f) result_dict = json.load(f)
with open(path_readme, "a") as f: with open(path_readme, "a", encoding="utf-8") as f:
f.write(f"## {filename} \n") f.write(f"## {filename} \n")
f.write(f"{make_table(result_dict)} \n") f.write(f"{make_table(result_dict)} \n")
...@@ -11,14 +11,13 @@ import datasets ...@@ -11,14 +11,13 @@ import datasets
import pandas as pd import pandas as pd
from lm_eval import tasks from lm_eval import tasks
from lm_eval.tasks import TASK_REGISTRY
from lm_eval.utils import load_yaml_config from lm_eval.utils import load_yaml_config
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
datasets.disable_caching() datasets.disable_caching()
tasks.initialize_tasks() task_manager = tasks.TaskManager
def load_changed_files(file_path: str) -> List[str]: def load_changed_files(file_path: str) -> List[str]:
...@@ -74,11 +73,11 @@ def maketable(df): ...@@ -74,11 +73,11 @@ def maketable(df):
] ]
values = [] values = []
if not df: if not df:
_tasks = tasks.TASK_REGISTRY.items() _tasks = task_manager.TASK_REGISTRY.items()
_tasks = sorted(_tasks, key=lambda x: x[0]) _tasks = sorted(_tasks, key=lambda x: x[0])
else: else:
task_classes = new_tasks() task_classes = new_tasks()
_tasks = [(x, TASK_REGISTRY.get(x)) for x in task_classes] _tasks = [(x, task_manager.TASK_REGISTRY.get(x)) for x in task_classes]
count = 0 count = 0
for tname, Task in _tasks: for tname, Task in _tasks:
task = Task() task = Task()
......
...@@ -94,7 +94,11 @@ def eval_models(args, branch=None): ...@@ -94,7 +94,11 @@ def eval_models(args, branch=None):
ret = os.system(command) ret = os.system(command)
results[model] = json.load(open(output_path)) if ret == 0 else {"results": {}} results[model] = (
json.load(open(output_path, encoding="utf-8"))
if ret == 0
else {"results": {}}
)
end_time = time.time() end_time = time.time()
......
...@@ -5,7 +5,7 @@ import random ...@@ -5,7 +5,7 @@ import random
import numpy as np import numpy as np
from lm_eval import tasks from lm_eval import tasks
from lm_eval.tasks import include_path, initialize_tasks from lm_eval.tasks import TaskManager
from lm_eval.utils import eval_logger, join_iters from lm_eval.utils import eval_logger, join_iters
...@@ -39,22 +39,21 @@ def main(): ...@@ -39,22 +39,21 @@ def main():
args = parse_args() args = parse_args()
np.random.seed(args.seed) np.random.seed(args.seed)
initialize_tasks(args.verbosity)
if args.include_path is not None: if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}") eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if args.tasks == "all_tasks": if args.tasks == "all_tasks":
task_names = tasks.ALL_TASKS task_names = task_manager.all_tasks
else: else:
task_names = args.tasks.split(",") task_names = args.tasks.split(",")
task_dict = tasks.get_task_dict(task_names) task_dict = tasks.get_task_dict(task_names, task_manager)
os.makedirs(args.output_base_path, exist_ok=True) os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
if type(task) == tuple: if isinstance(task, tuple):
group_name, task = task _, task = task
rnd = random.Random() rnd = random.Random()
rnd.seed(args.seed) rnd.seed(args.seed)
......
...@@ -69,18 +69,20 @@ def main(): ...@@ -69,18 +69,20 @@ def main():
model_args = re.sub( model_args = re.sub(
"/|=", "/|=",
"__", "__",
json.load(open(Path(args.data_path, model, "results.json")))["config"][ json.load(
"model_args" open(Path(args.data_path, model, "results.json"), encoding="utf-8")
], )["config"]["model_args"],
) )
with open( with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"), "r" Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
"r",
encoding="utf-8",
) as file: ) as file:
data = json.loads(file.read()) data = json.loads(file.read())
configs = json.load(open(Path(args.data_path, model, "results.json")))[ configs = json.load(
"configs" open(Path(args.data_path, model, "results.json"), encoding="utf-8")
] )["configs"]
config = configs[task] config = configs[task]
if model_index == 0: # Only need to assemble data for the first model if model_index == 0: # Only need to assemble data for the first model
...@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str): ...@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str):
list: A list of tasks for the model. list: A list of tasks for the model.
""" """
dir_path = Path(data_path, model) dir_path = Path(data_path, model)
config = (json.load(open(Path(dir_path, "results.json")))["configs"],) config = (
json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
)
return list(config[0].keys()) return list(config[0].keys())
......
...@@ -11,20 +11,21 @@ from lm_eval.api.instance import Instance ...@@ -11,20 +11,21 @@ from lm_eval.api.instance import Instance
from lm_eval.models.huggingface import HFLM from lm_eval.models.huggingface import HFLM
tasks.initialize_tasks() task_manager = tasks.TaskManager()
class Test_HFLM: class Test_HFLM:
torch.use_deterministic_algorithms(True) torch.use_deterministic_algorithms(True)
task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"])
version_minor = sys.version_info.minor version_minor = sys.version_info.minor
multiple_choice_task = tasks.TASK_REGISTRY.get("arc_easy")() # type: ignore multiple_choice_task = task_list["arc_easy"] # type: ignore
multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1) multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
MULTIPLE_CH: list[Instance] = multiple_choice_task.instances MULTIPLE_CH: list[Instance] = multiple_choice_task.instances
generate_until_task = tasks.TASK_REGISTRY.get("gsm8k")() # type: ignore generate_until_task = task_list["gsm8k"] # type: ignore
generate_until_task.build_all_requests(limit=10, rank=0, world_size=1) generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
generate_until_task._config.generation_kwargs["max_gen_toks"] = 10 generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
generate_until: list[Instance] = generate_until_task.instances generate_until: list[Instance] = generate_until_task.instances
rolling_task = tasks.TASK_REGISTRY.get("wikitext")() # type: ignore rolling_task = task_list["wikitext"] # type: ignore
rolling_task.build_all_requests(limit=10, rank=0, world_size=1) rolling_task.build_all_requests(limit=10, rank=0, world_size=1)
ROLLING: list[Instance] = rolling_task.instances ROLLING: list[Instance] = rolling_task.instances
......
import random
import tempfile
import pytest
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer
import lm_eval.evaluator as evaluator
from lm_eval.api.registry import get_model
SUPPORTED_ARCHITECTURES_TASKS = {
"facebook/opt-125m": "lambada_openai",
"hf-internal-testing/tiny-random-gpt2": "wikitext",
}
@pytest.mark.parametrize("model_id,task", SUPPORTED_ARCHITECTURES_TASKS.items())
def test_evaluator(model_id, task):
with tempfile.TemporaryDirectory() as tmpdirname:
model = OVModelForCausalLM.from_pretrained(
model_id, export=True, use_cache=True
)
model.save_pretrained(tmpdirname)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(tmpdirname)
lm = get_model("openvino").create_from_arg_string(
f"pretrained={tmpdirname}",
{
"batch_size": 1,
"device": "cpu",
},
)
def ll_fn(reqs):
for ctx, cont in [req.args for req in reqs]:
if len(ctx) == 0:
continue
# space convention
assert ctx[-1] != " "
assert cont[0] == " " or ctx[-1] == "\n"
res = []
random.seed(42)
for _ in reqs:
res.append((-random.random(), False))
return res
def ll_perp_fn(reqs):
for (string,) in [req.args for req in reqs]:
assert isinstance(string, str)
res = []
random.seed(42)
for _ in reqs:
res.append(-random.random())
return res
lm.loglikelihood = ll_fn
lm.loglikelihood_rolling = ll_perp_fn
limit = 10
evaluator.simple_evaluate(
model=lm,
tasks=[task],
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment