Commit cb8889cc authored by lintangsutawika's avatar lintangsutawika
Browse files

merged with latest update from main

parents ec05e561 74119471
......@@ -41,3 +41,4 @@ filter_list:
- function: "take_first"
metadata:
version: 2.0
num_fewshot: 8
......@@ -24,6 +24,7 @@ generation_kwargs:
- "\n\n"
- "Question:"
do_sample: false
temperature: 0.0
repeats: 1
num_fewshot: 5
filter_list:
......
......@@ -22,3 +22,4 @@ metric_list:
num_fewshot: 0
metadata:
version: 1.0
num_fewshot: 4
......@@ -85,13 +85,13 @@ if __name__ == "__main__":
# get filename of base_yaml so we can `"include": ` it in our "other" YAMLs.
base_yaml_name = os.path.split(args.base_yaml_path)[-1]
with open(args.base_yaml_path) as f:
with open(args.base_yaml_path, encoding="utf-8") as f:
base_yaml = yaml.full_load(f)
if args.cot_prompt_path is not None:
import json
with open(args.cot_prompt_path) as f:
with open(args.cot_prompt_path, encoding="utf-8") as f:
cot_file = json.load(f)
ALL_CATEGORIES = []
......@@ -120,7 +120,7 @@ if __name__ == "__main__":
file_save_path = args.save_prefix_path + f"_{subject}.yaml"
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
yaml_dict,
yaml_file,
......@@ -142,7 +142,7 @@ if __name__ == "__main__":
file_save_path = args.save_prefix_path + ".yaml"
eval_logger.info(f"Saving benchmark config to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
{
"group": f"mmlu_{args.task_prefix}"
......
......@@ -9,7 +9,7 @@ def main() -> None:
for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
file_name = f"{task}.yaml"
try:
with open(f"{file_name}", "w") as f:
with open(f"{file_name}", "w", encoding="utf-8") as f:
f.write("# Generated by _generate_configs.py\n")
yaml.dump(
{
......
......@@ -9,7 +9,7 @@ def main() -> None:
for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
file_name = f"{task}.yaml"
try:
with open(f"{file_name}", "w") as f:
with open(f"{file_name}", "w", encoding="utf-8") as f:
f.write("# Generated by _generate_configs.py\n")
yaml.dump(
{
......
......@@ -50,7 +50,7 @@ def process_docs(dataset, set_answer_type="bool"):
obs_list["abstract"].append(abstract)
obs_list["question"].append(question)
obs_list["answer_type"].append(answer_type)
if type(answer) == list:
if isinstance(answer, list):
answer = ", ".join(answer)
obs_list["answer"].append(answer)
......
group: scrolls
task:
# - task: scrolls_qasper
# class: !function task.Qasper
- task: scrolls_qasper
class: !function task.Qasper
- task: scrolls_quality
class: !function task.QuALITY
# - scrolls_narrativeqa
# class: !function task.NarrativeQA
# - scrolls_contractnli
# class: !function task.ContractNLI
# - scrolls_govreport
# class: !function task.GovReport
# - scrolls_summscreenfd
# class: !function task.SummScreenFD
# - scrolls_qmsum
# class: !function task.QMSum
- task: scrolls_narrativeqa
class: !function task.NarrativeQA
- task: scrolls_contractnli
class: !function task.ContractNLI
- task: scrolls_govreport
class: !function task.GovReport
- task: scrolls_summscreenfd
class: !function task.SummScreenFD
- task: scrolls_qmsum
class: !function task.QMSum
......@@ -115,8 +115,10 @@ class _SCROLLSTask(Task):
PRUNE_MAX_TOKENS = None
PRUNE_NUM_PROC = None
def __post_init__(self):
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
def __init__(self):
super().__init__()
if self.DATASET_NAME is not None:
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
def has_training_docs(self):
return True
......@@ -224,9 +226,10 @@ class _SCROLLSMultipleChoiceTask(_SCROLLSTask):
def process_results(self, doc, results):
gold = doc["gold"]
acc = 1.0 if np.argmax(results) == gold else 0.0
lls, _ = zip(*results)
acc = 1.0 if np.argmax(lls) == gold else 0.0
completion_len = np.array([float(len(i)) for i in doc["choices"]])
acc_norm = 1.0 if np.argmax(results / completion_len) == gold else 0.0
acc_norm = 1.0 if np.argmax(lls / completion_len) == gold else 0.0
return {
"acc": acc,
......@@ -279,7 +282,6 @@ class _SCROLLSSummaryTask(_SCROLLSTask):
return f"{doc['input']}\n\nQuestion: What is a summary of the preceding text?\nAnswer:"
# @register_task("scrolls_qasper")
class Qasper(_SCROLLSTask):
"""A Dataset of Information-Seeking Questions and Answers Anchored in Research Papers
https://arxiv.org/abs/2105.03011
......@@ -337,7 +339,6 @@ class Qasper(_SCROLLSTask):
)
# @register_task("scrolls_quality")
class QuALITY(_SCROLLSMultipleChoiceTask):
"""QuALITY: Question Answering with Long Input Texts, Yes!
https://arxiv.org/abs/2112.08608
......@@ -366,7 +367,6 @@ class QuALITY(_SCROLLSMultipleChoiceTask):
return [doc]
# @register_task("scrolls_narrativeqa")
class NarrativeQA(_SCROLLSTask):
"""The NarrativeQA Reading Comprehension Challenge
https://arxiv.org/abs/1712.07040
......@@ -400,7 +400,6 @@ class NarrativeQA(_SCROLLSTask):
)
# @register_task("scrolls_contractnli")
class ContractNLI(_SCROLLSMultipleChoiceTask):
"""ContractNLI: A Dataset for Document-level Natural Language Inference for Contracts
https://arxiv.org/abs/1712.07040
......@@ -419,7 +418,6 @@ class ContractNLI(_SCROLLSMultipleChoiceTask):
return f"{doc['text']}\n\nHypothesis: {doc['question']}\nConclusion:"
# @register_task("scrolls_govreport")
class GovReport(_SCROLLSSummaryTask):
"""Efficient Attentions for Long Document Summarization
https://arxiv.org/abs/2104.02112
......@@ -433,7 +431,6 @@ class GovReport(_SCROLLSSummaryTask):
DATASET_NAME = "gov_report"
# @register_task("scrolls_summscreenfd")
class SummScreenFD(_SCROLLSSummaryTask):
"""SummScreen: A Dataset for Abstractive Screenplay Summarization
https://arxiv.org/abs/2104.07091
......@@ -442,7 +439,6 @@ class SummScreenFD(_SCROLLSSummaryTask):
DATASET_NAME = "summ_screen_fd"
# @register_task("scrolls_qmsum")
class QMSum(_SCROLLSSummaryTask):
"""QMSum: A New Benchmark for Query-based Multi-domain
Meeting Summarization
......
......@@ -19,7 +19,7 @@ from math import exp
from functools import partial
from packaging import version
from lm_eval.api.task import Task
from lm_eval.api.task import ConfigurableTask
from lm_eval.api.instance import Instance
_CITATION = """
......@@ -46,11 +46,14 @@ def _squad_agg(key, items):
return _squad_metric(predictions=predictions, references=references).get(key, 0)
class SQuAD2(Task):
class SQuAD2(ConfigurableTask):
VERSION = 3
DATASET_PATH = "squad_v2"
DATASET_NAME = None
def __init__(self):
super().__init__(config={'metadata': {'version': self.VERSION}})
# HF changed squad on us so we have to make sure we aren't running the old one
assert version.parse(datasets.__version__) >= version.parse(
"1.11.0"
......
......@@ -7,6 +7,7 @@ training_split: train
validation_split: validation
output_type: generate_until
doc_to_text: !function "t5_utils.doc_to_text"
process_results: !function "t5_utils.process_results"
doc_to_target: label
generation_kwargs:
until:
......@@ -15,9 +16,5 @@ metric_list:
- metric: accuracy
aggregation: mean
higher_is_better: true
filter_list:
- name: "wsc_postprocessor"
filter:
- function: !function t5_utils.WSCPostprocess
metadata:
version: 0.0
version: 1.0
import re
from lm_eval.api.filter import Filter
from typing import List
def doc_to_text(x):
text = re.sub(r" X ", " *" + x["span2_text"] + "* ", _wsc_inputs(x))
......@@ -24,14 +23,14 @@ def _wsc_inputs(x):
[
" ".join(words[:pronoun_index]),
"X",
" ".join(words[pronoun_index + 1 :]),
" ".join(words[pronoun_index + 1:]),
]
)
# Handle some special cases.
if (
x["text"]
== 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
x["text"]
== 'The boy continued to whip the pony , and eventually the pony threw him over. John laughed out quite loud. "Good for him," he said. '
):
return (
"The boy continued to whip the pony , and eventually the pony threw "
......@@ -40,8 +39,8 @@ def _wsc_inputs(x):
# Using the span2_index, we get 'use' instead of 'it'.
if (
x["text"]
== "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
x["text"]
== "When they had eventually calmed down a bit , and had gotten home, Mr. Farley put the magic pebble in an iron safe . Some day they might want to use it , but really for now, what more could they wish for?"
):
return (
"When they had eventually calmed down a bit , and had gotten home, "
......@@ -52,56 +51,53 @@ def _wsc_inputs(x):
return create_input()
class WSCPostprocess(Filter):
def __init__(self, **kwargs):
self.determiners = {
"a",
"an",
"few",
"her",
"his",
"each",
"every",
"many",
"much",
"my",
"our",
"some",
"that",
"the",
"their",
"these",
"this",
"those",
"which",
"whose",
"your",
}
def clean(self, s):
"""Ignore capitalization and determiners."""
s = s.strip().lower()
return " ".join([w for w in s.split(" ") if w not in self.determiners])
def apply(self, resps, docs):
filtered_resps = []
for prediction, reference in zip(*(resps, docs["span1_text"])):
prediction = self.clean(prediction[0])
reference = self.clean(reference)
if ("'" in prediction) != ("'" in reference):
# referent is "Bob's hat" as predicting the referent.
predicted_referent = False
else:
prediction_words = set(prediction.split(" "))
referent_words = set(reference.split(" "))
# Handle cases where the prediction is "fuzzy bunny" and the referent is
# "bunny".
predicted_referent = prediction_words.issubset(
referent_words
) or referent_words.issubset(prediction_words)
filtered_resps.append(predicted_referent)
return filtered_resps
DETERMINERS = {
"a",
"an",
"few",
"her",
"his",
"each",
"every",
"many",
"much",
"my",
"our",
"some",
"that",
"the",
"their",
"these",
"this",
"those",
"which",
"whose",
"your",
}
def clean(s: str) -> str:
"""Ignore capitalization and determiners."""
s = s.strip().lower()
return " ".join([w for w in s.split(" ") if w not in DETERMINERS])
def process_results(docs: dict, resps: List):
prediction = clean(resps[0])
reference = clean(docs["span1_text"])
if ("'" in prediction) != ("'" in reference):
# referent is "Bob's hat" as predicting the referent.
predicted_referent = False
else:
prediction_words = set(prediction.split(" "))
referent_words = set(reference.split(" "))
# Handle cases where the prediction is "fuzzy bunny" and the referent is
# "bunny".
predicted_referent = prediction_words.issubset(
referent_words
) or referent_words.issubset(prediction_words)
acc = 1.0 if predicted_referent == docs["label"] else 0.0
return {"accuracy": acc}
......@@ -51,7 +51,7 @@ def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
for lang in LANGUAGES:
file_name = f"xwinograd_{lang}.yaml"
try:
with open(f"{output_dir}/{file_name}", "w" if overwrite else "x") as f:
with open(f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf-8") as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
......
......@@ -501,14 +501,14 @@ def import_function(loader, node):
return function
def load_yaml_config(mode="simple", yaml_path=None, yaml_config=None, yaml_dir=None):
def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None, mode="full"):
if mode == "simple":
constuctor_fn = ignore_constructor
constructor_fn = ignore_constructor
elif mode == "full":
constuctor_fn = import_function
constructor_fn = import_function
# Add the import_function constructor to the YAML loader
yaml.add_constructor("!function", constuctor_fn)
yaml.add_constructor("!function", constructor_fn)
if yaml_config is None:
with open(yaml_path, "rb") as file:
yaml_config = yaml.full_load(file)
......@@ -536,7 +536,7 @@ def load_yaml_config(mode="simple", yaml_path=None, yaml_config=None, yaml_dir=N
path = os.path.join(yaml_dir, path)
try:
included_yaml_config = load_yaml_config(mode=mode, yaml_path=path)
included_yaml_config = load_yaml_config(yaml_path=path, mode=mode)
final_yaml_config.update(included_yaml_config)
except Exception as ex:
# If failed to load, ignore
......
......@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "lm_eval"
version = "0.4.0"
version = "0.4.1"
authors = [
{name="EleutherAI", email="contact@eleuther.ai"}
]
......@@ -56,15 +56,14 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies]
anthropic = ["anthropic"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"]
gptq = ["auto-gptq[triton]>=0.6.0"]
ifeval = ["langdetect", "immutabledict"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
openai = ["openai==1.3.9", "tiktoken"]
promptsource = [
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource"
]
optimum = ["optimum[openvino]"]
promptsource = ["promptsource>=0.2.3"]
sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm<=0.2.5"]
......
......@@ -23,7 +23,7 @@ def parse_args():
if __name__ == "__main__":
args = parse_args()
with open(args.benchmark_path) as file:
with open(args.benchmark_path, encoding="utf-8") as file:
TASK_LIST = yaml.full_load(file)
for task in tqdm(TASK_LIST):
eval_logger.info(f"Processing {task}")
......@@ -57,5 +57,5 @@ if __name__ == "__main__":
file_save_path = os.path.join(file_path, full_file_name)
eval_logger.info(f"Save to {file_save_path}")
with open(file_save_path, "w") as yaml_file:
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(config_dict, yaml_file)
......@@ -119,7 +119,7 @@ class Buckets:
def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
pile_statistics = json.load(open("pile_statistics.json", "r"))
pile_statistics = json.load(open("pile_statistics.json", "r", encoding="utf-8"))
pile_document_count = pile_statistics["Document Count"]
start_offsets = pile_statistics["File Start Offsets"]
......@@ -212,4 +212,4 @@ if __name__ == "__main__":
info_dict = {"title": "dataset ngrams", "ngram_size": 13}
info_dict_path = os.path.join(args.working_directory, "info.json")
json.dump(info_dict, open(info_dict_path, "w"))
json.dump(info_dict, open(info_dict_path, "w", encoding="utf-8"))
......@@ -79,7 +79,7 @@ if __name__ == "__main__":
stats_file_path = "pile_statistics.json"
if os.path.exists(stats_file_path):
stats = json.load(open(stats_file_path, "r"))
stats = json.load(open(stats_file_path, "r", encoding="utf-8"))
else:
document_count, total_document_size_chars, start_offsets = get_stats()
stats = {
......@@ -88,7 +88,7 @@ if __name__ == "__main__":
"Total Pile Characters": total_document_size_chars,
"File Start Offsets": start_offsets,
}
json.dump(stats, open(stats_file_path, "w"), indent=4)
json.dump(stats, open(stats_file_path, "w", encoding="utf-8"), indent=4)
print(f"document_count: {stats['Document Count']}")
print(f"total_chars: {stats['Total Pile Characters']}")
......
......@@ -61,14 +61,14 @@ if __name__ == "__main__":
if not filenames:
continue
path_readme = os.path.join(dirpath, "README.md")
with open(path_readme, "w") as f:
with open(path_readme, "w", encoding="utf-8") as f:
# get path name, only last folder
path_name = dirpath.split("/")[-1]
f.write(f"# {path_name} \n\n")
for filename in sorted([f for f in filenames if f.endswith(".json")]):
path = os.path.join(dirpath, filename)
with open(path, "r") as f:
with open(path, "r", encoding="utf-8") as f:
result_dict = json.load(f)
with open(path_readme, "a") as f:
with open(path_readme, "a", encoding="utf-8") as f:
f.write(f"## {filename} \n")
f.write(f"{make_table(result_dict)} \n")
......@@ -50,5 +50,5 @@ if __name__ == "__main__":
values.append(v)
writer.value_matrix = values
table = writer.dumps()
with open(args.output, "w") as f:
with open(args.output, "w", encoding="utf-8") as f:
f.write(table)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment