Unverified Commit da211969 authored by Jess's avatar Jess Committed by GitHub
Browse files

Merge branch 'EleutherAI:main' into main

parents 1b97e487 801322e0
include: _paloma_template
task: paloma_wikitext_103
task_alias: Wikitext-103
dataset_name: wikitext_103
...@@ -19,3 +19,5 @@ metric_list: ...@@ -19,3 +19,5 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
include: polemo2_in.yaml include: polemo2_in.yaml
task: polemo2_out task: polemo2_out
dataset_path: allegro/klej-polemo2-out dataset_path: allegro/klej-polemo2-out
dataset_name: klej-polemo2-out dataset_name: null
...@@ -4,12 +4,12 @@ from functools import reduce ...@@ -4,12 +4,12 @@ from functools import reduce
import numpy as np import numpy as np
import transformers.data.metrics.squad_metrics as squad_metrics import transformers.data.metrics.squad_metrics as squad_metrics
from datasets import load_metric from datasets import Dataset, load_metric
from transformers import AutoTokenizer from transformers import AutoTokenizer
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.metrics import mean from lm_eval.api.metrics import mean
from lm_eval.api.task import Task from lm_eval.api.task import ConfigurableTask
_CITATION = """ _CITATION = """
...@@ -108,7 +108,7 @@ def _num_cpu_cores(): ...@@ -108,7 +108,7 @@ def _num_cpu_cores():
return len(os.sched_getaffinity(0)) return len(os.sched_getaffinity(0))
class _SCROLLSTask(Task): class _SCROLLSTask(ConfigurableTask):
VERSION = 2 VERSION = 2
DATASET_PATH = "tau/scrolls" DATASET_PATH = "tau/scrolls"
DATASET_NAME = None DATASET_NAME = None
...@@ -117,7 +117,7 @@ class _SCROLLSTask(Task): ...@@ -117,7 +117,7 @@ class _SCROLLSTask(Task):
PRUNE_NUM_PROC = None PRUNE_NUM_PROC = None
def __init__(self): def __init__(self):
super().__init__() super().__init__(config={"metadata": {"version": self.VERSION}})
if self.DATASET_NAME is not None: if self.DATASET_NAME is not None:
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
...@@ -131,12 +131,26 @@ class _SCROLLSTask(Task): ...@@ -131,12 +131,26 @@ class _SCROLLSTask(Task):
return False return False
def training_docs(self): def training_docs(self):
for doc in self.dataset["train"]: processed_docs = list(map(self._process_doc, self.dataset["train"]))
yield from self._process_doc(doc)
# Flatten the list of lists since _process_doc returns a list of one element.
processed_docs = [item for sublist in processed_docs for item in sublist]
processed_dict = {
key: [d[key] for d in processed_docs] for key in processed_docs[0]
}
return Dataset.from_dict(processed_dict)
def validation_docs(self): def validation_docs(self):
for doc in self.dataset["validation"]: processed_docs = list(map(self._process_doc, self.dataset["validation"]))
yield from self._process_doc(doc)
# Flatten the list of lists since _process_doc returns a list of one element.
processed_docs = [item for sublist in processed_docs for item in sublist]
processed_dict = {
key: [d[key] for d in processed_docs] for key in processed_docs[0]
}
return Dataset.from_dict(processed_dict)
def should_decontaminate(self): def should_decontaminate(self):
return True return True
......
...@@ -6,10 +6,7 @@ training_split: train ...@@ -6,10 +6,7 @@ training_split: train
validation_split: validation validation_split: validation
doc_to_text: "Q: {{context}} {{question}}\nA:" doc_to_text: "Q: {{context}} {{question}}\nA:"
target_delimiter: " " target_delimiter: " "
doc_to_choice: doc_to_choice: "{{[answerA, answerB, answerC]}}"
- "{{answerA}}"
- "{{answerB}}"
- "{{answerC}}"
doc_to_target: "{{ (label|int) - 1 }}" doc_to_target: "{{ (label|int) - 1 }}"
metric_list: metric_list:
- metric: acc - metric: acc
......
"""
"""
import re import re
from typing import List from typing import List
......
...@@ -13,6 +13,7 @@ also determine when no answer is supported by the paragraph and abstain from ans ...@@ -13,6 +13,7 @@ also determine when no answer is supported by the paragraph and abstain from ans
Homepage: https://rajpurkar.github.io/SQuAD-explorer/ Homepage: https://rajpurkar.github.io/SQuAD-explorer/
""" """
from functools import partial from functools import partial
from math import exp from math import exp
......
...@@ -2,49 +2,31 @@ ...@@ -2,49 +2,31 @@
### Paper ### Paper
Title: `Few-shot Learning with Multilingual Language Models` Title: `A Corpus and Evaluation Framework for Deeper Understanding of Commonsense Stories`
Abstract: `https://arxiv.org/abs/2112.10668` Abstract: `https://arxiv.org/abs/1604.01696`
XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI. Homepage: https://cs.rochester.edu/nlp/rocstories/
Homepage: https://github.com/facebookresearch/fairseq/pull/4820 'Story Cloze Test' is a new commonsense reasoning framework for evaluating story understanding, story generation, and script learning. This test requires a system to choose the correct ending to a four-sentence story
### Citation ### Citation
``` ```
@article{DBLP:journals/corr/abs-2112-10668, @misc{mostafazadeh2016corpus,
author = {Xi Victoria Lin and title={A Corpus and Evaluation Framework for Deeper Understanding of Commonsense Stories},
Todor Mihaylov and author={Nasrin Mostafazadeh and
Mikel Artetxe and Nathanael Chambers and
Tianlu Wang and Xiaodong He and
Shuohui Chen and Devi Parikh and
Daniel Simig and Dhruv Batra and
Myle Ott and Lucy Vanderwende and
Naman Goyal and Pushmeet Kohli and
Shruti Bhosale and James Allen},
Jingfei Du and year={2016},
Ramakanth Pasunuru and eprint={1604.01696},
Sam Shleifer and archivePrefix={arXiv},
Punit Singh Koura and primaryClass={cs.CL}
Vishrav Chaudhary and
Brian O'Horo and
Jeff Wang and
Luke Zettlemoyer and
Zornitsa Kozareva and
Mona T. Diab and
Veselin Stoyanov and
Xian Li},
title = {Few-shot Learning with Multilingual Language Models},
journal = {CoRR},
volume = {abs/2112.10668},
year = {2021},
url = {https://arxiv.org/abs/2112.10668},
eprinttype = {arXiv},
eprint = {2112.10668},
timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
} }
``` ```
......
""" This code mirrors the utils of the original winogrande task """ """This code mirrors the utils of the original winogrande task"""
def doc_to_text(doc): def doc_to_text(doc):
......
""" """
Take in a YAML, and output all "other" splits with this YAML Take in a YAML, and output all "other" splits with this YAML
""" """
import argparse import argparse
import os import os
......
...@@ -26,6 +26,11 @@ eval_logger = logging.getLogger("lm-eval") ...@@ -26,6 +26,11 @@ eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47 SPACING = " " * 47
HIGHER_IS_BETTER_SYMBOLS = {
True: "↑",
False: "↓",
}
def hash_string(string: str) -> str: def hash_string(string: str) -> str:
return hashlib.sha256(string.encode("utf-8")).hexdigest() return hashlib.sha256(string.encode("utf-8")).hexdigest()
...@@ -76,6 +81,18 @@ def handle_non_serializable(o): ...@@ -76,6 +81,18 @@ def handle_non_serializable(o):
return str(o) return str(o)
def sanitize_list(sub):
"""
Takes possible nested list and recursively converts all inner component to strings
"""
if isinstance(sub, list):
return [sanitize_list(item) for item in sub]
if isinstance(sub, tuple):
return tuple(sanitize_list(item) for item in sub)
else:
return str(sub)
def simple_parse_args_string(args_string): def simple_parse_args_string(args_string):
""" """
Parses something like Parses something like
...@@ -135,6 +152,55 @@ def general_detokenize(string): ...@@ -135,6 +152,55 @@ def general_detokenize(string):
return string return string
def get_file_task_name(filename: str) -> str:
"""
Given the sample results filenames, extracts and returns the task name.
"""
return filename[filename.find("_") + 1 : filename.rfind("_")]
def get_file_datetime(filename: str) -> str:
"""
Given the results and sample results filenames, extracts and returns the datetime.
"""
return filename[filename.rfind("_") + 1 :].replace(".json", "")
def sanitize_model_name(model_name: str) -> str:
"""
Given the model name, returns a sanitized version of it.
"""
return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
def sanitize_task_name(task_name: str) -> str:
"""
Given the task name, returns a sanitized version of it.
"""
return re.sub(r"\W", "_", task_name)
def get_latest_filename(filenames: List[str]) -> str:
"""
Given a list of filenames, returns the filename with the latest datetime.
"""
return max(filenames, key=lambda f: get_file_datetime(f))
def get_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to aggregated results.
"""
return [f for f in filenames if "/results_" in f and ".json" in f]
def get_sample_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to sample results.
"""
return [f for f in filenames if "/samples_" in f and ".json" in f]
def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len): def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
""" """
- context_len allows for a rolling window context, allowing each prediction window to potentially - context_len allows for a rolling window context, allowing each prediction window to potentially
...@@ -257,6 +323,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True): ...@@ -257,6 +323,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
"Filter", "Filter",
"n-shot", "n-shot",
"Metric", "Metric",
"",
"Value", "Value",
"", "",
"Stderr", "Stderr",
...@@ -277,22 +344,29 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True): ...@@ -277,22 +344,29 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
dic = result_dict[column][k] dic = result_dict[column][k]
version = result_dict["versions"].get(k, "N/A") version = result_dict["versions"].get(k, "N/A")
n = str(result_dict["n-shot"][k]) n = str(result_dict["n-shot"][k])
higher_is_better = result_dict.get("higher_is_better", {}).get(k, {})
if "alias" in dic: if "alias" in dic:
k = dic.pop("alias") k = dic.pop("alias")
for (mf), v in dic.items(): metric_items = dic.items()
if sort_results:
metric_items = sorted(metric_items)
for (mf), v in metric_items:
m, _, f = mf.partition(",") m, _, f = mf.partition(",")
if m.endswith("_stderr"): if m.endswith("_stderr"):
continue continue
hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
if m + "_stderr" + "," + f in dic: if m + "_stderr" + "," + f in dic:
se = dic[m + "_stderr" + "," + f] se = dic[m + "_stderr" + "," + f]
if se != "N/A": if se != "N/A":
se = "%.4f" % se se = "%.4f" % se
values.append([k, version, f, n, m, "%.4f" % v, "±", se]) values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
else: else:
values.append([k, version, f, n, m, "%.4f" % v, "", ""]) values.append([k, version, f, n, m, hib, "%.4f" % v, "", ""])
k = "" k = ""
version = "" version = ""
md_writer.value_matrix = values md_writer.value_matrix = values
......
...@@ -19,7 +19,7 @@ classifiers = [ ...@@ -19,7 +19,7 @@ classifiers = [
requires-python = ">=3.8" requires-python = ">=3.8"
license = { "text" = "MIT" } license = { "text" = "MIT" }
dependencies = [ dependencies = [
"accelerate>=0.21.0", "accelerate>=0.26.0",
"evaluate", "evaluate",
"datasets>=2.16.0", "datasets>=2.16.0",
"evaluate>=0.4.0", "evaluate>=0.4.0",
...@@ -73,7 +73,7 @@ promptsource = ["promptsource>=0.2.3"] ...@@ -73,7 +73,7 @@ promptsource = ["promptsource>=0.2.3"]
sentencepiece = ["sentencepiece>=0.1.98"] sentencepiece = ["sentencepiece>=0.1.98"]
sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"] sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
testing = ["pytest", "pytest-cov", "pytest-xdist"] testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm==0.3.2"] vllm = ["vllm>=0.4.2"]
zeno = ["pandas", "zeno-client"] zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"] wandb = ["wandb>=0.16.3", "pandas", "numpy"]
unitxt = ["unitxt"] unitxt = ["unitxt"]
......
...@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1 ...@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
the match, splitting the training data into chunks the match, splitting the training data into chunks
3) Any chunks less than `minimum_slice_length` are removed 3) Any chunks less than `minimum_slice_length` are removed
4) Training data sets split into more than `too_dirty_cutoff` are considered 4) Training data sets split into more than `too_dirty_cutoff` are considered
completey contaminated and removed completely contaminated and removed
OpenAI used: OpenAI used:
``` ```
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
Usage: Usage:
python make_table_tasks.py --output <markdown_filename> python make_table_tasks.py --output <markdown_filename>
""" """
import json import json
import logging import logging
import os import os
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
Usage: Usage:
python make_table_tasks.py --output <markdown_filename> python make_table_tasks.py --output <markdown_filename>
""" """
import argparse import argparse
import logging import logging
......
...@@ -70,6 +70,11 @@ def main(): ...@@ -70,6 +70,11 @@ def main():
if docs is not None: if docs is not None:
iters.append(docs) iters.append(docs)
if len(iters) == 0:
raise ValueError(
f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value."
)
docs = join_iters(iters) docs = join_iters(iters)
with open( with open(
......
...@@ -7,7 +7,12 @@ from pathlib import Path ...@@ -7,7 +7,12 @@ from pathlib import Path
import pandas as pd import pandas as pd
from zeno_client import ZenoClient, ZenoMetric from zeno_client import ZenoClient, ZenoMetric
from lm_eval.utils import eval_logger from lm_eval.utils import (
eval_logger,
get_latest_filename,
get_results_filenames,
get_sample_results_filenames,
)
def parse_args(): def parse_args():
...@@ -45,13 +50,15 @@ def main(): ...@@ -45,13 +50,15 @@ def main():
assert len(models) > 0, "No model directories found in the data_path." assert len(models) > 0, "No model directories found in the data_path."
# Get the tasks from the latest results file of the first model.
tasks = set(tasks_for_model(models[0], args.data_path)) tasks = set(tasks_for_model(models[0], args.data_path))
for model in models: # Make sure that all models have the same tasks. # Get tasks names from the latest results file for each model
# Get intersection of tasks for all models
for model in models:
old_tasks = tasks.copy() old_tasks = tasks.copy()
task_count = len(tasks) task_count = len(tasks)
model_tasks = set(tasks_for_model(model, args.data_path))
model_tasks = tasks_for_model(model, args.data_path)
tasks.intersection(set(model_tasks)) tasks.intersection(set(model_tasks))
if task_count != len(tasks): if task_count != len(tasks):
...@@ -66,22 +73,36 @@ def main(): ...@@ -66,22 +73,36 @@ def main():
for task in tasks: for task in tasks:
# Upload data for all models # Upload data for all models
for model_index, model in enumerate(models): for model_index, model in enumerate(models):
# Get latest results and sample results for a model
model_dir = Path(args.data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
model_sample_filenames = get_sample_results_filenames(model_files)
latest_results = get_latest_filename(
[Path(f).name for f in model_results_filenames]
)
latest_sample_results = get_latest_filename(
[Path(f).name for f in model_sample_filenames if task in f]
)
model_args = re.sub( model_args = re.sub(
r"[\"<>:/\|\\?\*\[\]]+", r"[\"<>:/\|\\?\*\[\]]+",
"__", "__",
json.load( json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8") open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["config"]["model_args"], )["config"]["model_args"],
) )
print(model_args)
data = []
with open( with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"), Path(args.data_path, model, latest_sample_results),
"r", "r",
encoding="utf-8", encoding="utf-8",
) as file: ) as file:
data = json.loads(file.read()) for line in file:
data.append(json.loads(line.strip()))
configs = json.load( configs = json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8") open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["configs"] )["configs"]
config = configs[task] config = configs[task]
...@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str): ...@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
Returns: Returns:
list: A list of tasks for the model. list: A list of tasks for the model.
""" """
dir_path = Path(data_path, model) # get latest model results for a given name
config = ( model_dir = Path(data_path, model)
json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"], model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
) model_results_filenames = get_results_filenames(model_files)
latest_results = get_latest_filename(model_results_filenames)
config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
return list(config[0].keys()) return list(config[0].keys())
......
...@@ -15,11 +15,11 @@ base_url = "https://matthoffner-ggml-llm-api.hf.space" ...@@ -15,11 +15,11 @@ base_url = "https://matthoffner-ggml-llm-api.hf.space"
def gguf_completion_mock(base_url=None, **kwargs): def gguf_completion_mock(base_url=None, **kwargs):
# Generate a hash from the parameters # Generate a hash from the parameters
hash_kwargs = {"base_url": base_url, **kwargs} hash_kwargs = {"base_url": base_url, **kwargs}
hash = hashlib.sha256( parameters_hash = hashlib.sha256(
json.dumps(hash_kwargs, sort_keys=True).encode("utf-8") json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
).hexdigest() ).hexdigest()
fname = f"./tests/testdata/gguf_test_{hash}.pkl" fname = f"./tests/testdata/gguf_test_{parameters_hash}.pkl"
if os.path.exists(fname): if os.path.exists(fname):
with open(fname, "rb") as fh: with open(fname, "rb") as fh:
......
from __future__ import annotations from __future__ import annotations
import os
import sys import sys
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import torch import torch
import lm_eval.tasks as tasks from lm_eval import tasks
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.models.huggingface import HFLM from lm_eval.models.huggingface import HFLM
os.environ["TOKENIZERS_PARALLELISM"] = "false"
task_manager = tasks.TaskManager() task_manager = tasks.TaskManager()
TEST_STRING = "foo bar"
class Test_HFLM: class Test_HFLM:
torch.use_deterministic_algorithms(True) torch.use_deterministic_algorithms(True)
...@@ -107,7 +111,7 @@ class Test_HFLM: ...@@ -107,7 +111,7 @@ class Test_HFLM:
file_path = dir_path / f"outputs_log_{self.version_minor}.txt" file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
file_path = file_path.resolve() file_path = file_path.resolve()
with open(file_path, "w") as f: with open(file_path, "w", encoding="utf-8") as f:
f.write("\n".join(str(x) for x in _res)) f.write("\n".join(str(x) for x in _res))
assert np.allclose(_res, _RES, atol=1e-2) assert np.allclose(_res, _RES, atol=1e-2)
# check indices for Multiple Choice # check indices for Multiple Choice
...@@ -126,19 +130,19 @@ class Test_HFLM: ...@@ -126,19 +130,19 @@ class Test_HFLM:
assert np.allclose(res, self.ROLLING_RES, atol=1e-1) assert np.allclose(res, self.ROLLING_RES, atol=1e-1)
def test_toc_encode(self) -> None: def test_toc_encode(self) -> None:
res = self.LM.tok_encode("foo bar") res = self.LM.tok_encode(TEST_STRING)
assert res == [12110, 2534] assert res == [12110, 2534]
def test_toc_decode(self) -> None: def test_toc_decode(self) -> None:
res = self.LM.tok_decode([12110, 2534]) res = self.LM.tok_decode([12110, 2534])
assert res == "foo bar" assert res == TEST_STRING
def test_batch_encode(self) -> None: def test_batch_encode(self) -> None:
res = self.LM.tok_batch_encode(["foo bar", "bar foo"])[0].tolist() res = self.LM.tok_batch_encode([TEST_STRING, "bar foo"])[0].tolist()
assert res == [[12110, 2534], [2009, 17374]] assert res == [[12110, 2534], [2009, 17374]]
def test_model_generate(self) -> None: def test_model_generate(self) -> None:
context = self.LM.tok_batch_encode(["foo bar"])[0] context = self.LM.tok_batch_encode([TEST_STRING])[0]
res = self.LM._model_generate(context, max_length=10, stop=["\n\n"]) res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
res = self.LM.tok_decode(res[0]) res = self.LM.tok_decode(res[0])
assert res == "foo bar\n<bazhang>!info bar" assert res == "foo bar\n<bazhang>!info bar"
import pytest import pytest
import lm_eval.evaluator as evaluator from lm_eval import evaluator
from lm_eval.api.registry import get_model from lm_eval.api.registry import get_model
...@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [ ...@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
] ]
@pytest.mark.skip(reason="test failing")
@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS) @pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
def test_sparseml_eval(model_id, task): def test_sparseml_eval(model_id, task):
lm = get_model("sparseml").create_from_arg_string( lm = get_model("sparseml").create_from_arg_string(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment