Unverified Commit da211969 authored by Jess's avatar Jess Committed by GitHub
Browse files

Merge branch 'EleutherAI:main' into main

parents 1b97e487 801322e0
include: _paloma_template
task: paloma_wikitext_103
task_alias: Wikitext-103
dataset_name: wikitext_103
......@@ -19,3 +19,5 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
include: polemo2_in.yaml
task: polemo2_out
dataset_path: allegro/klej-polemo2-out
dataset_name: klej-polemo2-out
dataset_name: null
......@@ -4,12 +4,12 @@ from functools import reduce
import numpy as np
import transformers.data.metrics.squad_metrics as squad_metrics
from datasets import load_metric
from datasets import Dataset, load_metric
from transformers import AutoTokenizer
from lm_eval.api.instance import Instance
from lm_eval.api.metrics import mean
from lm_eval.api.task import Task
from lm_eval.api.task import ConfigurableTask
_CITATION = """
......@@ -108,7 +108,7 @@ def _num_cpu_cores():
return len(os.sched_getaffinity(0))
class _SCROLLSTask(Task):
class _SCROLLSTask(ConfigurableTask):
VERSION = 2
DATASET_PATH = "tau/scrolls"
DATASET_NAME = None
......@@ -117,7 +117,7 @@ class _SCROLLSTask(Task):
PRUNE_NUM_PROC = None
def __init__(self):
super().__init__()
super().__init__(config={"metadata": {"version": self.VERSION}})
if self.DATASET_NAME is not None:
self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME)
......@@ -131,12 +131,26 @@ class _SCROLLSTask(Task):
return False
def training_docs(self):
for doc in self.dataset["train"]:
yield from self._process_doc(doc)
processed_docs = list(map(self._process_doc, self.dataset["train"]))
# Flatten the list of lists since _process_doc returns a list of one element.
processed_docs = [item for sublist in processed_docs for item in sublist]
processed_dict = {
key: [d[key] for d in processed_docs] for key in processed_docs[0]
}
return Dataset.from_dict(processed_dict)
def validation_docs(self):
for doc in self.dataset["validation"]:
yield from self._process_doc(doc)
processed_docs = list(map(self._process_doc, self.dataset["validation"]))
# Flatten the list of lists since _process_doc returns a list of one element.
processed_docs = [item for sublist in processed_docs for item in sublist]
processed_dict = {
key: [d[key] for d in processed_docs] for key in processed_docs[0]
}
return Dataset.from_dict(processed_dict)
def should_decontaminate(self):
return True
......
......@@ -6,10 +6,7 @@ training_split: train
validation_split: validation
doc_to_text: "Q: {{context}} {{question}}\nA:"
target_delimiter: " "
doc_to_choice:
- "{{answerA}}"
- "{{answerB}}"
- "{{answerC}}"
doc_to_choice: "{{[answerA, answerB, answerC]}}"
doc_to_target: "{{ (label|int) - 1 }}"
metric_list:
- metric: acc
......
"""
"""
import re
from typing import List
......
......@@ -13,6 +13,7 @@ also determine when no answer is supported by the paragraph and abstain from ans
Homepage: https://rajpurkar.github.io/SQuAD-explorer/
"""
from functools import partial
from math import exp
......
......@@ -2,49 +2,31 @@
### Paper
Title: `Few-shot Learning with Multilingual Language Models`
Abstract: `https://arxiv.org/abs/2112.10668`
Title: `A Corpus and Evaluation Framework for Deeper Understanding of Commonsense Stories`
Abstract: `https://arxiv.org/abs/1604.01696`
XStoryCloze consists of the professionally translated version of the [English StoryCloze dataset](https://cs.rochester.edu/nlp/rocstories/) (Spring 2016 version) to 10 non-English languages. This dataset is released by Meta AI.
Homepage: https://cs.rochester.edu/nlp/rocstories/
Homepage: https://github.com/facebookresearch/fairseq/pull/4820
'Story Cloze Test' is a new commonsense reasoning framework for evaluating story understanding, story generation, and script learning. This test requires a system to choose the correct ending to a four-sentence story
### Citation
```
@article{DBLP:journals/corr/abs-2112-10668,
author = {Xi Victoria Lin and
Todor Mihaylov and
Mikel Artetxe and
Tianlu Wang and
Shuohui Chen and
Daniel Simig and
Myle Ott and
Naman Goyal and
Shruti Bhosale and
Jingfei Du and
Ramakanth Pasunuru and
Sam Shleifer and
Punit Singh Koura and
Vishrav Chaudhary and
Brian O'Horo and
Jeff Wang and
Luke Zettlemoyer and
Zornitsa Kozareva and
Mona T. Diab and
Veselin Stoyanov and
Xian Li},
title = {Few-shot Learning with Multilingual Language Models},
journal = {CoRR},
volume = {abs/2112.10668},
year = {2021},
url = {https://arxiv.org/abs/2112.10668},
eprinttype = {arXiv},
eprint = {2112.10668},
timestamp = {Tue, 04 Jan 2022 15:59:27 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2112-10668.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
@misc{mostafazadeh2016corpus,
title={A Corpus and Evaluation Framework for Deeper Understanding of Commonsense Stories},
author={Nasrin Mostafazadeh and
Nathanael Chambers and
Xiaodong He and
Devi Parikh and
Dhruv Batra and
Lucy Vanderwende and
Pushmeet Kohli and
James Allen},
year={2016},
eprint={1604.01696},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
```
......
""" This code mirrors the utils of the original winogrande task """
"""This code mirrors the utils of the original winogrande task"""
def doc_to_text(doc):
......
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import os
......
......@@ -26,6 +26,11 @@ eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
HIGHER_IS_BETTER_SYMBOLS = {
True: "↑",
False: "↓",
}
def hash_string(string: str) -> str:
return hashlib.sha256(string.encode("utf-8")).hexdigest()
......@@ -76,6 +81,18 @@ def handle_non_serializable(o):
return str(o)
def sanitize_list(sub):
"""
Takes possible nested list and recursively converts all inner component to strings
"""
if isinstance(sub, list):
return [sanitize_list(item) for item in sub]
if isinstance(sub, tuple):
return tuple(sanitize_list(item) for item in sub)
else:
return str(sub)
def simple_parse_args_string(args_string):
"""
Parses something like
......@@ -135,6 +152,55 @@ def general_detokenize(string):
return string
def get_file_task_name(filename: str) -> str:
"""
Given the sample results filenames, extracts and returns the task name.
"""
return filename[filename.find("_") + 1 : filename.rfind("_")]
def get_file_datetime(filename: str) -> str:
"""
Given the results and sample results filenames, extracts and returns the datetime.
"""
return filename[filename.rfind("_") + 1 :].replace(".json", "")
def sanitize_model_name(model_name: str) -> str:
"""
Given the model name, returns a sanitized version of it.
"""
return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name)
def sanitize_task_name(task_name: str) -> str:
"""
Given the task name, returns a sanitized version of it.
"""
return re.sub(r"\W", "_", task_name)
def get_latest_filename(filenames: List[str]) -> str:
"""
Given a list of filenames, returns the filename with the latest datetime.
"""
return max(filenames, key=lambda f: get_file_datetime(f))
def get_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to aggregated results.
"""
return [f for f in filenames if "/results_" in f and ".json" in f]
def get_sample_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to sample results.
"""
return [f for f in filenames if "/samples_" in f and ".json" in f]
def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
"""
- context_len allows for a rolling window context, allowing each prediction window to potentially
......@@ -257,6 +323,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
"Filter",
"n-shot",
"Metric",
"",
"Value",
"",
"Stderr",
......@@ -277,22 +344,29 @@ def make_table(result_dict, column: str = "results", sort_results: bool = True):
dic = result_dict[column][k]
version = result_dict["versions"].get(k, "N/A")
n = str(result_dict["n-shot"][k])
higher_is_better = result_dict.get("higher_is_better", {}).get(k, {})
if "alias" in dic:
k = dic.pop("alias")
for (mf), v in dic.items():
metric_items = dic.items()
if sort_results:
metric_items = sorted(metric_items)
for (mf), v in metric_items:
m, _, f = mf.partition(",")
if m.endswith("_stderr"):
continue
hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
if m + "_stderr" + "," + f in dic:
se = dic[m + "_stderr" + "," + f]
if se != "N/A":
se = "%.4f" % se
values.append([k, version, f, n, m, "%.4f" % v, "±", se])
values.append([k, version, f, n, m, hib, "%.4f" % v, "±", se])
else:
values.append([k, version, f, n, m, "%.4f" % v, "", ""])
values.append([k, version, f, n, m, hib, "%.4f" % v, "", ""])
k = ""
version = ""
md_writer.value_matrix = values
......
......@@ -19,7 +19,7 @@ classifiers = [
requires-python = ">=3.8"
license = { "text" = "MIT" }
dependencies = [
"accelerate>=0.21.0",
"accelerate>=0.26.0",
"evaluate",
"datasets>=2.16.0",
"evaluate>=0.4.0",
......@@ -73,7 +73,7 @@ promptsource = ["promptsource>=0.2.3"]
sentencepiece = ["sentencepiece>=0.1.98"]
sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm==0.3.2"]
vllm = ["vllm>=0.4.2"]
zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"]
unitxt = ["unitxt"]
......
......@@ -10,7 +10,7 @@ It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.1
the match, splitting the training data into chunks
3) Any chunks less than `minimum_slice_length` are removed
4) Training data sets split into more than `too_dirty_cutoff` are considered
completey contaminated and removed
completely contaminated and removed
OpenAI used:
```
......
......@@ -2,6 +2,7 @@
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import json
import logging
import os
......
......@@ -2,6 +2,7 @@
Usage:
python make_table_tasks.py --output <markdown_filename>
"""
import argparse
import logging
......
......@@ -70,6 +70,11 @@ def main():
if docs is not None:
iters.append(docs)
if len(iters) == 0:
raise ValueError(
f"Passed --sets '{args.sets}' but this task has no splits which match. Please specify a different --sets value."
)
docs = join_iters(iters)
with open(
......
......@@ -7,7 +7,12 @@ from pathlib import Path
import pandas as pd
from zeno_client import ZenoClient, ZenoMetric
from lm_eval.utils import eval_logger
from lm_eval.utils import (
eval_logger,
get_latest_filename,
get_results_filenames,
get_sample_results_filenames,
)
def parse_args():
......@@ -45,13 +50,15 @@ def main():
assert len(models) > 0, "No model directories found in the data_path."
# Get the tasks from the latest results file of the first model.
tasks = set(tasks_for_model(models[0], args.data_path))
for model in models: # Make sure that all models have the same tasks.
# Get tasks names from the latest results file for each model
# Get intersection of tasks for all models
for model in models:
old_tasks = tasks.copy()
task_count = len(tasks)
model_tasks = tasks_for_model(model, args.data_path)
model_tasks = set(tasks_for_model(model, args.data_path))
tasks.intersection(set(model_tasks))
if task_count != len(tasks):
......@@ -66,22 +73,36 @@ def main():
for task in tasks:
# Upload data for all models
for model_index, model in enumerate(models):
# Get latest results and sample results for a model
model_dir = Path(args.data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
model_sample_filenames = get_sample_results_filenames(model_files)
latest_results = get_latest_filename(
[Path(f).name for f in model_results_filenames]
)
latest_sample_results = get_latest_filename(
[Path(f).name for f in model_sample_filenames if task in f]
)
model_args = re.sub(
r"[\"<>:/\|\\?\*\[\]]+",
"__",
json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["config"]["model_args"],
)
print(model_args)
data = []
with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
Path(args.data_path, model, latest_sample_results),
"r",
encoding="utf-8",
) as file:
data = json.loads(file.read())
for line in file:
data.append(json.loads(line.strip()))
configs = json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["configs"]
config = configs[task]
......@@ -125,10 +146,12 @@ def tasks_for_model(model: str, data_path: str):
Returns:
list: A list of tasks for the model.
"""
dir_path = Path(data_path, model)
config = (
json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
)
# get latest model results for a given name
model_dir = Path(data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
latest_results = get_latest_filename(model_results_filenames)
config = (json.load(open(latest_results, encoding="utf-8"))["configs"],)
return list(config[0].keys())
......
......@@ -15,11 +15,11 @@ base_url = "https://matthoffner-ggml-llm-api.hf.space"
def gguf_completion_mock(base_url=None, **kwargs):
# Generate a hash from the parameters
hash_kwargs = {"base_url": base_url, **kwargs}
hash = hashlib.sha256(
parameters_hash = hashlib.sha256(
json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
).hexdigest()
fname = f"./tests/testdata/gguf_test_{hash}.pkl"
fname = f"./tests/testdata/gguf_test_{parameters_hash}.pkl"
if os.path.exists(fname):
with open(fname, "rb") as fh:
......
from __future__ import annotations
import os
import sys
from pathlib import Path
import numpy as np
import torch
import lm_eval.tasks as tasks
from lm_eval import tasks
from lm_eval.api.instance import Instance
from lm_eval.models.huggingface import HFLM
os.environ["TOKENIZERS_PARALLELISM"] = "false"
task_manager = tasks.TaskManager()
TEST_STRING = "foo bar"
class Test_HFLM:
torch.use_deterministic_algorithms(True)
......@@ -107,7 +111,7 @@ class Test_HFLM:
file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
file_path = file_path.resolve()
with open(file_path, "w") as f:
with open(file_path, "w", encoding="utf-8") as f:
f.write("\n".join(str(x) for x in _res))
assert np.allclose(_res, _RES, atol=1e-2)
# check indices for Multiple Choice
......@@ -126,19 +130,19 @@ class Test_HFLM:
assert np.allclose(res, self.ROLLING_RES, atol=1e-1)
def test_toc_encode(self) -> None:
res = self.LM.tok_encode("foo bar")
res = self.LM.tok_encode(TEST_STRING)
assert res == [12110, 2534]
def test_toc_decode(self) -> None:
res = self.LM.tok_decode([12110, 2534])
assert res == "foo bar"
assert res == TEST_STRING
def test_batch_encode(self) -> None:
res = self.LM.tok_batch_encode(["foo bar", "bar foo"])[0].tolist()
res = self.LM.tok_batch_encode([TEST_STRING, "bar foo"])[0].tolist()
assert res == [[12110, 2534], [2009, 17374]]
def test_model_generate(self) -> None:
context = self.LM.tok_batch_encode(["foo bar"])[0]
context = self.LM.tok_batch_encode([TEST_STRING])[0]
res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
res = self.LM.tok_decode(res[0])
assert res == "foo bar\n<bazhang>!info bar"
import pytest
import lm_eval.evaluator as evaluator
from lm_eval import evaluator
from lm_eval.api.registry import get_model
......@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
]
@pytest.mark.skip(reason="test failing")
@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
def test_sparseml_eval(model_id, task):
lm = get_model("sparseml").create_from_arg_string(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment