"vllm_flash_attn/flash_attn_interface.py" did not exist on "54e80a3829c6d2337570d01e78ebd9529c02d342"
Commit 3e1301bb authored by lintangsutawika's avatar lintangsutawika
Browse files

resolved merge conflict from latest version

parents fd9cd80f 070d31df
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_ner
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
group:
- unitxt
dataset_path: unitxt/data
output_type: generate_until
training_split: train
validation_split: test
doc_to_text: '{{source}}'
doc_to_target: target
process_results: !function 'unitxt_wrapper.process_results'
generation_kwargs:
until:
- </s>
metric_list:
- metric: unitxt_rouge
aggregation: unitxt
higher_is_better: true
metadata:
verison: 1.0
try:
from unitxt import evaluate
except ImportError:
raise ImportError(
"Package 'unitxt' is not installed. To install it, use `pip install 'lm_eval[unitxt]'`"
)
from lm_eval.api.registry import AGGREGATION_REGISTRY, METRIC_REGISTRY, register_metric
def unitxt_agg_metric(items):
preds = [pred[0] for pred, _, _ in items]
refs = [ref for _, ref, _ in items]
metric_name = items[0][2].replace("unitxt_", "metrics.")
for ref in refs:
ref["metrics"] = [metric_name]
result_metrics = evaluate(preds, refs)
return result_metrics[0]["score"]["global"]["score"]
AGGREGATION_REGISTRY["unitxt"] = unitxt_agg_metric
def unitxt_metric(items): # This is a passthrough function
return items
def process_results(doc, results):
metrics = doc["metrics"]
scores = {}
for metric in metrics:
metric = metric.replace("metrics.", "unitxt_")
scores[metric] = (results, doc, metric)
if metric not in METRIC_REGISTRY:
register_metric(
metric=metric,
higher_is_better=True,
output_type="generate_until",
aggregation="unitxt",
)(unitxt_metric)
return scores
#
include: unitxt_tasks.summarization.abstractive
task: xsum
dataset_name: card=cards.xsum,template=templates.summarization.abstractive.full
include: unitxt_tasks.classification.multi_class
task: yahoo_answers_topics
dataset_name: card=cards.yahoo_answers_topics,template=templates.classification.multi_class.title
......@@ -26,6 +26,11 @@ eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
HIGHER_IS_BETTER_SYMBOLS = {
True: "↑",
False: "↓",
}
def hash_string(string: str) -> str:
return hashlib.sha256(string.encode("utf-8")).hexdigest()
......@@ -76,6 +81,18 @@ def handle_non_serializable(o):
return str(o)
def sanitize_list(sub):
"""
Takes possible nested list and recursively converts all inner component to strings
"""
if isinstance(sub, list):
return [sanitize_list(item) for item in sub]
if isinstance(sub, tuple):
return tuple(sanitize_list(item) for item in sub)
else:
return str(sub)
def simple_parse_args_string(args_string):
"""
Parses something like
......@@ -257,6 +274,7 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
"Filter",
"n-shot",
"Metric",
"",
"Value",
"",
"Stderr",
......@@ -276,10 +294,8 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
for k in keys:
dic = result_dict[column][k]
version = result_dict["versions"].get(k, " N/A")
if k in result_dict["n-shot"]:
n = str(result_dict["n-shot"][k])
else:
n = " "
n = str(result_dict.get("n-shot", " ").get(k, " "))
higher_is_better = result_dict.get("higher_is_better", {}).get(k, {})
if "alias" in dic:
k = dic.pop("alias")
......@@ -290,13 +306,16 @@ def make_table(result_dict, column: str = "results", sort_results: bool = False)
continue
if v != " ":
v = "%.4f" % v
hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "")
if m + "_stderr" + "," + f in dic:
se = dic[m + "_stderr" + "," + f]
if se != "N/A":
se = "%.4f" % se
values.append([k, version, f, n, m, v, "±", se])
values.append([k, version, f, n, m, hib, v, "±", se])
else:
values.append([k, version, f, n, m, v, "", ""])
values.append([k, version, f, n, m, hib, v, "", ""])
k = ""
version = ""
md_writer.value_matrix = values
......
......@@ -19,7 +19,7 @@ classifiers = [
requires-python = ">=3.8"
license = { "text" = "MIT" }
dependencies = [
"accelerate>=0.21.0",
"accelerate>=0.26.0",
"evaluate",
"datasets>=2.16.0",
"evaluate>=0.4.0",
......@@ -39,6 +39,7 @@ dependencies = [
"dill",
"word2number",
"more_itertools",
"shortuuid",
]
[tool.setuptools.packages.find]
......@@ -73,9 +74,10 @@ promptsource = ["promptsource>=0.2.3"]
sentencepiece = ["sentencepiece>=0.1.98"]
sparseml = ["sparseml-nightly[llm]>=1.8.0.20240404"]
testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm==0.3.2"]
vllm = ["vllm>=0.4.2"]
zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"]
unitxt = ["unitxt"]
all = [
"lm_eval[anthropic]",
"lm_eval[dev]",
......@@ -94,6 +96,7 @@ all = [
"lm_eval[vllm]",
"lm_eval[zeno]",
"lm_eval[wandb]",
"lm_eval[unitxt]"
]
[tool.ruff.lint]
......
......@@ -15,11 +15,11 @@ base_url = "https://matthoffner-ggml-llm-api.hf.space"
def gguf_completion_mock(base_url=None, **kwargs):
# Generate a hash from the parameters
hash_kwargs = {"base_url": base_url, **kwargs}
hash = hashlib.sha256(
parameters_hash = hashlib.sha256(
json.dumps(hash_kwargs, sort_keys=True).encode("utf-8")
).hexdigest()
fname = f"./tests/testdata/gguf_test_{hash}.pkl"
fname = f"./tests/testdata/gguf_test_{parameters_hash}.pkl"
if os.path.exists(fname):
with open(fname, "rb") as fh:
......
from __future__ import annotations
import os
import sys
from pathlib import Path
import numpy as np
import torch
import lm_eval.tasks as tasks
from lm_eval import tasks
from lm_eval.api.instance import Instance
from lm_eval.models.huggingface import HFLM
os.environ["TOKENIZERS_PARALLELISM"] = "false"
task_manager = tasks.TaskManager()
TEST_STRING = "foo bar"
class Test_HFLM:
torch.use_deterministic_algorithms(True)
......@@ -107,7 +111,7 @@ class Test_HFLM:
file_path = dir_path / f"outputs_log_{self.version_minor}.txt"
file_path = file_path.resolve()
with open(file_path, "w") as f:
with open(file_path, "w", encoding="utf-8") as f:
f.write("\n".join(str(x) for x in _res))
assert np.allclose(_res, _RES, atol=1e-2)
# check indices for Multiple Choice
......@@ -126,19 +130,19 @@ class Test_HFLM:
assert np.allclose(res, self.ROLLING_RES, atol=1e-1)
def test_toc_encode(self) -> None:
res = self.LM.tok_encode("foo bar")
res = self.LM.tok_encode(TEST_STRING)
assert res == [12110, 2534]
def test_toc_decode(self) -> None:
res = self.LM.tok_decode([12110, 2534])
assert res == "foo bar"
assert res == TEST_STRING
def test_batch_encode(self) -> None:
res = self.LM.tok_batch_encode(["foo bar", "bar foo"])[0].tolist()
res = self.LM.tok_batch_encode([TEST_STRING, "bar foo"])[0].tolist()
assert res == [[12110, 2534], [2009, 17374]]
def test_model_generate(self) -> None:
context = self.LM.tok_batch_encode(["foo bar"])[0]
context = self.LM.tok_batch_encode([TEST_STRING])[0]
res = self.LM._model_generate(context, max_length=10, stop=["\n\n"])
res = self.LM.tok_decode(res[0])
assert res == "foo bar\n<bazhang>!info bar"
import pytest
import lm_eval.evaluator as evaluator
from lm_eval import evaluator
from lm_eval.api.registry import get_model
......
......@@ -6,7 +6,7 @@ import pytest
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer
import lm_eval.evaluator as evaluator
from lm_eval import evaluator
from lm_eval.api.registry import get_model
......@@ -46,7 +46,7 @@ def test_evaluator(model_id, task):
random.seed(42)
for _ in reqs:
res.append((-random.random(), False))
res.extend([(-random.random(), False)])
return res
......@@ -57,7 +57,7 @@ def test_evaluator(model_id, task):
res = []
random.seed(42)
for _ in reqs:
res.append(-random.random())
res.extend([-random.random()])
return res
......@@ -79,7 +79,7 @@ def test_ov_config():
model_id = "hf-internal-testing/tiny-random-gpt2"
with tempfile.TemporaryDirectory() as tmpdirname:
config_file = str(Path(tmpdirname) / "ov_config.json")
with open(Path(config_file), "w") as f:
with open(Path(config_file), "w", encoding="utf-8") as f:
f.write('{"DYNAMIC_QUANTIZATION_GROUP_SIZE" : "32"}')
lm = get_model("openvino").create_from_arg_string(
f"pretrained={model_id},ov_config={config_file}"
......
......@@ -3,7 +3,7 @@ from typing import List
import pytest
import torch
import lm_eval.tasks as tasks
from lm_eval import tasks
from lm_eval.api.instance import Instance
......
# import lm_eval.base as base
import os
from typing import List
import pytest
# import lm_eval.models as models
import lm_eval.api as api
import lm_eval.evaluator as evaluator
from lm_eval import tasks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
@pytest.mark.parametrize(
"task_name,limit,model,model_args",
"task_name,limit,model,model_args,bootstrap_iters",
[
(
["arc_easy"],
10,
"hf",
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
0,
),
(
["mmlu_abstract_algebra"],
None,
"hf",
"pretrained=EleutherAI/pythia-160m,dtype=float32,device=cpu",
10000,
),
],
)
def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str):
# task_name = task_name
# limit = 10
def test_evaluator(
task_name: List[str], limit: int, model: str, model_args: str, bootstrap_iters: int
):
e1 = evaluator.simple_evaluate(
model=model,
tasks=task_name,
limit=limit,
model_args=model_args,
bootstrap_iters=bootstrap_iters,
)
assert e1 is not None
......@@ -57,6 +59,7 @@ def test_evaluator(task_name: List[str], limit: int, model: str, model_args: str
lm=lm,
task_dict=task_dict,
limit=limit,
bootstrap_iters=bootstrap_iters,
)
assert e2 is not None
......
import os
from collections import defaultdict
from lm_eval.decontamination.janitor import (
......@@ -9,23 +10,41 @@ from lm_eval.decontamination.janitor import (
)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
TEST_SEQUENCE = (
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
)
JANITOR_EXPECTED = (
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
JANITOR_FILTH1 = "filth lots of dirty filthy filth"
JANITOR_FILTH2 = "filth lots of filthy dirty filth"
def simple_ngram(sequence, n):
ngrams = list()
ngram = []
for x in sequence:
ngram.append(x)
ngram.extend([x])
if len(ngram) == n:
ngrams.append(tuple(ngram))
ngrams.extend([tuple(ngram)])
ngram = ngram[1:]
return ngrams
def test_form_ngrams():
sequence = (
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
)
sequence = TEST_SEQUENCE
n_values = [1, 2, 3, 5, 13]
for n in n_values:
......@@ -36,10 +55,7 @@ def test_form_ngrams():
def test_word_ngrams():
sequence = (
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
)
sequence = TEST_SEQUENCE
words = sequence.split()
......@@ -53,10 +69,7 @@ def test_word_ngrams():
def test_split_indices():
sequence = (
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
)
sequence = TEST_SEQUENCE
comparison = []
current_word = ""
......@@ -65,12 +78,18 @@ def test_split_indices():
current_word += c
else:
if current_word:
comparison.append((current_word, (i - len(current_word), i - 1)))
comparison.extend([(current_word, (i - len(current_word), i - 1))])
current_word = ""
if current_word:
comparison.append(
(current_word, (len(sequence) - len(current_word), len(sequence) - 1))
len_sequence = len(sequence)
comparison.extend(
[
(
current_word,
(len_sequence - len(current_word), len_sequence - 1),
)
]
)
current_word = ""
......@@ -80,10 +99,7 @@ def test_split_indices():
def test_word_ngrams_indices():
sequence = (
"Hello my name is Bob, I like eating pizza, chicken, chips and ice cream. Maybe I should eat some"
" more salad but it's so booooring. I just... like eating pizza, chicken, chips and ice cream so much."
)
sequence = TEST_SEQUENCE
n_values = [1, 2, 3, 5, 13]
......@@ -100,14 +116,13 @@ def test_word_ngrams_indices():
tracker[ngram] = end + 1
# ignore partial word matches
if (start != 0 and sequence[start - 1] != " ") or (
end != len(sequence) - 1 and sequence[end + 1] != " "
if not (
(start != 0 and sequence[start - 1] != " ")
or (end != len(sequence) - 1 and sequence[end + 1] != " ")
):
pass
else:
break
comparison.append((ngram, (start, end)))
comparison.extend([(ngram, (start, end))])
result_to_test = list(word_ngrams_indices(sequence, n))
assert len(result_to_test) == len(comparison)
......@@ -184,17 +199,6 @@ def test_janitor2():
filth = "filth"
expected_result = (
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
janitor = Janitor(
ngram_n=1, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
)
......@@ -207,7 +211,7 @@ def test_janitor2():
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
assert result == JANITOR_EXPECTED
def test_janitor3():
......@@ -229,19 +233,6 @@ def test_janitor3():
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filth = "filth lots of dirty filthy filth"
expected_result = (
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
janitor = Janitor(
ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
)
......@@ -249,12 +240,12 @@ def test_janitor3():
result = "".join(result)
assert result == sequence
janitor.register_contaminant(filth)
assert janitor.dirt_ngrams == {filth}
janitor.register_contaminant(JANITOR_FILTH1)
assert janitor.dirt_ngrams == {JANITOR_FILTH1}
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
assert result == JANITOR_EXPECTED
def test_janitor4():
......@@ -284,19 +275,6 @@ def test_janitor4():
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filth = "filth lots of dirty filthy filth"
expected_result = (
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
janitor = Janitor(
ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
)
......@@ -304,12 +282,12 @@ def test_janitor4():
result = "".join(result)
assert result == sequence
janitor.register_contaminant(filth)
assert janitor.dirt_ngrams == {filth}
janitor.register_contaminant(JANITOR_FILTH1)
assert janitor.dirt_ngrams == {JANITOR_FILTH1}
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
assert result == JANITOR_EXPECTED
def test_janitor5():
......@@ -338,18 +316,7 @@ def test_janitor5():
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
expected_result = (
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filths = [JANITOR_FILTH1, JANITOR_FILTH2]
janitor = Janitor(
ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
......@@ -364,7 +331,7 @@ def test_janitor5():
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
assert result == JANITOR_EXPECTED
def test_janitor6():
......@@ -401,18 +368,7 @@ def test_janitor6():
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
expected_result = (
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing "
" characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filths = [JANITOR_FILTH1, JANITOR_FILTH2]
janitor = Janitor(
ngram_n=6, window_to_remove=200, too_dirty_cutoff=10, minimum_slice_length=200
......@@ -427,7 +383,7 @@ def test_janitor6():
result = janitor.clean_python(sequence)
result = "".join(result)
assert result == expected_result
assert result == JANITOR_EXPECTED
def test_janitor7():
......@@ -465,7 +421,7 @@ def test_janitor7():
"This is a @line #containing a certain number of characters, 76 to be exact. "
)
filths = ["filth lots of dirty filthy filth", "filth lots of filthy dirty filth"]
filths = [JANITOR_FILTH1, JANITOR_FILTH2]
expected_result = ""
......@@ -488,20 +444,3 @@ def test_janitor7():
def test_janitor8():
# This will test the save and load contams
pass
# source = """ ,, I'm a very !dirty,, ,, dirty boy. Clean me daddy. \n\nhe he he hehe heh. lastword """ * 2
# contaminant = "dirty boy. Clean he he"
# jan = Janitor(ngram_n=3)
# jan.register_contaminant(contaminant)
# cleaned = " ".join(jan.clean(source))
# for contam in jan.dirt_ngrams:
# assert contam not in cleaned, contam
# filename = "data/saved_contam"
# jan.save_contamination_ngrams(filename)
# jan = Janitor(ngram_n=3)
# jan.load_contamination_ngrams(filename)
# cleaned = " ".join(jan.clean(source))
# for contam in jan.dirt_ngrams:
# assert contam not in cleaned, contam
# import lm_eval.base as base
import importlib
import os
import sys
from datetime import datetime
from typing import List, Tuple
from typing import List, Optional, Tuple
import pytest
import torch
# import lm_eval.models as models
from lm_eval.caching.cache import PATH
......@@ -43,7 +41,7 @@ def clear_cache():
# leaving tasks here to allow for the option to select specific task files
def get_cache_files(tasks: List[str] = None) -> Tuple[List[str], List[str]]:
def get_cache_files(tasks: Optional[List[str]] = None) -> Tuple[List[str], List[str]]:
cache_files = os.listdir(PATH)
file_task_names = []
......@@ -51,7 +49,7 @@ def get_cache_files(tasks: List[str] = None) -> Tuple[List[str], List[str]]:
for file in cache_files:
file_without_prefix = file.split("-")[1]
file_without_prefix_and_suffix = file_without_prefix.split(".")[0]
file_task_names.append(file_without_prefix_and_suffix)
file_task_names.extend([file_without_prefix_and_suffix])
return cache_files, file_task_names
......@@ -113,10 +111,11 @@ if __name__ == "__main__":
# test_requests_caching_refresh,
# test_requests_caching_delete,
]
# Lookups of global names within a loop is inefficient, so copy to a local variable outside of the loop first
default_tasks = DEFAULT_TASKS
for test_func in tests:
clear_cache()
test_func(tasks=DEFAULT_TASKS)
test_func(tasks=default_tasks)
print("Tests pass")
......
import os
from itertools import islice
import pytest
......@@ -8,6 +9,7 @@ from lm_eval.api.task import ConfigurableTask
from .utils import new_tasks
os.environ["TOKENIZERS_PARALLELISM"] = "false"
task_manager = tasks.TaskManager()
# Default Task
TASKS = ["arc_easy"]
......@@ -87,7 +89,6 @@ class TestNewTasks:
)
if "multiple_choice" in task._config.output_type:
_array = [task.doc_to_choice(doc) for doc in arr]
# assert all(len(x) == 4 for x in _array)
assert all(isinstance(x, list) for x in _array)
assert all(isinstance(x[0], str) for x in _array)
......@@ -101,9 +102,6 @@ class TestNewTasks:
_array_target = [task.doc_to_target(doc) for doc in arr]
if task._config.output_type == "multiple_choice":
assert all(isinstance(label, int) for label in _array_target)
# _array_text = [task.doc_to_text(doc) for doc in arr]
# Not working
# assert all(tgt[0] == " " or txt[-1] == "\n" if len(txt) != 0 else True for txt, tgt in zip(_array_text, _array_target))
def test_build_all_requests(self, task_class, limit):
task_class.build_all_requests(rank=1, limit=limit, world_size=1)
......@@ -118,5 +116,4 @@ class TestNewTasks:
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# assert all(isinstance(doc, list) for doc in requests)
assert len(requests) == limit if limit else True
......@@ -41,7 +41,7 @@ def test_get_rolling_token_windows_v1():
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
output.extend([(input_tokens, pred_tokens)])
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
......@@ -70,7 +70,7 @@ def test_get_rolling_token_windows_v2():
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
output.extend([(input_tokens, pred_tokens)])
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
......@@ -115,7 +115,7 @@ def test_get_rolling_token_windows_v3():
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
output.extend([(input_tokens, pred_tokens)])
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
......@@ -156,7 +156,7 @@ def test_get_rolling_token_windows_v4():
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
output.extend([(input_tokens, pred_tokens)])
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
......@@ -185,7 +185,7 @@ def test_get_rolling_token_windows_v5():
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
output.extend([(input_tokens, pred_tokens)])
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
......@@ -210,7 +210,7 @@ def test_get_rolling_token_windows_v6():
pred_length = 0
output = []
for input_tokens, pred_tokens in generator:
output.append((input_tokens, pred_tokens))
output.extend([(input_tokens, pred_tokens)])
pred_length += len(pred_tokens)
assert pred_length == len(x)
assert gold == output
......@@ -273,26 +273,26 @@ class TestCollator:
generation_samples = self.make_generate_sample(int(end))
gens = Collator(generation_samples, _collate_gen, group_by="gen_kwargs")
chunks = gens.get_batched(n=int(batch_size), batch_fn=None)
chunks_gen = gens.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
group_one = end // 2
group_two = end - end // 2
is_batch = batch_size != 0
for chunks in chunks_gen:
# check batching
group_one = end // 2
group_two = end - end // 2
assert (
len(chunks) <= batch_size
if batch_size != 0
if is_batch
else len(chunks) in [group_one, group_two]
)
# check if reorder-er is working correctly
assert all(
len(chunks[i][0]) <= len(chunks[i - 1][0])
for i in range(1, len(chunks))
)
chunk_lengths = [len(chunk[0]) for chunk in chunks]
assert chunk_lengths == sorted(chunk_lengths, reverse=True)
# check if grouping correctly
assert all(x[1] == chunks[0][1] for x in chunks)
chunk_to_compare = chunks[0][1]
assert all(x[1] == chunk_to_compare for x in chunks)
for x in chunks:
output.append(x)
output.extend([x])
reordered_output = gens.get_original(output)
# check get original
assert reordered_output == generation_samples
......@@ -305,18 +305,17 @@ class TestCollator:
loglikelihood_samples,
_collate_log,
)
chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
chunks_gen = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
output = []
for chunks in chunks:
is_batch = batch_size != 0
for chunks in chunks_gen:
# check batching
assert len(chunks) <= batch_size if batch_size != 0 else len(chunks) == end
assert len(chunks) <= batch_size if is_batch else len(chunks) == end
# check reorder
assert all(
len(chunks[i][1]) <= len(chunks[i - 1][1])
for i in range(1, len(chunks))
)
chunk_lengths = [len(chunk[1]) for chunk in chunks]
assert chunk_lengths == sorted(chunk_lengths, reverse=True)
for x in chunks:
output.append(x[1])
output.extend([x[1]])
# check indices
reordered_output = loglikelihoods.get_original(output)
assert reordered_output == [x[1] for x in loglikelihood_samples]
......@@ -335,18 +334,17 @@ class TestCollator:
group_fn=lambda a: a[-2] + a[-1][:-1],
group_by="contexts",
)
chunks = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
chunks_gen = loglikelihoods.get_batched(n=int(batch_size), batch_fn=None)
output = []
outputs_ = []
for chunks in chunks:
is_batch = batch_size != 0
for chunks in chunks_gen:
# check batching
if batch_size != 0:
if is_batch:
assert len(chunks) <= batch_size
# check reorder
assert all(
len(chunks[i][1]) <= len(chunks[i - 1][1])
for i in range(1, len(chunks))
)
chunk_lengths = [len(chunk[1]) for chunk in chunks]
assert chunk_lengths == sorted(chunk_lengths, reverse=True)
for x in chunks:
for request_str, cont_toks, logits in loglikelihoods.get_cache(
req_str="".join(x[0]),
......@@ -356,8 +354,8 @@ class TestCollator:
.unsqueeze(0)
.unsqueeze(0),
):
output.append(x[1])
outputs_.append(cont_toks)
output.extend([x[1]])
outputs_.extend([cont_toks])
assert len(output) == len(outputs_)
# check indices
reordered_output = loglikelihoods.get_original(output)
......
......@@ -3,12 +3,12 @@ group_alias: test 1
task:
- piqa # string task
- ai2_arc # string tag
- task: super-glue-lm-eval-v1 # Should this be spread out?
num_fewshot: 3
# - task: super-glue-lm-eval-v1 # Should this be spread out?
# num_fewshot: 3
- task: swag # dict registered task
num_fewshot: 2
- task: mmlu
num_fewshot: 5
# - task: mmlu
# num_fewshot: 5
- group: nli-tasks # dict group
task:
- anli
......@@ -17,29 +17,31 @@ task:
num_fewshot: 4
metric_list:
- metric: brier_score
- task: sciq # dict registered task duplicate
task_alias: sciq 2-shot
num_fewshot: 2
- task: sciq # dict registered task duplicate
task_alias: sciq 4-shot
num_fewshot: 4
- task: sciq # dict registered task duplicate
task_alias: sciq 6-shot
num_fewshot: 6
- task: siqa_custom # dict task
dataset_path: social_i_qa
dataset_name: null
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "Question: {{context}} {{question}}\nAnswer:"
target_delimiter: " "
doc_to_choice:
- "{{answerA}}"
- "{{answerB}}"
- "{{answerC}}"
doc_to_target: "{{ (label|int) - 1 }}"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
aggregate_metric: true
# - task: sciq # dict registered task duplicate
# task_alias: sciq 2-shot
# num_fewshot: 2
# - task: sciq # dict registered task duplicate
# task_alias: sciq 4-shot
# num_fewshot: 4
# - task: sciq # dict registered task duplicate
# task_alias: sciq 6-shot
# num_fewshot: 6
# - task: siqa_custom # dict task
# dataset_path: social_i_qa
# dataset_name: null
# output_type: multiple_choice
# training_split: train
# validation_split: validation
# doc_to_text: "Question: {{context}} {{question}}\nAnswer:"
# target_delimiter: " "
# doc_to_choice:
# - "{{answerA}}"
# - "{{answerB}}"
# - "{{answerC}}"
# doc_to_target: "{{ (label|int) - 1 }}"
# metric_list:
# - metric: acc
# aggregation: mean
# higher_is_better: true
......@@ -12,9 +12,9 @@ from lm_eval.utils import load_yaml_config
# reads a text file and returns a list of words
# used to read the output of the changed txt from tj-actions/changed-files
def load_changed_files(file_path: str) -> List[str]:
with open(file_path, "r") as f:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
words_list = [x for x in content.split()]
words_list = list(content.split())
return words_list
......@@ -25,7 +25,7 @@ def load_changed_files(file_path: str) -> List[str]:
def parser(full_path: List[str]) -> List[str]:
_output = set()
for x in full_path:
if os.path.exists(x) and x.endswith(".yaml"):
if x.endswith(".yaml") and os.path.exists(x):
config = load_yaml_config(x, mode="simple")
if isinstance(config["task"], str):
_output.add(config["task"])
......@@ -40,10 +40,9 @@ def new_tasks() -> Union[List[str], None]:
# If tasks folder has changed then we get the list of files from FILENAME
# and parse the yaml files to get the task names.
return parser(load_changed_files(FILENAME))
elif os.getenv("API") is not None:
if os.getenv("API") is not None:
# Or if API has changed then we set the ENV variable API to True
# and run given tasks.
return ["arc_easy", "hellaswag", "piqa", "wikitext"]
# if both not true just do arc_easy
else:
return
return None
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment