Commit 2106fbeb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/openai_completions.py
parents 4354fe46 703fbffd
include: xquad_common_yaml
task: xquad_hi
dataset_name: xquad.hi
doc_to_text: "प्रसंग: {{context}}\n\nसवाल: {{question}}\n\nउत्तर:"
include: xquad_common_yaml
task: xquad_ro
dataset_name: xquad.ro
doc_to_text: "Context: {{context}}\n\nÎntrebare: {{question}}\n\nRăspuns:"
include: xquad_common_yaml
task: xquad_ru
dataset_name: xquad.ru
doc_to_text: "Контекст: {{context}}\n\nВопрос: {{question}}\n\nОтвет:"
include: xquad_common_yaml
task: xquad_th
dataset_name: xquad.th
doc_to_text: "บริบท: {{context}}\n\nคำถาม: {{question}}\n\nคำตอบ:"
include: xquad_common_yaml
task: xquad_tr
dataset_name: xquad.tr
doc_to_text: "Bağlam: {{context}}\n\nSoru: {{question}}\n\nCevap:"
include: xquad_common_yaml
task: xquad_vi
dataset_name: xquad.vi
doc_to_text: "Bối cảnh: {{context}}\n\nCâu hỏi: {{question}}\n\nTrả lời:"
include: xquad_common_yaml
task: xquad_zh
dataset_name: xquad.zh
doc_to_text: "语境: {{context}}\n\n问题: {{question}}\n\n回答:"
......@@ -10,7 +10,7 @@ import os
import re
from dataclasses import asdict, is_dataclass
from itertools import islice
from typing import Any, Callable, List
from typing import Any, Callable, Generator, List, Tuple
import numpy as np
import yaml
......@@ -104,7 +104,8 @@ def simple_parse_args_string(args_string):
return {}
arg_list = [arg for arg in args_string.split(",") if arg]
args_dict = {
k: handle_arg_string(v) for k, v in [arg.split("=") for arg in arg_list]
kv[0]: handle_arg_string("=".join(kv[1:]))
for kv in [arg.split("=") for arg in arg_list]
}
return args_dict
......@@ -201,7 +202,9 @@ def get_sample_results_filenames(filenames: List[str]) -> List[str]:
return [f for f in filenames if "/samples_" in f and ".json" in f]
def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
def get_rolling_token_windows(
token_list: List[int], prefix_token: int, max_seq_len: int, context_len: int
) -> Generator[Tuple[List[int], List[int]], None, None]:
"""
- context_len allows for a rolling window context, allowing each prediction window to potentially
condition on some context
......@@ -228,7 +231,7 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
# Special handling for first window: predict all tokens
first_seq_len = min(max_seq_len, len(token_list))
yield ([prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len])
yield [prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len]
predicted += first_seq_len
while predicted < len(token_list):
......@@ -242,7 +245,9 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
predicted += window_pred_len
def make_disjoint_window(pair):
def make_disjoint_window(
pair: Tuple[List[int], List[int]],
) -> Tuple[List[int], List[int]]:
"""Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
a, b = pair
return a[: len(a) - (len(b) - 1)], b
......
......@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project]
name = "lm_eval"
version = "0.4.5"
version = "0.4.7"
authors = [
{name="EleutherAI", email="contact@eleuther.ai"}
]
......@@ -16,7 +16,7 @@ classifiers = [
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
]
requires-python = ">=3.8"
requires-python = ">=3.9"
license = { "text" = "MIT" }
dependencies = [
"accelerate>=0.26.0",
......@@ -62,6 +62,7 @@ dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
gptq = ["auto-gptq[triton]>=0.6.0"]
hf_transfer = ["hf_transfer"]
ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"]
ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
neuronx = ["optimum[neuronx]"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
......@@ -75,12 +76,15 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm>=0.4.2"]
zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"]
gptqmodel = ["gptqmodel>=1.0.9"]
japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
all = [
"lm_eval[anthropic]",
"lm_eval[dev]",
"lm_eval[deepsparse]",
"lm_eval[gptq]",
"lm_eval[hf_transfer]",
"lm_eval[ibm_watsonx_ai]",
"lm_eval[ifeval]",
"lm_eval[mamba]",
"lm_eval[math]",
......@@ -93,6 +97,7 @@ all = [
"lm_eval[vllm]",
"lm_eval[zeno]",
"lm_eval[wandb]",
"lm_eval[japanese_leaderboard]",
]
[tool.ruff.lint]
......
......@@ -55,7 +55,7 @@ def yield_pile(start_offsets=None, checkpoint_offset=None):
print(
"We expect the pile archives to be in the 'pile' directory, but this was not found."
)
raise Exception("Pile directory not found.")
raise FileNotFoundError("Pile directory not found.")
files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
......
......@@ -109,13 +109,14 @@ def main():
if model_index == 0: # Only need to assemble data for the first model
metrics = []
for metric in config["metric_list"]:
metrics.append(
ZenoMetric(
name=metric["metric"],
type="mean",
columns=[metric["metric"]],
if metric.get("aggregation") == "mean":
metrics.append(
ZenoMetric(
name=metric["metric"],
type="mean",
columns=[metric["metric"]],
)
)
)
project = client.create_project(
name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
view="text-classification",
......@@ -168,7 +169,11 @@ def generate_dataset(
Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
"""
ids = [x["doc_id"] for x in data]
ids = (
[x["doc_id"] for x in data]
if not config.get("filter_list")
else [f"{x['doc_id']}.{x['filter']}" for x in data]
)
labels = [x["target"] for x in data]
instance = [""] * len(ids)
......@@ -190,6 +195,7 @@ def generate_dataset(
return pd.DataFrame(
{
"id": ids,
"doc_id": [x["doc_id"] for x in data],
"data": instance,
"input_len": [len(x) for x in instance],
"labels": labels,
......@@ -208,8 +214,15 @@ def generate_system_df(data, config):
Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
"""
ids = [x["doc_id"] for x in data]
ids = (
[x["doc_id"] for x in data]
if not config.get("filter_list")
else [f"{x['doc_id']}.{x['filter']}" for x in data]
)
system_dict = {"id": ids}
system_dict["doc_id"] = [x["doc_id"] for x in data]
if config.get("filter_list"):
system_dict["filter"] = [x["filter"] for x in data]
system_dict["output"] = [""] * len(ids)
if config["output_type"] == "loglikelihood":
......@@ -228,11 +241,10 @@ def generate_system_df(data, config):
system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]
metrics = {}
for metric in config["metric_list"]:
if "aggregation" in metric and metric["aggregation"] == "mean":
metrics[metric["metric"]] = [x[metric["metric"]] for x in data]
metrics = {
metric["metric"]: [x[metric["metric"]] for x in data]
for metric in config["metric_list"]
}
system_dict.update(metrics)
system_df = pd.DataFrame(system_dict)
return system_df
......
......@@ -63,13 +63,13 @@ def test_create_payload_loglikelihood(api):
(
["Hello, how are"],
True,
{"max_gen_toks": 100, "temperature": 0.7},
{"max_gen_toks": 100, "temperature": 0.7, "until": ["hi"]},
{
"prompt": "Hello, how are",
"model": "gpt-3.5-turbo",
"max_tokens": 100,
"temperature": 0.7,
"stop": ["<|endoftext|>"],
"stop": ["hi"],
"seed": 1234,
},
),
......@@ -82,7 +82,7 @@ def test_create_payload_loglikelihood(api):
"model": "gpt-3.5-turbo",
"max_tokens": 256,
"temperature": 0,
"stop": ["<|endoftext|>"],
"stop": [],
"seed": 1234,
},
),
......
from typing import List
import pytest
import lm_eval
def assert_less_than(value, threshold, desc):
if value is not None:
assert float(value) < threshold, f"{desc} should be less than {threshold}"
@pytest.mark.skip(reason="requires CUDA")
class Test_GPTQModel:
gptqmodel = pytest.importorskip("gptqmodel", minversion="1.0.9")
MODEL_ID = "ModelCloud/Opt-125-GPTQ-4bit-10-25-2024"
def test_gptqmodel(self) -> None:
acc = "acc"
acc_norm = "acc_norm"
acc_value = None
acc_norm_value = None
task = "arc_easy"
model_args = f"pretrained={self.MODEL_ID},gptqmodel=True"
tasks: List[str] = [task]
results = lm_eval.simple_evaluate(
model="hf",
model_args=model_args,
tasks=tasks,
device="cuda",
)
column = "results"
dic = results.get(column, {}).get(self.task)
if dic is not None:
if "alias" in dic:
_ = dic.pop("alias")
items = sorted(dic.items())
for k, v in items:
m, _, f = k.partition(",")
if m.endswith("_stderr"):
continue
if m == acc:
acc_value = "%.4f" % v if isinstance(v, float) else v
if m == acc_norm:
acc_norm_value = "%.4f" % v if isinstance(v, float) else v
assert_less_than(acc_value, 0.43, "acc")
assert_less_than(acc_norm_value, 0.39, "acc_norm")
import os
from itertools import islice
import datasets
import pytest
import lm_eval.tasks as tasks
......@@ -10,6 +11,7 @@ from lm_eval.evaluator_utils import get_task_list
from .utils import new_tasks
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
os.environ["TOKENIZERS_PARALLELISM"] = "false"
task_manager = tasks.TaskManager()
# Default Task
......@@ -77,10 +79,17 @@ class TestNewTasks:
)
_array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all(
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True)
for x in _array
)
target_delimiter: str = task.config.target_delimiter
if not task.multiple_input:
for x in _array:
assert isinstance(x, str)
assert (
(x[-1].isspace() is False if len(x) > 0 else True)
if target_delimiter.isspace()
else True
), "doc_to_text ends in a whitespace and target delimiter also a whitespace"
else:
pass
def test_create_choices(self, task_class, limit):
task = task_class
......@@ -121,5 +130,11 @@ class TestNewTasks:
if task.has_test_docs()
else list(islice(task.validation_docs(), limit))
)
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr]
# ctx is "" for multiple input tasks
requests = [
task.construct_requests(
doc=doc, ctx="" if task.multiple_input else task.doc_to_text(doc)
)
for doc in arr
]
assert len(requests) == limit if limit else True
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment