Commit 2106fbeb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/openai_completions.py
parents 4354fe46 703fbffd
include: xquad_common_yaml
task: xquad_hi
dataset_name: xquad.hi
doc_to_text: "प्रसंग: {{context}}\n\nसवाल: {{question}}\n\nउत्तर:"
include: xquad_common_yaml
task: xquad_ro
dataset_name: xquad.ro
doc_to_text: "Context: {{context}}\n\nÎntrebare: {{question}}\n\nRăspuns:"
include: xquad_common_yaml
task: xquad_ru
dataset_name: xquad.ru
doc_to_text: "Контекст: {{context}}\n\nВопрос: {{question}}\n\nОтвет:"
include: xquad_common_yaml
task: xquad_th
dataset_name: xquad.th
doc_to_text: "บริบท: {{context}}\n\nคำถาม: {{question}}\n\nคำตอบ:"
include: xquad_common_yaml
task: xquad_tr
dataset_name: xquad.tr
doc_to_text: "Bağlam: {{context}}\n\nSoru: {{question}}\n\nCevap:"
include: xquad_common_yaml
task: xquad_vi
dataset_name: xquad.vi
doc_to_text: "Bối cảnh: {{context}}\n\nCâu hỏi: {{question}}\n\nTrả lời:"
include: xquad_common_yaml
task: xquad_zh
dataset_name: xquad.zh
doc_to_text: "语境: {{context}}\n\n问题: {{question}}\n\n回答:"
...@@ -10,7 +10,7 @@ import os ...@@ -10,7 +10,7 @@ import os
import re import re
from dataclasses import asdict, is_dataclass from dataclasses import asdict, is_dataclass
from itertools import islice from itertools import islice
from typing import Any, Callable, List from typing import Any, Callable, Generator, List, Tuple
import numpy as np import numpy as np
import yaml import yaml
...@@ -104,7 +104,8 @@ def simple_parse_args_string(args_string): ...@@ -104,7 +104,8 @@ def simple_parse_args_string(args_string):
return {} return {}
arg_list = [arg for arg in args_string.split(",") if arg] arg_list = [arg for arg in args_string.split(",") if arg]
args_dict = { args_dict = {
k: handle_arg_string(v) for k, v in [arg.split("=") for arg in arg_list] kv[0]: handle_arg_string("=".join(kv[1:]))
for kv in [arg.split("=") for arg in arg_list]
} }
return args_dict return args_dict
...@@ -201,7 +202,9 @@ def get_sample_results_filenames(filenames: List[str]) -> List[str]: ...@@ -201,7 +202,9 @@ def get_sample_results_filenames(filenames: List[str]) -> List[str]:
return [f for f in filenames if "/samples_" in f and ".json" in f] return [f for f in filenames if "/samples_" in f and ".json" in f]
def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len): def get_rolling_token_windows(
token_list: List[int], prefix_token: int, max_seq_len: int, context_len: int
) -> Generator[Tuple[List[int], List[int]], None, None]:
""" """
- context_len allows for a rolling window context, allowing each prediction window to potentially - context_len allows for a rolling window context, allowing each prediction window to potentially
condition on some context condition on some context
...@@ -228,7 +231,7 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len ...@@ -228,7 +231,7 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
# Special handling for first window: predict all tokens # Special handling for first window: predict all tokens
first_seq_len = min(max_seq_len, len(token_list)) first_seq_len = min(max_seq_len, len(token_list))
yield ([prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len]) yield [prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len]
predicted += first_seq_len predicted += first_seq_len
while predicted < len(token_list): while predicted < len(token_list):
...@@ -242,7 +245,9 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len ...@@ -242,7 +245,9 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
predicted += window_pred_len predicted += window_pred_len
def make_disjoint_window(pair): def make_disjoint_window(
pair: Tuple[List[int], List[int]],
) -> Tuple[List[int], List[int]]:
"""Takes output from get_rolling_token_windows and makes the context not overlap with the continuation""" """Takes output from get_rolling_token_windows and makes the context not overlap with the continuation"""
a, b = pair a, b = pair
return a[: len(a) - (len(b) - 1)], b return a[: len(a) - (len(b) - 1)], b
......
...@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" ...@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "lm_eval" name = "lm_eval"
version = "0.4.5" version = "0.4.7"
authors = [ authors = [
{name="EleutherAI", email="contact@eleuther.ai"} {name="EleutherAI", email="contact@eleuther.ai"}
] ]
...@@ -16,7 +16,7 @@ classifiers = [ ...@@ -16,7 +16,7 @@ classifiers = [
"License :: OSI Approved :: MIT License", "License :: OSI Approved :: MIT License",
"Operating System :: OS Independent", "Operating System :: OS Independent",
] ]
requires-python = ">=3.8" requires-python = ">=3.9"
license = { "text" = "MIT" } license = { "text" = "MIT" }
dependencies = [ dependencies = [
"accelerate>=0.26.0", "accelerate>=0.26.0",
...@@ -62,6 +62,7 @@ dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"] ...@@ -62,6 +62,7 @@ dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"] deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
gptq = ["auto-gptq[triton]>=0.6.0"] gptq = ["auto-gptq[triton]>=0.6.0"]
hf_transfer = ["hf_transfer"] hf_transfer = ["hf_transfer"]
ibm_watsonx_ai = ["ibm_watsonx_ai>=1.1.22"]
ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"] ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
neuronx = ["optimum[neuronx]"] neuronx = ["optimum[neuronx]"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"] mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
...@@ -75,12 +76,15 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"] ...@@ -75,12 +76,15 @@ testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm>=0.4.2"] vllm = ["vllm>=0.4.2"]
zeno = ["pandas", "zeno-client"] zeno = ["pandas", "zeno-client"]
wandb = ["wandb>=0.16.3", "pandas", "numpy"] wandb = ["wandb>=0.16.3", "pandas", "numpy"]
gptqmodel = ["gptqmodel>=1.0.9"]
japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
all = [ all = [
"lm_eval[anthropic]", "lm_eval[anthropic]",
"lm_eval[dev]", "lm_eval[dev]",
"lm_eval[deepsparse]", "lm_eval[deepsparse]",
"lm_eval[gptq]", "lm_eval[gptq]",
"lm_eval[hf_transfer]", "lm_eval[hf_transfer]",
"lm_eval[ibm_watsonx_ai]",
"lm_eval[ifeval]", "lm_eval[ifeval]",
"lm_eval[mamba]", "lm_eval[mamba]",
"lm_eval[math]", "lm_eval[math]",
...@@ -93,6 +97,7 @@ all = [ ...@@ -93,6 +97,7 @@ all = [
"lm_eval[vllm]", "lm_eval[vllm]",
"lm_eval[zeno]", "lm_eval[zeno]",
"lm_eval[wandb]", "lm_eval[wandb]",
"lm_eval[japanese_leaderboard]",
] ]
[tool.ruff.lint] [tool.ruff.lint]
......
...@@ -55,7 +55,7 @@ def yield_pile(start_offsets=None, checkpoint_offset=None): ...@@ -55,7 +55,7 @@ def yield_pile(start_offsets=None, checkpoint_offset=None):
print( print(
"We expect the pile archives to be in the 'pile' directory, but this was not found." "We expect the pile archives to be in the 'pile' directory, but this was not found."
) )
raise Exception("Pile directory not found.") raise FileNotFoundError("Pile directory not found.")
files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*")))) files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
......
...@@ -109,13 +109,14 @@ def main(): ...@@ -109,13 +109,14 @@ def main():
if model_index == 0: # Only need to assemble data for the first model if model_index == 0: # Only need to assemble data for the first model
metrics = [] metrics = []
for metric in config["metric_list"]: for metric in config["metric_list"]:
metrics.append( if metric.get("aggregation") == "mean":
ZenoMetric( metrics.append(
name=metric["metric"], ZenoMetric(
type="mean", name=metric["metric"],
columns=[metric["metric"]], type="mean",
columns=[metric["metric"]],
)
) )
)
project = client.create_project( project = client.create_project(
name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""), name=args.project_name + (f"_{task}" if len(tasks) > 1 else ""),
view="text-classification", view="text-classification",
...@@ -168,7 +169,11 @@ def generate_dataset( ...@@ -168,7 +169,11 @@ def generate_dataset(
Returns: Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno. pd.Dataframe: A dataframe that is ready to be uploaded to Zeno.
""" """
ids = [x["doc_id"] for x in data] ids = (
[x["doc_id"] for x in data]
if not config.get("filter_list")
else [f"{x['doc_id']}.{x['filter']}" for x in data]
)
labels = [x["target"] for x in data] labels = [x["target"] for x in data]
instance = [""] * len(ids) instance = [""] * len(ids)
...@@ -190,6 +195,7 @@ def generate_dataset( ...@@ -190,6 +195,7 @@ def generate_dataset(
return pd.DataFrame( return pd.DataFrame(
{ {
"id": ids, "id": ids,
"doc_id": [x["doc_id"] for x in data],
"data": instance, "data": instance,
"input_len": [len(x) for x in instance], "input_len": [len(x) for x in instance],
"labels": labels, "labels": labels,
...@@ -208,8 +214,15 @@ def generate_system_df(data, config): ...@@ -208,8 +214,15 @@ def generate_system_df(data, config):
Returns: Returns:
pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system. pd.Dataframe: A dataframe that is ready to be uploaded to Zeno as a system.
""" """
ids = [x["doc_id"] for x in data] ids = (
[x["doc_id"] for x in data]
if not config.get("filter_list")
else [f"{x['doc_id']}.{x['filter']}" for x in data]
)
system_dict = {"id": ids} system_dict = {"id": ids}
system_dict["doc_id"] = [x["doc_id"] for x in data]
if config.get("filter_list"):
system_dict["filter"] = [x["filter"] for x in data]
system_dict["output"] = [""] * len(ids) system_dict["output"] = [""] * len(ids)
if config["output_type"] == "loglikelihood": if config["output_type"] == "loglikelihood":
...@@ -228,11 +241,10 @@ def generate_system_df(data, config): ...@@ -228,11 +241,10 @@ def generate_system_df(data, config):
system_dict["output"] = [str(x["filtered_resps"][0]) for x in data] system_dict["output"] = [str(x["filtered_resps"][0]) for x in data]
system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data] system_dict["output_length"] = [len(str(x["filtered_resps"][0])) for x in data]
metrics = {} metrics = {
for metric in config["metric_list"]: metric["metric"]: [x[metric["metric"]] for x in data]
if "aggregation" in metric and metric["aggregation"] == "mean": for metric in config["metric_list"]
metrics[metric["metric"]] = [x[metric["metric"]] for x in data] }
system_dict.update(metrics) system_dict.update(metrics)
system_df = pd.DataFrame(system_dict) system_df = pd.DataFrame(system_dict)
return system_df return system_df
......
...@@ -63,13 +63,13 @@ def test_create_payload_loglikelihood(api): ...@@ -63,13 +63,13 @@ def test_create_payload_loglikelihood(api):
( (
["Hello, how are"], ["Hello, how are"],
True, True,
{"max_gen_toks": 100, "temperature": 0.7}, {"max_gen_toks": 100, "temperature": 0.7, "until": ["hi"]},
{ {
"prompt": "Hello, how are", "prompt": "Hello, how are",
"model": "gpt-3.5-turbo", "model": "gpt-3.5-turbo",
"max_tokens": 100, "max_tokens": 100,
"temperature": 0.7, "temperature": 0.7,
"stop": ["<|endoftext|>"], "stop": ["hi"],
"seed": 1234, "seed": 1234,
}, },
), ),
...@@ -82,7 +82,7 @@ def test_create_payload_loglikelihood(api): ...@@ -82,7 +82,7 @@ def test_create_payload_loglikelihood(api):
"model": "gpt-3.5-turbo", "model": "gpt-3.5-turbo",
"max_tokens": 256, "max_tokens": 256,
"temperature": 0, "temperature": 0,
"stop": ["<|endoftext|>"], "stop": [],
"seed": 1234, "seed": 1234,
}, },
), ),
......
from typing import List
import pytest
import lm_eval
def assert_less_than(value, threshold, desc):
if value is not None:
assert float(value) < threshold, f"{desc} should be less than {threshold}"
@pytest.mark.skip(reason="requires CUDA")
class Test_GPTQModel:
gptqmodel = pytest.importorskip("gptqmodel", minversion="1.0.9")
MODEL_ID = "ModelCloud/Opt-125-GPTQ-4bit-10-25-2024"
def test_gptqmodel(self) -> None:
acc = "acc"
acc_norm = "acc_norm"
acc_value = None
acc_norm_value = None
task = "arc_easy"
model_args = f"pretrained={self.MODEL_ID},gptqmodel=True"
tasks: List[str] = [task]
results = lm_eval.simple_evaluate(
model="hf",
model_args=model_args,
tasks=tasks,
device="cuda",
)
column = "results"
dic = results.get(column, {}).get(self.task)
if dic is not None:
if "alias" in dic:
_ = dic.pop("alias")
items = sorted(dic.items())
for k, v in items:
m, _, f = k.partition(",")
if m.endswith("_stderr"):
continue
if m == acc:
acc_value = "%.4f" % v if isinstance(v, float) else v
if m == acc_norm:
acc_norm_value = "%.4f" % v if isinstance(v, float) else v
assert_less_than(acc_value, 0.43, "acc")
assert_less_than(acc_norm_value, 0.39, "acc_norm")
import os import os
from itertools import islice from itertools import islice
import datasets
import pytest import pytest
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
...@@ -10,6 +11,7 @@ from lm_eval.evaluator_utils import get_task_list ...@@ -10,6 +11,7 @@ from lm_eval.evaluator_utils import get_task_list
from .utils import new_tasks from .utils import new_tasks
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
task_manager = tasks.TaskManager() task_manager = tasks.TaskManager()
# Default Task # Default Task
...@@ -77,10 +79,17 @@ class TestNewTasks: ...@@ -77,10 +79,17 @@ class TestNewTasks:
) )
_array = [task.doc_to_text(doc) for doc in arr] _array = [task.doc_to_text(doc) for doc in arr]
# space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on # space convention; allow txt to have length 0 for perplexity-like tasks since the model tacks an <|endoftext|> on
assert all( target_delimiter: str = task.config.target_delimiter
isinstance(x, str) and (x[-1] != " " if len(x) != 0 else True) if not task.multiple_input:
for x in _array for x in _array:
) assert isinstance(x, str)
assert (
(x[-1].isspace() is False if len(x) > 0 else True)
if target_delimiter.isspace()
else True
), "doc_to_text ends in a whitespace and target delimiter also a whitespace"
else:
pass
def test_create_choices(self, task_class, limit): def test_create_choices(self, task_class, limit):
task = task_class task = task_class
...@@ -121,5 +130,11 @@ class TestNewTasks: ...@@ -121,5 +130,11 @@ class TestNewTasks:
if task.has_test_docs() if task.has_test_docs()
else list(islice(task.validation_docs(), limit)) else list(islice(task.validation_docs(), limit))
) )
requests = [task.construct_requests(doc, task.doc_to_text(doc)) for doc in arr] # ctx is "" for multiple input tasks
requests = [
task.construct_requests(
doc=doc, ctx="" if task.multiple_input else task.doc_to_text(doc)
)
for doc in arr
]
assert len(requests) == limit if limit else True assert len(requests) == limit if limit else True
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment