Commit 51f27158 authored by lintangsutawika's avatar lintangsutawika
Browse files

udpate with merge

parents 924c9790 f5408b6b
...@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" ...@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
[project] [project]
name = "lm_eval" name = "lm_eval"
version = "0.4.0" version = "0.4.1"
authors = [ authors = [
{name="EleutherAI", email="contact@eleuther.ai"} {name="EleutherAI", email="contact@eleuther.ai"}
] ]
...@@ -56,15 +56,14 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness" ...@@ -56,15 +56,14 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
[project.optional-dependencies] [project.optional-dependencies]
anthropic = ["anthropic"] anthropic = ["anthropic"]
dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"] dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy"]
gptq = ["auto-gptq[triton] @ git+https://github.com/PanQiWei/AutoGPTQ"] gptq = ["auto-gptq[triton]>=0.6.0"]
ifeval = ["langdetect", "immutabledict"] ifeval = ["langdetect", "immutabledict"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2"] mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"] math = ["sympy>=1.12", "antlr4-python3-runtime==4.11"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"] multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
openai = ["openai==1.3.9", "tiktoken"] openai = ["openai==1.3.9", "tiktoken"]
promptsource = [ optimum = ["optimum[openvino]"]
"promptsource @ git+https://github.com/bigscience-workshop/promptsource.git#egg=promptsource" promptsource = ["promptsource>=0.2.3"]
]
sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"] sentencepiece = ["sentencepiece>=0.1.98", "protobuf>=4.22.1"]
testing = ["pytest", "pytest-cov", "pytest-xdist"] testing = ["pytest", "pytest-cov", "pytest-xdist"]
vllm = ["vllm<=0.2.5"] vllm = ["vllm<=0.2.5"]
......
...@@ -23,7 +23,7 @@ def parse_args(): ...@@ -23,7 +23,7 @@ def parse_args():
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
with open(args.benchmark_path) as file: with open(args.benchmark_path, encoding="utf-8") as file:
TASK_LIST = yaml.full_load(file) TASK_LIST = yaml.full_load(file)
for task in tqdm(TASK_LIST): for task in tqdm(TASK_LIST):
eval_logger.info(f"Processing {task}") eval_logger.info(f"Processing {task}")
...@@ -57,5 +57,5 @@ if __name__ == "__main__": ...@@ -57,5 +57,5 @@ if __name__ == "__main__":
file_save_path = os.path.join(file_path, full_file_name) file_save_path = os.path.join(file_path, full_file_name)
eval_logger.info(f"Save to {file_save_path}") eval_logger.info(f"Save to {file_save_path}")
with open(file_save_path, "w") as yaml_file: with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(config_dict, yaml_file) yaml.dump(config_dict, yaml_file)
...@@ -119,7 +119,7 @@ class Buckets: ...@@ -119,7 +119,7 @@ class Buckets:
def do_ngrams_in_buckets(n_value, working_directory, bucket_count): def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
pile_statistics = json.load(open("pile_statistics.json", "r")) pile_statistics = json.load(open("pile_statistics.json", "r", encoding="utf-8"))
pile_document_count = pile_statistics["Document Count"] pile_document_count = pile_statistics["Document Count"]
start_offsets = pile_statistics["File Start Offsets"] start_offsets = pile_statistics["File Start Offsets"]
...@@ -212,4 +212,4 @@ if __name__ == "__main__": ...@@ -212,4 +212,4 @@ if __name__ == "__main__":
info_dict = {"title": "dataset ngrams", "ngram_size": 13} info_dict = {"title": "dataset ngrams", "ngram_size": 13}
info_dict_path = os.path.join(args.working_directory, "info.json") info_dict_path = os.path.join(args.working_directory, "info.json")
json.dump(info_dict, open(info_dict_path, "w")) json.dump(info_dict, open(info_dict_path, "w", encoding="utf-8"))
...@@ -79,7 +79,7 @@ if __name__ == "__main__": ...@@ -79,7 +79,7 @@ if __name__ == "__main__":
stats_file_path = "pile_statistics.json" stats_file_path = "pile_statistics.json"
if os.path.exists(stats_file_path): if os.path.exists(stats_file_path):
stats = json.load(open(stats_file_path, "r")) stats = json.load(open(stats_file_path, "r", encoding="utf-8"))
else: else:
document_count, total_document_size_chars, start_offsets = get_stats() document_count, total_document_size_chars, start_offsets = get_stats()
stats = { stats = {
...@@ -88,7 +88,7 @@ if __name__ == "__main__": ...@@ -88,7 +88,7 @@ if __name__ == "__main__":
"Total Pile Characters": total_document_size_chars, "Total Pile Characters": total_document_size_chars,
"File Start Offsets": start_offsets, "File Start Offsets": start_offsets,
} }
json.dump(stats, open(stats_file_path, "w"), indent=4) json.dump(stats, open(stats_file_path, "w", encoding="utf-8"), indent=4)
print(f"document_count: {stats['Document Count']}") print(f"document_count: {stats['Document Count']}")
print(f"total_chars: {stats['Total Pile Characters']}") print(f"total_chars: {stats['Total Pile Characters']}")
......
...@@ -61,14 +61,14 @@ if __name__ == "__main__": ...@@ -61,14 +61,14 @@ if __name__ == "__main__":
if not filenames: if not filenames:
continue continue
path_readme = os.path.join(dirpath, "README.md") path_readme = os.path.join(dirpath, "README.md")
with open(path_readme, "w") as f: with open(path_readme, "w", encoding="utf-8") as f:
# get path name, only last folder # get path name, only last folder
path_name = dirpath.split("/")[-1] path_name = dirpath.split("/")[-1]
f.write(f"# {path_name} \n\n") f.write(f"# {path_name} \n\n")
for filename in sorted([f for f in filenames if f.endswith(".json")]): for filename in sorted([f for f in filenames if f.endswith(".json")]):
path = os.path.join(dirpath, filename) path = os.path.join(dirpath, filename)
with open(path, "r") as f: with open(path, "r", encoding="utf-8") as f:
result_dict = json.load(f) result_dict = json.load(f)
with open(path_readme, "a") as f: with open(path_readme, "a", encoding="utf-8") as f:
f.write(f"## {filename} \n") f.write(f"## {filename} \n")
f.write(f"{make_table(result_dict)} \n") f.write(f"{make_table(result_dict)} \n")
...@@ -50,5 +50,5 @@ if __name__ == "__main__": ...@@ -50,5 +50,5 @@ if __name__ == "__main__":
values.append(v) values.append(v)
writer.value_matrix = values writer.value_matrix = values
table = writer.dumps() table = writer.dumps()
with open(args.output, "w") as f: with open(args.output, "w", encoding="utf-8") as f:
f.write(table) f.write(table)
...@@ -94,7 +94,11 @@ def eval_models(args, branch=None): ...@@ -94,7 +94,11 @@ def eval_models(args, branch=None):
ret = os.system(command) ret = os.system(command)
results[model] = json.load(open(output_path)) if ret == 0 else {"results": {}} results[model] = (
json.load(open(output_path, encoding="utf-8"))
if ret == 0
else {"results": {}}
)
end_time = time.time() end_time = time.time()
......
...@@ -53,7 +53,7 @@ def main(): ...@@ -53,7 +53,7 @@ def main():
os.makedirs(args.output_base_path, exist_ok=True) os.makedirs(args.output_base_path, exist_ok=True)
for task_name, task in task_dict.items(): for task_name, task in task_dict.items():
if type(task) == tuple: if isinstance(task, tuple):
group_name, task = task group_name, task = task
rnd = random.Random() rnd = random.Random()
rnd.seed(args.seed) rnd.seed(args.seed)
......
...@@ -69,18 +69,20 @@ def main(): ...@@ -69,18 +69,20 @@ def main():
model_args = re.sub( model_args = re.sub(
"/|=", "/|=",
"__", "__",
json.load(open(Path(args.data_path, model, "results.json")))["config"][ json.load(
"model_args" open(Path(args.data_path, model, "results.json"), encoding="utf-8")
], )["config"]["model_args"],
) )
with open( with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"), "r" Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
"r",
encoding="utf-8",
) as file: ) as file:
data = json.loads(file.read()) data = json.loads(file.read())
configs = json.load(open(Path(args.data_path, model, "results.json")))[ configs = json.load(
"configs" open(Path(args.data_path, model, "results.json"), encoding="utf-8")
] )["configs"]
config = configs[task] config = configs[task]
if model_index == 0: # Only need to assemble data for the first model if model_index == 0: # Only need to assemble data for the first model
...@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str): ...@@ -124,7 +126,9 @@ def tasks_for_model(model: str, data_path: str):
list: A list of tasks for the model. list: A list of tasks for the model.
""" """
dir_path = Path(data_path, model) dir_path = Path(data_path, model)
config = (json.load(open(Path(dir_path, "results.json")))["configs"],) config = (
json.load(open(Path(dir_path, "results.json"), encoding="utf-8"))["configs"],
)
return list(config[0].keys()) return list(config[0].keys())
......
import random
import tempfile
import pytest
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer
import lm_eval.evaluator as evaluator
import lm_eval.tasks as tasks
from lm_eval.api.registry import get_model
tasks.initialize_tasks()
SUPPORTED_ARCHITECTURES_TASKS = {
"facebook/opt-125m": "lambada_openai",
"hf-internal-testing/tiny-random-gpt2": "wikitext",
}
@pytest.mark.parametrize("model_id,task", SUPPORTED_ARCHITECTURES_TASKS.items())
def test_evaluator(model_id, task):
with tempfile.TemporaryDirectory() as tmpdirname:
model = OVModelForCausalLM.from_pretrained(
model_id, export=True, use_cache=True
)
model.save_pretrained(tmpdirname)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(tmpdirname)
lm = get_model("openvino").create_from_arg_string(
f"pretrained={tmpdirname}",
{
"batch_size": 1,
"device": "cpu",
},
)
def ll_fn(reqs):
for ctx, cont in [req.args for req in reqs]:
if len(ctx) == 0:
continue
# space convention
assert ctx[-1] != " "
assert cont[0] == " " or ctx[-1] == "\n"
res = []
random.seed(42)
for _ in reqs:
res.append((-random.random(), False))
return res
def ll_perp_fn(reqs):
for (string,) in [req.args for req in reqs]:
assert isinstance(string, str)
res = []
random.seed(42)
for _ in reqs:
res.append(-random.random())
return res
lm.loglikelihood = ll_fn
lm.loglikelihood_rolling = ll_perp_fn
limit = 10
evaluator.simple_evaluate(
model=lm,
tasks=[task],
num_fewshot=0,
limit=limit,
bootstrap_iters=10,
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment