"src/lib/vscode:/vscode.git/clone" did not exist on "a382e82dec1bf2631303f72490d5b716272f944e"
Commit f5d763dc authored by Nathan Habib's avatar Nathan Habib
Browse files

cleanup

parent 4619f7c1
......@@ -5,7 +5,6 @@ import os
import sys
from functools import partial
from typing import Union
from accelerate import Accelerator
from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict
......@@ -293,6 +292,13 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"If fewshot_as_multiturn is set, apply_chat_template must be set to True."
)
if (
args.num_fewshot is None or args.num_fewshot == 0
) and args.fewshot_as_multiturn:
raise ValueError(
"If fewshot_as_multiturn is set, num_fewshot must be greater than 0."
)
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
......@@ -394,9 +400,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
**request_caching_args,
)
accelerator = Accelerator()
if results is not None and accelerator.is_main_process:
if results is not None:
if args.log_samples:
samples = results.pop("samples")
dumped = json.dumps(
......
......@@ -376,7 +376,8 @@ class Task(abc.ABC):
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
lm=None,
chat_template: Optional[Callable] = None,
tokenizer_name: str = "",
) -> None:
"""Build a set of Instances for a task, and store them in task.instances"""
......@@ -391,7 +392,7 @@ class Task(abc.ABC):
if system_instruction is not None
else ""
)
cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else ""
cache_key += f"-tokenizer{tokenizer_name}"
cached_instances = load_from_cache(file_name=cache_key)
......@@ -436,7 +437,7 @@ class Task(abc.ABC):
system_instruction,
apply_chat_template,
fewshot_as_multiturn,
lm,
chat_template,
)
# TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
......@@ -444,7 +445,6 @@ class Task(abc.ABC):
doc=doc,
ctx=fewshot_ctx,
metadata=(self.config["task"], doc_id, self.config.repeats),
apply_chat_template=apply_chat_template
)
if not isinstance(inst, list):
......@@ -987,28 +987,6 @@ class ConfigurableTask(Task):
return super().fewshot_docs()
@staticmethod
def append_target_question(
labeled_examples: List[Dict[str, str]],
question: str,
fewshot_as_multiturn: bool = False,
) -> None:
"""Adds a target question to the labeled examples list.
If fewshot_as_multiturn is True, or labeled_examples is empty, or the last entry is a system turn, appends the question as a new user entry.
Otherwise, it is appended to the last user entry, ensuring that the conversation alternates between the user and the assistant.
"""
if not fewshot_as_multiturn:
# if no messages or last message is system, append as new user entry
if len(labeled_examples) == 0 or labeled_examples[-1]["role"] == "system":
labeled_examples.append({"role": "user", "content": question})
# if last message is user, append to it to avoid two user messages in a row
else:
labeled_examples[-1]["content"] += question
else:
return self.sampler.fewshot_delimiter + "".join(
f"{s['role']}: {s['content']}" + self.sampler.fewshot_delimiter
for s in chat_history
)
@staticmethod
def append_target_question(
labeled_examples: List[Dict[str, str]],
question: str,
......@@ -1037,7 +1015,7 @@ class ConfigurableTask(Task):
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
lm=None,
chat_template: Optional[Callable] = None,
) -> str:
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
......@@ -1052,8 +1030,8 @@ class ConfigurableTask(Task):
Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param lm:
Language model with definition of the tokenizer/function to use for applying the chat template.
:param chat_template: Callable
Chat template to be applied to the fewshot context.
:returns: str
The fewshot context.
"""
......@@ -1100,7 +1078,7 @@ class ConfigurableTask(Task):
example = self.doc_to_text(doc)
if apply_chat_template:
if self.multiple_input:
return lm.apply_chat_template(labeled_examples)
return chat_template(labeled_examples)
if isinstance(example, str):
self.append_target_question(
labeled_examples, example, fewshot_as_multiturn
......@@ -1112,7 +1090,7 @@ class ConfigurableTask(Task):
for ex in example:
chat = deepcopy(labeled_examples)
self.append_target_question(chat, ex, fewshot_as_multiturn)
labeled_examples_list.append(lm.apply_chat_template(chat))
labeled_examples_list.append(chat_template(chat))
return labeled_examples_list
# if example is an integer, append the choice or convert to string
elif isinstance(example, int):
......@@ -1126,7 +1104,7 @@ class ConfigurableTask(Task):
labeled_examples, str(example), fewshot_as_multiturn
)
# return lm.apply_chat_template(labeled_examples)
return lm.apply_chat_template(labeled_examples)
return chat_template(labeled_examples)
else:
if self.multiple_input:
return labeled_examples
......@@ -1293,8 +1271,6 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "multiple_choice":
choices = self.doc_to_choice(doc)
target_delimiter = self.config.target_delimiter
if kwargs.get("apply_chat_template", False) is True:
target_delimiter = ""
if self.multiple_input:
# If there are multiple inputs, choices are placed in the ctx
cont = self.doc_to_target(doc)
......@@ -1304,7 +1280,6 @@ class ConfigurableTask(Task):
else:
# Otherwise they are placed in the continuation
arguments = [(ctx, f"{target_delimiter}{cont}") for cont in choices]
kwargs.pop("apply_chat_template")
request_list = [
Instance(
......@@ -1341,7 +1316,6 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "generate_until":
arguments = (ctx, deepcopy(self.config.generation_kwargs))
kwargs.pop("apply_chat_template")
return Instance(
request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
)
......
......@@ -399,7 +399,12 @@ def evaluate(
system_instruction=system_instruction,
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
lm=lm,
chat_template=getattr(lm, "apply_chat_template")
if apply_chat_template
else None,
tokenizer_name=getattr(lm, "tokenizer_name", "")
if apply_chat_template
else "",
)
eval_logger.debug(
f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
......
import json
import os
import re
import time
from collections import defaultdict
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
from datasets import load_dataset
from datasets.utils.metadata import MetadataConfigs
......@@ -212,17 +210,21 @@ class EvaluationTracker:
file_results_aggregated.open("w", encoding="utf-8").write(dumped)
if self.api and self.push_results_to_hub:
repo_id = "open-llm-leaderboard/results_v2"
repo_id = (
self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private
)
self.api.create_repo(
repo_id=repo_id,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_file(
self.api.upload_folder(
repo_id=repo_id,
path_or_fileobj=str(path.joinpath(f"results_{self.date_id}.json")),
path_in_repo=os.path.join(self.general_config_tracker.model_name, f"results_{self.date_id}.json"),
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
repo_type="dataset",
commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
)
......@@ -276,7 +278,6 @@ class EvaluationTracker:
sample["resps"] = sanitize_list(sample["resps"])
sample["filtered_resps"] = sanitize_list(sample["filtered_resps"])
sample["arguments"] = arguments
sample["target"] = str(sample["target"])
sample_dump = (
json.dumps(
......@@ -302,13 +303,6 @@ class EvaluationTracker:
private=not self.public_repo,
exist_ok=True,
)
headers = build_hf_headers()
r = get_session().put(
url=f"https://huggingface.co/api/datasets/{repo_id}/settings",
headers=headers,
json={"gated": "auto"},
)
hf_raise_for_status(r)
self.api.upload_folder(
repo_id=repo_id,
folder_path=str(path),
......@@ -366,10 +360,7 @@ class EvaluationTracker:
results_datetime,
)
latest_task_results_datetime[samples_key] = latest_datetime
latest_task_results_datetime[results_key] = max(
latest_task_results_datetime[results_key],
latest_datetime,
)
latest_task_results_datetime[results_key] = latest_datetime
# Create metadata card
card_metadata = MetadataConfigs()
......@@ -386,15 +377,14 @@ class EvaluationTracker:
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all results files are listed in the metadata card
current_results = card_metadata.get(config_name, {"data_files": []})
current_results["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_results
# If the results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
......@@ -413,20 +403,65 @@ class EvaluationTracker:
sanitized_last_eval_date_results = re.sub(
r"[^\w\.]", "_", latest_task_results_datetime[config_name]
)
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
if eval_date_sanitized == sanitized_last_eval_date_results:
# Ensure that all sample results files are listed in the metadata card
current_details_for_task = card_metadata.get(
config_name, {"data_files": []}
)
current_details_for_task["data_files"].append(
{"split": eval_date_sanitized, "path": [str(results_filename)]}
)
card_metadata[config_name] = current_details_for_task
# If the samples results file is the newest, update the "latest" field in the metadata card
card_metadata[config_name]["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
# Special case for MMLU with a single split covering it all
# We add another config with all MMLU splits results together for easy inspection
SPECIAL_TASKS = ["mmlu", "gpqa", "minerva_math"]
for special_task in SPECIAL_TASKS:
if special_task in config_name:
special_task = f"{model_name}__{special_task}"
former_entry = card_metadata.get(special_task, {"data_files": []})
former_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == eval_date_sanitized
]
if len(former_split) == 0:
former_entry["data_files"].append(
{
"split": eval_date_sanitized,
"path": [str(results_filename)],
}
)
else:
split_index, _ = former_split[0]
former_entry["data_files"][split_index]["path"].append(
str(results_filename)
)
if eval_date_sanitized == sanitized_last_eval_date_results:
latest_split = [
(i, entry)
for i, entry in enumerate(former_entry["data_files"])
if entry.get("split", None) == "latest"
]
if len(latest_split) == 0:
former_entry["data_files"].append(
{"split": "latest", "path": [str(results_filename)]}
)
else:
latest_index, _ = latest_split[0]
former_entry["data_files"][latest_index]["path"].append(
str(results_filename)
)
card_metadata[special_task] = former_entry
# Get latest results and extract info to update metadata card examples
latest_datetime = max(latest_task_results_datetime.values())
latest_model_name = max(
......
......@@ -40,6 +40,7 @@ from lm_eval.models.utils import (
eval_logger = utils.eval_logger
def _get_accelerate_args(
device_map_option: Optional[str] = "auto",
max_memory_per_gpu: Optional[Union[int, str]] = None,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment