Commit 8d59330b authored by lintangsutawika's avatar lintangsutawika
Browse files

resolved merge conflict

parents 110e5a28 d4a913c4
......@@ -301,10 +301,23 @@ lm_eval --model hf \
We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
## Saving Results
To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.
Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example output](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo/tree/main/microsoft__phi-2). For instance:
```bash
lm_eval --model hf \
--model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \
--tasks hellaswag \
--log_samples \
--output_path results \
--hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \
```
For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!
## Visualizing Results
......
......@@ -14,7 +14,7 @@ This mode supports a number of command-line arguments, the details of which can
- `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)
- `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups.
- `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. A list of supported tasks can be viewed with `--tasks list`.
- `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer.
......
......@@ -155,6 +155,21 @@ Our final filter pipeline, "maj@8", does majority voting across the first 8 of t
Thus, given the 64 responses from our LM on each document, we can report metrics on these responses in these 3 different ways, as defined by our filter pipelines.
### Adding a custom filter
Just like adding a custom model with `register_model` decorator one is able to do the same with filters, for example
```python
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("new_filter")
class NewFilter(Filter)
...
```
## Embedded Python Code
Use can use python functions for certain arguments by using the `!function` operator after the argument name followed by `<filename>.<pythonfunctionname>`. This feature can be used for the following arguments:
......
......@@ -2,34 +2,20 @@ import argparse
import json
import logging
import os
import re
import sys
from functools import partial
from pathlib import Path
from typing import Union
import numpy as np
from lm_eval import evaluator, utils
from lm_eval.evaluator import request_caching_arg_to_dict
from lm_eval.logging_utils import WandbLogger
from lm_eval.logging import EvaluationTracker, WandbLogger
from lm_eval.tasks import TaskManager
from lm_eval.utils import make_table, simple_parse_args_string
from lm_eval.utils import handle_non_serializable, make_table, simple_parse_args_string
DEFAULT_RESULTS_FILE = "results.json"
def _handle_non_serializable(o):
if isinstance(o, np.int64) or isinstance(o, np.int32):
return int(o)
elif isinstance(o, set):
return list(o)
else:
return str(o)
def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","):
def _int_or_none_list_arg_type(
min_len: int, max_len: int, defaults: str, value: str, split_char: str = ","
):
def parse_value(item):
item = item.strip().lower()
if item == "none":
......@@ -45,10 +31,19 @@ def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","):
if num_items == 1:
# Makes downstream handling the same for single and multiple values
items = items * max_len
elif num_items != max_len:
elif num_items < min_len or num_items > max_len:
raise argparse.ArgumentTypeError(
f"Argument requires {max_len} integers or None, separated by '{split_char}'"
)
elif num_items != max_len:
logging.warning(
f"Argument requires {max_len} integers or None, separated by '{split_char}'. "
"Missing values will be filled with defaults."
)
default_items = [parse_value(v) for v in defaults.split(split_char)]
items.extend(
default_items[num_items:]
) # extend items list with missing defaults
return items
......@@ -203,6 +198,12 @@ def setup_parser() -> argparse.ArgumentParser:
default="",
help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
)
parser.add_argument(
"--hf_hub_log_args",
type=str,
default="",
help="Comma separated string arguments passed to Hugging Face Hub's log function, e.g. `hub_results_org=EleutherAI,hub_repo_name=lm-eval-results`",
)
parser.add_argument(
"--predict_only",
"-x",
......@@ -210,17 +211,20 @@ def setup_parser() -> argparse.ArgumentParser:
default=False,
help="Use with --log_samples. Only model outputs will be saved and metrics will not be evaluated.",
)
default_seed_string = "0,1234,1234,1234"
parser.add_argument(
"--seed",
type=partial(_int_or_none_list_arg_type, 3),
default="0,1234,1234", # for backward compatibility
type=partial(_int_or_none_list_arg_type, 3, 4, default_seed_string),
default=default_seed_string, # for backward compatibility
help=(
"Set seed for python's random, numpy and torch.\n"
"Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, "
"or a single integer to set the same seed for all three.\n"
"The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).\n"
"E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.\n"
"E.g, `--seed 42` sets all three seeds to 42."
"Set seed for python's random, numpy, torch, and fewshot sampling.\n"
"Accepts a comma-separated list of 4 values for python's random, numpy, torch, and fewshot sampling seeds, "
"respectively, or a single integer to set the same seed for all three.\n"
f"The values are either an integer or 'None' to not set the seed. Default is `{default_seed_string}` "
"(for backward compatibility).\n"
"E.g. `--seed 0,None,8,52` sets `random.seed(0)`, `torch.manual_seed(8)`, and fewshot sampling seed to 52. "
"Here numpy's seed is not set since the second value is `None`.\n"
"E.g, `--seed 42` sets all four seeds to 42."
),
)
parser.add_argument(
......@@ -228,7 +232,6 @@ def setup_parser() -> argparse.ArgumentParser:
action="store_true",
help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
)
return parser
......@@ -251,6 +254,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
eval_logger.info(f"Verbosity set to {args.verbosity}")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# update the evaluation tracker args with the output path and the HF token
args.hf_hub_log_args = f"output_path={args.output_path},token={os.environ.get('HF_TOKEN')},{args.hf_hub_log_args}"
evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args)
evaluation_tracker = EvaluationTracker(**evaluation_tracker_args)
evaluation_tracker.general_config_tracker.log_experiment_args(
model_source=args.model,
model_args=args.model_args,
)
if args.predict_only:
args.log_samples = True
if (args.log_samples or args.predict_only) and not args.output_path:
......@@ -262,6 +274,18 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
if (
"push_results_to_hub" in evaluation_tracker_args
or "push_samples_to_hub" in evaluation_tracker_args
) and "hub_results_org" not in evaluation_tracker_args:
raise ValueError(
"If push_results_to_hub or push_samples_to_hub is set, results_org must be specified."
)
if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
eval_logger.warning(
"Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub."
)
if args.limit:
eval_logger.warning(
" --limit SHOULD ONLY BE USED FOR TESTING."
......@@ -306,24 +330,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
f"Tasks not found: {missing}. Try `lm-eval --tasks list` for list of available tasks, or '--verbosity DEBUG' to troubleshoot task registration issues."
)
if args.output_path:
path = Path(args.output_path)
# check if file or 'dir/results.json' exists
if path.is_file():
raise FileExistsError(f"File already exists at {path}")
output_path_file = path.joinpath(DEFAULT_RESULTS_FILE)
if output_path_file.is_file():
eval_logger.warning(
f"File {output_path_file} already exists. Results will be overwritten."
)
# if path json then get parent dir
elif path.suffix in (".json", ".jsonl"):
output_path_file = path
path.parent.mkdir(parents=True, exist_ok=True)
path = path.parent
else:
path.mkdir(parents=True, exist_ok=True)
# Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args
if args.trust_remote_code:
os.environ["HF_DATASETS_TRUST_REMOTE_CODE"] = str(args.trust_remote_code)
......@@ -358,6 +364,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
random_seed=args.seed[0],
numpy_random_seed=args.seed[1],
torch_random_seed=args.seed[2],
fewshot_random_seed=args.seed[3],
**request_caching_args,
)
......@@ -365,7 +372,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
if args.log_samples:
samples = results.pop("samples")
dumped = json.dumps(
results, indent=2, default=_handle_non_serializable, ensure_ascii=False
results, indent=2, default=handle_non_serializable, ensure_ascii=False
)
if args.show_config:
print(dumped)
......@@ -382,23 +389,15 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
except Exception as e:
eval_logger.info(f"Logging to Weights and Biases failed due to {e}")
if args.output_path:
output_path_file.open("w", encoding="utf-8").write(dumped)
if args.log_samples:
for task_name, config in results["configs"].items():
output_name = "{}_{}".format(
re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", args.model_args),
task_name,
)
filename = path.joinpath(f"{output_name}.jsonl")
samples_dumped = json.dumps(
samples[task_name],
indent=2,
default=_handle_non_serializable,
ensure_ascii=False,
)
filename.write_text(samples_dumped, encoding="utf-8")
evaluation_tracker.save_results_aggregated(
results=results, samples=samples if args.log_samples else None
)
if args.log_samples:
for task_name, config in results["configs"].items():
evaluation_tracker.save_results_samples(
task_name=task_name, samples=samples[task_name]
)
print(
f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
......
......@@ -78,6 +78,7 @@ METRIC_REGISTRY = {}
METRIC_AGGREGATION_REGISTRY = {}
AGGREGATION_REGISTRY: Dict[str, Callable[[], Dict[str, Callable]]] = {}
HIGHER_IS_BETTER_REGISTRY = {}
FILTER_REGISTRY = {}
DEFAULT_METRIC_REGISTRY = {
"loglikelihood": [
......@@ -170,3 +171,22 @@ def is_higher_better(metric_name) -> bool:
eval_logger.warning(
f"higher_is_better not specified for metric '{metric_name}'!"
)
def register_filter(name):
def decorate(cls):
if name in FILTER_REGISTRY:
eval_logger.info(
f"Registering filter `{name}` that is already in Registry {FILTER_REGISTRY}"
)
FILTER_REGISTRY[name] = cls
return cls
return decorate
def get_filter(filter_name: str) -> type:
try:
return FILTER_REGISTRY[filter_name]
except KeyError:
eval_logger.warning(f"filter `{filter_name}` is not registered!")
class ContextSampler:
def __init__(self, docs, task, fewshot_indices=None, rnd=None) -> None:
self.rnd = rnd
assert self.rnd, "must pass rnd to FewShotSampler!"
if not self.rnd:
raise ValueError(
"A `random.Random` generator argument must be provided to `rnd` of FewShotSampler!"
)
self.task = task
self.config = task._config
......
......@@ -312,6 +312,9 @@ class Task(abc.ABC):
self._config: TaskConfig = TaskConfig({**config}) if config else TaskConfig()
self._filters = [build_filter_ensemble("none", [["take_first", None]])]
self.fewshot_rnd: Optional[
random.Random
] = None # purposely induce errors in case of improper usage
def download(
self,
......@@ -603,7 +606,7 @@ class Task(abc.ABC):
self,
doc,
num_fewshot,
rnd=random.Random(1234),
rnd=None,
description=None,
):
"""Returns a fewshot context string that is made up of a prepended description
......@@ -622,9 +625,12 @@ class Task(abc.ABC):
The fewshot context.
"""
if rnd is None:
raise ValueError(
"A `random.Random` generator argument must be provided to `rnd`"
)
if self.fewshot_rnd is not None:
rnd = self.fewshot_rnd
else:
raise ValueError(
"A `random.Random` generator argument must be provided to `rnd`"
)
description = description if description else ""
......@@ -715,6 +721,11 @@ class Task(abc.ABC):
setattr(self._config, "metric_list", [{"metric": metric_name}])
setattr(self._config, "process_results", None)
def set_fewshot_seed(self, seed: Optional[int] = None) -> None:
self.fewshot_rnd = random.Random(seed)
if hasattr(self, "sampler"):
self.sampler.rnd = self.fewshot_rnd
@property
def eval_docs(self) -> Union[datasets.Dataset, List[dict]]:
if self.has_test_docs():
......@@ -891,11 +902,29 @@ class ConfigurableTask(Task):
self.prompt = None
if self.fewshot_docs() is not None:
self.sampler = samplers.get_sampler(
self.fewshot_rnd = (
random.Random()
) # setting with no seed, to be overridden at a later time
config_sampler: Union[str, Callable] = (
self.config.fewshot_config.get("sampler", "default")
if self.config.fewshot_config
else "default"
)(list(self.fewshot_docs()), self, rnd=random.Random(1234))
)
if isinstance(config_sampler, str):
self.sampler = samplers.get_sampler(config_sampler)(
list(self.fewshot_docs()), self, rnd=self.fewshot_rnd
)
elif callable(config_sampler) and issubclass(
config_sampler, samplers.ContextSampler
):
self.sampler = config_sampler(
docs=list(self.fewshot_docs()), task=self, rnd=self.fewshot_rnd
)
else:
raise TypeError(
f"fewshot_config.sampler should be a string or callable of ContextSampler type, "
f"not {type(config_sampler)}"
)
self.task_docs = self.eval_docs
......
import itertools
import json
import logging
import random
import time
......@@ -28,7 +29,13 @@ from lm_eval.tasks import (
TaskManager,
get_task_dict,
)
from lm_eval.utils import eval_logger, positional_deprecated, simple_parse_args_string
from lm_eval.utils import (
eval_logger,
handle_non_serializable,
hash_string,
positional_deprecated,
simple_parse_args_string,
)
if TYPE_CHECKING:
......@@ -61,6 +68,7 @@ def simple_evaluate(
random_seed: int = 0,
numpy_random_seed: int = 1234,
torch_random_seed: int = 1234,
fewshot_random_seed: int = 1234,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -108,6 +116,8 @@ def simple_evaluate(
Random seed for numpy. If set to None, the seed will not be set.
:param torch_random_seed: int
Random seed for torch. If set to None, the seed will not be set.
:param fewshot_random_seed: int
Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None.
:return
Dictionary of results
......@@ -156,15 +166,6 @@ def simple_evaluate(
if model_args is None:
eval_logger.warning("model_args not specified. Using defaults.")
model_args = ""
if "pretrained" not in model_args and model in [
"hf-auto",
"hf",
"huggingface",
"vllm",
]:
eval_logger.warning(
"pretrained not specified. Using default pretrained=gpt2."
)
if isinstance(model_args, dict):
eval_logger.info(
......@@ -217,7 +218,7 @@ def simple_evaluate(
task_dict = get_task_dict(tasks, task_manager)
def _adjust_config(task_dict):
def _adjust_config(task_dict, predict_only):
adjusted_task_dict = {}
for task_name, task_obj in task_dict.items():
if isinstance(task_obj, dict):
......@@ -252,6 +253,10 @@ def simple_evaluate(
f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
)
task_obj.set_config(key="num_fewshot", value=num_fewshot)
task_obj.set_fewshot_seed(seed=fewshot_random_seed)
eval_logger.info(
f"Setting fewshot random generator seed to {fewshot_random_seed}"
)
else:
# if num_fewshot not provided, and the task does not define a default one, default to 0
if (
......@@ -263,7 +268,7 @@ def simple_evaluate(
return adjusted_task_dict
task_dict = _adjust_config(task_dict)
task_dict = _adjust_config(task_dict, predict_only)
results = evaluate(
lm=lm,
task_dict=task_dict,
......@@ -288,16 +293,28 @@ def simple_evaluate(
results["config"] = {
"model": model_name,
"model_args": model_args,
"batch_size": batch_size,
"batch_sizes": (
list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
),
"device": device,
"use_cache": use_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters,
"gen_kwargs": gen_kwargs,
}
# add more detailed model info if available
if isinstance(lm, lm_eval.models.huggingface.HFLM):
results["config"].update(lm.get_model_info())
# add info about execution
results["config"].update(
{
"batch_size": batch_size,
"batch_sizes": (
list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else []
),
"device": device,
"use_cache": use_cache,
"limit": limit,
"bootstrap_iters": bootstrap_iters,
"gen_kwargs": gen_kwargs,
"random_seed": random_seed,
"numpy_seed": numpy_random_seed,
"torch_seed": torch_random_seed,
"fewshot_seed": fewshot_random_seed,
}
)
results["git_hash"] = get_git_commit_hash()
results["date"] = start_date
add_env_info(results) # additional environment info to results
......@@ -365,7 +382,6 @@ def evaluate(
eval_logger.debug(
f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
)
if write_out:
print_writeout(task)
# aggregate Instances by LM method requested to get output.
......@@ -451,6 +467,16 @@ def evaluate(
"filtered_resps": [
req.filtered_resps[filter_key] for req in requests
],
"doc_hash": hash_string(
json.dumps(
requests[0].doc,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
),
"prompt_hash": hash_string(requests[0].arguments[0]),
"target_hash": hash_string(str(target)),
}
example.update(metrics)
task_output.logged_samples.append(example)
......@@ -612,6 +638,16 @@ def evaluate(
"configs": dict(sorted(configs.items())),
"versions": dict(sorted(versions.items())),
"n-shot": dict(sorted(num_fewshot.items())),
"n-samples": {
task_output.task_name: {
"original": len(task_output.task.eval_docs),
"effective": min(
limit if limit else len(task_output.task.eval_docs),
len(task_output.task.eval_docs),
),
}
for task_output in eval_tasks
},
}
if log_samples:
results_dict["samples"] = dict(samples)
......
......@@ -5,8 +5,9 @@ import sys
from typing import List, Optional, Tuple, Union
from lm_eval.api import metrics
from lm_eval.utils import eval_logger, positional_deprecated
from lm_eval.tasks import ConfigurableGroup
from lm_eval.utils import eval_logger, positional_deprecated
class TaskOutput:
"""
......@@ -198,9 +199,7 @@ def prepare_print_tasks(
task_agg[name].pop("samples")
if from_configurable_group and (" " not in results[name]):
group_tab_string = (
" " * group_depth + "- " if group_depth > 0 else ""
)
group_tab_string = " " * group_depth + "- " if group_depth > 0 else ""
group_agg[name] = results[name].copy()
group_agg[name]["alias"] = group_tab_string + alias
if "samples" in group_agg[name]:
......
from functools import partial
from typing import List, Union
from typing import List
from lm_eval.api.filter import FilterEnsemble
from lm_eval.api.registry import get_filter
from . import extraction, selection, transformation
FILTER_REGISTRY = {
"take_first": selection.TakeFirstFilter,
"regex": extraction.RegexFilter,
"majority_vote": selection.MajorityVoteFilter,
"take_first_k": selection.TakeKFilter,
"remove_whitespace": extraction.WhitespaceFilter,
"lowercase": transformation.LowercaseFilter,
"uppercase": transformation.UppercaseFilter,
"map": transformation.MapFilter,
"multi_choice_regex": extraction.MultiChoiceRegexFilter,
# TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
# that takes an input and returns a scalar and then should select the max reward,
# or should implement different filters for different ways of handling a reward model's inference.
# "arg_max": selection.ArgMaxFilter,
}
def get_filter(filter_name: str) -> Union[type, str]:
if filter_name in FILTER_REGISTRY:
return FILTER_REGISTRY[filter_name]
else:
return filter_name
def build_filter_ensemble(
filter_name: str, components: List[List[str]]
) -> FilterEnsemble:
......
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("decontaminate")
class DecontaminationFilter(Filter):
"""
......
......@@ -3,8 +3,10 @@ import sys
import unicodedata
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("regex")
class RegexFilter(Filter):
""" """
......@@ -49,6 +51,7 @@ class RegexFilter(Filter):
return filtered_resps
@register_filter("remove_whitespace")
class WhitespaceFilter(Filter):
""" """
......@@ -71,6 +74,7 @@ class WhitespaceFilter(Filter):
return filtered_resps
@register_filter("multi_choice_regex")
class MultiChoiceRegexFilter(RegexFilter):
"""
A filter used to extract a model's answer on multiple choice questions with
......
from collections import Counter
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
# TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function
# that takes an input and returns a scalar and then should select the max reward,
# or should implement different filters for different ways of handling a reward model's inference.
@register_filter("take_first")
class TakeFirstFilter(Filter):
def __init__(self) -> None:
"""
......@@ -16,6 +23,7 @@ class TakeFirstFilter(Filter):
return map(lambda r: r[0], resps)
@register_filter("take_first_k")
class TakeKFilter(Filter):
def __init__(self, **kwargs) -> None:
self.k = kwargs.pop("k")
......@@ -32,6 +40,7 @@ class TakeKFilter(Filter):
return map(lambda r: r[: self.k], resps)
@register_filter("majority_vote")
class MajorityVoteFilter(Filter):
def __init__(self) -> None:
"""
......
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("lowercase")
class LowercaseFilter(Filter):
def __init__(self) -> None:
pass
......@@ -12,6 +14,7 @@ class LowercaseFilter(Filter):
return [filter_set(resp) for resp in resps]
@register_filter("uppercase")
class UppercaseFilter(Filter):
def __init__(self) -> None:
pass
......@@ -23,6 +26,7 @@ class UppercaseFilter(Filter):
return [filter_set(resp) for resp in resps]
@register_filter("map")
class MapFilter(Filter):
def __init__(self, mapping_dict: dict = None, default_value=None) -> None:
"""
......
from .evaluation_tracker import EvaluationTracker
from .wandb_logger import WandbLogger
import json
import re
import time
from dataclasses import asdict, dataclass
from datetime import datetime
from pathlib import Path
from huggingface_hub import HfApi
from lm_eval.utils import (
eval_logger,
handle_non_serializable,
hash_string,
)
@dataclass(init=False)
class GeneralConfigTracker:
"""
Tracker for the evaluation parameters.
Attributes:
model_source (str): Source of the model (e.g. Hugging Face, GGUF, etc.)
model_name (str): Name of the model.
model_name_sanitized (str): Sanitized model name for directory creation.
start_time (float): Start time of the experiment. Logged at class init.
end_time (float): Start time of the experiment. Logged when calling [`GeneralConfigTracker.log_end_time`]
total_evaluation_time_seconds (str): Inferred total evaluation time in seconds (from the start and end times).
"""
model_source: str = None
model_name: str = None
model_name_sanitized: str = None
start_time: float = None
end_time: float = None
total_evaluation_time_seconds: str = None
def __init__(self) -> None:
"""Starts the evaluation timer."""
self.start_time = time.perf_counter()
@staticmethod
def _get_model_name(model_args: str) -> str:
"""Extracts the model name from the model arguments."""
def extract_model_name(model_args: str, key: str) -> str:
"""Extracts the model name from the model arguments using a key."""
args_after_key = model_args.split(key)[1]
return args_after_key.split(",")[0]
# order does matter, e.g. peft and delta are provided together with pretrained
prefixes = ["peft=", "delta=", "pretrained=", "model=", "path=", "engine="]
for prefix in prefixes:
if prefix in model_args:
return extract_model_name(model_args, prefix)
return ""
def log_experiment_args(
self,
model_source: str,
model_args: str,
) -> None:
"""Logs model parameters and job ID."""
self.model_source = model_source
self.model_name = GeneralConfigTracker._get_model_name(model_args)
self.model_name_sanitized = re.sub(
r"[\"<>:/\|\\?\*\[\]]+", "__", self.model_name
)
def log_end_time(self) -> None:
"""Logs the end time of the evaluation and calculates the total evaluation time."""
self.end_time = time.perf_counter()
self.total_evaluation_time_seconds = str(self.end_time - self.start_time)
class EvaluationTracker:
"""
Keeps track and saves relevant information of the evaluation process.
Compiles the data from trackers and writes it to files, which can be published to the Hugging Face hub if requested.
"""
def __init__(
self,
output_path: str = None,
hub_results_org: str = "",
hub_repo_name: str = "",
push_results_to_hub: bool = False,
push_samples_to_hub: bool = False,
public_repo: bool = False,
token: str = "",
) -> None:
"""
Creates all the necessary loggers for evaluation tracking.
Args:
output_path (str): Path to save the results. If not provided, the results won't be saved.
hub_results_org (str): The Hugging Face organisation to push the results to. If not provided, the results won't be pushed.
hub_repo_name (str): The name of the Hugging Face repository to push the results to. If not provided, the results will be pushed to `lm-eval-results`.
push_results_to_hub (bool): Whether to push the results to the Hugging Face hub.
push_samples_to_hub (bool): Whether to push the samples to the Hugging Face hub.
public_repo (bool): Whether to push the results to a public or private repository.
token (str): Token to use when pushing to the Hugging Face hub. This token should have write access to `hub_results_org`.
"""
self.general_config_tracker = GeneralConfigTracker()
self.output_path = output_path
self.hub_results_org = hub_results_org
hub_repo_name = hub_repo_name if hub_repo_name else "lm-eval-results"
self.hub_results_repo = f"{hub_results_org}/{hub_repo_name}"
self.hub_results_repo_private = f"{hub_results_org}/{hub_repo_name}-private"
self.push_results_to_hub = push_results_to_hub
self.push_samples_to_hub = push_samples_to_hub
self.public_repo = public_repo
self.api = HfApi(token=token) if token else None
def save_results_aggregated(
self,
results: dict,
samples: dict,
) -> None:
"""
Saves the aggregated results and samples to the output path and pushes them to the Hugging Face hub if requested.
Args:
results (dict): The aggregated results to save.
samples (dict): The samples results to save.
"""
self.general_config_tracker.log_end_time()
if self.output_path:
try:
eval_logger.info("Saving results aggregated")
# calculate cumulative hash for each task - only if samples are provided
task_hashes = {}
if samples:
for task_name, task_samples in samples.items():
sample_hashes = [
s["doc_hash"] + s["prompt_hash"] + s["target_hash"]
for s in task_samples
]
task_hashes[task_name] = hash_string("".join(sample_hashes))
# update initial results dict
results.update({"task_hashes": task_hashes})
results.update(asdict(self.general_config_tracker))
dumped = json.dumps(
results,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
path = Path(self.output_path if self.output_path else Path.cwd())
path = path.joinpath(self.general_config_tracker.model_name_sanitized)
path.mkdir(parents=True, exist_ok=True)
self.date_id = datetime.now().isoformat().replace(":", "-")
file_results_aggregated = path.joinpath(f"results_{self.date_id}.json")
file_results_aggregated.open("w", encoding="utf-8").write(dumped)
if self.api and self.push_results_to_hub:
self.api.create_repo(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
repo_type="dataset",
commit_message=f"Adding aggregated results for {self.general_config_tracker.model_name}",
)
except Exception as e:
eval_logger.warning("Could not save results aggregated")
eval_logger.info(repr(e))
else:
eval_logger.info(
"Output path not provided, skipping saving results aggregated"
)
def save_results_samples(
self,
task_name: str,
samples: dict,
) -> None:
"""
Saves the samples results to the output path and pushes them to the Hugging Face hub if requested.
Args:
task_name (str): The task name to save the samples for.
samples (dict): The samples results to save.
"""
if self.output_path:
try:
eval_logger.info("Saving samples results")
samples_dumped = json.dumps(
samples,
indent=2,
default=handle_non_serializable,
ensure_ascii=False,
)
path = Path(self.output_path if self.output_path else Path.cwd())
path = path.joinpath(self.general_config_tracker.model_name_sanitized)
path.mkdir(parents=True, exist_ok=True)
file_results_samples = path.joinpath(
f"samples_{task_name}_{self.date_id}.json"
)
file_results_samples.write_text(samples_dumped, encoding="utf-8")
if self.api and self.push_samples_to_hub:
self.api.create_repo(
self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
repo_type="dataset",
private=not self.public_repo,
exist_ok=True,
)
self.api.upload_folder(
repo_id=self.hub_results_repo
if self.public_repo
else self.hub_results_repo_private,
folder_path=str(path),
path_in_repo=self.general_config_tracker.model_name_sanitized,
repo_type="dataset",
commit_message=f"Adding samples results for {task_name} to {self.general_config_tracker.model_name}",
)
except Exception as e:
eval_logger.warning("Could not save sample results")
eval_logger.info(repr(e))
else:
eval_logger.info("Output path not provided, skipping saving sample results")
import logging
import os
import re
import subprocess
from pathlib import Path
from typing import Any, Dict, Optional, Tuple, Union
import numpy as np
from torch.utils.collect_env import get_pretty_env_info
from transformers import __version__ as trans_version
logger = logging.getLogger(__name__)
def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
"""Remove the ',none' substring from the input_string if it exists at the end.
Args:
input_string (str): The input string from which to remove the ',none' substring.
Returns:
Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
and a boolean indicating whether the modification was made (True) or not (False).
"""
# Define the pattern to match ',none' at the end of the string
pattern = re.compile(r",none$")
# Use sub() to replace ',none' with an empty string
result = re.sub(pattern, "", input_string)
# check if the input_string changed
removed = result != input_string
return result, removed
def _handle_non_serializable(o: Any) -> Union[int, str, list]:
"""Handle non-serializable objects by converting them to serializable types.
Args:
o (Any): The object to be handled.
Returns:
Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
it will be converted to int. If the object is of type set, it will be converted
to a list. Otherwise, it will be converted to str.
"""
if isinstance(o, np.int64) or isinstance(o, np.int32):
return int(o)
elif isinstance(o, set):
return list(o)
else:
return str(o)
def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
try:
git_folder = Path(repo_path, ".git")
if git_folder.is_file():
git_folder = Path(
git_folder.parent,
git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
)
if Path(git_folder, "HEAD").exists():
head_name = (
Path(git_folder, "HEAD")
.read_text(encoding="utf-8")
.split("\n")[0]
.split(" ")[-1]
)
head_ref = Path(git_folder, head_name)
git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
else:
git_hash = None
except Exception as err:
logger.debug(
f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
)
return None
return git_hash
def get_git_commit_hash():
"""
Gets the git commit hash of your current repo (if it exists).
Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
"""
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except (subprocess.CalledProcessError, FileNotFoundError):
# FileNotFoundError occurs when git not installed on system
git_hash = get_commit_from_path(os.getcwd()) # git hash of repo if exists
return git_hash
def add_env_info(storage: Dict[str, Any]):
try:
pretty_env_info = get_pretty_env_info()
except Exception as err:
pretty_env_info = str(err)
transformers_version = trans_version
upper_dir_commit = get_commit_from_path(
Path(os.getcwd(), "..")
) # git hash of upper repo if exists
added_info = {
"pretty_env_info": pretty_env_info,
"transformers_version": transformers_version,
"upper_git_hash": upper_dir_commit, # in case this repo is submodule
}
storage.update(added_info)
import copy
import json
import logging
import os
import re
import subprocess
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, Union
from typing import Any, Dict, List, Literal, Tuple
import numpy as np
import pandas as pd
from packaging.version import Version
from torch.utils.collect_env import get_pretty_env_info
from transformers import __version__ as trans_version
logger = logging.getLogger(__name__)
def remove_none_pattern(input_string: str) -> Tuple[str, bool]:
"""Remove the ',none' substring from the input_string if it exists at the end.
Args:
input_string (str): The input string from which to remove the ',none' substring.
Returns:
Tuple[str, bool]: A tuple containing the modified input_string with the ',none' substring removed
and a boolean indicating whether the modification was made (True) or not (False).
"""
# Define the pattern to match ',none' at the end of the string
pattern = re.compile(r",none$")
# Use sub() to replace ',none' with an empty string
result = re.sub(pattern, "", input_string)
# check if the input_string changed
removed = result != input_string
return result, removed
from lm_eval.logging.utils import _handle_non_serializable, remove_none_pattern
def _handle_non_serializable(o: Any) -> Union[int, str, list]:
"""Handle non-serializable objects by converting them to serializable types.
Args:
o (Any): The object to be handled.
Returns:
Union[int, str, list]: The converted object. If the object is of type np.int64 or np.int32,
it will be converted to int. If the object is of type set, it will be converted
to a list. Otherwise, it will be converted to str.
"""
if isinstance(o, np.int64) or isinstance(o, np.int32):
return int(o)
elif isinstance(o, set):
return list(o)
else:
return str(o)
logger = logging.getLogger(__name__)
def get_wandb_printer() -> Literal["Printer"]:
......@@ -395,61 +350,3 @@ class WandbLogger:
self._log_samples_as_artifact(eval_preds, task_name)
self.run.log({f"{group}_eval_results": grouped_df})
def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
try:
git_folder = Path(repo_path, ".git")
if git_folder.is_file():
git_folder = Path(
git_folder.parent,
git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
)
if Path(git_folder, "HEAD").exists():
head_name = (
Path(git_folder, "HEAD")
.read_text(encoding="utf-8")
.split("\n")[0]
.split(" ")[-1]
)
head_ref = Path(git_folder, head_name)
git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
else:
git_hash = None
except Exception as err:
logger.debug(
f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
)
return None
return git_hash
def get_git_commit_hash():
"""
Gets the git commit hash of your current repo (if it exists).
Source: https://github.com/EleutherAI/gpt-neox/blob/b608043be541602170bfcfb8ec9bf85e8a0799e0/megatron/neox_arguments/neox_args.py#L42
"""
try:
git_hash = subprocess.check_output(["git", "describe", "--always"]).strip()
git_hash = git_hash.decode()
except (subprocess.CalledProcessError, FileNotFoundError):
# FileNotFoundError occurs when git not installed on system
git_hash = get_commit_from_path(os.getcwd()) # git hash of repo if exists
return git_hash
def add_env_info(storage: Dict[str, Any]):
try:
pretty_env_info = get_pretty_env_info()
except Exception as err:
pretty_env_info = str(err)
transformers_version = trans_version
upper_dir_commit = get_commit_from_path(
Path(os.getcwd(), "..")
) # git hash of upper repo if exists
added_info = {
"pretty_env_info": pretty_env_info,
"transformers_version": transformers_version,
"upper_git_hash": upper_dir_commit, # in case this repo is submodule
}
storage.update(added_info)
......@@ -13,6 +13,7 @@ from accelerate import (
InitProcessGroupKwargs,
find_executable_batch_size,
)
from huggingface_hub import HfApi
from packaging import version
from peft import PeftModel
from peft import __version__ as PEFT_VERSION
......@@ -77,7 +78,7 @@ class HFLM(TemplateLM):
def __init__(
self,
pretrained: Optional[Union[str, transformers.PreTrainedModel]] = "gpt2",
pretrained: Union[str, transformers.PreTrainedModel],
backend: Optional[Literal["default", "causal", "seq2seq"]] = "default",
# override whether the model should be treated as decoder-only (causal) or encoder-decoder (seq2seq)
revision: Optional[str] = "main",
......@@ -278,7 +279,10 @@ class HFLM(TemplateLM):
)
self._max_length = max_length
self.pretrained = pretrained
self.delta = delta
self.peft = peft
self.revision = revision
self.batch_schedule = 1
self.batch_sizes = {}
self.max_batch_size = max_batch_size
......@@ -663,6 +667,8 @@ class HFLM(TemplateLM):
max_cont_enc = len(continuation_enc[-(self.max_length + 1) :])
else:
max_length = self.max_length
max_context_enc = max_length
max_cont_enc = max_length
# if OOM, then halves batch_size and tries again
@find_executable_batch_size(starting_batch_size=self.max_batch_size)
......@@ -1272,3 +1278,44 @@ class HFLM(TemplateLM):
pbar.close()
return res
def get_model_info(self) -> dict:
"""
Method to get Hugging Face model information for experiment reproducibility.
"""
def get_model_num_params(model) -> int:
if hasattr(model, "num_parameters"):
return model.num_parameters()
if hasattr(model, "parameters"):
return sum(p.numel() for p in model.parameters())
else:
return -1
def get_model_dtype(model) -> str:
if hasattr(model, "dtype"):
return model.dtype
else:
return ""
def get_model_sha(pretrained: str, revision: str) -> str:
try:
model_info = HfApi().model_info(repo_id=pretrained, revision=revision)
return model_info.sha
except Exception as e:
eval_logger.warn(
f"Failed to get model SHA for {pretrained} at revision {revision}. Error: {e}"
)
return ""
model_info = {
"model_num_parameters": get_model_num_params(self._model),
"model_dtype": get_model_dtype(self._model),
"model_revision": self.revision,
"model_sha": get_model_sha(self.pretrained, self.revision),
}
if self.peft:
model_info["peft_sha"] = get_model_sha(self.peft, self.revision)
if self.delta:
model_info["delta_sha"] = get_model_sha(self.delta, self.revision)
return model_info
......@@ -14,13 +14,11 @@ from lm_eval.models.utils import retry_on_specific_exceptions
from lm_eval.utils import eval_logger
def get_result(response, ctxlen: int) -> Tuple[float, bool]:
def get_result(response) -> Tuple[float, bool]:
"""Process results from OpenAI API response.
:param response: dict
OpenAI API Response
:param ctxlen: int
Length of context (so we can slice them away and only keep the predictions)
:return:
continuation_logprobs: np.array
Log probabilities of continuation tokens
......@@ -29,9 +27,9 @@ def get_result(response, ctxlen: int) -> Tuple[float, bool]:
"""
is_greedy = True
logprobs = response.logprobs.token_logprobs
continuation_logprobs = sum(logprobs[ctxlen:])
continuation_logprobs = sum(logprobs)
for i in range(ctxlen, len(response.logprobs.token_logprobs)):
for i in range(len(response.logprobs.token_logprobs)):
token = response.logprobs.token_logprobs[i]
top_tokens = response.logprobs.top_logprobs[i]
top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
......@@ -212,7 +210,6 @@ class OpenaiCompletionsLM(TemplateLM):
client=self.client,
model=self.model,
prompt=inps,
echo=True,
max_tokens=0,
temperature=0.0,
logprobs=10,
......@@ -222,7 +219,7 @@ class OpenaiCompletionsLM(TemplateLM):
for resp, ctxlen, (cache_key, context_enc, continuation_enc) in zip(
response.choices, ctxlens, chunk
):
answer = get_result(resp, ctxlen)
answer = get_result(resp)
res.append(answer)
......@@ -433,7 +430,7 @@ class OpenaiChatCompletionsLM(LM):
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [kwargs]
until = [until]
elif not isinstance(until, list):
raise ValueError(
f"Expected repr(kwargs['until']) to be of type Union[str, list] but got {until}"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment