Unverified Commit 1ba35e62 authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Logging (#2203)



* changed source of eval_logger

* allow eval_logger to be set from args

* removed verbosity arg from non-main methods

* fix logging

* pre-commit

* set verbosity in eval logger

* replace utils.eval_logger

* fix logging in main

* add logging to docs

* add logging message

* nit

* add logging to docs

* refactor setup_logging to utils

---------
Co-authored-by: default avatarBaber <baber@hey.com>
parent 358adaf7
......@@ -82,8 +82,10 @@ We also support using the library's external API for use within model training l
```python
import lm_eval
from lm_eval.utils import setup_logging
...
# initialize logging
setup_logging("DEBUG") # optional, but recommended; or you can set up logging yourself
my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
...
# instantiate an LM subclass that takes your initialized model and can run
......
......@@ -37,7 +37,8 @@ and rename the folders and YAML file(s) as desired.
All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md)
.
> [!TIP]
> To test your task, we recommend using verbose logging using `export LOGLEVEL = DEBUG` in your shell before running the evaluation script. This will help you debug any issues that may arise.
Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset:
```yaml
......
......@@ -79,48 +79,48 @@
" Switched to a new branch 'big-refactor'\n",
" Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.\n",
" Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 42f486ee49b65926a444cb0620870a39a5b4b0a8\n",
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
" Installing build dependencies ... \u001B[?25l\u001B[?25hdone\n",
" Getting requirements to build wheel ... \u001B[?25l\u001B[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001B[?25l\u001B[?25hdone\n",
"Collecting accelerate>=0.21.0 (from lm-eval==1.0.0)\n",
" Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m261.4/261.4 kB\u001B[0m \u001B[31m4.1 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001B[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
" Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m84.1/84.1 kB\u001B[0m \u001B[31m5.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001B[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
" Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m521.2/521.2 kB\u001B[0m \u001B[31m9.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001B[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
" Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
"Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.8.7)\n",
"Collecting peft>=0.2.0 (from lm-eval==1.0.0)\n",
" Downloading peft-0.6.2-py3-none-any.whl (174 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m174.7/174.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m174.7/174.7 kB\u001B[0m \u001B[31m7.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001B[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
" Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m227.7/227.7 kB\u001B[0m \u001B[31m12.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001B[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
" Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m111.1/111.1 kB\u001B[0m \u001B[31m8.3 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001B[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
" Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
"Collecting sacrebleu>=1.5.0 (from lm-eval==1.0.0)\n",
" Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.7/119.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m119.7/119.7 kB\u001B[0m \u001B[31m8.7 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001B[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
"Collecting sqlitedict (from lm-eval==1.0.0)\n",
" Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
"Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.1.0+cu118)\n",
"Collecting tqdm-multiprocess (from lm-eval==1.0.0)\n",
" Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n",
"Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (4.35.2)\n",
"Collecting zstandard (from lm-eval==1.0.0)\n",
" Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m5.4/5.4 MB\u001B[0m \u001B[31m29.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001B[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (23.2)\n",
"Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (5.9.5)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (6.0.1)\n",
......@@ -130,15 +130,15 @@
" Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
"Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==1.0.0)\n",
" Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m115.3/115.3 kB\u001B[0m \u001B[31m14.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001B[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
"Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2.31.0)\n",
"Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (4.66.1)\n",
"Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.4.1)\n",
"Collecting multiprocess (from datasets>=2.0.0->lm-eval==1.0.0)\n",
" Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
"\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m134.8/134.8 kB\u001B[0m \u001B[31m19.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001B[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.8.6)\n",
"Collecting responses<0.19 (from evaluate->lm-eval==1.0.0)\n",
" Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
......@@ -193,13 +193,13 @@
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==1.0.0) (8.1.7)\n",
"Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm-eval==1.0.0) (1.3.0)\n",
"Building wheels for collected packages: lm-eval, rouge-score, sqlitedict\n",
" Building wheel for lm-eval (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for lm-eval (pyproject.toml) ... \u001B[?25l\u001B[?25hdone\n",
" Created wheel for lm-eval: filename=lm_eval-1.0.0-py3-none-any.whl size=994254 sha256=88356155b19f2891981ecef948326ad6ce8ca40a6009378410ec20d0e225995a\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-9v6ye7h3/wheels/17/01/26/599c0779e9858a70a73fa8a306699b5b9a868f820c225457b0\n",
" Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for rouge-score (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
" Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=6bb0d44e4881972c43ce194e7cb65233d309758cb15f0dec54590d3d2efcfc36\n",
" Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
" Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for sqlitedict (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
" Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16863 sha256=5747f7dd73ddf3d8fbcebf51b5e4f718fabe1e94bccdf16d2f22a2e65ee7fdf4\n",
" Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n",
"Successfully built lm-eval rouge-score sqlitedict\n",
......@@ -361,6 +361,7 @@
}
],
"source": [
"%env LOGLEVEL=DEBUG\n",
"!lm_eval \\\n",
" --model hf \\\n",
" --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
......@@ -462,6 +463,7 @@
],
"source": [
"# !accelerate launch --no_python\n",
"%env LOGLEVEL=DEBUG\n",
"!lm_eval \\\n",
" --model hf \\\n",
" --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
......@@ -561,6 +563,7 @@
],
"source": [
"# !accelerate launch --no_python\n",
"%env LOGLEVEL=DEBUG\n",
"!lm_eval \\\n",
" --model hf \\\n",
" --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
......@@ -637,6 +640,7 @@
],
"source": [
"# !accelerate launch --no_python\n",
"%env LOGLEVEL=DEBUG\n",
"!lm_eval \\\n",
" --model hf \\\n",
" --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
......
import logging
import os
from .evaluator import evaluate, simple_evaluate
......@@ -213,9 +213,9 @@ def setup_parser() -> argparse.ArgumentParser:
"--verbosity",
"-v",
type=str.upper,
default="INFO",
default=None,
metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
)
parser.add_argument(
"--wandb_args",
......@@ -279,9 +279,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
if args.wandb_args:
wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))
eval_logger = utils.eval_logger
eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
eval_logger.info(f"Verbosity set to {args.verbosity}")
utils.setup_logging(args.verbosity)
eval_logger = logging.getLogger(__name__)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# update the evaluation tracker args with the output path and the HF token
......@@ -306,7 +305,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path)
task_manager = TaskManager(include_path=args.include_path)
if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
eval_logger.warning(
......@@ -377,8 +376,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
args.model_args = args.model_args + ",trust_remote_code=True"
eval_logger.info(f"Selected Tasks: {task_names}")
print(eval_logger.level)
eval_logger.info(
f"Selected Tasks: {task_names}"
) if eval_logger.level >= logging.INFO else print(f"Selected Tasks: {task_names}")
request_caching_args = request_caching_arg_to_dict(
cache_requests=args.cache_requests
......@@ -403,7 +404,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
fewshot_as_multiturn=args.fewshot_as_multiturn,
gen_kwargs=args.gen_kwargs,
task_manager=task_manager,
verbosity=args.verbosity,
predict_only=args.predict_only,
random_seed=args.seed[0],
numpy_random_seed=args.seed[1],
......
......@@ -12,7 +12,7 @@ import sacrebleu
from lm_eval.api.registry import register_aggregation, register_metric
eval_logger = logging.getLogger("lm-eval")
eval_logger = logging.getLogger(__name__)
# Register Aggregations First
......
......@@ -12,7 +12,7 @@ from tqdm import tqdm
from lm_eval import utils
eval_logger = logging.getLogger("lm-eval")
eval_logger = logging.getLogger(__name__)
T = TypeVar("T", bound="LM")
......
......@@ -6,7 +6,7 @@ import evaluate as hf_evaluate
from lm_eval.api.model import LM
eval_logger = logging.getLogger("lm-eval")
eval_logger = logging.getLogger(__name__)
MODEL_REGISTRY = {}
......
......@@ -48,7 +48,7 @@ ALL_OUTPUT_TYPES = [
"generate_until",
]
eval_logger = logging.getLogger("lm-eval")
eval_logger = logging.getLogger(__name__)
@dataclass
......
import hashlib
import logging
import os
import dill
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
......
......@@ -31,7 +31,6 @@ from lm_eval.tasks import (
get_task_dict,
)
from lm_eval.utils import (
eval_logger,
handle_non_serializable,
hash_string,
positional_deprecated,
......@@ -43,6 +42,8 @@ if TYPE_CHECKING:
from lm_eval.api.model import LM
from lm_eval.api.task import Task
eval_logger = logging.getLogger(__name__)
@positional_deprecated
def simple_evaluate(
......@@ -68,7 +69,7 @@ def simple_evaluate(
fewshot_as_multiturn: bool = False,
gen_kwargs: Optional[str] = None,
task_manager: Optional[TaskManager] = None,
verbosity: str = "INFO",
verbostiy=None,
predict_only: bool = False,
random_seed: int = 0,
numpy_random_seed: int = 1234,
......@@ -123,6 +124,8 @@ def simple_evaluate(
:param gen_kwargs: str
String arguments for model generation
Ignored for all tasks with loglikelihood output_type
:param verbostiy: str
Verbosity level for logging
:param predict_only: bool
If true only model outputs will be generated and returned. Metrics will not be evaluated
:param random_seed: int
......@@ -137,7 +140,8 @@ def simple_evaluate(
:return
Dictionary of results
"""
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
if verbostiy is not None:
lm_eval.setup_logging(verbosity=verbostiy)
start_date = time.time()
if delete_requests_cache:
......@@ -231,7 +235,7 @@ def simple_evaluate(
)
if task_manager is None:
task_manager = TaskManager(verbosity)
task_manager = TaskManager()
task_dict = get_task_dict(tasks, task_manager)
......@@ -313,9 +317,11 @@ def simple_evaluate(
system_instruction=system_instruction,
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
verbosity=verbosity,
verbosity=verbostiy,
confirm_run_unsafe_code=confirm_run_unsafe_code,
)
if verbostiy is not None:
lm_eval.setup_logging(verbosity=verbostiy)
if lm.rank == 0:
if isinstance(model, str):
......@@ -411,8 +417,6 @@ def evaluate(
Dictionary of results
"""
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
if apply_chat_template:
eval_logger.warning(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
......
import collections
import logging
import math
import pathlib
import sys
......@@ -12,7 +13,10 @@ from lm_eval.api.metrics import (
stderr_for_metric,
)
from lm_eval.api.task import Task
from lm_eval.utils import eval_logger, positional_deprecated
from lm_eval.utils import positional_deprecated
eval_logger = logging.getLogger(__name__)
class TaskOutput:
......
import json
import logging
import os
import re
import time
......@@ -18,7 +19,6 @@ from huggingface_hub import (
from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
from lm_eval.utils import (
eval_logger,
get_file_datetime,
get_file_task_name,
get_results_filenames,
......@@ -31,6 +31,9 @@ from lm_eval.utils import (
)
eval_logger = logging.getLogger(__name__)
@dataclass(init=False)
class GeneralConfigTracker:
"""
......
import logging
import os
from functools import cached_property
from typing import Any, Dict, List, Tuple, Union
from tqdm import tqdm
from lm_eval import utils
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.openai_completions import LocalCompletionsAPI
from lm_eval.models.utils import handle_stop_sequences, retry_on_specific_exceptions
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
def anthropic_completion(
......
......@@ -3,6 +3,7 @@ import asyncio
import copy
import itertools
import json
import logging
from functools import cached_property
from typing import (
Any,
......@@ -37,6 +38,8 @@ from lm_eval.api.model import TemplateLM
from lm_eval.models.utils import Collator, chunks, configure_pad_token
eval_logger = logging.getLogger(__name__)
LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
......@@ -48,9 +51,6 @@ class JsonChatStr(NamedTuple):
return self.prompt.encode(encoding)
eval_logger = utils.eval_logger
class TemplateAPI(TemplateLM):
def __init__(
self,
......
import copy
import logging
from typing import Dict, List, Optional, Tuple, Union
import torch
......@@ -7,7 +8,6 @@ import transformers
from tqdm import tqdm
from transformers import BatchEncoding
from lm_eval import utils
from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
......@@ -24,7 +24,7 @@ from lm_eval.models.utils import (
DEFAULT_IMAGE_PLACEHOLDER = "<image>"
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("hf-multimodal")
......
import copy
import logging
import os
from datetime import timedelta
from pathlib import Path
......@@ -39,7 +40,7 @@ from lm_eval.models.utils import (
)
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("hf-auto", "hf", "huggingface")
......
import copy
import json
import logging
import os
from functools import lru_cache
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
......@@ -10,7 +11,10 @@ from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.api_models import JsonChatStr
from lm_eval.utils import eval_logger, simple_parse_args_string
from lm_eval.utils import simple_parse_args_string
eval_logger = logging.getLogger(__name__)
class LogLikelihoodResult(NamedTuple):
......
......@@ -13,6 +13,7 @@
# limitations under the License.
import importlib
import logging
import pathlib
from copy import deepcopy
from typing import List, Literal
......@@ -27,13 +28,15 @@ from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator
from lm_eval.utils import (
eval_logger,
get_rolling_token_windows,
make_disjoint_window,
simple_parse_args_string,
)
eval_logger = logging.getLogger(__name__)
def _patch_pretrained_cfg(
pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
):
......
import copy
import logging
from typing import List, Optional, Tuple, Union
import numpy
......@@ -13,7 +14,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
eval_logger = utils.eval_logger
eval_logger = logging.getLogger(__name__)
@register_model("sparseml")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment