Logging (#2203)

* changed source of eval_logger * allow eval_logger to be set from args * removed verbosity arg from non-main methods * fix logging * pre-commit * set verbosity in eval logger * replace utils.eval_logger * fix logging in main * add logging to docs * add logging message * nit * add logging to docs * refactor setup_logging to utils --------- Co-authored-by: Baber <baber@hey.com>

Logging (#2203)
* changed source of eval_logger * allow eval_logger to be set from args * removed verbosity arg from non-main methods * fix logging * pre-commit * set verbosity in eval logger * replace utils.eval_logger * fix logging in main * add logging to docs * add logging message * nit * add logging to docs * refactor setup_logging to utils --------- Co-authored-by: Baber <baber@hey.com>
1ba35e62 · Lintang Sutawika · GitHub · 358adaf7 · 1ba35e62 · 1ba35e62
Unverified Commit 1ba35e62 authored Feb 20, 2025 by Lintang Sutawika Committed by GitHub Feb 21, 2025
20 changed files
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -82,8 +82,10 @@ We also support using the library's external API for use within model training l
 ```python
 import lm_eval
+from lm_eval.utils import setup_logging
 ...
+# initialize logging
+setup_logging("DEBUG") # optional, but recommended; or you can set up logging yourself
 my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
 ...
 # instantiate an LM subclass that takes your initialized model and can run

--- a/docs/new_task_guide.md
+++ b/docs/new_task_guide.md
@@ -37,7 +37,8 @@ and rename the folders and YAML file(s) as desired.
 All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md)
 .
+> [!TIP]
+> To test your task, we recommend using verbose logging using `export LOGLEVEL = DEBUG` in your shell before running the evaluation script. This will help you debug any issues that may arise.
 Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset:
 ```yaml

--- a/examples/lm-eval-overview.ipynb
+++ b/examples/lm-eval-overview.ipynb
@@ -79,48 +79,48 @@
      "  Switched to a new branch 'big-refactor'\n",
      "  Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.\n",
      "  Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 42f486ee49b65926a444cb0620870a39a5b4b0a8\n",
-      "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+      "  Installing build dependencies ... \u001B[?25l\u001B[?25hdone\n",
-      "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+      "  Getting requirements to build wheel ... \u001B[?25l\u001B[?25hdone\n",
-      "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Preparing metadata (pyproject.toml) ... \u001B[?25l\u001B[?25hdone\n",
      "Collecting accelerate>=0.21.0 (from lm-eval==1.0.0)\n",
      "  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m261.4/261.4 kB\u001B[0m \u001B[31m4.1 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001b[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
+      "\u001B[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
      "  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m84.1/84.1 kB\u001B[0m \u001B[31m5.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001b[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
+      "\u001B[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
      "  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m521.2/521.2 kB\u001B[0m \u001B[31m9.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001b[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
+      "\u001B[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
      "  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
      "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.8.7)\n",
      "Collecting peft>=0.2.0 (from lm-eval==1.0.0)\n",
      "  Downloading peft-0.6.2-py3-none-any.whl (174 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m174.7/174.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m174.7/174.7 kB\u001B[0m \u001B[31m7.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
+      "\u001B[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
      "  Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m227.7/227.7 kB\u001B[0m \u001B[31m12.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001b[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
+      "\u001B[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
      "  Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m111.1/111.1 kB\u001B[0m \u001B[31m8.3 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
+      "\u001B[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
      "  Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
-      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
      "Collecting sacrebleu>=1.5.0 (from lm-eval==1.0.0)\n",
      "  Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.7/119.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m119.7/119.7 kB\u001B[0m \u001B[31m8.7 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
+      "\u001B[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
      "Collecting sqlitedict (from lm-eval==1.0.0)\n",
      "  Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n",
-      "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
      "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.1.0+cu118)\n",
      "Collecting tqdm-multiprocess (from lm-eval==1.0.0)\n",
      "  Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n",
      "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (4.35.2)\n",
      "Collecting zstandard (from lm-eval==1.0.0)\n",
      "  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m5.4/5.4 MB\u001B[0m \u001B[31m29.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
+      "\u001B[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (23.2)\n",
      "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (5.9.5)\n",
      "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (6.0.1)\n",
@@ -130,15 +130,15 @@
      "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
      "Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==1.0.0)\n",
      "  Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m115.3/115.3 kB\u001B[0m \u001B[31m14.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
+      "\u001B[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
      "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2.31.0)\n",
      "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (4.66.1)\n",
      "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.4.1)\n",
      "Collecting multiprocess (from datasets>=2.0.0->lm-eval==1.0.0)\n",
      "  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001B[2K     \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m134.8/134.8 kB\u001B[0m \u001B[31m19.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
-      "\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
+      "\u001B[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
      "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.8.6)\n",
      "Collecting responses<0.19 (from evaluate->lm-eval==1.0.0)\n",
      "  Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
@@ -193,13 +193,13 @@
      "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==1.0.0) (8.1.7)\n",
      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm-eval==1.0.0) (1.3.0)\n",
      "Building wheels for collected packages: lm-eval, rouge-score, sqlitedict\n",
-      "  Building wheel for lm-eval (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Building wheel for lm-eval (pyproject.toml) ... \u001B[?25l\u001B[?25hdone\n",
      "  Created wheel for lm-eval: filename=lm_eval-1.0.0-py3-none-any.whl size=994254 sha256=88356155b19f2891981ecef948326ad6ce8ca40a6009378410ec20d0e225995a\n",
      "  Stored in directory: /tmp/pip-ephem-wheel-cache-9v6ye7h3/wheels/17/01/26/599c0779e9858a70a73fa8a306699b5b9a868f820c225457b0\n",
-      "  Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Building wheel for rouge-score (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
      "  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=6bb0d44e4881972c43ce194e7cb65233d309758cb15f0dec54590d3d2efcfc36\n",
      "  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
-      "  Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+      "  Building wheel for sqlitedict (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
      "  Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16863 sha256=5747f7dd73ddf3d8fbcebf51b5e4f718fabe1e94bccdf16d2f22a2e65ee7fdf4\n",
      "  Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n",
      "Successfully built lm-eval rouge-score sqlitedict\n",
@@ -361,6 +361,7 @@
    }
   ],
   "source": [
+    "%env LOGLEVEL=DEBUG\n",
    "!lm_eval \\\n",
    "    --model hf \\\n",
    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
@@ -462,6 +463,7 @@
   ],
   "source": [
    "# !accelerate launch --no_python\n",
+    "%env LOGLEVEL=DEBUG\n",
    "!lm_eval \\\n",
    "    --model hf \\\n",
    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
@@ -561,6 +563,7 @@
   ],
   "source": [
    "# !accelerate launch --no_python\n",
+    "%env LOGLEVEL=DEBUG\n",
    "!lm_eval \\\n",
    "    --model hf \\\n",
    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
@@ -637,6 +640,7 @@
   ],
   "source": [
    "# !accelerate launch --no_python\n",
+    "%env LOGLEVEL=DEBUG\n",
    "!lm_eval \\\n",
    "    --model hf \\\n",
    "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",

--- a/lm_eval/__init__.py
+++ b/lm_eval/__init__.py
+import logging
+import os
 from .evaluator import evaluate, simple_evaluate
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -213,9 +213,9 @@ def setup_parser() -> argparse.ArgumentParser:
        "--verbosity",
        "-v",
        type=str.upper,
-        default="INFO",
+        default=None,
        metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
-        help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.",
+        help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
    )
    parser.add_argument(
        "--wandb_args",
@@ -279,9 +279,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    if args.wandb_args:
        wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))
-    eval_logger = utils.eval_logger
+    utils.setup_logging(args.verbosity)
-    eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
+    eval_logger = logging.getLogger(__name__)
-    eval_logger.info(f"Verbosity set to {args.verbosity}")
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    # update the evaluation tracker args with the output path and the HF token
@@ -306,7 +305,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
-    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
+    task_manager = TaskManager(include_path=args.include_path)
    if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
        eval_logger.warning(
@@ -377,8 +376,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
        args.model_args = args.model_args + ",trust_remote_code=True"
+    print(eval_logger.level)
-    eval_logger.info(f"Selected Tasks: {task_names}")
+    eval_logger.info(
+        f"Selected Tasks: {task_names}"
+    ) if eval_logger.level >= logging.INFO else print(f"Selected Tasks: {task_names}")
    request_caching_args = request_caching_arg_to_dict(
        cache_requests=args.cache_requests
@@ -403,7 +404,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        fewshot_as_multiturn=args.fewshot_as_multiturn,
        gen_kwargs=args.gen_kwargs,
        task_manager=task_manager,
-        verbosity=args.verbosity,
        predict_only=args.predict_only,
        random_seed=args.seed[0],
        numpy_random_seed=args.seed[1],

--- a/lm_eval/api/metrics.py
+++ b/lm_eval/api/metrics.py
@@ -12,7 +12,7 @@ import sacrebleu
 from lm_eval.api.registry import register_aggregation, register_metric
-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)
 # Register Aggregations First

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -12,7 +12,7 @@ from tqdm import tqdm
 from lm_eval import utils
-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)
 T = TypeVar("T", bound="LM")

--- a/lm_eval/api/registry.py
+++ b/lm_eval/api/registry.py
@@ -6,7 +6,7 @@ import evaluate as hf_evaluate
 from lm_eval.api.model import LM
-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)
 MODEL_REGISTRY = {}

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -48,7 +48,7 @@ ALL_OUTPUT_TYPES = [
    "generate_until",
 ]
-eval_logger = logging.getLogger("lm-eval")
+eval_logger = logging.getLogger(__name__)
 @dataclass

--- a/lm_eval/caching/cache.py
+++ b/lm_eval/caching/cache.py
 import hashlib
+import logging
 import os
 import dill
-from lm_eval.utils import eval_logger
+eval_logger = logging.getLogger(__name__)
 MODULE_DIR = os.path.dirname(os.path.realpath(__file__))

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -31,7 +31,6 @@ from lm_eval.tasks import (
    get_task_dict,
 )
 from lm_eval.utils import (
-    eval_logger,
    handle_non_serializable,
    hash_string,
    positional_deprecated,
@@ -43,6 +42,8 @@ if TYPE_CHECKING:
    from lm_eval.api.model import LM
    from lm_eval.api.task import Task
+eval_logger = logging.getLogger(__name__)
 @positional_deprecated
 def simple_evaluate(
@@ -68,7 +69,7 @@ def simple_evaluate(
    fewshot_as_multiturn: bool = False,
    gen_kwargs: Optional[str] = None,
    task_manager: Optional[TaskManager] = None,
-    verbosity: str = "INFO",
+    verbostiy=None,
    predict_only: bool = False,
    random_seed: int = 0,
    numpy_random_seed: int = 1234,
@@ -123,6 +124,8 @@ def simple_evaluate(
    :param gen_kwargs: str
        String arguments for model generation
        Ignored for all tasks with loglikelihood output_type
+    :param verbostiy: str
+        Verbosity level for logging
    :param predict_only: bool
        If true only model outputs will be generated and returned. Metrics will not be evaluated
    :param random_seed: int
@@ -137,7 +140,8 @@ def simple_evaluate(
    :return
        Dictionary of results
    """
-    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+    if verbostiy is not None:
+        lm_eval.setup_logging(verbosity=verbostiy)
    start_date = time.time()
    if delete_requests_cache:
@@ -231,7 +235,7 @@ def simple_evaluate(
        )
    if task_manager is None:
-        task_manager = TaskManager(verbosity)
+        task_manager = TaskManager()
    task_dict = get_task_dict(tasks, task_manager)
@@ -313,9 +317,11 @@ def simple_evaluate(
        system_instruction=system_instruction,
        apply_chat_template=apply_chat_template,
        fewshot_as_multiturn=fewshot_as_multiturn,
-        verbosity=verbosity,
+        verbosity=verbostiy,
        confirm_run_unsafe_code=confirm_run_unsafe_code,
    )
+    if verbostiy is not None:
+        lm_eval.setup_logging(verbosity=verbostiy)
    if lm.rank == 0:
        if isinstance(model, str):
@@ -411,8 +417,6 @@ def evaluate(
        Dictionary of results
    """
-    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
    if apply_chat_template:
        eval_logger.warning(
            "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."

--- a/lm_eval/evaluator_utils.py
+++ b/lm_eval/evaluator_utils.py
 import collections
+import logging
 import math
 import pathlib
 import sys
@@ -12,7 +13,10 @@ from lm_eval.api.metrics import (
    stderr_for_metric,
 )
 from lm_eval.api.task import Task
-from lm_eval.utils import eval_logger, positional_deprecated
+from lm_eval.utils import positional_deprecated
+eval_logger = logging.getLogger(__name__)
 class TaskOutput:

--- a/lm_eval/loggers/evaluation_tracker.py
+++ b/lm_eval/loggers/evaluation_tracker.py
 import json
+import logging
 import os
 import re
 import time
@@ -18,7 +19,6 @@ from huggingface_hub import (
 from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
 from lm_eval.utils import (
-    eval_logger,
    get_file_datetime,
    get_file_task_name,
    get_results_filenames,
@@ -31,6 +31,9 @@ from lm_eval.utils import (
 )
+eval_logger = logging.getLogger(__name__)
 @dataclass(init=False)
 class GeneralConfigTracker:
    """

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
+import logging
 import os
 from functools import cached_property
 from typing import Any, Dict, List, Tuple, Union
 from tqdm import tqdm
-from lm_eval import utils
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.openai_completions import LocalCompletionsAPI
 from lm_eval.models.utils import handle_stop_sequences, retry_on_specific_exceptions
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 def anthropic_completion(

--- a/lm_eval/models/api_models.py
+++ b/lm_eval/models/api_models.py
@@ -3,6 +3,7 @@ import asyncio
 import copy
 import itertools
 import json
+import logging
 from functools import cached_property
 from typing import (
    Any,
@@ -37,6 +38,8 @@ from lm_eval.api.model import TemplateLM
 from lm_eval.models.utils import Collator, chunks, configure_pad_token
+eval_logger = logging.getLogger(__name__)
 LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
@@ -48,9 +51,6 @@ class JsonChatStr(NamedTuple):
        return self.prompt.encode(encoding)
-eval_logger = utils.eval_logger
 class TemplateAPI(TemplateLM):
    def __init__(
        self,

--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
 import copy
+import logging
 from typing import Dict, List, Optional, Tuple, Union
 import torch
@@ -7,7 +8,6 @@ import transformers
 from tqdm import tqdm
 from transformers import BatchEncoding
-from lm_eval import utils
 from lm_eval.api.instance import Instance
 from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
@@ -24,7 +24,7 @@ from lm_eval.models.utils import (
 DEFAULT_IMAGE_PLACEHOLDER = "<image>"
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("hf-multimodal")

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
 import copy
+import logging
 import os
 from datetime import timedelta
 from pathlib import Path
@@ -39,7 +40,7 @@ from lm_eval.models.utils import (
 )
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("hf-auto", "hf", "huggingface")

--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py
 import copy
 import json
+import logging
 import os
 from functools import lru_cache
 from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
@@ -10,7 +11,10 @@ from lm_eval.api.instance import Instance
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.api_models import JsonChatStr
-from lm_eval.utils import eval_logger, simple_parse_args_string
+from lm_eval.utils import simple_parse_args_string
+eval_logger = logging.getLogger(__name__)
 class LogLikelihoodResult(NamedTuple):

--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import importlib
+import logging
 import pathlib
 from copy import deepcopy
 from typing import List, Literal
@@ -27,13 +28,15 @@ from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from lm_eval.models.utils import Collator
 from lm_eval.utils import (
-    eval_logger,
    get_rolling_token_windows,
    make_disjoint_window,
    simple_parse_args_string,
 )
+eval_logger = logging.getLogger(__name__)
 def _patch_pretrained_cfg(
    pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
 ):

--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
 import copy
+import logging
 from typing import List, Optional, Tuple, Union
 import numpy
@@ -13,7 +14,7 @@ from lm_eval.api.registry import register_model
 from lm_eval.models.huggingface import HFLM
-eval_logger = utils.eval_logger
+eval_logger = logging.getLogger(__name__)
 @register_model("sparseml")