Unverified Commit 1ba35e62 authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Logging (#2203)



* changed source of eval_logger

* allow eval_logger to be set from args

* removed verbosity arg from non-main methods

* fix logging

* pre-commit

* set verbosity in eval logger

* replace utils.eval_logger

* fix logging in main

* add logging to docs

* add logging message

* nit

* add logging to docs

* refactor setup_logging to utils

---------
Co-authored-by: default avatarBaber <baber@hey.com>
parent 358adaf7
...@@ -82,8 +82,10 @@ We also support using the library's external API for use within model training l ...@@ -82,8 +82,10 @@ We also support using the library's external API for use within model training l
```python ```python
import lm_eval import lm_eval
from lm_eval.utils import setup_logging
... ...
# initialize logging
setup_logging("DEBUG") # optional, but recommended; or you can set up logging yourself
my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code) my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
... ...
# instantiate an LM subclass that takes your initialized model and can run # instantiate an LM subclass that takes your initialized model and can run
......
...@@ -37,7 +37,8 @@ and rename the folders and YAML file(s) as desired. ...@@ -37,7 +37,8 @@ and rename the folders and YAML file(s) as desired.
All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md) All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md)
. .
> [!TIP]
> To test your task, we recommend using verbose logging using `export LOGLEVEL = DEBUG` in your shell before running the evaluation script. This will help you debug any issues that may arise.
Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset: Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset:
```yaml ```yaml
......
...@@ -79,48 +79,48 @@ ...@@ -79,48 +79,48 @@
" Switched to a new branch 'big-refactor'\n", " Switched to a new branch 'big-refactor'\n",
" Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.\n", " Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.\n",
" Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 42f486ee49b65926a444cb0620870a39a5b4b0a8\n", " Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 42f486ee49b65926a444cb0620870a39a5b4b0a8\n",
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Installing build dependencies ... \u001B[?25l\u001B[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001B[?25l\u001B[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001B[?25l\u001B[?25hdone\n",
"Collecting accelerate>=0.21.0 (from lm-eval==1.0.0)\n", "Collecting accelerate>=0.21.0 (from lm-eval==1.0.0)\n",
" Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n", " Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m261.4/261.4 kB\u001B[0m \u001B[31m4.1 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001b[?25hCollecting evaluate (from lm-eval==1.0.0)\n", "\u001B[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
" Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n", " Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m84.1/84.1 kB\u001B[0m \u001B[31m5.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001b[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n", "\u001B[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
" Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n", " Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m521.2/521.2 kB\u001B[0m \u001B[31m9.5 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001b[?25hCollecting jsonlines (from lm-eval==1.0.0)\n", "\u001B[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
" Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n", " Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
"Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.8.7)\n", "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.8.7)\n",
"Collecting peft>=0.2.0 (from lm-eval==1.0.0)\n", "Collecting peft>=0.2.0 (from lm-eval==1.0.0)\n",
" Downloading peft-0.6.2-py3-none-any.whl (174 kB)\n", " Downloading peft-0.6.2-py3-none-any.whl (174 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m174.7/174.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m174.7/174.7 kB\u001B[0m \u001B[31m7.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n", "\u001B[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
" Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n", " Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m227.7/227.7 kB\u001B[0m \u001B[31m12.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001b[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n", "\u001B[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
" Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n", " Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m111.1/111.1 kB\u001B[0m \u001B[31m8.3 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n", "\u001B[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
" Downloading rouge_score-0.1.2.tar.gz (17 kB)\n", " Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
"Collecting sacrebleu>=1.5.0 (from lm-eval==1.0.0)\n", "Collecting sacrebleu>=1.5.0 (from lm-eval==1.0.0)\n",
" Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)\n", " Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.7/119.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m119.7/119.7 kB\u001B[0m \u001B[31m8.7 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n", "\u001B[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
"Collecting sqlitedict (from lm-eval==1.0.0)\n", "Collecting sqlitedict (from lm-eval==1.0.0)\n",
" Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n", " Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
"Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.1.0+cu118)\n", "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.1.0+cu118)\n",
"Collecting tqdm-multiprocess (from lm-eval==1.0.0)\n", "Collecting tqdm-multiprocess (from lm-eval==1.0.0)\n",
" Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n", " Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n",
"Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (4.35.2)\n", "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (4.35.2)\n",
"Collecting zstandard (from lm-eval==1.0.0)\n", "Collecting zstandard (from lm-eval==1.0.0)\n",
" Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n", " Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m5.4/5.4 MB\u001B[0m \u001B[31m29.2 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n", "\u001B[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (23.2)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (23.2)\n",
"Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (5.9.5)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (5.9.5)\n",
"Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (6.0.1)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (6.0.1)\n",
...@@ -130,15 +130,15 @@ ...@@ -130,15 +130,15 @@
" Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", " Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
"Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==1.0.0)\n", "Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==1.0.0)\n",
" Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n", " Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m115.3/115.3 kB\u001B[0m \u001B[31m14.4 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n", "\u001B[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
"Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2.31.0)\n", "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2.31.0)\n",
"Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (4.66.1)\n", "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (4.66.1)\n",
"Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.4.1)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.4.1)\n",
"Collecting multiprocess (from datasets>=2.0.0->lm-eval==1.0.0)\n", "Collecting multiprocess (from datasets>=2.0.0->lm-eval==1.0.0)\n",
" Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", " Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", "\u001B[2K \u001B[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001B[0m \u001B[32m134.8/134.8 kB\u001B[0m \u001B[31m19.9 MB/s\u001B[0m eta \u001B[36m0:00:00\u001B[0m\n",
"\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n", "\u001B[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
"Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.8.6)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.8.6)\n",
"Collecting responses<0.19 (from evaluate->lm-eval==1.0.0)\n", "Collecting responses<0.19 (from evaluate->lm-eval==1.0.0)\n",
" Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n", " Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
...@@ -193,13 +193,13 @@ ...@@ -193,13 +193,13 @@
"Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==1.0.0) (8.1.7)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==1.0.0) (8.1.7)\n",
"Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm-eval==1.0.0) (1.3.0)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm-eval==1.0.0) (1.3.0)\n",
"Building wheels for collected packages: lm-eval, rouge-score, sqlitedict\n", "Building wheels for collected packages: lm-eval, rouge-score, sqlitedict\n",
" Building wheel for lm-eval (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for lm-eval (pyproject.toml) ... \u001B[?25l\u001B[?25hdone\n",
" Created wheel for lm-eval: filename=lm_eval-1.0.0-py3-none-any.whl size=994254 sha256=88356155b19f2891981ecef948326ad6ce8ca40a6009378410ec20d0e225995a\n", " Created wheel for lm-eval: filename=lm_eval-1.0.0-py3-none-any.whl size=994254 sha256=88356155b19f2891981ecef948326ad6ce8ca40a6009378410ec20d0e225995a\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-9v6ye7h3/wheels/17/01/26/599c0779e9858a70a73fa8a306699b5b9a868f820c225457b0\n", " Stored in directory: /tmp/pip-ephem-wheel-cache-9v6ye7h3/wheels/17/01/26/599c0779e9858a70a73fa8a306699b5b9a868f820c225457b0\n",
" Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for rouge-score (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
" Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=6bb0d44e4881972c43ce194e7cb65233d309758cb15f0dec54590d3d2efcfc36\n", " Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=6bb0d44e4881972c43ce194e7cb65233d309758cb15f0dec54590d3d2efcfc36\n",
" Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n", " Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
" Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Building wheel for sqlitedict (setup.py) ... \u001B[?25l\u001B[?25hdone\n",
" Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16863 sha256=5747f7dd73ddf3d8fbcebf51b5e4f718fabe1e94bccdf16d2f22a2e65ee7fdf4\n", " Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16863 sha256=5747f7dd73ddf3d8fbcebf51b5e4f718fabe1e94bccdf16d2f22a2e65ee7fdf4\n",
" Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n", " Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n",
"Successfully built lm-eval rouge-score sqlitedict\n", "Successfully built lm-eval rouge-score sqlitedict\n",
...@@ -361,6 +361,7 @@ ...@@ -361,6 +361,7 @@
} }
], ],
"source": [ "source": [
"%env LOGLEVEL=DEBUG\n",
"!lm_eval \\\n", "!lm_eval \\\n",
" --model hf \\\n", " --model hf \\\n",
" --model_args pretrained=EleutherAI/pythia-2.8b \\\n", " --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
...@@ -462,6 +463,7 @@ ...@@ -462,6 +463,7 @@
], ],
"source": [ "source": [
"# !accelerate launch --no_python\n", "# !accelerate launch --no_python\n",
"%env LOGLEVEL=DEBUG\n",
"!lm_eval \\\n", "!lm_eval \\\n",
" --model hf \\\n", " --model hf \\\n",
" --model_args pretrained=EleutherAI/pythia-2.8b \\\n", " --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
...@@ -561,6 +563,7 @@ ...@@ -561,6 +563,7 @@
], ],
"source": [ "source": [
"# !accelerate launch --no_python\n", "# !accelerate launch --no_python\n",
"%env LOGLEVEL=DEBUG\n",
"!lm_eval \\\n", "!lm_eval \\\n",
" --model hf \\\n", " --model hf \\\n",
" --model_args pretrained=EleutherAI/pythia-2.8b \\\n", " --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
...@@ -637,6 +640,7 @@ ...@@ -637,6 +640,7 @@
], ],
"source": [ "source": [
"# !accelerate launch --no_python\n", "# !accelerate launch --no_python\n",
"%env LOGLEVEL=DEBUG\n",
"!lm_eval \\\n", "!lm_eval \\\n",
" --model hf \\\n", " --model hf \\\n",
" --model_args pretrained=EleutherAI/pythia-2.8b \\\n", " --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
......
import logging
import os
from .evaluator import evaluate, simple_evaluate from .evaluator import evaluate, simple_evaluate
...@@ -213,9 +213,9 @@ def setup_parser() -> argparse.ArgumentParser: ...@@ -213,9 +213,9 @@ def setup_parser() -> argparse.ArgumentParser:
"--verbosity", "--verbosity",
"-v", "-v",
type=str.upper, type=str.upper,
default="INFO", default=None,
metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG", metavar="CRITICAL|ERROR|WARNING|INFO|DEBUG",
help="Controls the reported logging error level. Set to DEBUG when testing + adding new task configurations for comprehensive log output.", help="(Deprecated) Controls logging verbosity level. Use the `LOGLEVEL` environment variable instead. Set to DEBUG for detailed output when testing or adding new task configurations.",
) )
parser.add_argument( parser.add_argument(
"--wandb_args", "--wandb_args",
...@@ -279,9 +279,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -279,9 +279,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
if args.wandb_args: if args.wandb_args:
wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args)) wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))
eval_logger = utils.eval_logger utils.setup_logging(args.verbosity)
eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) eval_logger = logging.getLogger(__name__)
eval_logger.info(f"Verbosity set to {args.verbosity}")
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
# update the evaluation tracker args with the output path and the HF token # update the evaluation tracker args with the output path and the HF token
...@@ -306,7 +305,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -306,7 +305,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
if args.include_path is not None: if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}") eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path) task_manager = TaskManager(include_path=args.include_path)
if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples: if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples:
eval_logger.warning( eval_logger.warning(
...@@ -377,8 +376,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -377,8 +376,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
args.model_args = args.model_args + ",trust_remote_code=True" args.model_args = args.model_args + ",trust_remote_code=True"
print(eval_logger.level)
eval_logger.info(f"Selected Tasks: {task_names}") eval_logger.info(
f"Selected Tasks: {task_names}"
) if eval_logger.level >= logging.INFO else print(f"Selected Tasks: {task_names}")
request_caching_args = request_caching_arg_to_dict( request_caching_args = request_caching_arg_to_dict(
cache_requests=args.cache_requests cache_requests=args.cache_requests
...@@ -403,7 +404,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -403,7 +404,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
fewshot_as_multiturn=args.fewshot_as_multiturn, fewshot_as_multiturn=args.fewshot_as_multiturn,
gen_kwargs=args.gen_kwargs, gen_kwargs=args.gen_kwargs,
task_manager=task_manager, task_manager=task_manager,
verbosity=args.verbosity,
predict_only=args.predict_only, predict_only=args.predict_only,
random_seed=args.seed[0], random_seed=args.seed[0],
numpy_random_seed=args.seed[1], numpy_random_seed=args.seed[1],
......
...@@ -12,7 +12,7 @@ import sacrebleu ...@@ -12,7 +12,7 @@ import sacrebleu
from lm_eval.api.registry import register_aggregation, register_metric from lm_eval.api.registry import register_aggregation, register_metric
eval_logger = logging.getLogger("lm-eval") eval_logger = logging.getLogger(__name__)
# Register Aggregations First # Register Aggregations First
......
...@@ -12,7 +12,7 @@ from tqdm import tqdm ...@@ -12,7 +12,7 @@ from tqdm import tqdm
from lm_eval import utils from lm_eval import utils
eval_logger = logging.getLogger("lm-eval") eval_logger = logging.getLogger(__name__)
T = TypeVar("T", bound="LM") T = TypeVar("T", bound="LM")
......
...@@ -6,7 +6,7 @@ import evaluate as hf_evaluate ...@@ -6,7 +6,7 @@ import evaluate as hf_evaluate
from lm_eval.api.model import LM from lm_eval.api.model import LM
eval_logger = logging.getLogger("lm-eval") eval_logger = logging.getLogger(__name__)
MODEL_REGISTRY = {} MODEL_REGISTRY = {}
......
...@@ -48,7 +48,7 @@ ALL_OUTPUT_TYPES = [ ...@@ -48,7 +48,7 @@ ALL_OUTPUT_TYPES = [
"generate_until", "generate_until",
] ]
eval_logger = logging.getLogger("lm-eval") eval_logger = logging.getLogger(__name__)
@dataclass @dataclass
......
import hashlib import hashlib
import logging
import os import os
import dill import dill
from lm_eval.utils import eval_logger
eval_logger = logging.getLogger(__name__)
MODULE_DIR = os.path.dirname(os.path.realpath(__file__)) MODULE_DIR = os.path.dirname(os.path.realpath(__file__))
......
...@@ -31,7 +31,6 @@ from lm_eval.tasks import ( ...@@ -31,7 +31,6 @@ from lm_eval.tasks import (
get_task_dict, get_task_dict,
) )
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger,
handle_non_serializable, handle_non_serializable,
hash_string, hash_string,
positional_deprecated, positional_deprecated,
...@@ -43,6 +42,8 @@ if TYPE_CHECKING: ...@@ -43,6 +42,8 @@ if TYPE_CHECKING:
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.task import Task from lm_eval.api.task import Task
eval_logger = logging.getLogger(__name__)
@positional_deprecated @positional_deprecated
def simple_evaluate( def simple_evaluate(
...@@ -68,7 +69,7 @@ def simple_evaluate( ...@@ -68,7 +69,7 @@ def simple_evaluate(
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
gen_kwargs: Optional[str] = None, gen_kwargs: Optional[str] = None,
task_manager: Optional[TaskManager] = None, task_manager: Optional[TaskManager] = None,
verbosity: str = "INFO", verbostiy=None,
predict_only: bool = False, predict_only: bool = False,
random_seed: int = 0, random_seed: int = 0,
numpy_random_seed: int = 1234, numpy_random_seed: int = 1234,
...@@ -123,6 +124,8 @@ def simple_evaluate( ...@@ -123,6 +124,8 @@ def simple_evaluate(
:param gen_kwargs: str :param gen_kwargs: str
String arguments for model generation String arguments for model generation
Ignored for all tasks with loglikelihood output_type Ignored for all tasks with loglikelihood output_type
:param verbostiy: str
Verbosity level for logging
:param predict_only: bool :param predict_only: bool
If true only model outputs will be generated and returned. Metrics will not be evaluated If true only model outputs will be generated and returned. Metrics will not be evaluated
:param random_seed: int :param random_seed: int
...@@ -137,7 +140,8 @@ def simple_evaluate( ...@@ -137,7 +140,8 @@ def simple_evaluate(
:return :return
Dictionary of results Dictionary of results
""" """
eval_logger.setLevel(getattr(logging, f"{verbosity}")) if verbostiy is not None:
lm_eval.setup_logging(verbosity=verbostiy)
start_date = time.time() start_date = time.time()
if delete_requests_cache: if delete_requests_cache:
...@@ -231,7 +235,7 @@ def simple_evaluate( ...@@ -231,7 +235,7 @@ def simple_evaluate(
) )
if task_manager is None: if task_manager is None:
task_manager = TaskManager(verbosity) task_manager = TaskManager()
task_dict = get_task_dict(tasks, task_manager) task_dict = get_task_dict(tasks, task_manager)
...@@ -313,9 +317,11 @@ def simple_evaluate( ...@@ -313,9 +317,11 @@ def simple_evaluate(
system_instruction=system_instruction, system_instruction=system_instruction,
apply_chat_template=apply_chat_template, apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn, fewshot_as_multiturn=fewshot_as_multiturn,
verbosity=verbosity, verbosity=verbostiy,
confirm_run_unsafe_code=confirm_run_unsafe_code, confirm_run_unsafe_code=confirm_run_unsafe_code,
) )
if verbostiy is not None:
lm_eval.setup_logging(verbosity=verbostiy)
if lm.rank == 0: if lm.rank == 0:
if isinstance(model, str): if isinstance(model, str):
...@@ -411,8 +417,6 @@ def evaluate( ...@@ -411,8 +417,6 @@ def evaluate(
Dictionary of results Dictionary of results
""" """
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
if apply_chat_template: if apply_chat_template:
eval_logger.warning( eval_logger.warning(
"Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details." "Chat template formatting change affects loglikelihood and multiple-choice tasks. See docs/chat-template-readme.md for details."
......
import collections import collections
import logging
import math import math
import pathlib import pathlib
import sys import sys
...@@ -12,7 +13,10 @@ from lm_eval.api.metrics import ( ...@@ -12,7 +13,10 @@ from lm_eval.api.metrics import (
stderr_for_metric, stderr_for_metric,
) )
from lm_eval.api.task import Task from lm_eval.api.task import Task
from lm_eval.utils import eval_logger, positional_deprecated from lm_eval.utils import positional_deprecated
eval_logger = logging.getLogger(__name__)
class TaskOutput: class TaskOutput:
......
import json import json
import logging
import os import os
import re import re
import time import time
...@@ -18,7 +19,6 @@ from huggingface_hub import ( ...@@ -18,7 +19,6 @@ from huggingface_hub import (
from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status from huggingface_hub.utils import build_hf_headers, get_session, hf_raise_for_status
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger,
get_file_datetime, get_file_datetime,
get_file_task_name, get_file_task_name,
get_results_filenames, get_results_filenames,
...@@ -31,6 +31,9 @@ from lm_eval.utils import ( ...@@ -31,6 +31,9 @@ from lm_eval.utils import (
) )
eval_logger = logging.getLogger(__name__)
@dataclass(init=False) @dataclass(init=False)
class GeneralConfigTracker: class GeneralConfigTracker:
""" """
......
import logging
import os import os
from functools import cached_property from functools import cached_property
from typing import Any, Dict, List, Tuple, Union from typing import Any, Dict, List, Tuple, Union
from tqdm import tqdm from tqdm import tqdm
from lm_eval import utils
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.openai_completions import LocalCompletionsAPI from lm_eval.models.openai_completions import LocalCompletionsAPI
from lm_eval.models.utils import handle_stop_sequences, retry_on_specific_exceptions from lm_eval.models.utils import handle_stop_sequences, retry_on_specific_exceptions
eval_logger = utils.eval_logger eval_logger = logging.getLogger(__name__)
def anthropic_completion( def anthropic_completion(
......
...@@ -3,6 +3,7 @@ import asyncio ...@@ -3,6 +3,7 @@ import asyncio
import copy import copy
import itertools import itertools
import json import json
import logging
from functools import cached_property from functools import cached_property
from typing import ( from typing import (
Any, Any,
...@@ -37,6 +38,8 @@ from lm_eval.api.model import TemplateLM ...@@ -37,6 +38,8 @@ from lm_eval.api.model import TemplateLM
from lm_eval.models.utils import Collator, chunks, configure_pad_token from lm_eval.models.utils import Collator, chunks, configure_pad_token
eval_logger = logging.getLogger(__name__)
LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]] LogLikelihoodInputs = Tuple[Tuple[str, str], List[int], List[int]]
...@@ -48,9 +51,6 @@ class JsonChatStr(NamedTuple): ...@@ -48,9 +51,6 @@ class JsonChatStr(NamedTuple):
return self.prompt.encode(encoding) return self.prompt.encode(encoding)
eval_logger = utils.eval_logger
class TemplateAPI(TemplateLM): class TemplateAPI(TemplateLM):
def __init__( def __init__(
self, self,
......
import copy import copy
import logging
from typing import Dict, List, Optional, Tuple, Union from typing import Dict, List, Optional, Tuple, Union
import torch import torch
...@@ -7,7 +8,6 @@ import transformers ...@@ -7,7 +8,6 @@ import transformers
from tqdm import tqdm from tqdm import tqdm
from transformers import BatchEncoding from transformers import BatchEncoding
from lm_eval import utils
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM from lm_eval.models.huggingface import HFLM
...@@ -24,7 +24,7 @@ from lm_eval.models.utils import ( ...@@ -24,7 +24,7 @@ from lm_eval.models.utils import (
DEFAULT_IMAGE_PLACEHOLDER = "<image>" DEFAULT_IMAGE_PLACEHOLDER = "<image>"
eval_logger = utils.eval_logger eval_logger = logging.getLogger(__name__)
@register_model("hf-multimodal") @register_model("hf-multimodal")
......
import copy import copy
import logging
import os import os
from datetime import timedelta from datetime import timedelta
from pathlib import Path from pathlib import Path
...@@ -39,7 +40,7 @@ from lm_eval.models.utils import ( ...@@ -39,7 +40,7 @@ from lm_eval.models.utils import (
) )
eval_logger = utils.eval_logger eval_logger = logging.getLogger(__name__)
@register_model("hf-auto", "hf", "huggingface") @register_model("hf-auto", "hf", "huggingface")
......
import copy import copy
import json import json
import logging
import os import os
from functools import lru_cache from functools import lru_cache
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
...@@ -10,7 +11,10 @@ from lm_eval.api.instance import Instance ...@@ -10,7 +11,10 @@ from lm_eval.api.instance import Instance
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.api_models import JsonChatStr from lm_eval.models.api_models import JsonChatStr
from lm_eval.utils import eval_logger, simple_parse_args_string from lm_eval.utils import simple_parse_args_string
eval_logger = logging.getLogger(__name__)
class LogLikelihoodResult(NamedTuple): class LogLikelihoodResult(NamedTuple):
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
# limitations under the License. # limitations under the License.
import importlib import importlib
import logging
import pathlib import pathlib
from copy import deepcopy from copy import deepcopy
from typing import List, Literal from typing import List, Literal
...@@ -27,13 +28,15 @@ from lm_eval.api.model import LM ...@@ -27,13 +28,15 @@ from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator from lm_eval.models.utils import Collator
from lm_eval.utils import ( from lm_eval.utils import (
eval_logger,
get_rolling_token_windows, get_rolling_token_windows,
make_disjoint_window, make_disjoint_window,
simple_parse_args_string, simple_parse_args_string,
) )
eval_logger = logging.getLogger(__name__)
def _patch_pretrained_cfg( def _patch_pretrained_cfg(
pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size pretrained_cfg, trainer, tensor_model_parallel_size, pipeline_model_parallel_size
): ):
......
import copy import copy
import logging
from typing import List, Optional, Tuple, Union from typing import List, Optional, Tuple, Union
import numpy import numpy
...@@ -13,7 +14,7 @@ from lm_eval.api.registry import register_model ...@@ -13,7 +14,7 @@ from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM from lm_eval.models.huggingface import HFLM
eval_logger = utils.eval_logger eval_logger = logging.getLogger(__name__)
@register_model("sparseml") @register_model("sparseml")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment