Unverified Commit afda6551 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #958 from EleutherAI/verbosity-rework

[Refactor] Verbosity rework
parents 2c0c345a 9b596e8f
......@@ -59,6 +59,8 @@ my_model = initialize_my_model() # create your model (could be running finetunin
...
lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
lm_eval.tasks.initialize_tasks() # register all tasks from the `lm_eval/tasks` subdirectory. Alternatively, can call `lm_eval.tasks.include_path("path/to/my/custom/task/configs")` to only register a set of tasks in a separate directory.
results = lm_eval.simple_evaluate( # call simple_evaluate
model=lm_obj,
tasks=["taskname1", "taskname2"],
......@@ -85,7 +87,7 @@ my_model = initialize_my_model() # create your model (could be running finetunin
...
lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
lm_eval.tasks.initialize_tasks() # register all tasks from the `lm_eval/tasks` subdirectory. Alternatively, can call `lm_eval.tasks.include_path("path/to/my/custom/task/configs")` to only register a set of tasks in a separate directory.
def evaluate(
lm=lm_obj,
......
import os
import re
import sys
import json
import fnmatch
import argparse
import logging
from pathlib import Path
import argparse
import numpy as np
from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger, SPACING
from lm_eval.tasks import include_path
from pathlib import Path
from typing import Union
from lm_eval import evaluator, utils
from lm_eval.tasks import initialize_tasks, include_path
from lm_eval.api.registry import ALL_TASKS
def _handle_non_serializable(o):
if isinstance(o, np.int64) or isinstance(o, np.int32):
......@@ -25,11 +25,11 @@ def _handle_non_serializable(o):
def parse_eval_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--model", required=True, help="Name of model e.g. `hf`")
parser.add_argument("--model", default="hf", help="Name of model e.g. `hf`")
parser.add_argument(
"--tasks",
default=None,
help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))),
help="To get full list of tasks, use the command lm-eval --tasks list",
)
parser.add_argument(
"--model_args",
......@@ -119,9 +119,13 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
# we allow for args to be passed externally, else we parse them ourselves
args = parse_eval_args()
eval_logger = utils.eval_logger
eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
eval_logger.info(f"Verbosity set to {args.verbosity}")
os.environ["TOKENIZERS_PARALLELISM"] = "false"
initialize_tasks(args.verbosity)
if args.limit:
eval_logger.warning(
" --limit SHOULD ONLY BE USED FOR TESTING."
......@@ -133,6 +137,11 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
if args.tasks is None:
task_names = ALL_TASKS
elif args.tasks == "list":
eval_logger.info(
"Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS)))
)
sys.exit()
else:
if os.path.isdir(args.tasks):
import glob
......@@ -159,10 +168,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
missing = ", ".join(task_missing)
eval_logger.error(
f"Tasks were not found: {missing}\n"
f"{SPACING}Try `lm-eval -h` for list of available tasks",
f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
)
raise ValueError(
f"Tasks {missing} were not found. Try `lm-eval -h` for list of available tasks."
f"Tasks {missing} were not found. Try `lm-eval --tasks list` for list of available tasks."
)
if args.output_path:
......
......@@ -9,6 +9,9 @@ import evaluate
from lm_eval.api.registry import register_metric, register_aggregation
import logging
eval_logger = logging.getLogger("lm-eval")
# Register Aggregations First
@register_aggregation("mean")
......
......@@ -10,7 +10,10 @@ import hashlib
from tqdm import tqdm
from lm_eval import utils
from lm_eval.logger import eval_logger
import logging
eval_logger = logging.getLogger("lm-eval")
T = TypeVar("T", bound="LM")
......
import os
import evaluate
from lm_eval.api.model import LM
from lm_eval.logger import eval_logger
import logging
eval_logger = logging.getLogger("lm-eval")
MODEL_REGISTRY = {}
......
......@@ -4,6 +4,7 @@ from dataclasses import dataclass, field, asdict
import re
import ast
import yaml
import logging
import evaluate
import random
import itertools
......@@ -21,7 +22,6 @@ from lm_eval.api import samplers
from lm_eval.api.instance import Instance
from lm_eval.api.filter import FilterEnsemble
from lm_eval.logger import eval_logger
from lm_eval.prompts import get_prompt
from lm_eval.filters import build_filter_ensemble
from lm_eval.api.metrics import (
......@@ -48,6 +48,9 @@ ALL_OUTPUT_TYPES = [
]
eval_logger = logging.getLogger("lm-eval")
@dataclass
class TaskConfig(dict):
# task naming/registry
......
......@@ -20,10 +20,9 @@ from lm_eval.utils import (
make_table,
create_iterator,
get_git_commit_hash,
eval_logger,
)
from lm_eval.logger import eval_logger
@positional_deprecated
def simple_evaluate(
......@@ -256,7 +255,7 @@ def evaluate(
task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)
eval_logger.info(
eval_logger.debug(
f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
)
......
import logging
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
......@@ -2,9 +2,11 @@ from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from tqdm import tqdm
import time
from lm_eval.logger import eval_logger
from lm_eval import utils
from typing import List, Any, Tuple
eval_logger = utils.eval_logger
def anthropic_completion(
client, #: anthropic.Anthropic,
......
......@@ -16,7 +16,6 @@ from pathlib import Path
import torch.nn.functional as F
from lm_eval import utils
from lm_eval.logger import eval_logger
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
......@@ -25,6 +24,8 @@ from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
from accelerate import Accelerator, find_executable_batch_size, DistributedType
from typing import List, Optional, Union
eval_logger = utils.eval_logger
def _get_accelerate_args(
device_map_option: Optional[str] = "auto",
......
......@@ -3,7 +3,7 @@ import ast
from typing import Dict
from lm_eval import utils
from lm_eval.logger import eval_logger
from lm_eval.utils import eval_logger
# Prompt library.
# Stores prompts in a dictionary indexed by 2 levels:
......
......@@ -26,7 +26,7 @@ from .scrolls.task import (
QMSum,
)
eval_logger = logging.getLogger("lm-eval")
eval_logger = utils.eval_logger
def register_configurable_task(config: Dict[str, str]) -> int:
......@@ -152,8 +152,11 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
else:
if type(config["task"]) == list:
register_configurable_group(config, yaml_path)
# Log this silently and show it only when
# the user defines the appropriate verbosity.
except ModuleNotFoundError as e:
eval_logger.warning(
eval_logger.debug(
f"{yaml_path}: {e}. Config will not be added to registry."
)
except Exception as error:
......@@ -176,8 +179,12 @@ def include_path(task_dir):
return 0
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_path(task_dir)
def initialize_tasks(verbosity="INFO"):
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_path(task_dir)
def get_task(task_name, config):
......
import datasets
import re
import signal
from lm_eval.logger import eval_logger
from lm_eval.utils import eval_logger
from typing import Optional, List, Dict
try:
......
......@@ -3,7 +3,7 @@ import json
import requests
import numpy as np
from lm_eval.logger import eval_logger
from lm_eval.utils import eval_logger
def toxicity_perspective_api(references, predictions, **kwargs):
......
......@@ -19,7 +19,16 @@ import transformers
from jinja2 import BaseLoader, Environment, StrictUndefined
from itertools import islice
from lm_eval.logger import eval_logger
import logging
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
def escaped_split(text, sep_char, maxsplit=-1):
......
......@@ -4,9 +4,8 @@ import json
import os
import random
from lm_eval import tasks
from lm_eval.utils import join_iters
from lm_eval.tasks import include_path
from lm_eval.logger import eval_logger
from lm_eval.utils import join_iters, eval_logger
from lm_eval.tasks import initialize_tasks, include_path
EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
......@@ -25,6 +24,12 @@ def parse_args():
default=None,
help="Additional path to include if there are external tasks to include.",
)
parser.add_argument(
"--verbosity",
type=str,
default="INFO",
help="Log error when tasks are not registered.",
)
return parser.parse_args()
......@@ -32,6 +37,8 @@ def main():
args = parse_args()
np.random.seed(args.seed)
initialize_tasks(args.verbosity)
if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path)
......
......@@ -8,6 +8,8 @@ import lm_eval.tasks as tasks
import sys
import torch
tasks.initialize_tasks()
class Test_HFLM:
torch.use_deterministic_algorithms(True)
......
......@@ -11,6 +11,7 @@ from typing import List
import random
import pytest
tasks.initialize_tasks()
# TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces
......
......@@ -4,7 +4,7 @@ from .utils import new_tasks
import lm_eval.tasks as tasks
from lm_eval.api.task import ConfigurableTask
tasks.initialize_tasks()
# Default Task
TASKS = ["arc_easy"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment