Unverified Commit afda6551 authored by Hailey Schoelkopf's avatar Hailey Schoelkopf Committed by GitHub
Browse files

Merge pull request #958 from EleutherAI/verbosity-rework

[Refactor] Verbosity rework
parents 2c0c345a 9b596e8f
...@@ -59,6 +59,8 @@ my_model = initialize_my_model() # create your model (could be running finetunin ...@@ -59,6 +59,8 @@ my_model = initialize_my_model() # create your model (could be running finetunin
... ...
lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()` lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
lm_eval.tasks.initialize_tasks() # register all tasks from the `lm_eval/tasks` subdirectory. Alternatively, can call `lm_eval.tasks.include_path("path/to/my/custom/task/configs")` to only register a set of tasks in a separate directory.
results = lm_eval.simple_evaluate( # call simple_evaluate results = lm_eval.simple_evaluate( # call simple_evaluate
model=lm_obj, model=lm_obj,
tasks=["taskname1", "taskname2"], tasks=["taskname1", "taskname2"],
...@@ -85,7 +87,7 @@ my_model = initialize_my_model() # create your model (could be running finetunin ...@@ -85,7 +87,7 @@ my_model = initialize_my_model() # create your model (could be running finetunin
... ...
lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()` lm_obj = Your_LM(model=my_model, batch_size=16) # instantiate an LM subclass that takes your initialized model and can run `Your_LM.loglikelihood()`, `Your_LM.loglikelihood_rolling()`, `Your_LM.generate_until()`
lm_eval.tasks.initialize_tasks() # register all tasks from the `lm_eval/tasks` subdirectory. Alternatively, can call `lm_eval.tasks.include_path("path/to/my/custom/task/configs")` to only register a set of tasks in a separate directory.
def evaluate( def evaluate(
lm=lm_obj, lm=lm_obj,
......
import os import os
import re import re
import sys
import json import json
import fnmatch
import argparse
import logging import logging
from pathlib import Path import argparse
import numpy as np import numpy as np
from lm_eval import evaluator, utils
from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger, SPACING
from lm_eval.tasks import include_path
from pathlib import Path
from typing import Union from typing import Union
from lm_eval import evaluator, utils
from lm_eval.tasks import initialize_tasks, include_path
from lm_eval.api.registry import ALL_TASKS
def _handle_non_serializable(o): def _handle_non_serializable(o):
if isinstance(o, np.int64) or isinstance(o, np.int32): if isinstance(o, np.int64) or isinstance(o, np.int32):
...@@ -25,11 +25,11 @@ def _handle_non_serializable(o): ...@@ -25,11 +25,11 @@ def _handle_non_serializable(o):
def parse_eval_args() -> argparse.Namespace: def parse_eval_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument("--model", required=True, help="Name of model e.g. `hf`") parser.add_argument("--model", default="hf", help="Name of model e.g. `hf`")
parser.add_argument( parser.add_argument(
"--tasks", "--tasks",
default=None, default=None,
help="Available Tasks:\n - {}".format("\n - ".join(sorted(ALL_TASKS))), help="To get full list of tasks, use the command lm-eval --tasks list",
) )
parser.add_argument( parser.add_argument(
"--model_args", "--model_args",
...@@ -119,9 +119,13 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -119,9 +119,13 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
# we allow for args to be passed externally, else we parse them ourselves # we allow for args to be passed externally, else we parse them ourselves
args = parse_eval_args() args = parse_eval_args()
eval_logger = utils.eval_logger
eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) eval_logger.setLevel(getattr(logging, f"{args.verbosity}"))
eval_logger.info(f"Verbosity set to {args.verbosity}")
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
initialize_tasks(args.verbosity)
if args.limit: if args.limit:
eval_logger.warning( eval_logger.warning(
" --limit SHOULD ONLY BE USED FOR TESTING." " --limit SHOULD ONLY BE USED FOR TESTING."
...@@ -133,6 +137,11 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -133,6 +137,11 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
if args.tasks is None: if args.tasks is None:
task_names = ALL_TASKS task_names = ALL_TASKS
elif args.tasks == "list":
eval_logger.info(
"Available Tasks:\n - {}".format(f"\n - ".join(sorted(ALL_TASKS)))
)
sys.exit()
else: else:
if os.path.isdir(args.tasks): if os.path.isdir(args.tasks):
import glob import glob
...@@ -159,10 +168,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -159,10 +168,10 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
missing = ", ".join(task_missing) missing = ", ".join(task_missing)
eval_logger.error( eval_logger.error(
f"Tasks were not found: {missing}\n" f"Tasks were not found: {missing}\n"
f"{SPACING}Try `lm-eval -h` for list of available tasks", f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks",
) )
raise ValueError( raise ValueError(
f"Tasks {missing} were not found. Try `lm-eval -h` for list of available tasks." f"Tasks {missing} were not found. Try `lm-eval --tasks list` for list of available tasks."
) )
if args.output_path: if args.output_path:
......
...@@ -9,6 +9,9 @@ import evaluate ...@@ -9,6 +9,9 @@ import evaluate
from lm_eval.api.registry import register_metric, register_aggregation from lm_eval.api.registry import register_metric, register_aggregation
import logging
eval_logger = logging.getLogger("lm-eval")
# Register Aggregations First # Register Aggregations First
@register_aggregation("mean") @register_aggregation("mean")
......
...@@ -10,7 +10,10 @@ import hashlib ...@@ -10,7 +10,10 @@ import hashlib
from tqdm import tqdm from tqdm import tqdm
from lm_eval import utils from lm_eval import utils
from lm_eval.logger import eval_logger
import logging
eval_logger = logging.getLogger("lm-eval")
T = TypeVar("T", bound="LM") T = TypeVar("T", bound="LM")
......
import os import os
import evaluate import evaluate
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.logger import eval_logger
import logging
eval_logger = logging.getLogger("lm-eval")
MODEL_REGISTRY = {} MODEL_REGISTRY = {}
......
...@@ -4,6 +4,7 @@ from dataclasses import dataclass, field, asdict ...@@ -4,6 +4,7 @@ from dataclasses import dataclass, field, asdict
import re import re
import ast import ast
import yaml import yaml
import logging
import evaluate import evaluate
import random import random
import itertools import itertools
...@@ -21,7 +22,6 @@ from lm_eval.api import samplers ...@@ -21,7 +22,6 @@ from lm_eval.api import samplers
from lm_eval.api.instance import Instance from lm_eval.api.instance import Instance
from lm_eval.api.filter import FilterEnsemble from lm_eval.api.filter import FilterEnsemble
from lm_eval.logger import eval_logger
from lm_eval.prompts import get_prompt from lm_eval.prompts import get_prompt
from lm_eval.filters import build_filter_ensemble from lm_eval.filters import build_filter_ensemble
from lm_eval.api.metrics import ( from lm_eval.api.metrics import (
...@@ -48,6 +48,9 @@ ALL_OUTPUT_TYPES = [ ...@@ -48,6 +48,9 @@ ALL_OUTPUT_TYPES = [
] ]
eval_logger = logging.getLogger("lm-eval")
@dataclass @dataclass
class TaskConfig(dict): class TaskConfig(dict):
# task naming/registry # task naming/registry
......
...@@ -20,10 +20,9 @@ from lm_eval.utils import ( ...@@ -20,10 +20,9 @@ from lm_eval.utils import (
make_table, make_table,
create_iterator, create_iterator,
get_git_commit_hash, get_git_commit_hash,
eval_logger,
) )
from lm_eval.logger import eval_logger
@positional_deprecated @positional_deprecated
def simple_evaluate( def simple_evaluate(
...@@ -256,7 +255,7 @@ def evaluate( ...@@ -256,7 +255,7 @@ def evaluate(
task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size) task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)
eval_logger.info( eval_logger.debug(
f"Task: {task_name}; number of requests on this rank: {len(task.instances)}" f"Task: {task_name}; number of requests on this rank: {len(task.instances)}"
) )
......
import logging
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
...@@ -2,9 +2,11 @@ from lm_eval.api.model import LM ...@@ -2,9 +2,11 @@ from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
from tqdm import tqdm from tqdm import tqdm
import time import time
from lm_eval.logger import eval_logger from lm_eval import utils
from typing import List, Any, Tuple from typing import List, Any, Tuple
eval_logger = utils.eval_logger
def anthropic_completion( def anthropic_completion(
client, #: anthropic.Anthropic, client, #: anthropic.Anthropic,
......
...@@ -16,7 +16,6 @@ from pathlib import Path ...@@ -16,7 +16,6 @@ from pathlib import Path
import torch.nn.functional as F import torch.nn.functional as F
from lm_eval import utils from lm_eval import utils
from lm_eval.logger import eval_logger
from lm_eval.api.model import LM from lm_eval.api.model import LM
from lm_eval.api.registry import register_model from lm_eval.api.registry import register_model
...@@ -25,6 +24,8 @@ from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria ...@@ -25,6 +24,8 @@ from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
from accelerate import Accelerator, find_executable_batch_size, DistributedType from accelerate import Accelerator, find_executable_batch_size, DistributedType
from typing import List, Optional, Union from typing import List, Optional, Union
eval_logger = utils.eval_logger
def _get_accelerate_args( def _get_accelerate_args(
device_map_option: Optional[str] = "auto", device_map_option: Optional[str] = "auto",
......
...@@ -3,7 +3,7 @@ import ast ...@@ -3,7 +3,7 @@ import ast
from typing import Dict from typing import Dict
from lm_eval import utils from lm_eval import utils
from lm_eval.logger import eval_logger from lm_eval.utils import eval_logger
# Prompt library. # Prompt library.
# Stores prompts in a dictionary indexed by 2 levels: # Stores prompts in a dictionary indexed by 2 levels:
......
...@@ -26,7 +26,7 @@ from .scrolls.task import ( ...@@ -26,7 +26,7 @@ from .scrolls.task import (
QMSum, QMSum,
) )
eval_logger = logging.getLogger("lm-eval") eval_logger = utils.eval_logger
def register_configurable_task(config: Dict[str, str]) -> int: def register_configurable_task(config: Dict[str, str]) -> int:
...@@ -152,8 +152,11 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None: ...@@ -152,8 +152,11 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
else: else:
if type(config["task"]) == list: if type(config["task"]) == list:
register_configurable_group(config, yaml_path) register_configurable_group(config, yaml_path)
# Log this silently and show it only when
# the user defines the appropriate verbosity.
except ModuleNotFoundError as e: except ModuleNotFoundError as e:
eval_logger.warning( eval_logger.debug(
f"{yaml_path}: {e}. Config will not be added to registry." f"{yaml_path}: {e}. Config will not be added to registry."
) )
except Exception as error: except Exception as error:
...@@ -176,8 +179,12 @@ def include_path(task_dir): ...@@ -176,8 +179,12 @@ def include_path(task_dir):
return 0 return 0
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/" def initialize_tasks(verbosity="INFO"):
include_path(task_dir)
eval_logger.setLevel(getattr(logging, f"{verbosity}"))
task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
include_path(task_dir)
def get_task(task_name, config): def get_task(task_name, config):
......
import datasets import datasets
import re import re
import signal import signal
from lm_eval.logger import eval_logger from lm_eval.utils import eval_logger
from typing import Optional, List, Dict from typing import Optional, List, Dict
try: try:
......
...@@ -3,7 +3,7 @@ import json ...@@ -3,7 +3,7 @@ import json
import requests import requests
import numpy as np import numpy as np
from lm_eval.logger import eval_logger from lm_eval.utils import eval_logger
def toxicity_perspective_api(references, predictions, **kwargs): def toxicity_perspective_api(references, predictions, **kwargs):
......
...@@ -19,7 +19,16 @@ import transformers ...@@ -19,7 +19,16 @@ import transformers
from jinja2 import BaseLoader, Environment, StrictUndefined from jinja2 import BaseLoader, Environment, StrictUndefined
from itertools import islice from itertools import islice
from lm_eval.logger import eval_logger import logging
logging.basicConfig(
format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
datefmt="%Y-%m-%d:%H:%M:%S",
level=logging.INFO,
)
eval_logger = logging.getLogger("lm-eval")
SPACING = " " * 47
def escaped_split(text, sep_char, maxsplit=-1): def escaped_split(text, sep_char, maxsplit=-1):
......
...@@ -4,9 +4,8 @@ import json ...@@ -4,9 +4,8 @@ import json
import os import os
import random import random
from lm_eval import tasks from lm_eval import tasks
from lm_eval.utils import join_iters from lm_eval.utils import join_iters, eval_logger
from lm_eval.tasks import include_path from lm_eval.tasks import initialize_tasks, include_path
from lm_eval.logger import eval_logger
EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n" EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
...@@ -25,6 +24,12 @@ def parse_args(): ...@@ -25,6 +24,12 @@ def parse_args():
default=None, default=None,
help="Additional path to include if there are external tasks to include.", help="Additional path to include if there are external tasks to include.",
) )
parser.add_argument(
"--verbosity",
type=str,
default="INFO",
help="Log error when tasks are not registered.",
)
return parser.parse_args() return parser.parse_args()
...@@ -32,6 +37,8 @@ def main(): ...@@ -32,6 +37,8 @@ def main():
args = parse_args() args = parse_args()
np.random.seed(args.seed) np.random.seed(args.seed)
initialize_tasks(args.verbosity)
if args.include_path is not None: if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}") eval_logger.info(f"Including path: {args.include_path}")
include_path(args.include_path) include_path(args.include_path)
......
...@@ -8,6 +8,8 @@ import lm_eval.tasks as tasks ...@@ -8,6 +8,8 @@ import lm_eval.tasks as tasks
import sys import sys
import torch import torch
tasks.initialize_tasks()
class Test_HFLM: class Test_HFLM:
torch.use_deterministic_algorithms(True) torch.use_deterministic_algorithms(True)
......
...@@ -11,6 +11,7 @@ from typing import List ...@@ -11,6 +11,7 @@ from typing import List
import random import random
import pytest import pytest
tasks.initialize_tasks()
# TODO: more fine grained unit tests rather than this big honking integration # TODO: more fine grained unit tests rather than this big honking integration
# test once we break evaluator into smaller, more manageable pieces # test once we break evaluator into smaller, more manageable pieces
......
...@@ -4,7 +4,7 @@ from .utils import new_tasks ...@@ -4,7 +4,7 @@ from .utils import new_tasks
import lm_eval.tasks as tasks import lm_eval.tasks as tasks
from lm_eval.api.task import ConfigurableTask from lm_eval.api.task import ConfigurableTask
tasks.initialize_tasks()
# Default Task # Default Task
TASKS = ["arc_easy"] TASKS = ["arc_easy"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment