Commit 2106fbeb authored by Baber's avatar Baber
Browse files

Merge branch 'main' into mathvista

# Conflicts:
#	lm_eval/models/openai_completions.py
parents 4354fe46 703fbffd
......@@ -4,7 +4,7 @@ from typing import List
from lm_eval.api.filter import FilterEnsemble
from lm_eval.api.registry import get_filter
from . import extraction, selection, transformation
from . import custom, extraction, selection, transformation
def build_filter_ensemble(
......
from lm_eval.api.filter import Filter
from lm_eval.api.registry import register_filter
@register_filter("custom")
class CustomFilter(Filter):
"""
Custom filter that applies a custom, user-defined function to the model responses.
"""
def __init__(self, **kwargs) -> None:
self.filter_fn = kwargs.pop("filter_fn")
super().__init__(**kwargs)
def apply(self, resps, docs):
return self.filter_fn(resps, docs)
......@@ -8,12 +8,17 @@ from lm_eval.api.registry import register_filter
@register_filter("regex")
class RegexFilter(Filter):
""" """
"""A filter that extracts values from text using regex pattern matching.
This filter applies a regex pattern to each model response and extracts matched values.
If no match is found, returns a fallback value. Useful for extracting structured data
(like numbers) from unstructured model outputs.
"""
def __init__(
self,
regex_pattern: str = r"#### (\-?[0-9\.\,]+)",
group_select=0,
group_select: int = 0,
fallback: str = "[invalid]",
) -> None:
"""
......@@ -25,7 +30,7 @@ class RegexFilter(Filter):
self.group_select = group_select
self.fallback = fallback
def apply(self, resps, docs):
def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
......@@ -37,28 +42,27 @@ class RegexFilter(Filter):
if match:
match = match[self.group_select]
if isinstance(match, tuple):
match = [m for m in match if m][0]
match = [m for m in match if m]
if match:
match = match[0]
else:
match = self.fallback
match = match.strip()
else:
match = self.fallback
filtered.append(match)
return filtered
# print(resps)
filtered_resps = list(map(lambda x: filter_set(x), resps))
# print(filtered_resps)
return filtered_resps
@register_filter("remove_whitespace")
class WhitespaceFilter(Filter):
""" """
def __init__(self) -> None:
pass
"""Filters out leading whitespace from responses."""
def apply(self, resps, docs):
def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
def filter_set(inst):
filtered_resp = []
for resp in inst:
......@@ -103,7 +107,7 @@ class MultiChoiceRegexFilter(RegexFilter):
self.ignore_punctuation = ignore_punctuation
self.regexes_to_ignore = regexes_to_ignore
def apply(self, resps, docs):
def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
# here, we assume we have a list, in which each element is
# a list of model responses for some particular input/target pair.
# so we process each of these (same input/target response sets)
......@@ -162,7 +166,7 @@ class MultiChoiceRegexFilter(RegexFilter):
fallback_regex = re.compile("|".join(fallback_regexes))
without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile(
f":[\s]*({without_paren_fallback_regex})"
rf":[\s]*({without_paren_fallback_regex})"
)
filtered = []
......
......@@ -15,10 +15,9 @@ logger = logging.getLogger(__name__)
def get_wandb_printer() -> Literal["Printer"]:
"""Returns a wandb printer instance for pretty stdout."""
from wandb.sdk.lib.printer import get_printer
from wandb.sdk.wandb_settings import Settings
from wandb.sdk.lib.printer import new_printer
printer = get_printer(Settings()._jupyter)
printer = new_printer()
return printer
......@@ -49,6 +48,9 @@ class WandbLogger:
self.wandb_args: Dict[str, Any] = kwargs
# pop the step key from the args to save for all logging calls
self.step = self.wandb_args.pop("step", None)
# initialize a W&B run
if wandb.run is None:
self.run = wandb.init(**self.wandb_args)
......@@ -153,11 +155,11 @@ class WandbLogger:
# log the complete eval result to W&B Table
table = make_table(["Tasks"] + columns, "results")
self.run.log({"evaluation/eval_results": table})
self.run.log({"evaluation/eval_results": table}, step=self.step)
if "groups" in self.results.keys():
table = make_table(["Groups"] + columns, "groups")
self.run.log({"evaluation/group_eval_results": table})
self.run.log({"evaluation/group_eval_results": table}, step=self.step)
def _log_results_as_artifact(self) -> None:
"""Log results as JSON artifact to W&B."""
......@@ -175,13 +177,13 @@ class WandbLogger:
"""Log evaluation results to W&B."""
# Log configs to wandb
configs = self._get_config()
self.run.config.update(configs)
self.run.config.update(configs, allow_val_change=self.step is not None)
wandb_summary, self.wandb_results = self._sanitize_results_dict()
# update wandb.run.summary with items that were removed
self.run.summary.update(wandb_summary)
# Log the evaluation metrics to wandb
self.run.log(self.wandb_results)
self.run.log(self.wandb_results, step=self.step)
# Log the evaluation metrics as W&B Table
self._log_results_as_table()
# Log the results dict as json to W&B Artifacts
......@@ -330,7 +332,7 @@ class WandbLogger:
# log the samples as a W&B Table
df = self._generate_dataset(eval_preds, self.task_configs.get(task_name))
self.run.log({f"{task_name}_eval_results": df})
self.run.log({f"{task_name}_eval_results": df}, step=self.step)
# log the samples as a json file as W&B Artifact
self._log_samples_as_artifact(eval_preds, task_name)
......@@ -349,4 +351,4 @@ class WandbLogger:
# log the samples as a json file as W&B Artifact
self._log_samples_as_artifact(eval_preds, task_name)
self.run.log({f"{group}_eval_results": grouped_df})
self.run.log({f"{group}_eval_results": grouped_df}, step=self.step)
......@@ -5,11 +5,13 @@ from . import (
gguf,
hf_vlms,
huggingface,
ibm_watsonx_ai,
mamba_lm,
nemo_lm,
neuralmagic,
neuron_optimum,
openai_completions,
optimum_ipex,
optimum_lm,
textsynth,
vllm_causallms,
......
......@@ -8,7 +8,7 @@ from lm_eval import utils
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.openai_completions import LocalCompletionsAPI
from lm_eval.models.utils import retry_on_specific_exceptions
from lm_eval.models.utils import handle_stop_sequences, retry_on_specific_exceptions
eval_logger = utils.eval_logger
......@@ -45,8 +45,8 @@ def anthropic_completion(
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
......@@ -108,8 +108,8 @@ def anthropic_chat(
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
......@@ -168,8 +168,8 @@ class AnthropicLM(LM):
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
......@@ -217,8 +217,8 @@ please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install
def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
try:
import anthropic
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'anthropic' LM type, but package `anthropic` is not installed. \
please install anthropic via `pip install 'lm-eval[anthropic]'` or `pip install -e '.[anthropic]'`",
)
......@@ -311,7 +311,12 @@ class AnthropicChat(LocalCompletionsAPI):
}
def _create_payload(
self, messages: List[Dict], generate=True, gen_kwargs: dict = None, **kwargs
self,
messages: List[Dict],
generate=True,
gen_kwargs: dict = None,
eos="\n\nHuman:",
**kwargs,
) -> dict:
system = (
messages[0].get("content") if messages[0].get("role") == "system" else None
......@@ -321,7 +326,7 @@ class AnthropicChat(LocalCompletionsAPI):
gen_kwargs.pop("do_sample", False)
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["\n\nHuman:"])
stop = handle_stop_sequences(gen_kwargs.pop("until", ["\n\nHuman:"]), eos=eos)
if not isinstance(stop, list):
stop = [stop]
out = {
......
......@@ -21,7 +21,7 @@ from typing import (
try:
import requests
from aiohttp import ClientSession, TCPConnector
from aiohttp import ClientSession, ClientTimeout, TCPConnector
from tenacity import RetryError, retry, stop_after_attempt, wait_exponential
from tqdm import tqdm
from tqdm.asyncio import tqdm_asyncio
......@@ -58,11 +58,11 @@ class TemplateAPI(TemplateLM):
pretrained: str = None, # `model` takes precedence over `pretrained` when passed.
base_url: str = None,
tokenizer: Optional[str] = None,
# Logliklehood tasks require a tokenizer to calculate context lengths,
# Loglikelihood tasks require a tokenizer to calculate context lengths,
# however the requests can be sent as a string if the API doesn't support token inputs.
# use tokenized_requests=False
tokenizer_backend: Optional[
Literal["tiktoken", "huggingface", None]
Literal["tiktoken", "huggingface", "None", "none"]
] = "huggingface",
truncate: bool = False,
# number of concurrent requests. More useful if not batching
......@@ -79,6 +79,10 @@ class TemplateAPI(TemplateLM):
trust_remote_code: bool = False,
revision: Optional[str] = "main",
use_fast_tokenizer: bool = True,
verify_certificate: bool = True,
eos_string: str = None,
# timeout in seconds
timeout: int = 300,
**kwargs,
) -> None:
super().__init__()
......@@ -115,11 +119,16 @@ class TemplateAPI(TemplateLM):
"Concurrent requests are disabled. To enable concurrent requests, set `num_concurrent` > 1."
)
self._concurrent = int(num_concurrent)
self.tokenizer_backend = tokenizer_backend
self.tokenizer_backend = (
None if tokenizer_backend in ("None", "none") else tokenizer_backend
)
self.add_bos_token = add_bos_token
self.custom_prefix_token_id = custom_prefix_token_id
self.tokenized_requests = tokenized_requests
self.max_retries = int(max_retries)
self.verify_certificate = verify_certificate
self._eos_string = eos_string
self.timeout = int(timeout)
eval_logger.info(f"Using tokenizer {self.tokenizer_backend}")
if self.tokenizer_backend is None:
......@@ -144,7 +153,7 @@ class TemplateAPI(TemplateLM):
self.tokenizer = tiktoken.encoding_for_model(self.model)
except ModuleNotFoundError as e:
raise Exception(
raise ModuleNotFoundError(
"Attempted to use 'openai' LM type, but the package `tiktoken` is not installed. "
"Please install it via `pip install lm-eval[api]` or `pip install -e .[api]`."
) from e
......@@ -172,6 +181,7 @@ class TemplateAPI(TemplateLM):
generate: bool = True,
gen_kwargs: Optional[dict] = None,
seed: int = 1234,
eos: str = None,
**kwargs,
) -> dict:
"""This method is responsible for creating the json payload that will be sent to the API."""
......@@ -194,7 +204,7 @@ class TemplateAPI(TemplateLM):
if not self.tokenized_requests:
# if messages are tokenized:
if isinstance(messages[0][0], int):
# assuming decoding is lossless. However, this is only for logliklehood requests
# assuming decoding is lossless. However, this is only for loglikelihood requests
# as we need to compute the context length. For generations, we don't need to tokenize.
messages = self.decode_batch(messages)
if self._batch_size <= 1:
......@@ -243,12 +253,15 @@ class TemplateAPI(TemplateLM):
return ""
def apply_chat_template(
self, chat_history: List[Dict[str, str]]
self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
) -> Union[str, JsonChatStr]:
"""Applies a chat template to a list of chat history between user and model."""
if self.tokenizer_backend == "huggingface" and self.tokenized_requests:
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
chat_history,
tokenize=False,
add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt,
)
else:
# bit of a hack. We'll load back before sending to the API
......@@ -264,6 +277,21 @@ class TemplateAPI(TemplateLM):
elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.eot_token
@cached_property
def eos_string(self) -> Optional[str]:
if self._eos_string:
return self._eos_string
elif self.tokenizer is not None:
if self.tokenizer_backend == "huggingface":
return self.tokenizer.eos_token
elif self.tokenizer_backend == "tiktoken":
return self.tokenizer.decode([self.tokenizer.eot_token])
else:
eval_logger.warning(
"Cannot determine EOS string to pass to stop sequence. Manually set by passing `eos_string` to model_args."
)
return None
@cached_property
def prefix_token_id(self) -> Optional[int]:
if self.tokenizer is None:
......@@ -339,9 +367,11 @@ class TemplateAPI(TemplateLM):
generate=generate,
gen_kwargs=gen_kwargs,
seed=self._seed,
eos=self.eos_string,
**kwargs,
),
headers=self.header,
verify=self.verify_certificate,
)
if not response.ok:
eval_logger.warning(
......@@ -412,7 +442,7 @@ class TemplateAPI(TemplateLM):
)
return None
def batch_logliklehood_requests(
def batch_loglikelihood_requests(
self, chunks: Iterable[List[LogLikelihoodInputs]]
) -> Tuple[List[List[int]], List[int], List[Tuple[str, str]]]:
inputs = []
......@@ -421,9 +451,13 @@ class TemplateAPI(TemplateLM):
for chunk in chunks:
for cache_key, context_enc, continuation_enc in chunk:
# max_length - 1 as we always have 1 token for generation
inp = (context_enc + continuation_enc)[-(self.max_length) :]
inp = (context_enc + continuation_enc)[-self.max_length :]
if len(inp) < len(context_enc + continuation_enc):
eval_logger.warning(
f"Context length ({len(context_enc)}) + continuation length ({len(continuation_enc)}) > max_length ({self.max_length}). Left truncating context."
)
ctxlen = len(context_enc) - max(
0, len(context_enc) + len(continuation_enc) - (self.max_length)
0, len(context_enc) + len(continuation_enc) - self.max_length
)
inputs.append(inp)
......@@ -442,7 +476,9 @@ class TemplateAPI(TemplateLM):
) -> Union[List[List[str]], List[List[Tuple[float, bool]]]]:
ctxlens = ctxlens if ctxlens else [None] * len(requests)
conn = TCPConnector(limit=self._concurrent)
async with ClientSession(connector=conn) as session:
async with ClientSession(
connector=conn, timeout=ClientTimeout(total=self.timeout)
) as session:
retry_: Callable[..., Awaitable[Any]] = retry(
stop=stop_after_attempt(self.max_retries),
wait=wait_exponential(multiplier=0.5, min=1, max=10),
......@@ -497,7 +533,7 @@ class TemplateAPI(TemplateLM):
if self._concurrent <= 1:
pbar = tqdm(desc="Requesting API", total=len(requests))
for chunk in chunked:
inputs, ctxlens, cache_keys = self.batch_logliklehood_requests([chunk])
inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests([chunk])
outputs = retry(
stop=stop_after_attempt(self.max_retries),
......@@ -521,7 +557,7 @@ class TemplateAPI(TemplateLM):
)
pbar.update(1)
else:
inputs, ctxlens, cache_keys = self.batch_logliklehood_requests(chunked)
inputs, ctxlens, cache_keys = self.batch_loglikelihood_requests(chunked)
res = itertools.chain.from_iterable(
asyncio.run(
self.get_batched_requests(
......@@ -565,6 +601,24 @@ class TemplateAPI(TemplateLM):
pbar = tqdm(desc="Requesting API", total=len(requests))
for chunk in chunked:
contexts, all_gen_kwargs, encodings_list = zip(*chunk)
if self.tokenized_requests:
max_gen_toks = all_gen_kwargs[0].get(
"max_gen_toks", self._max_gen_toks
)
max_context_len = self.max_length - max_gen_toks
encodings_list = [x[-max_context_len:] for x in encodings_list]
if any(
len(x) + max_gen_toks > self.max_length for x in encodings_list
):
eval_logger.warning(
f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks: ({max_gen_toks}). They were left truncated."
)
else:
eval_logger.info(
"Tokenized requests are disabled. Context + generation length is not checked."
)
req = encodings_list if self.tokenized_requests else contexts
outputs = retry(
stop=stop_after_attempt(self.max_retries),
......@@ -596,6 +650,24 @@ class TemplateAPI(TemplateLM):
else:
for chunk in chunked:
contexts, all_gen_kwargs, encodings_list = zip(*chunk)
if self.tokenized_requests:
max_gen_toks = all_gen_kwargs[0].get(
"max_gen_toks", self._max_gen_toks
)
max_context_len = self.max_length - max_gen_toks
encodings_list = [x[-max_context_len:] for x in encodings_list]
if any(
len(x) + max_gen_toks > self.max_length for x in encodings_list
):
eval_logger.warning(
f"Some contexts exceeded (max length: ({self.max_length}) - max_gen_toks ({max_gen_toks}). They were left truncated."
)
else:
eval_logger.info(
"Tokenized requests are disabled. Context + generation length is not checked."
)
req = encodings_list if self.tokenized_requests else contexts
results = itertools.chain.from_iterable(
asyncio.run(
......
......@@ -68,7 +68,9 @@ class GGUFLM(LM):
logger.error(f"RequestException: {e}")
time.sleep(delay) # wait before retrying
else:
raise Exception(f"Failed to get a valid response after {retries} retries.")
raise RuntimeError(
f"Failed to get a valid response after {retries} retries."
)
def loglikelihood(self, requests, disable_tqdm: bool = False):
if not requests:
......
......@@ -14,6 +14,7 @@ from lm_eval.models.huggingface import HFLM
from lm_eval.models.utils import (
Collator,
flatten_image_list,
handle_stop_sequences,
pad_and_concat,
replace_placeholders,
stop_sequences_criteria,
......@@ -215,7 +216,9 @@ class HFMultimodalLM(HFLM):
return context_enc, continuation_enc, image_enc
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
def apply_chat_template(
self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
) -> str:
self.chat_applied = True
if not self.interleave:
for content in chat_history:
......@@ -265,7 +268,9 @@ class HFMultimodalLM(HFLM):
)
return self.processor.apply_chat_template(
chat_history, add_generation_prompt=True
chat_history,
add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt,
)
def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str]:
......@@ -661,7 +666,7 @@ class HFMultimodalLM(HFLM):
chunks = re_ords.get_batched(n=self.batch_size, batch_fn=None)
### Up to here: was identical to non-multimodal HFLM generate_until ###
eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
for chunk in chunks:
contexts, all_gen_kwargs, aux_arguments = zip(*chunk)
......@@ -678,27 +683,14 @@ class HFMultimodalLM(HFLM):
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
# unpack our keyword arguments.
until = None
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [until]
elif not isinstance(until, list):
raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
)
# add EOS token to stop sequences
until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
)
# add EOS token to stop sequences
eos = self.tok_decode(self.eot_token_id, skip_special_tokens=False)
if not until:
until = [eos]
else:
until.append(eos)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
......
This diff is collapsed.
import copy
import json
import os
from functools import lru_cache
from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.model import LM
from lm_eval.api.registry import register_model
from lm_eval.models.api_models import JsonChatStr
from lm_eval.utils import eval_logger, simple_parse_args_string
class LogLikelihoodResult(NamedTuple):
log_likelihood: float
is_greedy: bool
def _verify_credentials(creds: Any) -> None:
"""
Verifies that all required keys are present in the credentials dictionary.
Args:
creds (Any): A dictionary containing the credentials.
Raises:
ValueError: If any of the necessary credentials are missing, with guidance on which environment variables need to be set.
"""
required_keys = ["apikey", "url", "project_id"]
env_var_mapping = {
"apikey": "WATSONX_API_KEY",
"url": "WATSONX_URL",
"project_id": "WATSONX_PROJECT_ID",
}
missing_keys = [key for key in required_keys if key not in creds or not creds[key]]
if missing_keys:
missing_env_vars = [env_var_mapping[key] for key in missing_keys]
raise ValueError(
f"Missing required credentials: {', '.join(missing_keys)}. Please set the following environment variables: {', '.join(missing_env_vars)}"
)
@lru_cache(maxsize=None)
def get_watsonx_credentials() -> Dict[str, str]:
"""
Retrieves Watsonx API credentials from environmental variables.
Returns:
Dict[str, str]: A dictionary containing the credentials necessary for authentication, including
keys such as `apikey`, `url`, and `project_id`.
Raises:
AssertionError: If the credentials format is invalid or any of the necessary credentials are missing.
"""
credentials = {
"apikey": os.getenv("WATSONX_API_KEY", None),
"url": os.getenv("WATSONX_URL", None),
"project_id": os.getenv("WATSONX_PROJECT_ID", None),
}
_verify_credentials(credentials)
return credentials
@register_model("watsonx_llm")
class WatsonxLLM(LM):
"""
Implementation of LM model interface for evaluating Watsonx model with the lm_eval framework.
See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md for reference.
"""
@classmethod
def create_from_arg_string(
cls: Type["WatsonxLLM"],
arg_string: str,
additional_config: Optional[Dict] = None,
) -> "WatsonxLLM":
"""
Allow the user to specify model parameters (TextGenerationParameters) in CLI arguments.
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
args = simple_parse_args_string(arg_string)
args.update(additional_config)
model_id = args.pop("model_id", None)
if model_id is None:
raise ValueError("'model_id' is required, please pass it in 'model_args'")
if not args.get("do_sample", None):
args["temperature"] = None
args["top_p"] = None
args["top_k"] = None
args["seed"] = None
generate_params = {
GenParams.DECODING_METHOD: (
"greedy" if not args.get("do_sample", None) else "sample"
),
GenParams.LENGTH_PENALTY: args.get("length_penalty", None),
GenParams.TEMPERATURE: args.get("temperature", None),
GenParams.TOP_P: args.get("top_p", None),
GenParams.TOP_K: args.get("top_k", None),
GenParams.RANDOM_SEED: args.get("seed", None),
GenParams.REPETITION_PENALTY: args.get("repetition_penalty", None),
GenParams.MIN_NEW_TOKENS: args.get("min_new_tokens", None),
GenParams.MAX_NEW_TOKENS: args.get("max_new_tokens", 256),
GenParams.STOP_SEQUENCES: args.get("stop_sequences", None),
GenParams.TIME_LIMIT: args.get("time_limit", None),
GenParams.TRUNCATE_INPUT_TOKENS: args.get("truncate_input_tokens", None),
GenParams.RETURN_OPTIONS: {
"generated_tokens": True,
"input_tokens": True,
"token_logprobs": True,
"token_ranks": True,
},
}
generate_params = {k: v for k, v in generate_params.items() if v is not None}
return cls(
watsonx_credentials=get_watsonx_credentials(),
model_id=model_id,
generate_params=generate_params,
)
def __init__(
self,
watsonx_credentials: Dict,
model_id,
generate_params: Optional[Dict[Any, Any]] = None,
) -> None:
try:
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai.foundation_models import ModelInference
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
super().__init__()
client = APIClient(watsonx_credentials)
project_id = watsonx_credentials.get("project_id", None)
deployment_id = watsonx_credentials.get("deployment_id", None)
client.set.default_project(project_id)
self.generate_params = generate_params
self.model = ModelInference(
model_id=model_id,
deployment_id=deployment_id,
api_client=client,
project_id=project_id,
)
self._model_id = model_id
@staticmethod
def _has_stop_token(response_tokens: List[str], context_tokens: List[str]) -> bool:
"""
Determines whether a stop token has been generated in the `response_tokens` compared to the `context_tokens`.
If the tokens do not match as expected, the function raises a RuntimeError, indicating a possible
misalignment between the tokens generated by the tokenizer and the model.
Args:
response_tokens (List[str]): The List of tokens generated as a response by the model.
context_tokens (List[str]): The List of tokens representing the input context.
Returns:
bool: True if the `response_tokens` likely contain a stop token that terminates the sequence,
otherwise raises an exception.
Raises:
RuntimeError: If there is an unexpected mismatch between the `response_tokens` and the `context_tokens`.
"""
context_length = len(context_tokens)
if response_tokens[: context_length - 1] == context_tokens[:-1]:
return (
response_tokens[-1] != context_tokens[-1]
) # only last token differs, probably stop sequence (</s>)
raise RuntimeError(
f"There is an unexpected difference between tokenizer and model tokens:\n"
f"context_tokens={context_tokens}\n"
f"response_tokens={response_tokens[:context_length]}"
)
def _check_model_logprobs_support(self):
"""
Verifies if the model supports returning log probabilities for input tokens.
This function sends a prompt to the model and checks whether the model's response
includes log probabilities for the input tokens. If log probabilities are not present,
it raises a `RuntimeError`, indicating that the model is not supported.
Raises:
RuntimeError: If the model does not return log probabilities for input tokens.
"""
tokens = self.model.generate_text(
prompt=["The best ice cream flavor is:"],
params=self.generate_params,
raw_response=True,
)[0]["results"][0]
if all(token.get("logprob", None) is None for token in tokens["input_tokens"]):
raise RuntimeError(
f"Model {self._model_id} is not supported: does not return logprobs for input tokens"
)
def _get_log_likelihood(
self,
input_tokens: List[Dict[str, float]],
context_tokens: List[Dict[str, float]],
) -> LogLikelihoodResult:
"""
Calculates the log likelihood of the generated tokens compared to the context tokens.
Args:
input_tokens (List[Dict[str, float]]): A List of token dictionaries, each containing
token information like `text` and `logprob`.
context_tokens (List[Dict[str, float]]): A List of token dictionaries representing
the input context.
Returns:
LogLikelihoodResult: An object containing the calculated log likelihood and a boolean
flag indicating if the tokens were generated greedily.
"""
response_tokens = [token["text"] for token in input_tokens]
context_length = len(context_tokens)
if self._has_stop_token(response_tokens, context_tokens):
context_length -= 1
return LogLikelihoodResult(
log_likelihood=sum(
token.get("logprob", 0) for token in input_tokens[context_length:]
),
is_greedy=all(
token["rank"] == 1 for token in input_tokens[context_length:]
),
)
def generate_until(self, requests: List[Instance]) -> List[str]:
"""
Generates text responses for a List of requests, with progress tracking and caching.
Args:
requests (List[Instance]): A List of instances, each containing a text input to be processed.
Returns:
List[str]: A List of generated responses.
"""
requests = [request.args for request in requests]
results = []
for request in tqdm(
requests,
desc="Running generate_until function ...",
):
context, continuation = request
try:
if isinstance(context, JsonChatStr):
context = json.loads(context.prompt)
response = self.model.chat(context, self.generate_params)
response = response["choices"][0]["message"]["content"]
else:
response = self.model.generate_text(context, self.generate_params)
except Exception as exp:
eval_logger.error("Error while generating text.")
raise exp
results.append(response)
self.cache_hook.add_partial(
"generate_until", (context, continuation), response
)
return results
def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
"""
Args:
requests: Each request contains Instance.args : Tuple[str, str] containing:
1. an input string to the LM and
2. a target string on which the loglikelihood of the LM producing this target,
conditioned on the input, will be returned.
Returns:
Tuple (loglikelihood, is_greedy) for each request according to the input order:
loglikelihood: probability of generating the target string conditioned on the input
is_greedy: True if and only if the target string would be generated by greedy sampling from the LM
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
self._check_model_logprobs_support()
generate_params = copy.copy(self.generate_params)
generate_params[GenParams.MAX_NEW_TOKENS] = 1
requests = [request.args for request in requests]
results: List[LogLikelihoodResult] = []
# Note: We're not using batching due to (current) indeterminism of loglikelihood values when sending batch of requests
for request in tqdm(
requests,
desc="Running loglikelihood function ...",
):
context, continuation = request
try:
tokenized_context = self.model.tokenize(
prompt=context, return_tokens=True
)["result"]["tokens"]
except Exception as exp:
eval_logger.error("Error while model tokenize.")
raise exp
input_prompt = context + continuation
try:
response = self.model.generate_text(
prompt=input_prompt, params=generate_params, raw_response=True
)
except Exception as exp:
eval_logger.error("Error while model generate text.")
raise exp
log_likelihood_response = self._get_log_likelihood(
response["results"][0]["input_tokens"], tokenized_context
)
results.append(log_likelihood_response)
self.cache_hook.add_partial(
"loglikelihood",
(context, continuation),
(
log_likelihood_response.log_likelihood,
log_likelihood_response.is_greedy,
),
)
return cast(List[Tuple[float, bool]], results)
def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
"""
Used to evaluate perplexity on a data distribution.
Args:
requests: Each request contains Instance.args : Tuple[str] containing an input string to the model whose
entire loglikelihood, conditioned on purely the EOT token, will be calculated.
Returns:
Tuple (loglikelihood,) for each request according to the input order:
loglikelihood: solely the probability of producing each piece of text given no starting input.
"""
try:
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
except ImportError:
raise ImportError(
"Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
)
self._check_model_logprobs_support()
generate_params = copy.deepcopy(self.generate_params)
generate_params[GenParams.MAX_NEW_TOKENS] = 1
requests = [request.args for request in requests]
results: List[LogLikelihoodResult] = []
# Note: We're not using batching due to (current) indeterminism of loglikelihood values when sending batch of requests
for request in tqdm(
requests,
desc="Running loglikelihood_rolling function ...",
):
context, continuation = request
try:
response = self.model.generate_text(
prompt=context, params=generate_params, raw_response=True
)
except Exception as exp:
eval_logger.error("Error while model generate text.")
raise exp
log_likelihood_response = self._get_log_likelihood(
response["results"][0]["input_tokens"], []
)
results.append(log_likelihood_response)
self.cache_hook.add_partial(
"loglikelihood_rolling",
(context, continuation),
log_likelihood_response.log_likelihood,
)
return cast(List[Tuple[float, bool]], results)
@property
def tokenizer_name(self) -> str:
return ""
def apply_chat_template(
self, chat_history: List[Dict[str, str]]
) -> List[Dict[str, str]]:
# A hack similar from api_model to allow encoding for cache
return JsonChatStr(json.dumps(chat_history))
......@@ -12,6 +12,8 @@ class MambaLMWrapper(HFLM):
def __init__(
self,
pretrained="state-spaces/mamba-130m",
# To use the HF compatible variant
is_hf: bool = False,
**kwargs,
) -> None:
"""
......@@ -52,7 +54,7 @@ class MambaLMWrapper(HFLM):
if "backend" in kwargs:
# mamba currently only supports causal models
assert kwargs["backend"] == "causal"
self.is_hf = is_hf or (True if pretrained.endswith("hf") else False)
super().__init__(
pretrained=pretrained,
# set appropriate defaults for tokenizer, max length, etc
......@@ -67,15 +69,18 @@ class MambaLMWrapper(HFLM):
pretrained: str,
**kwargs,
) -> None:
try:
from mamba_ssm.utils.hf import load_config_hf # noqa: F811
except ModuleNotFoundError:
raise Exception(
"attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
)
self._config = load_config_hf(pretrained)
if self.is_hf:
super()._get_config(pretrained, **kwargs)
else:
try:
from mamba_ssm.utils.hf import load_config_hf # noqa: F811
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
)
self._config = load_config_hf(pretrained)
def _create_model(
self,
......@@ -86,24 +91,32 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
# Mamba does not support arbitrary HF from_pretrained() args
**kwargs,
) -> None:
try:
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel # noqa: F811
except ModuleNotFoundError:
raise Exception(
"attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
if self.is_hf:
super()._create_model(pretrained, dtype=dtype, **kwargs)
else:
try:
from mamba_ssm.models.mixer_seq_simple import (
MambaLMHeadModel, # noqa: F811
)
except ModuleNotFoundError as exception:
raise type(exception)(
"attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
)
self._model = MambaLMHeadModel.from_pretrained(
pretrained,
device=self._device,
dtype=torch.float16
if dtype == "auto"
else lm_eval.models.utils.get_dtype(dtype),
)
self._model = MambaLMHeadModel.from_pretrained(
pretrained,
device=self._device,
dtype=torch.float16
if dtype == "auto"
else lm_eval.models.utils.get_dtype(dtype),
)
def _model_generate(self, context, max_length, stop, **generation_kwargs):
for key in ("do_sample", "attention_mask"):
remove_arg = (
["attention_mask"] if self.is_hf else ["do_sample", "attention_mask"]
)
for key in remove_arg:
if key in generation_kwargs:
generation_kwargs.pop(key)
......@@ -116,11 +129,37 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
# self.tokenizer, stop, 1, context.shape[0]
# )
return self.model.generate(
input_ids=context,
max_length=max_length,
# stopping_criteria=stopping_criteria,
# pad_token_id=self.tokenizer.pad_token_id,
# use_cache=True,
**generation_kwargs,
)
if not self.is_hf:
return self.model.generate(
input_ids=context,
max_length=max_length,
# stopping_criteria=stopping_criteria,
# pad_token_id=self.tokenizer.pad_token_id,
# use_cache=True,
**generation_kwargs,
)
else:
stopping_criteria = lm_eval.models.utils.stop_sequences_criteria(
self.tokenizer,
stop,
context.shape[1],
context.shape[0],
)
generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
do_sample = generation_kwargs.get("do_sample", None)
# The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
generation_kwargs["do_sample"] = do_sample = False
if do_sample is False and generation_kwargs.get("temperature") == 0.0:
generation_kwargs.pop("temperature")
return self.model.generate(
input_ids=context,
max_length=max_length,
stopping_criteria=stopping_criteria,
pad_token_id=self.tokenizer.pad_token_id,
use_cache=True,
**generation_kwargs,
)
......@@ -39,8 +39,8 @@ def _patch_pretrained_cfg(
):
try:
import omegaconf
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
......@@ -79,8 +79,8 @@ def load_model(
MegatronGPTModel,
)
from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
......@@ -140,8 +140,8 @@ def load_model(
def setup_distributed_environment(trainer):
try:
from nemo.utils.app_state import AppState
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
......@@ -187,15 +187,15 @@ class NeMoLM(LM):
**kwargs,
):
try:
from lightning.pytorch.trainer.trainer import Trainer
from nemo.collections.nlp.modules.common.text_generation_utils import (
generate,
)
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
from pytorch_lightning.trainer.trainer import Trainer
self.generate = generate
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Attempted to use 'nemo_lm' model type, but package `nemo` is not installed"
"Please install nemo following the instructions in the README: either with a NVIDIA PyTorch or NeMo container, "
"or installing nemo following https://github.com/NVIDIA/NeMo.",
......
......@@ -38,8 +38,8 @@ class SparseMLLM(HFLM):
) -> None:
try:
from sparseml.transformers import SparseAutoModelForCausalLM
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
......@@ -88,8 +88,8 @@ class SparseMLLM(HFLM):
def _get_config(self, pretrained: str, **kwargs) -> None:
try:
from sparseml.transformers import SparseAutoConfig
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
......@@ -112,8 +112,8 @@ class SparseMLLM(HFLM):
) -> None:
try:
from sparseml.transformers import SparseAutoTokenizer
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `sparseml` is not installed. "
"Please install it via `pip install sparseml[transformers]`"
)
......@@ -171,8 +171,8 @@ class DeepSparseLM(LM):
try:
import deepsparse
except ModuleNotFoundError:
raise Exception(
except ModuleNotFoundError as exception:
raise type(exception)(
"Package `deepsparse` is not installed. "
"Please install it via `pip install deepsparse[transformers]`"
)
......
......@@ -144,7 +144,7 @@ class NEURON_HF(TemplateLM):
add_bos_token: Optional[bool] = False,
) -> None:
if not NEURON_AVAILABLE:
raise Exception(
raise ImportError(
"Tried to load neuron model, but neuron is not installed ",
"please install neuron via pip install transformers-neuron ",
"also make sure you are running on an AWS inf2 instance",
......
......@@ -5,6 +5,7 @@ import itertools
import json
import os
from functools import cached_property
from operator import itemgetter
from io import BytesIO
from typing import Any, Dict, List, Optional, Tuple, Union
......@@ -14,6 +15,8 @@ from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.registry import register_model
from lm_eval.models.api_models import TemplateAPI
from lm_eval.models.utils import handle_stop_sequences
from lm_eval.models.api_models import JsonChatStr, TemplateAPI
from lm_eval.models.utils import Collator
from lm_eval.utils import eval_logger
......@@ -37,6 +40,7 @@ class LocalCompletionsAPI(TemplateAPI):
generate=False,
gen_kwargs: Optional[dict] = None,
seed: int = 1234,
eos=None,
**kwargs,
) -> dict:
if generate:
......@@ -46,7 +50,7 @@ class LocalCompletionsAPI(TemplateAPI):
else:
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["<|endoftext|>"])
stop = handle_stop_sequences(gen_kwargs.pop("until", None), eos)
return {
"prompt": messages,
"model": self.model,
......@@ -78,7 +82,9 @@ class LocalCompletionsAPI(TemplateAPI):
if not isinstance(outputs, list):
outputs = [outputs]
for out in outputs:
for choice, ctxlen in zip(out["choices"], ctxlens):
for choice, ctxlen in zip(
sorted(out["choices"], key=itemgetter("index")), ctxlens
):
assert ctxlen > 0, "Context length must be greater than 0"
logprobs = sum(choice["logprobs"]["token_logprobs"][ctxlen:-1])
tokens_logprobs = choice["logprobs"]["token_logprobs"][ctxlen:-1]
......@@ -97,8 +103,10 @@ class LocalCompletionsAPI(TemplateAPI):
if not isinstance(outputs, list):
outputs = [outputs]
for out in outputs:
tmp = [None] * len(out["choices"])
for choices in out["choices"]:
res.append(choices["text"])
tmp[choices["index"]] = choices["text"]
res = res + tmp
return res
@property
......@@ -136,15 +144,19 @@ class LocalChatCompletion(LocalCompletionsAPI):
generate=False,
gen_kwargs: dict = None,
seed=1234,
eos=None,
**kwargs,
) -> dict:
assert (
type(messages) is not str
), "chat-completions require the --apply_chat_template flag."
gen_kwargs.pop("do_sample", False)
if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens")
else:
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = gen_kwargs.pop("until", ["<|endoftext|>"])
stop = handle_stop_sequences(gen_kwargs.pop("until", None), eos)
if not isinstance(stop, (list, tuple)):
stop = [stop]
return {
......@@ -163,8 +175,10 @@ class LocalChatCompletion(LocalCompletionsAPI):
if not isinstance(outputs, list):
outputs = [outputs]
for out in outputs:
tmp = [None] * len(out["choices"])
for choices in out["choices"]:
res.append(choices["message"]["content"])
tmp[choices["index"]] = choices["message"]["content"]
res = res + tmp
return res
def tok_encode(
......@@ -229,6 +243,10 @@ class OpenAIChatCompletion(LocalChatCompletion):
tokenized_requests=False,
**kwargs,
):
if "o1" in kwargs.get("model", ""):
eval_logger.warning(
"o1 models do not support `stop` and only support temperature=1"
)
super().__init__(
base_url=base_url,
tokenizer_backend=tokenizer_backend,
......@@ -251,6 +269,41 @@ class OpenAIChatCompletion(LocalChatCompletion):
"Loglikelihood (and therefore `multiple_choice`-type tasks) is not supported for chat completions as OpenAI does not provide prompt logprobs. See https://github.com/EleutherAI/lm-evaluation-harness/issues/942#issuecomment-1777836312 or https://github.com/EleutherAI/lm-evaluation-harness/issues/1196 for more background on this limitation."
)
def _create_payload(
self,
messages: List[Dict],
generate=False,
gen_kwargs: dict = None,
seed=1234,
eos="<|endoftext|>",
**kwargs,
) -> dict:
assert (
type(messages) is not str
), "chat-completions require the --apply_chat_template flag."
gen_kwargs.pop("do_sample", False)
if "max_tokens" in gen_kwargs:
max_tokens = gen_kwargs.pop("max_tokens")
else:
max_tokens = gen_kwargs.pop("max_gen_toks", self._max_gen_toks)
temperature = gen_kwargs.pop("temperature", 0)
stop = handle_stop_sequences(gen_kwargs.pop("until", ["<|endoftext|>"]), eos)
if not isinstance(stop, (list, tuple)):
stop = [stop]
output = {
"messages": messages,
"model": self.model,
"max_completion_tokens": max_tokens,
"temperature": temperature,
"stop": stop[:4],
"seed": seed,
**gen_kwargs,
}
if "o1" in self.model:
output.pop("stop")
output["temperature"] = 1
return output
@register_model("pixtral-api")
class PixtralAPI(LocalChatCompletion):
......
from importlib.util import find_spec
from lm_eval import utils
from lm_eval.api.registry import register_model
from lm_eval.models.huggingface import HFLM
from lm_eval.models.utils import get_dtype
eval_logger = utils.eval_logger
@register_model("ipex")
class IPEXLM(HFLM):
"""
using the HuggingFace transformers + optimum-intel ipex backend, can run on intel cpu and intel gpu
"""
def __init__(
self,
**kwargs,
) -> None:
if "backend" in kwargs:
# currently only supports causal models
assert (
kwargs["backend"] == "causal"
), "Currently, only IPEXModelForCausalLM is supported."
super().__init__(
backend=kwargs.pop("backend", "causal"),
**kwargs,
)
def _create_model(
self,
pretrained: str,
revision="main",
dtype="auto",
trust_remote_code=False,
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
# (accelerate naive PP (device_map) options)
parallelize=False,
gpus=None,
max_memory_per_gpu=None,
max_cpu_memory=None,
offload_folder="./offload",
# PEFT, delta weights and quantization options
peft=None,
delta=None,
autogptq=False,
gptqmodel=False,
**kwargs,
) -> None:
if not find_spec("optimum"):
raise ModuleNotFoundError(
"package `optimum` is not installed. Please install it via `pip install optimum[ipex]`"
)
else:
from optimum.intel import IPEXModelForCausalLM
model_kwargs = kwargs if kwargs else {}
model_kwargs.update(
self._get_accelerate_args(
parallelize=parallelize,
device_map=kwargs.get("device_map", None),
max_memory_per_gpu=max_memory_per_gpu,
max_cpu_memory=max_cpu_memory,
offload_folder=offload_folder,
gpus=gpus,
)
)
self._model = IPEXModelForCausalLM.from_pretrained(
pretrained,
revision=revision,
torch_dtype=get_dtype(dtype),
trust_remote_code=trust_remote_code,
**model_kwargs,
)
......@@ -50,7 +50,7 @@ class OptimumLM(HFLM):
**kwargs,
) -> None:
if not find_spec("optimum"):
raise Exception(
raise ModuleNotFoundError(
"package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
)
else:
......@@ -71,6 +71,11 @@ class OptimumLM(HFLM):
else:
model_kwargs["ov_config"] = {}
model_kwargs["ov_config"].setdefault("CACHE_DIR", "")
if "pipeline_parallel" in model_kwargs:
if model_kwargs["pipeline_parallel"]:
model_kwargs["ov_config"]["MODEL_DISTRIBUTION_POLICY"] = (
"PIPELINE_PARALLEL"
)
model_file = Path(pretrained) / "openvino_model.xml"
if model_file.exists():
export = False
......
......@@ -709,3 +709,21 @@ def flatten_image_list(images: List[List]):
:return: a list of PIL images, via concatenating all the sub-lists in order.
"""
return [image for image_list in images for image in image_list]
def handle_stop_sequences(
until: Union[str, List[str], None], eos: Optional[str]
) -> List[str]:
"""Ensures that the `until` parameter is a list of stop sequences and includes the EOS token."""
if isinstance(until, str):
until = [until]
elif until is None:
until = []
elif not isinstance(until, list):
raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
)
if eos is not None and eos not in until:
until.append(eos)
return until
......@@ -10,7 +10,12 @@ from tqdm import tqdm
from lm_eval.api.instance import Instance
from lm_eval.api.model import TemplateLM
from lm_eval.api.registry import register_model
from lm_eval.models.utils import Collator, configure_pad_token, undistribute
from lm_eval.models.utils import (
Collator,
configure_pad_token,
handle_stop_sequences,
undistribute,
)
from lm_eval.utils import (
eval_logger,
get_rolling_token_windows,
......@@ -65,7 +70,7 @@ class VLLM(TemplateLM):
super().__init__()
if not find_spec("vllm"):
raise Exception(
raise ModuleNotFoundError(
"attempted to use 'vllm' LM type, but package `vllm` is not installed. "
"Please install vllm via `pip install lm-eval[vllm]` or `pip install -e .[vllm]`"
)
......@@ -97,7 +102,7 @@ class VLLM(TemplateLM):
self.batch_size = (
"auto"
if isinstance(batch_size, str) and "auto" in batch_size
else batch_size
else int(batch_size)
)
if self.data_parallel_size <= 1:
self.model = LLM(**self.model_args)
......@@ -118,7 +123,7 @@ class VLLM(TemplateLM):
tokenizer if tokenizer else pretrained,
tokenizer_mode=tokenizer_mode,
trust_remote_code=trust_remote_code,
tokenizer_revision=tokenizer_revision,
revision=tokenizer_revision,
)
self.tokenizer = configure_pad_token(self.tokenizer)
self.add_bos_token = add_bos_token
......@@ -179,14 +184,21 @@ class VLLM(TemplateLM):
def max_gen_toks(self):
return self._max_gen_toks
def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
def apply_chat_template(
self, chat_history: List[Dict[str, str]], add_generation_prompt: bool = True
) -> str:
"""
Method to apply a chat template to a list of chat history between user and model.
"""
return self.tokenizer.apply_chat_template(
chat_history, tokenize=False, add_generation_prompt=True
chat_templated = self.tokenizer.apply_chat_template(
chat_history,
tokenize=False,
add_generation_prompt=add_generation_prompt,
continue_final_message=not add_generation_prompt,
)
return chat_templated
@property
def tokenizer_name(self) -> str:
return self.tokenizer.name_or_path.replace("/", "__")
......@@ -239,17 +251,25 @@ class VLLM(TemplateLM):
# but then tensor_parallel breaks
@ray.remote
def run_inference_one_model(
model_args: dict, sampling_params, requests: List[List[int]]
model_args: dict,
sampling_params,
requests: List[List[int]],
lora_request: LoRARequest,
):
llm = LLM(**model_args)
return llm.generate(
prompt_token_ids=requests, sampling_params=sampling_params
prompt_token_ids=requests,
sampling_params=sampling_params,
lora_request=lora_request,
)
# dispatch requests to all self.data_parallel_size workers, in interleaved fashion
# interleaved important to balance context lengths across workers
requests = [list(x) for x in distribute(self.data_parallel_size, requests)]
inputs = ((self.model_args, sampling_params, req) for req in requests)
inputs = (
(self.model_args, sampling_params, req, self.lora_request)
for req in requests
)
object_refs = [run_inference_one_model.remote(*x) for x in inputs]
results = ray.get(object_refs)
# Invoke ray.shutdown() to prevent hang-ups if subsequent calls required.
......@@ -257,28 +277,32 @@ class VLLM(TemplateLM):
# flatten results
return undistribute(results)
if self.lora_request is not None:
outputs = self.model.generate(
prompt_token_ids=requests,
sampling_params=sampling_params,
use_tqdm=True if self.batch_size == "auto" else False,
lora_request=self.lora_request,
)
else:
outputs = self.model.generate(
prompt_token_ids=requests,
sampling_params=sampling_params,
use_tqdm=True if self.batch_size == "auto" else False,
)
outputs = self.model.generate(
prompt_token_ids=requests,
sampling_params=sampling_params,
use_tqdm=True if self.batch_size == "auto" else False,
lora_request=self.lora_request,
)
return outputs
def loglikelihood_rolling(
self, requests: List[Instance], disable_tqdm: bool = False
) -> List[float]:
loglikelihoods = []
for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
rolling_token_windows = list(
adaptive_batch_size = None
if self.batch_size == "auto":
adaptive_batch_size = len(requests)
# First, collect all windows from all requests
all_windows = [] # List of (request_idx, window) tuples
request_window_counts = [] # Track number of windows per request
for req_idx, (string,) in enumerate(
tqdm(
[req.args for req in requests],
disable=(disable_tqdm or (self.rank != 0)),
)
):
rolling_token_windows: List[Tuple[List[int], List[int]]] = list(
map(
make_disjoint_window,
get_rolling_token_windows(
......@@ -291,20 +315,42 @@ class VLLM(TemplateLM):
)
)
rolling_token_windows = [(None,) + x for x in rolling_token_windows]
# TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
windows = [(None,) + x for x in rolling_token_windows]
string_nll = self._loglikelihood_tokens(
rolling_token_windows,
)
# Store windows with their request index
all_windows.extend((req_idx, window) for window in windows)
request_window_counts.append(len(windows))
# discard is_greedy
string_nll = [x[0] for x in string_nll]
all_nlls = []
batch_size = adaptive_batch_size or int(self.batch_size)
for i in range(0, len(all_windows), batch_size):
batch = all_windows[i : i + batch_size]
# Extract just the windows for processing, keeping track of request indices
batch_indices, batch_windows = zip(*batch)
string_nll = sum(string_nll)
loglikelihoods.append(string_nll)
batch_nlls = self._loglikelihood_tokens(
requests=batch_windows,
disable_tqdm=False,
)
# Store results with their request indices
all_nlls.extend(zip(batch_indices, batch_nlls))
# cache this loglikelihood_rolling request
self.cache_hook.add_partial("loglikelihood_rolling", (string,), string_nll)
# Reconstruct per-request loglikelihoods
loglikelihoods = []
current_idx = 0
for window_count in request_window_counts:
# Get all nlls for this request
request_nlls = all_nlls[current_idx : current_idx + window_count]
# Sum up the nlls for this request (discarding is_greedy)
request_total = sum(nll[0] for _, nll in request_nlls)
loglikelihoods.append(request_total)
current_idx += window_count
string = requests[len(loglikelihoods) - 1].args[0]
self.cache_hook.add_partial(
"loglikelihood_rolling", (string,), request_total
)
return loglikelihoods
......@@ -345,6 +391,7 @@ class VLLM(TemplateLM):
desc="Running generate_until requests",
)
# for each different set of kwargs, we execute all requests, by batch.
eos = self.tokenizer.decode(self.eot_token_id)
for chunk in chunks:
context_and_encoding, all_gen_kwargs = zip(*chunk)
context, context_encoding = zip(*context_and_encoding)
......@@ -352,27 +399,14 @@ class VLLM(TemplateLM):
# this is safe to assume because the `grouper` object ensures it.
gen_kwargs = all_gen_kwargs[0]
# unpack our keyword arguments.
until = None
if isinstance(gen_kwargs, dict):
kwargs = copy.deepcopy(gen_kwargs) # edge case for repeats > 1
if "until" in kwargs.keys():
until = kwargs.pop("until")
if isinstance(until, str):
until = [until]
elif not isinstance(until, list):
raise ValueError(
f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"
)
# add EOS token to stop sequences
until = handle_stop_sequences(kwargs.pop("until", None), eos=eos)
else:
raise ValueError(
f"Expected `kwargs` to be of type `dict` but got {gen_kwargs}"
f"Expected `kwargs` to be of type `dict` but got {type(gen_kwargs)}"
)
# add EOS token to stop sequences
eos = self.tokenizer.decode(self.eot_token_id)
if not until:
until = [eos]
else:
until.append(eos)
if "max_gen_toks" in kwargs.keys():
max_gen_toks = kwargs.pop("max_gen_toks")
else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment