Merge branch 'main' into humaneval

# Conflicts: # lm_eval/api/task.py

Merge branch 'main' into humaneval
# Conflicts: # lm_eval/api/task.py
173b2bc3 · Baber · 74344829 · bb098f13 · 173b2bc3 · 173b2bc3
Commit 173b2bc3 authored Jan 10, 2025 by Baber
20 changed files
--- a/lm_eval/models/ibm_watsonx_ai.py
+++ b/lm_eval/models/ibm_watsonx_ai.py
+import copy
+import json
+import os
+from functools import lru_cache
+from typing import Any, Dict, List, NamedTuple, Optional, Tuple, Type, cast
+from tqdm import tqdm
+from lm_eval.api.instance import Instance
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+from lm_eval.models.api_models import JsonChatStr
+from lm_eval.utils import eval_logger, simple_parse_args_string
+class LogLikelihoodResult(NamedTuple):
+    log_likelihood: float
+    is_greedy: bool
+def _verify_credentials(creds: Any) -> None:
+    """
+    Verifies that all required keys are present in the credentials dictionary.
+    Args:
+        creds (Any): A dictionary containing the credentials.
+    Raises:
+        ValueError: If any of the necessary credentials are missing, with guidance on which environment variables need to be set.
+    """
+    required_keys = ["apikey", "url", "project_id"]
+    env_var_mapping = {
+        "apikey": "WATSONX_API_KEY",
+        "url": "WATSONX_URL",
+        "project_id": "WATSONX_PROJECT_ID",
+    }
+    missing_keys = [key for key in required_keys if key not in creds or not creds[key]]
+    if missing_keys:
+        missing_env_vars = [env_var_mapping[key] for key in missing_keys]
+        raise ValueError(
+            f"Missing required credentials: {', '.join(missing_keys)}. Please set the following environment variables: {', '.join(missing_env_vars)}"
+        )
+@lru_cache(maxsize=None)
+def get_watsonx_credentials() -> Dict[str, str]:
+    """
+    Retrieves Watsonx API credentials from environmental variables.
+    Returns:
+        Dict[str, str]: A dictionary containing the credentials necessary for authentication, including
+                        keys such as `apikey`, `url`, and `project_id`.
+    Raises:
+        AssertionError: If the credentials format is invalid or any of the necessary credentials are missing.
+    """
+    credentials = {
+        "apikey": os.getenv("WATSONX_API_KEY", None),
+        "url": os.getenv("WATSONX_URL", None),
+        "project_id": os.getenv("WATSONX_PROJECT_ID", None),
+    }
+    _verify_credentials(credentials)
+    return credentials
+@register_model("watsonx_llm")
+class WatsonxLLM(LM):
+    """
+    Implementation of LM model interface for evaluating Watsonx model with the lm_eval framework.
+    See https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md for reference.
+    """
+    @classmethod
+    def create_from_arg_string(
+        cls: Type["WatsonxLLM"],
+        arg_string: str,
+        additional_config: Optional[Dict] = None,
+    ) -> "WatsonxLLM":
+        """
+        Allow the user to specify model parameters (TextGenerationParameters) in CLI arguments.
+        """
+        try:
+            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        args = simple_parse_args_string(arg_string)
+        args.update(additional_config)
+        model_id = args.pop("model_id", None)
+        if model_id is None:
+            raise ValueError("'model_id' is required, please pass it in 'model_args'")
+        if not args.get("do_sample", None):
+            args["temperature"] = None
+            args["top_p"] = None
+            args["top_k"] = None
+            args["seed"] = None
+        generate_params = {
+            GenParams.DECODING_METHOD: (
+                "greedy" if not args.get("do_sample", None) else "sample"
+            ),
+            GenParams.LENGTH_PENALTY: args.get("length_penalty", None),
+            GenParams.TEMPERATURE: args.get("temperature", None),
+            GenParams.TOP_P: args.get("top_p", None),
+            GenParams.TOP_K: args.get("top_k", None),
+            GenParams.RANDOM_SEED: args.get("seed", None),
+            GenParams.REPETITION_PENALTY: args.get("repetition_penalty", None),
+            GenParams.MIN_NEW_TOKENS: args.get("min_new_tokens", None),
+            GenParams.MAX_NEW_TOKENS: args.get("max_new_tokens", 256),
+            GenParams.STOP_SEQUENCES: args.get("stop_sequences", None),
+            GenParams.TIME_LIMIT: args.get("time_limit", None),
+            GenParams.TRUNCATE_INPUT_TOKENS: args.get("truncate_input_tokens", None),
+            GenParams.RETURN_OPTIONS: {
+                "generated_tokens": True,
+                "input_tokens": True,
+                "token_logprobs": True,
+                "token_ranks": True,
+            },
+        }
+        generate_params = {k: v for k, v in generate_params.items() if v is not None}
+        return cls(
+            watsonx_credentials=get_watsonx_credentials(),
+            model_id=model_id,
+            generate_params=generate_params,
+        )
+    def __init__(
+        self,
+        watsonx_credentials: Dict,
+        model_id,
+        generate_params: Optional[Dict[Any, Any]] = None,
+    ) -> None:
+        try:
+            from ibm_watsonx_ai import APIClient
+            from ibm_watsonx_ai.foundation_models import ModelInference
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        super().__init__()
+        client = APIClient(watsonx_credentials)
+        project_id = watsonx_credentials.get("project_id", None)
+        deployment_id = watsonx_credentials.get("deployment_id", None)
+        client.set.default_project(project_id)
+        self.generate_params = generate_params
+        self.model = ModelInference(
+            model_id=model_id,
+            deployment_id=deployment_id,
+            api_client=client,
+            project_id=project_id,
+        )
+        self._model_id = model_id
+    @staticmethod
+    def _has_stop_token(response_tokens: List[str], context_tokens: List[str]) -> bool:
+        """
+        Determines whether a stop token has been generated in the `response_tokens` compared to the `context_tokens`.
+        If the tokens do not match as expected, the function raises a RuntimeError, indicating a possible
+        misalignment between the tokens generated by the tokenizer and the model.
+        Args:
+            response_tokens (List[str]): The List of tokens generated as a response by the model.
+            context_tokens (List[str]): The List of tokens representing the input context.
+        Returns:
+            bool: True if the `response_tokens` likely contain a stop token that terminates the sequence,
+                  otherwise raises an exception.
+        Raises:
+            RuntimeError: If there is an unexpected mismatch between the `response_tokens` and the `context_tokens`.
+        """
+        context_length = len(context_tokens)
+        if response_tokens[: context_length - 1] == context_tokens[:-1]:
+            return (
+                response_tokens[-1] != context_tokens[-1]
+            )  # only last token differs, probably stop sequence (</s>)
+        raise RuntimeError(
+            f"There is an unexpected difference between tokenizer and model tokens:\n"
+            f"context_tokens={context_tokens}\n"
+            f"response_tokens={response_tokens[:context_length]}"
+        )
+    def _check_model_logprobs_support(self):
+        """
+        Verifies if the model supports returning log probabilities for input tokens.
+        This function sends a prompt to the model and checks whether the model's response
+        includes log probabilities for the input tokens. If log probabilities are not present,
+        it raises a `RuntimeError`, indicating that the model is not supported.
+        Raises:
+            RuntimeError: If the model does not return log probabilities for input tokens.
+        """
+        tokens = self.model.generate_text(
+            prompt=["The best ice cream flavor is:"],
+            params=self.generate_params,
+            raw_response=True,
+        )[0]["results"][0]
+        if all(token.get("logprob", None) is None for token in tokens["input_tokens"]):
+            raise RuntimeError(
+                f"Model {self._model_id} is not supported: does not return logprobs for input tokens"
+            )
+    def _get_log_likelihood(
+        self,
+        input_tokens: List[Dict[str, float]],
+        context_tokens: List[Dict[str, float]],
+    ) -> LogLikelihoodResult:
+        """
+        Calculates the log likelihood of the generated tokens compared to the context tokens.
+        Args:
+            input_tokens (List[Dict[str, float]]): A List of token dictionaries, each containing
+                token information like `text` and `logprob`.
+            context_tokens (List[Dict[str, float]]): A List of token dictionaries representing
+                the input context.
+        Returns:
+            LogLikelihoodResult: An object containing the calculated log likelihood and a boolean
+            flag indicating if the tokens were generated greedily.
+        """
+        response_tokens = [token["text"] for token in input_tokens]
+        context_length = len(context_tokens)
+        if self._has_stop_token(response_tokens, context_tokens):
+            context_length -= 1
+        return LogLikelihoodResult(
+            log_likelihood=sum(
+                token.get("logprob", 0) for token in input_tokens[context_length:]
+            ),
+            is_greedy=all(
+                token["rank"] == 1 for token in input_tokens[context_length:]
+            ),
+        )
+    def generate_until(self, requests: List[Instance]) -> List[str]:
+        """
+        Generates text responses for a List of requests, with progress tracking and caching.
+        Args:
+            requests (List[Instance]): A List of instances, each containing a text input to be processed.
+        Returns:
+            List[str]: A List of generated responses.
+        """
+        requests = [request.args for request in requests]
+        results = []
+        for request in tqdm(
+            requests,
+            desc="Running generate_until function ...",
+        ):
+            context, continuation = request
+            try:
+                if isinstance(context, JsonChatStr):
+                    context = json.loads(context.prompt)
+                    response = self.model.chat(context, self.generate_params)
+                    response = response["choices"][0]["message"]["content"]
+                else:
+                    response = self.model.generate_text(context, self.generate_params)
+            except Exception as exp:
+                eval_logger.error("Error while generating text.")
+                raise exp
+            results.append(response)
+            self.cache_hook.add_partial(
+                "generate_until", (context, continuation), response
+            )
+        return results
+    def loglikelihood(self, requests: List[Instance]) -> List[Tuple[float, bool]]:
+        """
+        Args:
+            requests: Each request contains Instance.args : Tuple[str, str] containing:
+                1. an input string to the LM and
+                2. a target string on which the loglikelihood of the LM producing this target,
+                   conditioned on the input, will be returned.
+        Returns:
+            Tuple (loglikelihood, is_greedy) for each request according to the input order:
+                loglikelihood: probability of generating the target string conditioned on the input
+                is_greedy: True if and only if the target string would be generated by greedy sampling from the LM
+        """
+        try:
+            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        self._check_model_logprobs_support()
+        generate_params = copy.copy(self.generate_params)
+        generate_params[GenParams.MAX_NEW_TOKENS] = 1
+        requests = [request.args for request in requests]
+        results: List[LogLikelihoodResult] = []
+        # Note: We're not using batching due to (current) indeterminism of loglikelihood values when sending batch of requests
+        for request in tqdm(
+            requests,
+            desc="Running loglikelihood function ...",
+        ):
+            context, continuation = request
+            try:
+                tokenized_context = self.model.tokenize(
+                    prompt=context, return_tokens=True
+                )["result"]["tokens"]
+            except Exception as exp:
+                eval_logger.error("Error while model tokenize.")
+                raise exp
+            input_prompt = context + continuation
+            try:
+                response = self.model.generate_text(
+                    prompt=input_prompt, params=generate_params, raw_response=True
+                )
+            except Exception as exp:
+                eval_logger.error("Error while model generate text.")
+                raise exp
+            log_likelihood_response = self._get_log_likelihood(
+                response["results"][0]["input_tokens"], tokenized_context
+            )
+            results.append(log_likelihood_response)
+            self.cache_hook.add_partial(
+                "loglikelihood",
+                (context, continuation),
+                (
+                    log_likelihood_response.log_likelihood,
+                    log_likelihood_response.is_greedy,
+                ),
+            )
+        return cast(List[Tuple[float, bool]], results)
+    def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
+        """
+        Used to evaluate perplexity on a data distribution.
+        Args:
+            requests: Each request contains Instance.args : Tuple[str] containing an input string to the model whose
+                entire loglikelihood, conditioned on purely the EOT token, will be calculated.
+        Returns:
+            Tuple (loglikelihood,) for each request according to the input order:
+                loglikelihood: solely the probability of producing each piece of text given no starting input.
+        """
+        try:
+            from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams
+        except ImportError:
+            raise ImportError(
+                "Could not import ibm_watsonx_ai: Please install lm_eval[ibm_watsonx_ai] package."
+            )
+        self._check_model_logprobs_support()
+        generate_params = copy.deepcopy(self.generate_params)
+        generate_params[GenParams.MAX_NEW_TOKENS] = 1
+        requests = [request.args for request in requests]
+        results: List[LogLikelihoodResult] = []
+        # Note: We're not using batching due to (current) indeterminism of loglikelihood values when sending batch of requests
+        for request in tqdm(
+            requests,
+            desc="Running loglikelihood_rolling function ...",
+        ):
+            context, continuation = request
+            try:
+                response = self.model.generate_text(
+                    prompt=context, params=generate_params, raw_response=True
+                )
+            except Exception as exp:
+                eval_logger.error("Error while model generate text.")
+                raise exp
+            log_likelihood_response = self._get_log_likelihood(
+                response["results"][0]["input_tokens"], []
+            )
+            results.append(log_likelihood_response)
+            self.cache_hook.add_partial(
+                "loglikelihood_rolling",
+                (context, continuation),
+                log_likelihood_response.log_likelihood,
+            )
+        return cast(List[Tuple[float, bool]], results)
+    @property
+    def tokenizer_name(self) -> str:
+        return ""
+    def apply_chat_template(
+        self, chat_history: List[Dict[str, str]]
+    ) -> List[Dict[str, str]]:
+        # A hack similar from api_model to allow encoding for cache
+        return JsonChatStr(json.dumps(chat_history))
--- a/lm_eval/models/mamba_lm.py
+++ b/lm_eval/models/mamba_lm.py
@@ -12,6 +12,8 @@ class MambaLMWrapper(HFLM):
    def __init__(
        self,
        pretrained="state-spaces/mamba-130m",
+        # To use the HF compatible variant
+        is_hf: bool = False,
        **kwargs,
    ) -> None:
        """
@@ -52,7 +54,7 @@ class MambaLMWrapper(HFLM):
        if "backend" in kwargs:
            # mamba currently only supports causal models
            assert kwargs["backend"] == "causal"
+        self.is_hf = is_hf or (True if pretrained.endswith("hf") else False)
        super().__init__(
            pretrained=pretrained,
            # set appropriate defaults for tokenizer, max length, etc
@@ -67,12 +69,15 @@ class MambaLMWrapper(HFLM):
        pretrained: str,
        **kwargs,
    ) -> None:
+        if self.is_hf:
+            super()._get_config(pretrained, **kwargs)
+        else:
            try:
                from mamba_ssm.utils.hf import load_config_hf  # noqa: F811
-        except ModuleNotFoundError:
+            except ModuleNotFoundError as exception:
-            raise Exception(
+                raise type(exception)(
                    "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
-please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+    please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
                )
            self._config = load_config_hf(pretrained)
@@ -86,12 +91,17 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
        # Mamba does not support arbitrary HF from_pretrained() args
        **kwargs,
    ) -> None:
+        if self.is_hf:
+            super()._create_model(pretrained, dtype=dtype, **kwargs)
+        else:
            try:
-            from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel  # noqa: F811
+                from mamba_ssm.models.mixer_seq_simple import (
-        except ModuleNotFoundError:
+                    MambaLMHeadModel,  # noqa: F811
-            raise Exception(
+                )
+            except ModuleNotFoundError as exception:
+                raise type(exception)(
                    "attempted to use 'mamba_ssm' LM type, but package `mamba_ssm` is not installed. \
-please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
+    please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba]`",
                )
            self._model = MambaLMHeadModel.from_pretrained(
@@ -103,7 +113,10 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
            )
    def _model_generate(self, context, max_length, stop, **generation_kwargs):
-        for key in ("do_sample", "attention_mask"):
+        remove_arg = (
+            ["attention_mask"] if self.is_hf else ["do_sample", "attention_mask"]
+        )
+        for key in remove_arg:
            if key in generation_kwargs:
                generation_kwargs.pop(key)
@@ -116,6 +129,7 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
        #     self.tokenizer, stop, 1, context.shape[0]
        # )
+        if not self.is_hf:
            return self.model.generate(
                input_ids=context,
                max_length=max_length,
@@ -124,3 +138,28 @@ please install mamba via `pip install lm-eval[mamba]` or `pip install -e .[mamba
                # use_cache=True,
                **generation_kwargs,
            )
+        else:
+            stopping_criteria = lm_eval.models.utils.stop_sequences_criteria(
+                self.tokenizer,
+                stop,
+                context.shape[1],
+                context.shape[0],
+            )
+            generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0)
+            do_sample = generation_kwargs.get("do_sample", None)
+            # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies
+            if generation_kwargs.get("temperature") == 0.0 and do_sample is None:
+                generation_kwargs["do_sample"] = do_sample = False
+            if do_sample is False and generation_kwargs.get("temperature") == 0.0:
+                generation_kwargs.pop("temperature")
+            return self.model.generate(
+                input_ids=context,
+                max_length=max_length,
+                stopping_criteria=stopping_criteria,
+                pad_token_id=self.tokenizer.pad_token_id,
+                use_cache=True,
+                **generation_kwargs,
+            )
--- a/lm_eval/models/nemo_lm.py
+++ b/lm_eval/models/nemo_lm.py
--- a/lm_eval/models/neuralmagic.py
+++ b/lm_eval/models/neuralmagic.py
--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
--- a/lm_eval/models/optimum_ipex.py
+++ b/lm_eval/models/optimum_ipex.py
--- a/lm_eval/models/optimum_lm.py
+++ b/lm_eval/models/optimum_lm.py
@@ -50,7 +50,7 @@ class OptimumLM(HFLM):
        **kwargs,
    ) -> None:
        if not find_spec("optimum"):
-            raise Exception(
+            raise ModuleNotFoundError(
                "package `optimum` is not installed. Please install it via `pip install optimum[openvino]`"
            )
        else:
@@ -71,6 +71,11 @@ class OptimumLM(HFLM):
        else:
            model_kwargs["ov_config"] = {}
        model_kwargs["ov_config"].setdefault("CACHE_DIR", "")
+        if "pipeline_parallel" in model_kwargs:
+            if model_kwargs["pipeline_parallel"]:
+                model_kwargs["ov_config"]["MODEL_DISTRIBUTION_POLICY"] = (
+                    "PIPELINE_PARALLEL"
+                )
        model_file = Path(pretrained) / "openvino_model.xml"
        if model_file.exists():
            export = False

--- a/lm_eval/models/utils.py
+++ b/lm_eval/models/utils.py
--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
--- a/lm_eval/models/vllm_vlms.py
+++ b/lm_eval/models/vllm_vlms.py
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
--- a/lm_eval/tasks/aclue/README.md
+++ b/lm_eval/tasks/aclue/README.md
--- a/lm_eval/tasks/aclue/_aclue.yaml
+++ b/lm_eval/tasks/aclue/_aclue.yaml
--- a/lm_eval/tasks/aclue/_default_template_yaml
+++ b/lm_eval/tasks/aclue/_default_template_yaml
-group: aclue
 dataset_path: tyouisen/aclue
 test_split: test
 fewshot_split: dev
@@ -16,4 +15,4 @@ metric_list:
    aggregation: mean
    higher_is_better: true
 metadata:
-  version: 0.0
+  version: 1.0
--- a/lm_eval/tasks/aexams/README.md
+++ b/lm_eval/tasks/aexams/README.md
--- a/lm_eval/tasks/aexams/_aexams.yaml
+++ b/lm_eval/tasks/aexams/_aexams.yaml
--- a/lm_eval/tasks/aexams/_default_template_yaml
+++ b/lm_eval/tasks/aexams/_default_template_yaml