Merge remote-tracking branch 'origin/main' into 1442-inverse-scaling-tasks

31019847 · h-albert-lee · 10c9d5de · dc90fecc · 31019847 · 31019847
Commit 31019847 authored Mar 16, 2024 by h-albert-lee
20 changed files
--- a/README.md
+++ b/README.md
@@ -140,7 +140,7 @@ lm_eval --model vllm \
    --tasks lambada_openai \
    --batch_size auto
 ```
-For a full list of supported vLLM configurations, please reference our vLLM integration and the vLLM documentation.
+To use vllm, do `pip install lm_eval[vllm]`. For a full list of supported vLLM configurations, please reference our [vLLM integration](https://github.com/EleutherAI/lm-evaluation-harness/blob/e74ec966556253fbe3d8ecba9de675c77c075bce/lm_eval/models/vllm_causallms.py) and the vLLM documentation.
 vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.

--- a/docs/task_guide.md
+++ b/docs/task_guide.md
@@ -30,9 +30,10 @@ Dataset configuration options:
 Prompting / in-context formatting options:
 - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
+- **description** (`str`, *optional*) — An optional prepended Jinja2 template or string which will be prepended to the few-shot examples passed into the model, often describing the task or providing instructions to a model, such as `"The following are questions (with answers) about {{subject}}.\n\n"`. No delimiters or spacing are inserted between the description and the first few-shot example.
- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into
+- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
+- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into
+- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
 - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
 - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -330,6 +330,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        log_samples=args.log_samples,
        gen_kwargs=args.gen_kwargs,
        task_manager=task_manager,
+        verbosity=args.verbosity,
        predict_only=args.predict_only,
        random_seed=args.seed[0],
        numpy_random_seed=args.seed[1],

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -304,7 +304,9 @@ class TemplateLM(LM):
        return context_enc, continuation_enc
-    def loglikelihood(self, requests) -> List[Tuple[float, bool]]:
+    def loglikelihood(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
        new_reqs = []
        for context, continuation in [req.args for req in requests]:
            if context == "":
@@ -318,12 +320,14 @@ class TemplateLM(LM):
            new_reqs.append(((context, continuation), context_enc, continuation_enc))
-        return self._loglikelihood_tokens(new_reqs)
+        return self._loglikelihood_tokens(new_reqs, disable_tqdm=disable_tqdm)
    @abc.abstractmethod
-    def loglikelihood_rolling(self, requests) -> List[Tuple[float, bool]]:
+    def loglikelihood_rolling(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[Tuple[float, bool]]:
        pass
    @abc.abstractmethod
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        pass
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -940,14 +940,14 @@ class ConfigurableTask(Task):
        :returns: str
            The fewshot context.
        """
+        if description := self.config.description:
+            description = utils.apply_template(self.config.description, doc)
        if num_fewshot == 0:
            # always prepend the (possibly empty) task description
-            labeled_examples = self.config.description
+            labeled_examples = description
        else:
-            labeled_examples = self.config.description + self.sampler.get_context(
+            labeled_examples = description + self.sampler.get_context(doc, num_fewshot)
-                doc, num_fewshot
-            )
        example = self.doc_to_text(doc)
        if self.multiple_input:

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -147,7 +147,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        try:
            import anthropic
        except ModuleNotFoundError:
@@ -162,7 +162,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
        _requests: List[Tuple[str, dict]] = [req.args for req in requests]
        res = []
-        for request in tqdm(_requests):
+        for request in tqdm(_requests, disable=disable_tqdm):
            try:
                inp = request[0]
                request_args = request[1]
@@ -199,8 +199,8 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
        # Isn't used because we override generate_until
        raise NotImplementedError()
-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
 import random
+from tqdm import tqdm
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
@@ -13,27 +15,27 @@ class DummyLM(LM):
    def create_from_arg_string(cls, arg_string, additional_config=None):
        return cls()
-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        res = []
-        for _ in requests:
+        for _ in tqdm(requests, disable=disable_tqdm):
            res.append((-random.random(), False))
        return res
-    def generate_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        res = []
-        for ctx, _ in requests:
+        for ctx, _ in tqdm(requests, disable=disable_tqdm):
            res.append("lol")
            assert ctx.strip() != ""
        return res
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        res = []
-        for _ in requests:
+        for _ in tqdm(requests, disable=disable_tqdm):
            res.append(-random.random())
        return res
--- a/lm_eval/models/gguf.py
+++ b/lm_eval/models/gguf.py
@@ -70,11 +70,13 @@ class GGUFLM(LM):
        else:
            raise Exception(f"Failed to get a valid response after {retries} retries.")
-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        if not requests:
            return []
        res = []
-        for context, continuation in tqdm([req.args for req in requests]):
+        for context, continuation in tqdm(
+            [req.args for req in requests], disable=disable_tqdm
+        ):
            response = self.gguf_completion(context=context, continuation=continuation)
            if response and "choices" in response and response["choices"]:
                choice = response["choices"][0]
@@ -97,12 +99,12 @@ class GGUFLM(LM):
                assert False
        return res
-    def generate_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        if not requests:
            return []
        res = []
-        for request in tqdm([req.args for req in requests]):
+        for request in tqdm([req.args for req in requests], disable=disable_tqdm):
            inp = request[0]
            request_args = request[1]
            until = request_args.get("until", ["</s>"])
@@ -122,7 +124,7 @@ class GGUFLM(LM):
                res.append(None)  # Add default value in case of error
        return res
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError(
            "loglikelihood_rolling not yet supported for GGUF models"
        )
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -790,7 +790,9 @@ class HFLM(TemplateLM):
        return logits
-    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
        loglikelihoods = []
        adaptive_batch_size = None
@@ -801,7 +803,9 @@ class HFLM(TemplateLM):
            print(f"Determined Largest batch size: {batch_size}")
            adaptive_batch_size = batch_size
-        for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
+        for (string,) in tqdm(
+            [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
+        ):
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,
@@ -1079,7 +1083,9 @@ class HFLM(TemplateLM):
        return re_ord.get_original(res)
-    def generate_until(self, requests: List[Instance]) -> List[str]:
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
        res = []
        def _collate(req: Tuple[str, dict]):
@@ -1095,7 +1101,7 @@ class HFLM(TemplateLM):
        pbar = tqdm(
            total=len(requests),
-            disable=(self.rank != 0),
+            disable=(disable_tqdm or (self.rank != 0)),
            desc="Running generate_until requests",
        )
        adaptive_batch_size = None

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -447,12 +447,14 @@ class NEURON_HF(TemplateLM):
        return logits
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        loglikelihoods = []
        adaptive_batch_size = None
-        for (string,) in tqdm([req.args for req in requests], disable=(self.rank != 0)):
+        for (string,) in tqdm(
+            [req.args for req in requests], disable=(disable_tqdm or (self.rank != 0))
+        ):
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,
@@ -616,7 +618,7 @@ class NEURON_HF(TemplateLM):
        return re_ord.get_original(res)
-    def generate_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        res = defaultdict(list)
        re_ords = {}
@@ -638,7 +640,7 @@ class NEURON_HF(TemplateLM):
            # within each set of reqs for given kwargs, we reorder by token length, descending.
            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)
-        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
        # for each different set of kwargs, we execute all requests, by batch.
        for key, re_ord in re_ords.items():

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -231,7 +231,7 @@ class OpenaiCompletionsLM(TemplateLM):
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
        return re_ord.get_original(res)
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        if not requests:
            return []
        res = []
@@ -258,7 +258,8 @@ class OpenaiCompletionsLM(TemplateLM):
        # todo: more intelligent batching for heterogeneous `until`
        for chunk, request_args in tqdm(
-            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size))
+            list(sameuntil_chunks(re_ord.get_reordered(), self.batch_size)),
+            disable=disable_tqdm,
        ):
            inps = []
            self._max_gen_toks = request_args.get("max_gen_toks", self.max_gen_toks)
@@ -308,10 +309,12 @@ class OpenaiCompletionsLM(TemplateLM):
        # Isn't used because we override generate_until
        raise NotImplementedError()
-    def loglikelihood_rolling(self, requests) -> List[float]:
+    def loglikelihood_rolling(
+        self, requests, disable_tqdm: bool = False
+    ) -> List[float]:
        loglikelihoods = []
-        for (string,) in tqdm([req.args for req in requests]):
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
            rolling_token_windows = list(
                map(
                    utils.make_disjoint_window,
@@ -398,7 +401,7 @@ class OpenaiChatCompletionsLM(LM):
        # Isn't used because we override _loglikelihood_tokens
        raise NotImplementedError()
-    def generate_until(self, requests) -> List[str]:
+    def generate_until(self, requests, disable_tqdm: bool = False) -> List[str]:
        res = defaultdict(list)
        re_ords = {}
@@ -412,7 +415,7 @@ class OpenaiChatCompletionsLM(LM):
                [req.args for req in reqs], lambda x: (-len(x[0]), x[0])
            )
-        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
+        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
        for key, re_ord in re_ords.items():
            # n needs to be 1 because messages in
            # chat completion are not batch but
@@ -471,8 +474,8 @@ class OpenaiChatCompletionsLM(LM):
        return grouper.get_original(res)
-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -95,9 +95,9 @@ class TextSynthLM(LM):
        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
        raise NotImplementedError()
-    def loglikelihood(self, requests):
+    def loglikelihood(self, requests, disable_tqdm: bool = False):
        res = []
-        for context, continuation in tqdm(requests):
+        for context, continuation in tqdm(requests, disable=disable_tqdm):
            response = textsynth_completion(
                url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
                headers={"Authorization": "Bearer " + self.api_key},
@@ -119,7 +119,7 @@ class TextSynthLM(LM):
                assert False
        return res
-    def loglikelihood_rolling(self, requests):
+    def loglikelihood_rolling(self, requests, disable_tqdm: bool = False):
        # TODO: The TextSynth API does not support tokenized inputs so we cannot
        # manually partition long contexts into smaller rolling windows as
        # done for other models derived from `BaseLM`. Override this method
@@ -129,12 +129,12 @@ class TextSynthLM(LM):
            "input tokenization support from TextSynth."
        )
-    def generate_until(self, requests):
+    def generate_until(self, requests, disable_tqdm: bool = False):
        if not requests:
            return []
        res = []
-        for request in tqdm(requests):
+        for request in tqdm(requests, disable=disable_tqdm):
            inp = request[0]
            request_args = request[1]
            until = request_args["until"]

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -215,10 +215,12 @@ class VLLM(TemplateLM):
        )
        return outputs
-    def loglikelihood_rolling(self, requests: List[Instance]) -> List[float]:
+    def loglikelihood_rolling(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[float]:
        loglikelihoods = []
-        for (string,) in tqdm([req.args for req in requests]):
+        for (string,) in tqdm([req.args for req in requests], disable=disable_tqdm):
            rolling_token_windows = list(
                map(
                    make_disjoint_window,
@@ -244,7 +246,9 @@ class VLLM(TemplateLM):
            loglikelihoods.append(string_nll)
        return loglikelihoods
-    def generate_until(self, requests: List[Instance]) -> List[str]:
+    def generate_until(
+        self, requests: List[Instance], disable_tqdm: bool = False
+    ) -> List[str]:
        res = []
        # batch tokenize contexts
@@ -273,7 +277,7 @@ class VLLM(TemplateLM):
        pbar = tqdm(
            total=len(requests),
-            disable=(self.rank != 0),
+            disable=(disable_tqdm or (self.rank != 0)),
            desc="Running generate_until requests",
        )
        # for each different set of kwargs, we execute all requests, by batch.
@@ -411,6 +415,26 @@ class VLLM(TemplateLM):
        # The first entry of prompt_logprobs is None because the model has no previous tokens to condition on.
        continuation_logprobs_dicts = outputs.prompt_logprobs
+        def coerce_logprob_to_num(logprob):
+            # vLLM changed the return type of logprobs from float
+            # to a Logprob object storing the float value + extra data
+            # (https://github.com/vllm-project/vllm/pull/3065).
+            # If we are dealing with vllm's Logprob object, return
+            # the logprob value stored as an attribute. Otherwise,
+            # return the object itself (which should be a float
+            # for older versions of vLLM).
+            return getattr(logprob, "logprob", logprob)
+        continuation_logprobs_dicts = [
+            {
+                token: coerce_logprob_to_num(logprob)
+                for token, logprob in logprob_dict.items()
+            }
+            if logprob_dict is not None
+            else None
+            for logprob_dict in continuation_logprobs_dicts
+        ]
        # Calculate continuation_logprobs
        # assume ctxlen always >= 1
        continuation_logprobs = sum(

--- a/lm_eval/tasks/aexams/README.md
+++ b/lm_eval/tasks/aexams/README.md
+# Arabic EXAMS
+### Paper
+EXAMS: a resource specialized in multilingual high school exam questions.
+The original paper [EXAMS](https://aclanthology.org/2020.emnlp-main.438/)
+The Arabic EXAMS dataset includes five subjects
+  - Islamic studies
+  - Biology
+  - Physics
+  - Science
+  - Social
+The original dataset [EXAMS-QA](https://github.com/mhardalov/exams-qa)
+EXAMS is a benchmark dataset for cross-lingual and multilingual question answering for high school examinations.
+With 24,000 high-quality high school exam questions in 16 languages, covering 8 language families and 24 school subjects from Natural Sciences and Social Sciences, among others.
+EXAMS offers unique fine-grained evaluation framework across multiple languages and subjects
+Homepage for Arabic EXAMS: [EXAMS Arabic Homepage](https://github.com/FreedomIntelligence/AceGPT/tree/main/eval/benchmark_eval/benchmarks/EXAMS_Arabic)
+### Citation
+### Groups and Tasks
+#### Groups
+- `EXAMS Arabic`: include IslamicStudies, Biology, Science, Physics, Social.
+#### Tasks
+The following tasks evaluate subjects in Arabic EXAMS dataset using loglikelihood-based multiple-choice scoring:
+- `aexams_IslamicStudies`
+- `aexams_Biology`
+- `aexams_Science`
+- `aexams_Physics`
+- `aexams_Social`
+### Checklist
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] Yes, original implementation contributed by author of the benchmark
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/aexams/_default_template_yaml
+++ b/lm_eval/tasks/aexams/_default_template_yaml
+group: aexams
+dataset_path: Hennara/aexams
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\nالجواب："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/aexams/aexams_Biology.yaml
+++ b/lm_eval/tasks/aexams/aexams_Biology.yaml
+"dataset_name": "Biology"
+"description": "قم بالإجابة على مايلي في مجال العلوم الحيوية\n\n"
+"include": "_default_template_yaml"
+"task": "aexams_Biology"
--- a/lm_eval/tasks/aexams/aexams_IslamicStudies.yaml
+++ b/lm_eval/tasks/aexams/aexams_IslamicStudies.yaml
+"dataset_name": "IslamicStudies"
+"description": "قم بالإجابة على مايلي في مجال العلوم الإسلامية \n\n"
+"include": "_default_template_yaml"
+"task": "aexams_IslamicStudies"
--- a/lm_eval/tasks/aexams/aexams_Physics.yaml
+++ b/lm_eval/tasks/aexams/aexams_Physics.yaml
+"dataset_name": "Physics"
+"description": "قم بالإجابة على مايلي في مجال الفيزياء \n\n"
+"include": "_default_template_yaml"
+"task": "aexams_Physics"
--- a/lm_eval/tasks/aexams/aexams_Science.yaml
+++ b/lm_eval/tasks/aexams/aexams_Science.yaml
+"dataset_name": "Science"
+"description": "قم بالإجابة على مايلي في مجال العلوم \n\n"
+"include": "_default_template_yaml"
+"task": "aexams_Science"
--- a/lm_eval/tasks/aexams/aexams_Social.yaml
+++ b/lm_eval/tasks/aexams/aexams_Social.yaml
+"dataset_name": "Social"
+"description": "قم بالإجابة على مايلي في مجال العلوم الإجتماعية \n\n"
+"include": "_default_template_yaml"
+"task": "aexams_Social"