merge conflict

470059f6 · lintangsutawika · b8d7d6c3 · 9d030712 · b8d7d6c3 · 470059f6
Commit 470059f6 authored Nov 24, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/logger.py
+++ b/lm_eval/logger.py
-import logging
-
-logging.basicConfig(
-    format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s",
-    datefmt="%Y-%m-%d:%H:%M:%S",
-    level=logging.INFO,
-)
-eval_logger = logging.getLogger("lm-eval")
-
-SPACING = " " * 47
--- a/lm_eval/models/__init__.py
+++ b/lm_eval/models/__init__.py
@@ -3,6 +3,7 @@ from . import openai_completions
 from . import textsynth
 from . import dummy
 from . import anthropic_llms
+from . import gguf


 # TODO: implement __all__
--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
@@ -2,9 +2,11 @@ from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model
 from tqdm import tqdm
 import time
-from lm_eval.logger import eval_logger
+from lm_eval import utils
 from typing import List, Any, Tuple

+eval_logger = utils.eval_logger
+

 def anthropic_completion(
    client,  #: anthropic.Anthropic,
@@ -138,7 +140,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
    def _loglikelihood_tokens(self, requests, disable_tqdm: bool = False):
        raise NotImplementedError("No support for logits.")

-    def greedy_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> List[str]:
        if not requests:
            return []

@@ -164,7 +166,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
                )
                res.append(response)

-                self.cache_hook.add_partial("greedy_until", request, response)
+                self.cache_hook.add_partial("generate_until", request, response)
            except anthropic.APIConnectionError as e:  # type: ignore # noqa: F821
                eval_logger.critical(f"Server unreachable: {e.__cause__}")
                break
@@ -179,7 +181,7 @@ please install anthropic via `pip install lm-eval[anthropic]` or `pip install -e
        raise NotImplementedError()

    def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
+        # Isn't used because we override generate_until
        raise NotImplementedError()

    def loglikelihood(self, requests):

--- a/lm_eval/models/dummy.py
+++ b/lm_eval/models/dummy.py
@@ -20,7 +20,7 @@ class DummyLM(LM):

        return res

-    def greedy_until(self, requests):
+    def generate_until(self, requests):
        res = []

        for ctx, _ in requests:

--- a/lm_eval/models/gguf.py
+++ b/lm_eval/models/gguf.py
+import requests
+import logging
+import time
+from tqdm import tqdm
+from requests.exceptions import RequestException
+from lm_eval.api.model import LM
+from lm_eval.api.registry import register_model
+
+logger = logging.getLogger(__name__)
+
+
+def get_result(logprobs, context_length):
+    is_greedy = True
+    offsets = logprobs["text_offset"]
+    tokens = logprobs["tokens"]
+    tokens_logprobs = logprobs["token_logprobs"]
+
+    idx = 0
+    while offsets[idx] < context_length:
+        idx += 1
+    continuation_logprobs = sum(tokens_logprobs[idx:-1])
+    for i in range(idx, len(tokens)):
+        token = tokens[i]
+        top_tokens = logprobs["top_logprobs"][i]
+        top_token = max(top_tokens.keys(), key=lambda x: top_tokens[x])
+        if top_token != token:
+            is_greedy = False
+            break
+
+    return continuation_logprobs, is_greedy
+
+
+@register_model("gguf", "ggml")
+class GGUFLM(LM):
+    def __init__(self, base_url=None, max_length=2048, **kwargs):
+        super().__init__()
+        self.base_url = base_url
+        assert self.base_url, "must pass `base_url` to use GGUF LM!"
+        self.logprobs = 10
+        self.temperature = 0.0
+        self.max_length = max_length
+
+    def gguf_completion(
+        self, context, continuation=None, stop=None, retries=3, delay=5, **kwargs
+    ):
+        for _ in range(retries):
+            try:
+                prompt = context
+                request = {
+                    "prompt": prompt,
+                    "logprobs": self.logprobs,
+                    "temperature": self.temperature,
+                }
+                if continuation:
+                    prompt += continuation
+                    request.update({"prompt": prompt, "max_tokens": 1, "echo": True})
+                if stop is not None:
+                    request["stop"] = stop
+                response = requests.post(
+                    f"{self.base_url}/v1/completions", json=request
+                )
+                response.raise_for_status()
+                return response.json()
+            except RequestException as e:
+                logger.error(f"RequestException: {e}")
+                time.sleep(delay)  # wait before retrying
+        else:
+            raise Exception(f"Failed to get a valid response after {retries} retries.")
+
+    def loglikelihood(self, requests):
+        if not requests:
+            return []
+        res = []
+        for context, continuation in tqdm([req.args for req in requests]):
+            response = self.gguf_completion(context=context, continuation=continuation)
+            if response and "choices" in response and response["choices"]:
+                choice = response["choices"][0]
+                logprobs = choice.get("logprobs")
+                if (
+                    logprobs
+                    and "token_logprobs" in logprobs
+                    and logprobs["token_logprobs"]
+                ):
+                    logprob, is_greedy = get_result(logprobs, len(context))
+                    res.append((logprob, is_greedy))
+                else:
+                    logger.warning(
+                        "Invalid logprobs data. Expected 'logprobs' to contain 'token_logprobs' list."
+                    )
+            else:
+                logger.error(
+                    f"Invalid response for loglikelihood. Response: {response}"
+                )
+                assert False
+        return res
+
+    def generate_until(self, requests):
+        if not requests:
+            return []
+
+        res = []
+        for request in tqdm([req.args for req in requests]):
+            inp = request[0]
+            request_args = request[1]
+            until = request_args.get("until", ["</s>"])
+            response = self.gguf_completion(context=inp, stop=until)
+            if response and "choices" in response and response["choices"]:
+                choice = response["choices"][0]
+                if "text" in choice:
+                    generated_text = choice["text"].strip()
+                    res.append(generated_text)
+                else:
+                    logger.error(
+                        f"Invalid response for greedy_until. Response: {response}"
+                    )
+                    res.append(None)  # Add default value in case of error
+            else:
+                logger.error(f"Invalid response for greedy_until. Response: {response}")
+                res.append(None)  # Add default value in case of error
+        return res
+
+    def loglikelihood_rolling(self, requests):
+        raise NotImplementedError(
+            "loglikelihood_rolling not yet supported for GGUF models"
+        )
--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -16,7 +16,6 @@ from pathlib import Path
 import torch.nn.functional as F

 from lm_eval import utils
-from lm_eval.logger import eval_logger
 from lm_eval.api.model import LM
 from lm_eval.api.registry import register_model

@@ -25,6 +24,8 @@ from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria
 from accelerate import Accelerator, find_executable_batch_size, DistributedType
 from typing import List, Optional, Union

+eval_logger = utils.eval_logger
+

 def _get_accelerate_args(
    device_map_option: Optional[str] = "auto",
@@ -419,12 +420,13 @@ class HFLM(LM):
        utils.clear_torch_cache()
        return batch_size

-    def tok_encode(self, string: str, left_truncate_len=None):
+    def tok_encode(self, string: str, left_truncate_len=None, add_special_tokens=None):
        """ """
-        if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
-            add_special_tokens = False
-        elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
-            add_special_tokens = True
+        if add_special_tokens is None:
+            if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
+                add_special_tokens = False
+            elif self.AUTO_MODEL_CLASS == transformers.AutoModelForSeq2SeqLM:
+                add_special_tokens = True

        encoding = self.tokenizer.encode(string, add_special_tokens=add_special_tokens)

@@ -539,8 +541,12 @@ class HFLM(LM):
        if n_spaces > 0:
            continuation = context[-n_spaces:] + continuation
            context = context[:-n_spaces]
-        whole_enc = self.tok_encode(context + continuation)
-        context_enc = self.tok_encode(context)
+
+        whole_enc = self.tok_encode(context + continuation, add_special_tokens=False)
+        context_enc = self.tok_encode(context, add_special_tokens=False)
+
+        # whole_enc = self.tok_encode(context + continuation)
+        # context_enc = self.tok_encode(context, add_special_tokens=False)
        context_enc_len = len(context_enc)
        continuation_enc = whole_enc[context_enc_len:]
        return context_enc, continuation_enc
@@ -616,6 +622,23 @@ class HFLM(LM):

        return loglikelihoods

+    def _batch_scheduler(self, pos, n_reordered_requests):
+        sched = pos // int(len(n_reordered_requests) / self.batch_schedule)
+        if sched in self.batch_sizes:
+            return self.batch_sizes[sched]
+        if (len(self.batch_sizes) > 1) and (
+            self.batch_sizes[sched - 1] == self.max_batch_size
+        ):
+            # if previous batch size is already maximal, skip recomputation
+            self.batch_sizes[sched] = self.max_batch_size
+            return self.batch_sizes[sched]
+        print(
+            f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
+        )
+        self.batch_sizes[sched] = self._detect_batch_size(n_reordered_requests, pos)
+        print(f"Determined largest batch size: {self.batch_sizes[sched]}")
+        return self.batch_sizes[sched]
+
    def _loglikelihood_tokens(
        self, requests, disable_tqdm: bool = False, override_bs=None
    ):
@@ -639,38 +662,22 @@ class HFLM(LM):
        # automatic (variable) batch size detection for vectorization
        # pull longest context sample from request

-        def _batch_scheduler(pos):
-            sched = pos // int(n_reordered_requests / self.batch_schedule)
-            if sched in self.batch_sizes:
-                return self.batch_sizes[sched]
-            if (len(self.batch_sizes) > 1) and (
-                self.batch_sizes[sched - 1] == self.max_batch_size
-            ):
-                # if previous batch size is already maximal, skip recomputation
-                self.batch_sizes[sched] = self.max_batch_size
-                return self.batch_sizes[sched]
-            print(
-                f"Passed argument batch_size = auto:{self.batch_schedule}. Detecting largest batch size"
-            )
-            self.batch_sizes[sched] = self._detect_batch_size(
-                re_ord.get_reordered(), pos
-            )
-            print(f"Determined largest batch size: {self.batch_sizes[sched]}")
-            return self.batch_sizes[sched]
-
-        for chunk in utils.chunks(
-            tqdm(re_ord.get_reordered(), disable=(disable_tqdm or (self.rank != 0))),
+        chunks = utils.chunks(
+            re_ord.get_reordered(),
            n=self.batch_size
            if self.batch_size != "auto"
            else override_bs
            if override_bs is not None
            else 0,
-            fn=_batch_scheduler
+            fn=self._batch_scheduler
            if self.batch_size == "auto"
            and n_reordered_requests > 0
            and not override_bs
            else None,
-        ):
+        )
+
+        pbar = tqdm(total=len(requests), disable=(disable_tqdm or (self.rank != 0)))
+        for chunk in chunks:
            inps = []
            cont_toks_list = []
            inplens = []
@@ -807,10 +814,13 @@ class HFLM(LM):
                res.append(answer)

                self.cache_hook.add_partial("loglikelihood", cache_key, answer)
+                pbar.update(1)
+
+        pbar.close()

        return re_ord.get_original(res)

-    def greedy_until(self, requests):
+    def generate_until(self, requests):
        res = defaultdict(list)
        re_ords = {}

@@ -833,13 +843,26 @@ class HFLM(LM):
            re_ords[key] = utils.Reorderer([req.args for req in reqs], _collate)

        pbar = tqdm(total=len(requests), disable=(self.rank != 0))
-
+        if self.batch_size == "auto":
+            # using rolling window with maximum context
+            print("Passed argument batch_size = auto. Detecting largest batch size")
+            batch_size = self._detect_batch_size()
+            print(f"Determined Largest batch size: {batch_size}")
+            adaptive_batch_size = batch_size
        # for each different set of kwargs, we execute all requests, by batch.
        for key, re_ord in re_ords.items():
-            for chunk in utils.chunks(
+            chunks = utils.chunks(
                re_ord.get_reordered(),
-                self.batch_size,
-            ):
+                n=self.batch_size
+                if self.batch_size != "auto"
+                else adaptive_batch_size
+                if adaptive_batch_size is not None
+                else 0,
+                fn=self._batch_scheduler
+                if self.batch_size == "auto" and not adaptive_batch_size
+                else None,
+            )
+            for chunk in chunks:
                contexts, all_gen_kwargs = zip(*chunk)
                # we assume all gen kwargs in the batch are the same
                # this is safe to assume because the `grouper` object ensures it.
@@ -866,8 +889,6 @@ class HFLM(LM):
                    max_gen_toks = kwargs.pop("max_gen_toks")
                else:
                    max_gen_toks = self.max_gen_toks
-                # first stop sequence is used to halt generation upon encountering
-                primary_until = [until[0]]

                # set the max length in tokens of inputs ("context_enc")
                if self.AUTO_MODEL_CLASS == transformers.AutoModelForCausalLM:
@@ -893,7 +914,7 @@ class HFLM(LM):
                cont = self._model_generate(
                    context=context_enc,
                    attention_mask=attn_masks,
-                    stop=primary_until,
+                    stop=until,
                    **kwargs,
                )

@@ -915,7 +936,7 @@ class HFLM(LM):
                    res[key].append(s)

                    self.cache_hook.add_partial(
-                        "greedy_until", (context, gen_kwargs), s
+                        "generate_until", (context, gen_kwargs), s
                    )
                    pbar.update(1)
            # reorder this group of results back to original unsorted form

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -203,7 +203,7 @@ class OpenaiCompletionsLM(LM):
                    self.cache_hook.add_partial("loglikelihood", cache_key, answer)
        return re_ord.get_original(res)

-    def greedy_until(self, requests) -> List[str]:
+    def generate_until(self, requests) -> List[str]:
        if not requests:
            return []
        res = []
@@ -260,7 +260,7 @@ class OpenaiCompletionsLM(LM):

                # partial caching
                self.cache_hook.add_partial(
-                    "greedy_until", (context, {"until": until_}), s
+                    "generate_until", (context, {"until": until_}), s
                )

                res.append(s)
@@ -271,7 +271,7 @@ class OpenaiCompletionsLM(LM):
        raise NotImplementedError()

    def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
+        # Isn't used because we override generate_until
        raise NotImplementedError()

    def loglikelihood_rolling(self, requests) -> List[float]:

--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -58,7 +58,7 @@ class TextSynthLM(LM):

    @property
    def eot_token_id(self):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
        raise NotImplementedError()

    @property
@@ -72,20 +72,20 @@ class TextSynthLM(LM):

    @property
    def batch_size(self):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
        raise NotImplementedError()

    @property
    def device(self):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
        raise NotImplementedError()

    def tok_encode(self, string: str):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
        raise NotImplementedError()

    def tok_decode(self, tokens):
-        # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
+        # Isn't used because we override loglikelihood, loglikelihood_rolling and generate_until
        raise NotImplementedError()

    def loglikelihood(self, requests):
@@ -122,7 +122,7 @@ class TextSynthLM(LM):
            "input tokenization support from TextSynth."
        )

-    def greedy_until(self, requests):
+    def generate_until(self, requests):
        if not requests:
            return []

@@ -146,7 +146,7 @@ class TextSynthLM(LM):
                s = resp["text"]
                res.append(s)

-                self.cache_hook.add_partial("greedy_until", (inp, request_args), s)
+                self.cache_hook.add_partial("generate_until", (inp, request_args), s)
            else:
                logger.error(
                    f"The following response does not contain generated `text`. "
@@ -160,5 +160,5 @@ class TextSynthLM(LM):
        raise NotImplementedError()

    def _model_generate(self, context, max_length, eos_token_id):
-        # Isn't used because we override greedy_until
+        # Isn't used because we override generate_until
        raise NotImplementedError()
--- a/lm_eval/prompts/__init__.py
+++ b/lm_eval/prompts/__init__.py
+import os
 import ast

 from typing import Dict
 from lm_eval import utils
-from lm_eval.logger import eval_logger
+from lm_eval.utils import eval_logger

 # Prompt library.
 # Stores prompts in a dictionary indexed by 2 levels:
@@ -47,6 +48,14 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
            raise ValueError(
                f"{prompt_name} not in prompt list {prompts.all_template_names}"
            )
+    elif ".yaml" in category_name:
+        import yaml
+
+        with open(category_name, "rb") as file:
+            prompt_yaml_file = yaml.full_load(file)
+
+        prompt_string = prompt_yaml_file["prompts"][prompt_name]
+        return PromptString(prompt_string)
    else:
        try:
            return PROMPT_REGISTRY[category_name][prompt_name]
@@ -57,21 +66,62 @@ def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None
            )


-def load_prompt_list(use_prompt: str, dataset_name=None, subset_name=None, **kwargs):
+def load_prompt_list(
+    use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs
+):

-    from promptsource.templates import DatasetTemplates
+    category_name, prompt_name = use_prompt.split(":")

-    if subset_name is None:
-        prompts = DatasetTemplates(dataset_name=dataset_name)
-    else:
-        prompts = DatasetTemplates(dataset_name=dataset_name, subset_name=subset_name)
+    if category_name == "promptsource":
+        from promptsource.templates import DatasetTemplates
+
+        if subset_name is None:
+            prompts = DatasetTemplates(dataset_name=dataset_name)
+        else:
+            prompts = DatasetTemplates(
+                dataset_name=dataset_name, subset_name=subset_name
+            )
+
+        prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
+
+    elif ".yaml" in category_name:
+        import yaml

-    category_name, *prompt_name = use_prompt.split(":")
+        if yaml_path is not None:
+            category_name = os.path.realpath(os.path.join(yaml_path, category_name))
+
+        with open(category_name, "rb") as file:
+            prompt_yaml_file = yaml.full_load(file)
+
+        prompt_list = utils.pattern_match(
+            prompt_name, prompt_yaml_file["prompts"].keys()
+        )
+
+    # category_name, *prompt_name = use_prompt.split(":")
    # TODO allow to multiple prompt naming
    # if len(prompt_name) > 1:
    #     prompt_list = []
    #     for prompt in prompt_name:
    #         prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names))
    # else:
-    prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
+    #     prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names)
    return [":".join([category_name, prompt]) for prompt in prompt_list]
+
+
+class PromptString:
+    def __init__(self, prompt_string):
+        self.prompt_string = prompt_string
+
+    def apply(self, doc):
+
+        doc_to_text = self.prompt_string["doc_to_text"]
+        doc_to_target = self.prompt_string["doc_to_target"]
+
+        # TODO need a way to process doc_to_choice
+        if "doc_to_choice" in self.prompt_string:
+            raise "Not yet implemented to accept doc_to_choice"
+
+        text_string = utils.apply_template(doc_to_text, doc)
+        target_string = utils.apply_template(doc_to_target, doc)
+
+        return [text_string, target_string]
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
@@ -59,6 +59,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] MGSM
 - [ ] SCROLLS
 - [x] Babi
+- [x] Belebele

 # Novel Tasks
 Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -4,7 +4,6 @@ from typing import List, Union, Dict

 from lm_eval import utils
 from lm_eval import prompts
-from lm_eval.logger import eval_logger
 from lm_eval.api.task import TaskConfig, Task, ConfigurableTask
 from lm_eval.api.registry import (
    register_task,
@@ -14,6 +13,21 @@ from lm_eval.api.registry import (
    ALL_TASKS,
 )

+import logging
+
+# import python tasks
+from .squadv2.task import SQuAD2
+from .scrolls.task import (
+    QuALITY,
+    NarrativeQA,
+    ContractNLI,
+    GovReport,
+    SummScreenFD,
+    QMSum,
+)
+
+eval_logger = utils.eval_logger
+

 def register_configurable_task(config: Dict[str, str]) -> int:
    SubClass = type(
@@ -27,7 +41,9 @@ def register_configurable_task(config: Dict[str, str]) -> int:
        register_task(task_name)(SubClass)

    if "group" in config:
-        if type(config["group"]) == str:
+        if config["group"] == config["task"]:
+            raise ValueError("task and group name cannot be the same")
+        elif type(config["group"]) == str:
            group_name = [config["group"]]
        else:
            group_name = config["group"]
@@ -38,18 +54,20 @@ def register_configurable_task(config: Dict[str, str]) -> int:
    return 0


-def register_configurable_group(config: Dict[str, str]) -> int:
+def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -> int:
    group = config["group"]
    all_task_list = config["task"]
    config_list = [task for task in all_task_list if type(task) != str]
    task_list = [task for task in all_task_list if type(task) == str]

    for task_config in config_list:
+        task_config = utils.load_yaml_config(yaml_path, task_config)
        var_configs = check_prompt_config(
            {
                **task_config,
                **{"group": group},
-            }
+            },
+            yaml_path=os.path.dirname(yaml_path),
        )
        for config in var_configs:
            register_configurable_task(config)
@@ -66,13 +84,16 @@ def register_configurable_group(config: Dict[str, str]) -> int:
    return 0


-def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]:
+def check_prompt_config(
+    config: Dict[str, str], yaml_path: str = None
+) -> List[Dict[str, str]]:
    all_configs = []
    if "use_prompt" in config:
        prompt_list = prompts.load_prompt_list(
            use_prompt=config["use_prompt"],
            dataset_name=config["dataset_path"],
            subset_name=config["dataset_name"] if "dataset_name" in config else None,
+            yaml_path=yaml_path,
        )
        for idx, prompt_variation in enumerate(prompt_list):
            all_configs.append(
@@ -85,11 +106,13 @@ def check_prompt_config(config: Dict[str, str]) -> List[Dict[str, str]]:
                                config["task"]
                                if "task" in config
                                else get_task_name_from_config(config),
-                                prompt_variation,
+                                prompt_variation.split("/")[-1]
+                                if ".yaml" in prompt_variation
+                                else prompt_variation,
                            ]
                        )
                    },
-                    **{"output_type": "greedy_until"},
+                    **{"output_type": "generate_until"},
                }
            )
    else:
@@ -104,41 +127,64 @@ def get_task_name_from_config(task_config: Dict[str, str]) -> str:
        return "{dataset_path}".format(**task_config)


-def include_task_folder(task_dir: str, register_task=True) -> None:
+def include_task_folder(task_dir: str, register_task: bool = True) -> None:
    """
    Calling this function
    """
    for root, subdirs, file_list in os.walk(task_dir):
-        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
-            for f in file_list:
-                if f.endswith(".yaml"):
-                    yaml_path = os.path.join(root, f)
-                    try:
-                        config = utils.load_yaml_config(yaml_path)
-
+        # if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
+        for f in file_list:
+            if f.endswith(".yaml"):
+                yaml_path = os.path.join(root, f)
+                try:
+                    config = utils.load_yaml_config(yaml_path)
+
+                    if "task" not in config:
+                        continue
+
+                    all_configs = check_prompt_config(
+                        config, yaml_path=os.path.dirname(yaml_path)
+                    )
+                    for config in all_configs:
                        if register_task:
-                            all_configs = check_prompt_config(config)
-                            for config in all_configs:
+                            if type(config["task"]) == str:
                                register_configurable_task(config)
                        else:
-                            # If a `task` in config is a list,
-                            # that means it's a benchmark
                            if type(config["task"]) == list:
-                                register_configurable_group(config)
-
-                    except Exception as error:
-                        eval_logger.warning(
-                            "Failed to load config in\n"
-                            f"                                 {yaml_path}\n"
-                            "                                 Config will not be added to registry\n"
-                            f"                                 Error: {error}"
-                        )
+                                register_configurable_group(config, yaml_path)
+
+                # Log this silently and show it only when
+                # the user defines the appropriate verbosity.
+                except ModuleNotFoundError as e:
+                    eval_logger.debug(
+                        f"{yaml_path}: {e}. Config will not be added to registry."
+                    )
+                except Exception as error:
+                    import traceback
+
+                    eval_logger.debug(
+                        "Failed to load config in\n"
+                        f"                                 {yaml_path}\n"
+                        "                                 Config will not be added to registry\n"
+                        f"                                 Error: {error}\n"
+                        f"                                 Traceback: {traceback.format_exc()}"
+                    )
+    return 0


-task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-include_task_folder(task_dir)
-# Register Benchmarks after all tasks have been added
-include_task_folder(task_dir, register_task=False)
+def include_path(task_dir):
+    include_task_folder(task_dir)
+    # Register Benchmarks after all tasks have been added
+    include_task_folder(task_dir, register_task=False)
+    return 0
+
+
+def initialize_tasks(verbosity="INFO"):
+
+    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+
+    task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
+    include_path(task_dir)


 def get_task(task_name, config):
@@ -166,7 +212,6 @@ def get_task_name_from_object(task_object):

 # TODO: pass num_fewshot and other cmdline overrides in a better way
 def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
-
    config = {**kwargs}

    task_name_from_registry_dict = {}
@@ -178,7 +223,6 @@ def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):

    for task_element in task_name_list:
        if isinstance(task_element, str):
-
            if task_element in GROUP_REGISTRY:
                group_name = task_element
                for task_name in GROUP_REGISTRY[task_element]:
@@ -216,7 +260,6 @@ def get_task_dict(task_name_list: List[Union[str, Dict, Task]], **kwargs):
            }

        elif isinstance(task_element, Task):
-
            task_name_from_object_dict = {
                **task_name_from_object_dict,
                get_task_name_from_object(task_element): task_element,

--- a/lm_eval/tasks/babi/babi.yaml
+++ b/lm_eval/tasks/babi/babi.yaml
 task: babi
 dataset_path: Muennighoff/babi
 dataset_name: null
-output_type: greedy_until
+output_type: generate_until
 training_split: train
 validation_split: valid
 test_split: test

--- a/lm_eval/tasks/bbh/README.md
+++ b/lm_eval/tasks/bbh/README.md
+# BigBenchHard
+
+## Paper
+Title: `Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them`
+Abstract: https://arxiv.org/abs/2210.09261
+
+A suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH).
+These are the task for which prior language model evaluations did not outperform
+the average human-rater.
+
+Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard
+
+
+## Citation
+```
+@article{suzgun2022challenging,
+  title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them},
+  author={Suzgun, Mirac and Scales, Nathan and Sch{\"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and and Wei, Jason},
+  journal={arXiv preprint arXiv:2210.09261},
+  year={2022}
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `bbh_flan_zeroshot`
+- `bbh_flan_fewshot`
+- `bbh_flan_cot_fewshot`
+- `bbh_flan_cot_zeroshot`
+
+
+#### Tasks
+
+- ...
+
+### Checklist
+
+- [x] Is in Eval-harness v1.0 ?
+- [ ] Has been checked for regression from v1.0?
+- [ ] Has been checked for equivalence with original paper methodology?
+- [ ] "Main" checked variant clearly denoted?
+
+### Variant Wishlist
+
+- [ ] Variant with Calculator (see https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py for example implementation)
+- [ ] Using Verifiers
+- [ ] Majority voting "without CoT"
--- a/lm_eval/tasks/bbh/_generate_configs.py
+++ b/lm_eval/tasks/bbh/_generate_configs.py
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import os
+import re
+import yaml
+import requests
+import argparse
+
+import datasets
+from tqdm import tqdm
+
+from lm_eval import utils
+from lm_eval.logger import eval_logger
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="flan_zeroshot")
+    parser.add_argument("--cot", default=False)
+    parser.add_argument("--fewshot", default=False)
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path) as f:
+        base_yaml = yaml.full_load(f)
+
+    base_doc_to_text = "Q: {{input}}\nA:"
+    answer_regex = re.compile("(?<=answer is )(.*)(?=.)")
+
+    dataset_path = "lukaemon/bbh"
+    for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
+
+        resp = requests.get(
+            f"https://raw.githubusercontent.com/suzgunmirac/BIG-Bench-Hard/main/cot-prompts/{task}.txt"
+        ).content.decode("utf-8")
+        prompt = resp.split("\n-----\n")[-1]
+        description, *few_shot = prompt.split("\n\nQ:")
+
+        prefix_doc_to_text = ""
+        if args.fewshot:
+            if args.cot:
+                prefix_doc_to_text = " ".join(few_shot)
+            else:
+                for shot in few_shot:
+                    shot = "Q:" + shot
+                    try:
+                        answer = answer_regex.search(shot)[0]
+                    except Exception:
+                        print("task", task)
+                        print(shot)
+                    example = shot.split("Let's think step by step.")[0]
+                    prefix_doc_to_text += f"{example}{answer}\n\n"
+
+        doc_to_text = prefix_doc_to_text + base_doc_to_text
+        if args.cot:
+            doc_to_text = doc_to_text + " Let's think step by step.\n"
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": f"bbh_{args.task_prefix}_{task}",
+            "dataset_name": task,
+            "description": description + "\n\n",
+            "doc_to_text": doc_to_text,
+        }
+
+        file_save_path = args.save_prefix_path + f"/{task}.yaml"
+        eval_logger.info(f"Saving yaml for subset {task} to {file_save_path}")
+        with open(file_save_path, "w") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/_flan_cot_fewshot_template_yaml
+group: bbh_flan_cot_fewshot
+dataset_path: lukaemon/bbh
+output_type: generate_until
+test_split: test
+doc_to_target: "{{target}}"
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    # ignore_case: true
+    # ignore_punctuation: true
+generation_kwargs:
+  until:
+    - "</s>"
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: "get-answer"
+    filter:
+      - function: "regex"
+        regex_pattern: "(?<=the answer is )(.*)(?=.)"
+      - function: "take_first"
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/boolean_expressions.yaml
+"dataset_name": "boolean_expressions"
+"description": "Evaluate the result of a random Boolean expression.\n\n"
+"doc_to_text": " not ( ( not not True ) ) is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = not ( ( not not True ) ) = not ( ( A ) )\" where \"A = not not True\".\nLet's evaluate A: A = not not True = not (not True) = not False = True.\nPlugging in A, we get: Z = not ( ( A ) ) = not ( ( True ) ) = not True = False. So the answer is False.  True and False and not True and True is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = True and False and not True and True = A and B\" where \"A = True and False\" and \"B = not True and True\".\nLet's evaluate A: A = True and False = False.\nLet's evaluate B: B = not True and True = not (True and True) = not (True) = False.\nPlugging in A and B, we get: Z = A and B = False and False = False. So the answer is False.  not not ( not ( False ) ) is\nA: Let's think step by step.\nRemember that (i) expressions inside brackets are always evaluated first and that (ii) the order of operations from highest priority to lowest priority is \"not\", \"and\", \"or\", respectively.\nWe first simplify this expression \"Z\" as follows: \"Z = not not ( not ( False ) ) = not not ( A )\" where \"A = not ( False )\".\nLet's evaluate A: A = not ( False ) = not False = True.\nPlugging in A, we get: Z = not not ( A ) = not not (True) = not not False = True. So the answer is True.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_fewshot_template_yaml"
+"task": "bbh_flan_cot_fewshot_boolean_expressions"
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/causal_judgement.yaml
+"dataset_name": "causal_judgement"
+"description": "Answer questions about causal attribution.\n\n"
+"doc_to_text": " How would a typical person answer each of the following questions about causation?\nFrank T., had an ongoing dispute with his neighbor over a stretch of land and one day decided to shoot his neighbor in the body. Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild. Nonetheless, the bullet bounced off a large boulder several feet away and hit the neighbor's body, causing significant injury. Did Frank T. intentionally shoot his neighbor in the body?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that \"Frank T. had no experience with guns, his hand slipped on the barrel of the gun, and the shot went wild.\" A typical person would assume that this passage suggests that Frank T. had no intention of shooting and injuring someone and that the bullet accidentally hit the neighbor's body; therefore, we conclude that Frank T. did not intentionally hit his neighbor. So the answer is No.  How would a typical person answer each of the following questions about causation?\nSuzy and Billy are working on a project that is very important for our nation's security. The boss tells them both: \"Be sure that you are here at exactly 9 am. It is absolutely essential that you arrive at that time.\" Both Billy and Suzy arrive at 9 am. As it happens, there was a motion detector installed in the room where they arrived. The motion detector was set up to be triggered if at least one person appeared in the room at the same time. So the motion detector went off. Did Billy cause the motion detector to go off?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that the boss ordered them both to arrive at the meeting room at the same time and that the motion detector was set up to be triggered if at least one person appeared in the room at the same time.\" A typical person would assume that the person probably meant to say the detector was set up to be triggered if \"both persons\" appeared in the room at the same time, not at least one person, since otherwise the phrase \"at the same time\" would not make much sense in that sentence. Because the motion detector went off, a typical person would therefore come to the conclusion that both Suzy and Billy triggered the motion detector to go off; hence, Billy did indeed cause the motion detector to go off. So the answer is Yes.  How would a typical person answer each of the following questions about causation?\nGeorge and his sister Lena reunite at their parents' house for Thanksgiving. Whereas George just got into medical school, Lena is unhappy in her marriage and recently lost her job. Over the course of the day, George and Lena get into a number of heated arguments. Later in the afternoon they play a game of darts. They split the first two games, and the third game is close until the end. Who will win comes down to George's last shot. If he hits a high point region, he wins; if he hits a low point region, Lena wins. George thinks of the difficult time Lena is having, and he really wants to let her win. He aims the dart at the low point region. He sets up his shot and the dart lands in the low point region. After his shot, Lena wins the game and is very happy. Did George hit the low point region intentionally?\nOptions:\n- Yes\n- No\nA: Let's think step by step.\nHere in this question, we are told that \"He aims the dart at the low point region.\" A typical person might therefore think George did intentionally hit the low point region, because he wanted to lift up the spirit of his sister Lena. So the answer is Yes.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_fewshot_template_yaml"
+"task": "bbh_flan_cot_fewshot_causal_judgement"
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/date_understanding.yaml
+"dataset_name": "date_understanding"
+"description": "Infer the date from context.\n\n"
+"doc_to_text": " Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY?\nOptions:\n(A) 12/14/2026\n(B) 12/14/1950\n(C) 12/14/2007\n(D) 12/14/1937\n(E) 07/14/1938\n(F) 12/14/1988\nA: Let's think step by step.\nIf today is Christmas Eve of 1937, then today's date is December 24, 1937. 10 days before today is December 14, 1937, that is 12/14/1937. So the answer is (D).  Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY?\nOptions:\n(A) 09/04/2018\n(B) 11/11/2018\n(C) 08/25/2018\n(D) 11/02/2018\n(E) 11/04/2018\nA: Let's think step by step.\nIf tomorrow is 11/12/2019, then today is 11/11/2019. The date one year ago from today is 11/11/2018. So the answer is (B).  Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. What is the date tomorrow in MM/DD/YYYY?\nOptions:\n(A) 01/11/1961\n(B) 01/03/1963\n(C) 01/18/1961\n(D) 10/14/1960\n(E) 01/03/1982\n(F) 12/03/1960\nA: Let's think step by step.\nIf Jane and John married on Jan 2, 1958, then and if it is their 5-year anniversary today, then today's date is Jan 2, 1963. The date tomorrow is Jan 3, 1963, that is 01/03/1963. So the answer is (B).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_fewshot_template_yaml"
+"task": "bbh_flan_cot_fewshot_date_understanding"
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/disambiguation_qa.yaml
+"dataset_name": "disambiguation_qa"
+"description": "Clarify the meaning of sentences with ambiguous pronouns.\n\n"
+"doc_to_text": " In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The chief told the counselor that they took the day off.\nOptions:\n(A) The chief took the day off\n(B) The counselor took the day off\n(C) Ambiguous\nA: Let's think step by step.\nHere we need to determine who the pronoun \"they\" might be referring to. There are two possible referents for \"they\", namely the chief and the counselor. The verb \"told\" might be able to help us determine which one is more likely (if either). Let X be the chief and Y the counselor. The sentence is then of the form \"X told Y that (X or Y) did something.\"\nLet's consider Y first: \"X told Y that Y did something.\" This case does not make much sense, as Y would already have the information that Y did something, because it is information about themself.\nNow, consider X: \"X told Y that X did something.\" This makes sense, because X would be sharing some information about themself that Y might not have known before.\nBecause in this context, X is the chief and Y is the counselor, the answer should be the chief. So the answer is (A).  In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: The manager sent a message to the secretary, but he didn't reply yet.\nOptions:\n(A) The secretary didn't reply yet\n(B) The manager didn't reply yet\n(C) Ambiguous\nA: Let's think step by step.\nHere we need to determine who the pronoun \"he\" might be referring to. There are two possible referents for \"he\", namely the manager and the secretary. The verbs \"sent\" and \"reply\" might be able to help us determine which one is more likely (if either). Let X be the manager and Y the secretary. The sentence is then of the form \"X sent a message to Y, but (X or Y) didn't reply yet.\"\nLet's consider Y first: \"X sent a message to Y, but Y didn't reply yet.\" This case makes sense, because of the implicit causality of the sentence. Y was the receiver of the message, but Y didn't get back to X yet.\nNow, consider X: \"X sent a message to Y, but X didn't reply yet.\" This case doesn't make sense, because X was the initial sender of the message, so it is now Y's turn to write back to X.\nBecause in this context, X is the manager and Y is the secretary, the answer should be the secretary. So the answer is (A).  In the following sentences, explain the antecedent of the pronoun (which thing the pronoun refers to), or state that it is ambiguous.\nSentence: Bailey will plan to meet the director at his office\nOptions:\n(A) It will be Bailey's office\n(B) It will be the director's office\n(C) Ambiguous\nA: Let's think step by step.\nHere we need to determine who the pronoun \"his\" might be referring to. There are two possible referents for \"his\", namely Bailey's and the director's. The verb phrase \"plan to meet\" might be able to help us determine which one is more likely (if either). Let X be Bailey and Y the director. The sentence is then of the form \"X will plan to meet Y at (X or Y)'s office.\"\nLet's consider Y first: \"X will plan to meet Y at Y's office.\" This case makes sense, because X might want to meet up with Y at Y's office.\nNow, consider X: \"X will plan to meet Y at X's office.\" This case also makes sense, because X might want to meet up with Y at X's own office.\nBecause both X and Y are possible at the same time, we conclude that the antecedent of the pronoun is ambiguous. So the answer is (C).Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_fewshot_template_yaml"
+"task": "bbh_flan_cot_fewshot_disambiguation_qa"
--- a/lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml
+++ b/lm_eval/tasks/bbh/flan_cot_fewshot/dyck_languages.yaml
+"dataset_name": "dyck_languages"
+"description": "Correctly close a Dyck-n word.\n\n"
+"doc_to_text": " Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: [ { [\nA: Let's think step by step.\nWe should process each input one by one and keep track of the stack configuration.\n0: empty stack\n1: [ ; stack: [\n2: { ; stack: [ {\n3: [ ; stack: [ { [\nNow, we have reached the end. The final stack is \"[ { [\".\nWe will need to pop out \"[\", \"{\", \"[\" one by one in that order.\nSo, we need \"]\", \"}\", \"]\". So the answer is ] } ].  Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < > ( ( [ [ ( { } ) [ < > ] ]\nA: Let's think step by step.\nWe should process each input one by one and keep track of the stack configuration.\n0: empty stack\n1: < ; stack: <\n2: > ; stack: empty\n3: ( ; stack: (\n4: ( ; stack: ( (\n5: [ ; stack: ( ( [\n6: [ ; stack: ( ( [ [\n7: ( ; stack: ( ( [ [ (\n8: { ; stack: ( ( [ [ ( {\n9: } ; stack: ( ( [ [ (\n10: ) ; stack: ( ( [ [\n11: [ ; stack: ( ( [ [ [\n12: < ; stack: ( ( [ [ [ <\n13: > ; stack: ( ( [ [ [\n14: ] ; stack: ( ( [ [\n15: ] ; stack: ( ( [\nNow, we have reached the end. The final stack is \"( ( [\".\nWe will need to pop out \"[\", \"(\", \"(\" one by one in that order.\nSo, we need \"]\", \")\", \")\". So the answer is ] ) ).  Complete the rest of the sequence, making sure that the parentheses are closed properly. Input: < [ < [ { < [ ] < { } > > } ] > { { ( ) } { < [ < > ] > }\nA: Let's think step by step.\nWe should process each input one by one and keep track of the stack configuration.\n0: empty stack\n1: < ; stack: <\n2: [ ; stack: < [\n3: < ; stack: < [ <\n4: [ ; stack: < [ < [\n5: { ; stack: < [ < [ {\n6: < ; stack: < [ < [ { <\n7: [ ; stack: < [ < [ { < [\n8: ] ; stack: < [ < [ { <\n9: < ; stack: < [ < [ { < <\n10: { ; stack: < [ < [ { < < {\n11: } ; stack: < [ < [ { < <\n12: > ; stack: < [ < [ { <\n13: > ; stack: < [ < [ {\n14: } ; stack: < [ < [\n15: ] ; stack: < [ <\n16: > ; stack: < [\n17: { ; stack: < [ {\n18: { ; stack: < [ { {\n19: ( ; stack: < [ { { (\n20: ) ; stack: < [ { {\n21: } ; stack: < [ {\n22: { ; stack: < [ { {\n23: < ; stack: < [ { { <\n24: [ ; stack: < [ { { < [\n25: < ; stack: < [ { { < [ <\n26: > ; stack: < [ { { < [\n27: ] ; stack: < [ { { <\n28: > ; stack: < [ { {\n29: } ; stack: < [ {\nNow, we have reached the end. The final stack is \"< [ {\".\nWe will need to pop out \"{\", \"[\", \"<\" one by one in that order.\nSo, we need \"}\", \"]\", \">\". So the answer is } ] >.Q: {{input}}\nA: Let's think step by step.\n"
+"include": "_flan_cot_fewshot_template_yaml"
+"task": "bbh_flan_cot_fewshot_dyck_languages"