Merge remote-tracking branch 'upstream/big-refactor' into refactor-more-tasks

6e3ef5ff · Benjamin Fattori · 026d2c21 · 070b6b9c · 6e3ef5ff · 6e3ef5ff
Commit 6e3ef5ff authored Jul 04, 2023 by Benjamin Fattori
20 changed files
--- a/README.md
+++ b/README.md
@@ -44,10 +44,10 @@ To install additional multilingual tokenization and text segmentation packages,
 pip install -e ".[multilingual]"
 ```

-To support loading GPTQ quantized models, install the package with the `auto-gptq` extra:
+To support loading GPTQ quantized models, install the package with the `gptq` extra:

 ```bash
-pip install -e ".[auto-gptq]"
+pip install -e ".[gptq]"
 ```

 ## Basic Usage
@@ -94,6 +94,25 @@ accelerate launch main.py \

 This will perform *data-parallel evaluation*: that is, placing a **single full copy** of your model onto each available GPU and *splitting batches across GPUs* to evaluate on K GPUs K times faster than on one.

+However, if your model *is too large to be run on a single one of your GPUs*, then we provide an alternative method to run these large models: use of the `parallelize` argument.
+
+```
+python main.py \
+    --model hf \
+    --model_args pretrained=EleutherAI/pythia-12b,parallelize=True
+    --tasks lambada_openai,arc_easy \
+    --batch_size 16
+```
+
+To pass even more advanced keyword arguments to `accelerate`, we allow for the following arguments as well:
+- `device_map_option`: How to split model weights across available GPUs. defaults to "auto".
+- `max_memory_per_gpu`: the max GPU memory to use per GPU in loading the model.
+- `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM.
+- `offload_folder`: a folder where model weights will be offloaded to disk if needed.
+
+Using this setting helps for massive models like BLOOM which require, or to avoid exceeding your total system RAM (by default, with `accelerate launch` one copy of the model for each GPU is initialized in RAM before moving it to GPU, resulting in large RAM usage spikes around the start of the script that may cause errors such as `Killed`.) However, it naively splits models across GPUs, resulting in only a single GPU performing work at any point in time, and so is much slower than launching with `accelerate launch`, possibly by a factor of the total # of GPUs.
+
+**Note that this option requires launching evaluation via `python main.py` rather than `accelerate launch main.py`.**

 ### Commercial APIs

@@ -141,17 +160,17 @@ For models loaded with the HuggingFace  `transformers` library, any arguments pr
 ```bash
 python main.py \
    --model hf \
-    --model_args pretrained=EleutherAI/gpt-j-6b,peft=nomic-ai/gpt4all-j-lora \
+    --model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \
    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
    --device cuda:0
 ```

-GPTQ quantized models can be loaded by specifying their file names in `,quantized=NAME` (or `,quantized=True` for default names) in the `model_args` argument:
+[GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,gptq=NAME` (or `,gptq=True` for default names) in the `model_args` argument:

 ```bash
 python main.py \
    --model hf \
-    --model_args pretrained=model-name-or-path,quantized=model.safetensors,gptq_use_triton=True \
+    --model_args pretrained=model-name-or-path,gptq=model.safetensors,gptq_use_triton=True \
    --tasks hellaswag
 ```


--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -19,6 +19,9 @@ class LM(abc.ABC):
        (inputs/outputs should be tokenization-agnostic.)

        """
+        # set rank and world size to a single process, by default.
+        self._rank = 0
+        self._world_size = 1
        self.cache_hook = CacheHook(None)

    @abc.abstractmethod
@@ -118,14 +121,14 @@ class LM(abc.ABC):
        # used in the case of parallelism. Hardcoded to
        # ensure no errors arise using API models which do
        # not support multi-device parallelism nor expect it.
-        return 0
+        return self._rank

    @property
    def world_size(self):
        # used in the case of parallelism. Hardcoded to
        # ensure no errors arise using API models which do
        # not support multi-device parallelism nor expect it.
-        return 1
+        return self._world_size

    def set_cache_hook(self, cache_hook):
        self.cache_hook = cache_hook

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -283,7 +283,7 @@ class Task(abc.ABC):
        else:
            eval_logger.warning(
                "has_training_docs and has_validation_docs are False"
-                ", using test_docs but this is not recommended."
+                ", using test_docs as fewshot_docs but this is not recommended."
            )
            return self.test_docs()

@@ -354,7 +354,8 @@ class Task(abc.ABC):
            fewshot_ctx = self.fewshot_context(
                doc, self._config.num_fewshot, rnd=random.Random()
            )
-            # TODO: we should override this if doing greedy gen so users don't waste time+compute
+
+            # TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
            inst = self.construct_requests(
                doc=doc,
                ctx=fewshot_ctx,

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -195,11 +195,6 @@ def evaluate(
        versions[task_name] = task.VERSION
        configs[task_name] = dict(task.dump_config())

-        # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
-        # task_docs = list(task_doc_func())
-        # rnd = random.Random()
-        # rnd.seed(42)
-        # rnd.shuffle(task_docs)
        if limit is not None:
            if task.has_test_docs():
                task_docs = task.test_docs()
@@ -257,13 +252,12 @@ def evaluate(
        task.apply_filters()

    ### Collect values of metrics on all datapoints ###
-    # TODO: make metric configurable, add metric registry
    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
    for task_name, task in task_dict.items():
-        # calculate values for each filter setup (TODO: make getting list of keys cleaner)
-        # TODO: make it possible to use a different metric per key
+        # TODO: make it possible to use a different metric per filter
+        # iterate over different filters used
        for key in task.instances[0].filtered_resps.keys():
            doc_iterator = (
                itertools.islice(
@@ -286,6 +280,7 @@ def evaluate(
                    "doc_id": doc_id,
                    "doc": doc,
                    "target": target,
+                    "arguments": requests[0].args,
                    "resps": [req.resps for req in requests],
                    "filtered_resps": [req.filtered_resps[key] for req in requests],
                }
@@ -296,6 +291,15 @@ def evaluate(

    if lm.world_size > 1:
        # if multigpu, then gather data across all ranks
+        # first gather logged samples across all ranks
+        for task_name, task_samples in list(samples.items()):
+
+            full_samples = [None] * lm.world_size
+            torch.distributed.all_gather_object(full_samples, task_samples)
+
+            samples[task_name] = list(itertools.chain.from_iterable(full_samples))
+
+        # then collect metrics across all ranks
        vals_torch = collections.defaultdict(list)
        for (task_name, key, metric), items in vals.items():


--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
 import torch
 import transformers
 from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+from peft import __version__ as PEFT_VERSION, PeftModel

 import copy
 from collections import defaultdict
 from tqdm import tqdm
+from pathlib import Path

 import torch.nn.functional as F

@@ -16,7 +18,32 @@ from lm_eval.api.registry import register_model
 from lm_eval.utils import MultiTokenEOSCriteria, stop_sequences_criteria

 from accelerate import Accelerator
-from typing import List, Union
+from typing import List, Optional, Union
+
+
+def _get_accelerate_args(
+    device_map_option: Optional[str] = "auto",
+    max_memory_per_gpu: Optional[Union[int, str]] = None,
+    max_cpu_memory: Optional[Union[int, str]] = None,
+    offload_folder: Optional[str] = "./offload",
+) -> dict:
+    """Returns the kwargs needed to apply `accelerate` in `AutoModel.from_pretrained`."""
+    max_memory = {}
+    if max_memory_per_gpu is not None:
+        max_memory_per_gpu_map = {
+            device_idx: max_memory_per_gpu
+            for device_idx in range(torch.cuda.device_count())
+        }
+        max_memory.update(max_memory_per_gpu_map)
+    if max_cpu_memory is not None:
+        max_memory["cpu"] = max_cpu_memory
+
+    args = {}
+    if max_memory:
+        args["max_memory"] = max_memory
+    args["device_map"] = device_map_option
+    args["offload_folder"] = offload_folder
+    return args


 @register_model("hf-auto", "hf", "huggingface")
@@ -33,14 +60,31 @@ class HFLM(LM):

    def __init__(
        self,
-        device="cuda",
-        pretrained="gpt2",
-        revision="main",
-        low_cpu_mem_usage=None,
-        max_length=None,
-        subfolder=None,
-        tokenizer=None,
-        batch_size=1,
+        pretrained: Optional[str] = "gpt2",
+        revision: Optional[str] = "main",
+        subfolder: Optional[str] = None,
+        tokenizer: Optional[str] = None,
+        max_length: Optional[int] = None,
+        device: Optional[str] = "cuda",
+        dtype: Optional[Union[str, torch.dtype]] = "auto",
+        batch_size: Optional[int] = 1,
+        low_cpu_mem_usage: Optional[bool] = True,
+        trust_remote_code: Optional[bool] = False,
+        # arguments used for splitting a model across GPUs naively.
+        # only used if `parallelize=True`.
+        parallelize: Optional[bool] = False,
+        device_map_option: Optional[str] = "auto",
+        max_memory_per_gpu: Optional[Union[int, str]] = None,
+        max_cpu_memory: Optional[Union[int, str]] = None,
+        offload_folder: Optional[str] = "./offload",
+        # PEFT and quantization options
+        peft: Optional[str] = None,
+        load_in_8bit: Optional[bool] = False,
+        load_in_4bit: Optional[bool] = False,
+        bnb_4bit_quant_type: Optional[str] = None,
+        bnb_4bit_compute_dtype: Optional[Union[str, torch.dtype]] = None,
+        gptq: Optional[Union[bool, str]] = False,
+        gptq_use_triton: Optional[bool] = False,
    ):
        super().__init__()

@@ -49,10 +93,16 @@ class HFLM(LM):
        assert isinstance(batch_size, int)

        gpus = torch.cuda.device_count()
+        accelerator = Accelerator()

-        if gpus <= 1:
+        if not (parallelize or accelerator.num_processes > 1):
+            # use user-passed device
+            device_list = set(
+                ["cuda", "cpu"]
+                + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
+            )
            if device:
-                if device not in ["cuda", "cpu"]:
+                if device not in device_list:
                    device = int(device)
                self._device = torch.device(device)
                eval_logger.info(f"Using device '{device}'")
@@ -64,19 +114,29 @@ class HFLM(LM):
                    if torch.cuda.is_available()
                    else torch.device("cpu")
                )
-            self._rank = 0
-            self._world_size = 1
-
        else:
-            self._device = "cpu"
+            eval_logger.info(
+                f"Using `accelerate launch` or `parallelize=True`, device '{device}' will be overridden when placing model."
+            )
+            # TODO: include in warning that `load_in_8bit` etc. affect this too
+            self._device = device
+
+        model_kwargs = {}
+        if parallelize:
+            model_kwargs = _get_accelerate_args(
+                device_map_option,
+                max_memory_per_gpu,
+                max_cpu_memory,
+                offload_folder,
+            )

        # TODO: update this to be less of a hack once subfolder is fixed in HF
        revision = revision + ("/" + subfolder if subfolder is not None else "")

-        # get config
        self._config = transformers.AutoConfig.from_pretrained(
            pretrained,
            revision=revision,
+            trust_remote_code=trust_remote_code,
        )

        if getattr(self._config, "model_type") in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
@@ -89,15 +149,67 @@ class HFLM(LM):
            transformers.AutoModelForSeq2SeqLM,
        ]

-        self._model = self.AUTO_MODEL_CLASS.from_pretrained(
-            pretrained, revision=revision, low_cpu_mem_usage=low_cpu_mem_usage
-        ).to(self.device)
+        if not gptq:
+            if load_in_4bit:
+                assert (
+                    transformers.__version__ >= "4.30.0"
+                ), "load_in_4bit requires transformers >= 4.30.0"
+            if transformers.__version__ >= "4.30.0":
+                model_kwargs["load_in_4bit"] = load_in_4bit
+                if load_in_4bit:
+                    if bnb_4bit_quant_type:
+                        model_kwargs["bnb_4bit_quant_type"] = bnb_4bit_quant_type
+                    if bnb_4bit_compute_dtype:
+                        model_kwargs["bnb_4bit_compute_dtype"] = utils.get_dtype(
+                            bnb_4bit_compute_dtype
+                        )
+            self._model = self.AUTO_MODEL_CLASS.from_pretrained(
+                pretrained,
+                revision=revision,
+                torch_dtype=utils.get_dtype(dtype),
+                low_cpu_mem_usage=low_cpu_mem_usage,
+                trust_remote_code=trust_remote_code,
+                load_in_8bit=load_in_8bit,
+                **model_kwargs,
+            )
+        else:
+            try:
+                from auto_gptq import AutoGPTQForCausalLM
+            except ModuleNotFoundError:
+                raise Exception(
+                    "Tried to load auto_gptq, but auto-gptq is not installed ",
+                    "please install auto-gptq via pip install lm-eval[gptq] or pip install -e .[gptq]",
+                )
+
+            self._model = AutoGPTQForCausalLM.from_quantized(
+                pretrained,
+                model_basename=None if gptq is True else Path(gptq).stem,
+                low_cpu_mem_usage=low_cpu_mem_usage,
+                trust_remote_code=trust_remote_code,
+                use_safetensors=True if gptq is True else gptq.endswith(".safetensors"),
+                use_triton=gptq_use_triton,
+                warmup_triton=gptq_use_triton,
+                **model_kwargs,
+            )
+
+        if peft:
+            if load_in_4bit:
+                assert PEFT_VERSION >= "0.4.0", "load_in_4bit requires peft >= 0.4.0"
+            self._model = PeftModel.from_pretrained(
+                self._model, peft, revision=revision
+            )
+
        # forever after, access self._model through self.model property
        self.model.eval()
+        self.model.tie_weights()
+        if gpus <= 1 and not parallelize:
+            # place model onto device, if not using HF Accelerate in any form
+            self.model.to(self.device)

        self.tokenizer = transformers.AutoTokenizer.from_pretrained(
            pretrained if tokenizer is None else tokenizer,
            revision=revision,
+            trust_remote_code=trust_remote_code,
        )

        self.vocab_size = self.tokenizer.vocab_size
@@ -106,12 +218,18 @@ class HFLM(LM):
        self._max_length = max_length

        # multithreading and batching
-        self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
+        self.batch_size_per_gpu = batch_size

-        # multigpu support with accelerate
+        # multigpu data-parallel support when launched with accelerate
        if gpus > 1:
-            accelerator = Accelerator()
-            if gpus > accelerator.num_processes:
+            if parallelize:
+                if accelerator.num_processes > 1:
+                    raise RuntimeError(
+                        "Attempted to use both a HF Accelerate `device_map` and to launch via `accelerate launch`. If this is the case, please either remove `parallelize=True` from --model_args or launch outside of the Accelerate launcher."
+                    )
+                else:
+                    pass
+            elif gpus > accelerator.num_processes:
                # TODO: make sure there's still never an edge case where we unintentionally default to CPU
                eval_logger.warning(
                    "WARNING: The number of total system GPUs does not match the number of spawned processes. "
@@ -302,16 +420,27 @@ class HFLM(LM):

        return logits

+    def _encode_pair(self, context, continuation):
+        n_spaces = len(context) - len(context.rstrip())
+        if n_spaces > 0:
+            continuation = context[-n_spaces:] + continuation
+            context = context[:-n_spaces]
+        whole_enc = self.tok_encode(context + continuation)
+        context_enc = self.tok_encode(context)
+        context_enc_len = len(context_enc)
+        continuation_enc = whole_enc[context_enc_len:]
+        return context_enc, continuation_enc
+
    def loglikelihood(self, requests):
        new_reqs = []
        for context, continuation in [req.args for req in requests]:
            if context == "":
                # end of text as context
-                context_enc = [self.eot_token_id]
+                context_enc, continuation_enc = [self.eot_token_id], self.tok_encode(
+                    continuation
+                )
            else:
-                context_enc = self.tok_encode(context)
-
-            continuation_enc = self.tok_encode(continuation)
+                context_enc, continuation_enc = self._encode_pair(context, continuation)

            new_reqs.append(((context, continuation), context_enc, continuation_enc))

@@ -383,7 +512,6 @@ class HFLM(LM):
            tqdm(re_ord.get_reordered(), disable=(disable_tqdm or (self.rank != 0))),
            self.batch_size,
        ):
-
            inps = []
            cont_toks_list = []
            inplens = []
@@ -480,12 +608,11 @@ class HFLM(LM):

            multi_logits = F.log_softmax(
                self._model_call(batched_inps, **call_kwargs), dim=-1
-            ).cpu()  # [batch, padding_length (inp or cont), vocab]
+            )  # [batch, padding_length (inp or cont), vocab]

            for (cache_key, _, _), logits, inplen, cont_toks in zip(
                chunk, multi_logits, inplens, cont_toks_list
            ):
-
                # Slice to original seq length
                contlen = len(cont_toks)
                # take only logits in the continuation
@@ -500,7 +627,9 @@ class HFLM(LM):

                # Check if per-token argmax is exactly equal to continuation
                greedy_tokens = logits.argmax(dim=-1)
-                cont_toks = torch.tensor(cont_toks, dtype=torch.long).unsqueeze(
+                cont_toks = torch.tensor(
+                    cont_toks, dtype=torch.long, device=self.device
+                ).unsqueeze(
                    0
                )  # [1, seq]
                max_equal = (greedy_tokens == cont_toks).all()

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
 # v1.0 Tasks
 This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.

-Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation.
+Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.

 - [ ] Glue (WIP)
 - [x] SuperGlue
@@ -12,7 +12,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] Lambada (Multilingual)
 - [x] Wikitext
 - [x] PiQA
- [ ] PROST
+- [ ] PROST (WIP)
 - [ ] MCTACO
 - [x] Pubmed QA
 - [x] SciQ
@@ -21,11 +21,16 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] TriviaQA
 - [x] AI2 ARC
 - [ ] LogiQA
- [ ] HellaSwag
+- [x] HellaSwag
 - [x] SWAG
 - [x] OpenBookQA
 - [ ] SQuADv2
 - [x] RACE
+- [ ] LogiQA (WIP)
+- [x] HellaSwag
+- [ ] SWAG (WIP)
+- [x] OpenBookQA
+- [ ] SQuADv2 (WIP)
 - [ ] HeadQA
 - [ ] MathQA
 - [ ] WebQs
@@ -35,7 +40,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] Hendrycks Ethics
 - [ ] TruthfulQA
 - [ ] MuTual
- [ ] Hendrycks Math
+- [ ] Hendrycks Math (WIP)
 - [ ] Asdiv
 - [ ] GSM8k
 - [x] Arithmetic
@@ -45,6 +50,8 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [x] ~~Pile (perplexity)~~
 - [ ] BLiMP
 - [ ] ToxiGen
+- [ ] StoryCloze
+- [ ] NaturalQs
 - [ ] CrowS-Pairs
 - [ ] XCopa
 - [ ] BIG-Bench
@@ -53,6 +60,9 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
 - [ ] PAWS-X
 - [ ] XNLI
 - [ ] MGSM
+- [ ] SCROLLS
+- [ ] JSON Task (reference: https://github.com/EleutherAI/lm-evaluation-harness/pull/481)
+- [ ] Babi

 # Novel Tasks
 Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -124,28 +124,6 @@ def get_task_dict(task_name_list: List[Union[str, dict, Task]], **kwargs):
                get_task_name_from_object(task_element): task_element,
            }

-    # task_name_from_registry_dict = {
-    #     task_name: get_task(
-    #         task_name=task_name,
-    #         task_config=config
-    #     )
-    #     for group_name in task_name_list for task_name in GROUP_REGISTRY[group_name]
-    #     if (isinstance(group_name, str)) and (group_name in GROUP_REGISTRY)
-    # }
-    # task_name_from_config_dict = {
-    #     get_task_name_from_config(task_config): ConfigurableTask(
-    #         config=task_config
-    #     )
-    #     for task_config in task_name_list
-    #     if isinstance(task_config, dict)
-    # }
-    # # TODO: Do we still need this?
-    # task_name_from_object_dict = {
-    #     get_task_name_from_object(task_object): task_object
-    #     for task_object in task_name_list
-    #     if isinstance(task_object, Task)
-    # }
-
    assert set(task_name_from_registry_dict.keys()).isdisjoint(
        set(task_name_from_object_dict.keys())
    )

--- a/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
@@ -25,7 +25,7 @@ metric_list:
    regexes_to_ignore:
      - ","
      - "\\$"
-delimiter: "\n\n"
+fewshot_delimiter: "\n\n"
 generation_kwargs:
  until:
    - "Q:"

--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
@@ -7,7 +7,7 @@ output_type: multiple_choice
 training_split: train
 validation_split: validation
 test_split: null
-template_aliases: "{% set gold = label %}{% set answer_choices = endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list %}"
+template_aliases: "{% set gold = label | int %}{% set answer_choices = endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list %}"
 doc_to_text: "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace('  ', ' ')}}"
 doc_to_target: "{{answer_choices[gold]}}"
 gold_alias: "{{gold}}"

--- a/lm_eval/tasks/super_glue/boolq/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/boolq/promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "GPT-3 Style"
-dataset_path: super_glue
-dataset_name: boolq
-training_split: train
-validation_split: validation
-use_prompt: "promptsource:GPT-3 Style"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/boolq/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/boolq/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "based on the previous passage"
-use_prompt: "promptsource:based on the previous passage"
--- a/lm_eval/tasks/super_glue/boolq/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/boolq/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "based on the following passage"
-use_prompt: "promptsource:based on the following passage"
--- a/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
+++ b/lm_eval/tasks/super_glue/boolq/seq2seq.yaml
 group:
-  - super-glue-lm-eval-v1
+  - super-glue-lm-eval-v1-seq2seq
 task: "boolq-seq2seq"
 dataset_path: super_glue
 dataset_name: boolq

--- a/lm_eval/tasks/super_glue/cb/default.yaml
+++ b/lm_eval/tasks/super_glue/cb/default.yaml
 group:
  - super-glue-lm-eval-v1
-task: "default"
+task: "cb"
 dataset_path: super_glue
 dataset_name: cb
 output_type: multiple_choice
 training_split: train
 validation_split: validation
 doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:"
-doc_to_target: "{{answer_choices[labe]}}"
+doc_to_target: "{{answer_choices[label]}}"
 gold_alias: "{{label}}" # this will be cast to an int.
 template_aliases: "{% set answer_choices = ['True', 'False', 'Neither'] %}"
 metric_list:

--- a/lm_eval/tasks/super_glue/cb/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/cb/promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "GPT-3 style"
-dataset_path: super_glue
-dataset_name: cb
-training_split: train
-validation_split: validation
-use_prompt: "promptsource:GPT-3 style"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/cb/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/cb/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "MNLI crowdsource"
-use_prompt: "promptsource:MNLI crowdsource"
--- a/lm_eval/tasks/super_glue/cb/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/cb/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "based on the previous passage"
-use_prompt: "promptsource:based on the previous passage"
--- a/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/cb/t5-prompt.yaml
 group:
  - super-glue-t5-prompt
-task: t5-prompt
-reference: "From Raffel et. al. 2019"
+task: super_glue-cb-t5-prompt
 dataset_path: super_glue
 dataset_name: cb
 training_split: train

--- a/lm_eval/tasks/super_glue/copa/default.yaml
+++ b/lm_eval/tasks/super_glue/copa/default.yaml
+group:
+  - super-glue-lm-eval-v1-
+task: "copa"
+dataset_path: super_glue
+dataset_name: copa
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: !function utils.doc_to_text
+doc_to_target: !function utils.doc_to_target
+gold_alias: "{{label}}" # this will be cast to an int.
+template_aliases: "{% set answer_choices = [{{doc.choice1}}, 'b'] %} {{answer_choices}}"
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/super_glue/copa/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/copa/promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "C1 or C2? premise, so/because…"
-dataset_path: super_glue
-dataset_name: copa
-training_split: train
-validation_split: validation
-use_prompt: "promptsource:C1 or C2? premise, so/because…"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true