Merge pull request #590 from gakada/big-refactor-merge

Merge master into big-refactor

Merge pull request #590 from gakada/big-refactor-merge
Merge master into big-refactor
24e3e3fa · Lintang Sutawika · GitHub · 3fc5bedc · b7c3580a · 24e3e3fa
Unverified Commit 24e3e3fa authored Jun 16, 2023 by Lintang Sutawika Committed by GitHub Jun 16, 2023
20 changed files
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -9,5 +9,5 @@ jobs:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
-          python-version: 3.8
+          python-version: 3.9
      - uses: pre-commit/action@v2.0.3
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,7 +12,7 @@ repos:
      - id: check-merge-conflict
      - id: check-symlinks
      - id: check-yaml
-        args: ['--unsafe']
+        args: ["--unsafe"]
      - id: destroyed-symlinks
      - id: detect-private-key
      - id: end-of-file-fixer
@@ -33,7 +33,7 @@ repos:
    rev: 22.3.0
    hooks:
      - id: black
-        language_version: python3.8
+        language_version: python3.9
  - repo: https://github.com/codespell-project/codespell
    rev: v2.1.0
    hooks:

--- a/CODEOWNERS
+++ b/CODEOWNERS
-* @jon-tow @StellaAthena
+* @haileyschoelkopf @lintangsutawika
--- a/README.md
+++ b/README.md
@@ -8,7 +8,7 @@ We’d like your help to test it out! you can help by:
 1. Trying out your current workloads on the big-refactor branch, and seeing if anything breaks or is counterintuitive,
 2. Porting tasks supported in the previous version of the harness to the new YAML configuration format. Please check out our [task implementation guide](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/docs/new_task_guide.md) for more information.
-If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with: 
+If you choose to port a task not yet completed according to [our checklist](https://github.com/EleutherAI/lm-evaluation-harness/blob/big-refactor/lm_eval/tasks/README.md), then you can contribute it by opening a PR containing [Refactor] in the name with:
 - A command of the form `python main.py --model hf-causal --model_args ..... --tasks <task name> ...` which will run the task in the `master` branch, and what the score is
 - A command of the form `python main.py --model hf-causal --model_args ..... --tasks <task name> ...` to run the task in your PR branch to `big-refactor`, and what the resulting score is, to show that we achieve equality between the two implementations.

--- a/ignore.txt
+++ b/ignore.txt
 ROUGE
 rouge
 nin
+maka
+mor
+te
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -37,14 +37,16 @@ def simple_evaluate(
    tasks=[],
    num_fewshot=0,
    batch_size=None,
+    max_batch_size=None,
    device=None,
    no_cache=False,
    limit=None,
    bootstrap_iters=100000,
    check_integrity=False,
    decontamination_ngrams_path=None,
+    write_out=False,
+    output_base_path=None,
 ):
    """Instantiate and evaluate a model on a list of tasks.
    :param model: Union[str, LM]
@@ -56,18 +58,24 @@ def simple_evaluate(
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
    :param num_fewshot: int
        Number of examples in few-shot context
-    :param batch_size: int, optional
+    :param batch_size: int or str, optional
        Batch size for model
+    :param max_batch_size: int, optional
+        Maximal batch size to try with automatic batch size detection
    :param device: str, optional
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
    :param no_cache: bool
        Whether or not to cache
-    :param limit: int, optional
+    :param limit: int or float, optional
-        Limit the number of examples per task (only use this for testing)
+        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
+    :param write_out: bool
+        If True, write details about prompts and logits to json for all tasks
+    :param output_base_path: str, optional
+        Directory to which detailed eval info will be written. Defaults to present working dir.
    :return
        Dictionary of results
    """
@@ -80,7 +88,12 @@ def simple_evaluate(
        if model_args is None:
            model_args = ""
        lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
-            model_args, {"batch_size": batch_size, "device": device}
+            model_args,
+            {
+                "batch_size": batch_size,
+                "max_batch_size": max_batch_size,
+                "device": device,
+            },
        )
    else:
        assert isinstance(model, lm_eval.api.model.LM)
@@ -97,15 +110,22 @@ def simple_evaluate(
        limit=limit,
        bootstrap_iters=bootstrap_iters,
        decontamination_ngrams_path=decontamination_ngrams_path,
+        write_out=write_out,
+        output_base_path=output_base_path,
    )
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
-            "model": model,
+            "model": model
+            if isinstance(model, str)
+            else model.model.config._name_or_path,
            "model_args": model_args,
            "num_fewshot": num_fewshot,
            "batch_size": batch_size,
+            "batch_sizes": list(lm.batch_sizes.values())
+            if hasattr(lm, "batch_sizes")
+            else [],
            "device": device,
            "no_cache": no_cache,
            "limit": limit,
@@ -127,6 +147,8 @@ def evaluate(
    limit=None,
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
+    write_out=False,
+    output_base_path=None,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -140,6 +162,10 @@ def evaluate(
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
+    :param write_out: bool
+        If True, write all prompts, logits and metrics to json for offline analysis
+    :param output_base_path: str, optional
+        Directory to which detailed eval info will be written. Defaults to present working dir
    :return
        Dictionary of results
    """
@@ -166,6 +192,14 @@ def evaluate(
        # rnd = random.Random()
        # rnd.seed(42)
        # rnd.shuffle(task_docs)
+        if limit is not None:
+            if task.has_test_docs():
+                task_docs = task.test_docs()
+            elif task.has_validation_docs():
+                task_docs = task.validation_docs()
+            else:
+                raise RuntimeError("Task has neither test_docs nor validation_docs")
+            limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
        task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
+import os
+from lm_eval.base import BaseLM
+from tqdm import tqdm
+import time
+def anthropic_completion(
+    client, model, prompt, max_tokens_to_sample, temperature, stop
+):
+    """Query Anthropic API for completion.
+    Retry with back-off until they respond
+    """
+    import anthropic
+    backoff_time = 3
+    while True:
+        try:
+            response = client.completion(
+                prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
+                model=model,
+                # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
+                #       (e.g. gsm8k's ":") may truncate a lot of the input.
+                stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
+                max_tokens_to_sample=max_tokens_to_sample,
+                temperature=temperature,
+            )
+            print(response)
+            return response["completion"]
+        except RuntimeError:
+            # TODO: I don't actually know what error Anthropic raises when it times out
+            #       So err update this error when we find out.
+            import traceback
+            traceback.print_exc()
+            time.sleep(backoff_time)
+            backoff_time *= 1.5
+class AnthropicLM(BaseLM):
+    REQ_CHUNK_SIZE = 20
+    def __init__(self, model):
+        """
+        :param model: str
+            Anthropic model e.g. claude-instant-v1
+        """
+        super().__init__()
+        import anthropic
+        self.model = model
+        self.client = anthropic.Client(os.environ["ANTHROPIC_API_KEY"])
+    @property
+    def eot_token_id(self):
+        raise NotImplementedError("No idea about anthropic tokenization.")
+    @property
+    def max_length(self):
+        return 2048
+    @property
+    def max_gen_toks(self):
+        return 256
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+    def tok_encode(self, string: str):
+        raise NotImplementedError("No idea about anthropic tokenization.")
+    def tok_decode(self, tokens):
+        raise NotImplementedError("No idea about anthropic tokenization.")
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        raise NotImplementedError("No support for logits.")
+    def greedy_until(self, requests):
+        if not requests:
+            return []
+        res = []
+        for request in tqdm(requests):
+            inp = request[0]
+            request_args = request[1]
+            until = request_args["until"]
+            response = anthropic_completion(
+                client=self.client,
+                model=self.model,
+                prompt=inp,
+                max_tokens_to_sample=self.max_gen_toks,
+                temperature=0.0,
+                stop=until,
+            )
+            res.append(response)
+        return res
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override greedy_until
+        raise NotImplementedError()
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -125,7 +125,8 @@ class TextSynthLM(LM):
        res = []
        for request in tqdm(requests):
            inp = request[0]
-            until = request[1]
+            request_args = request[1]
+            until = request_args["until"]
            response = textsynth_completion(
                url=self.api_url + "/v1/engines/" + self.engine + "/completions",
                headers={"Authorization": "Bearer " + self.api_key},

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -8,13 +8,19 @@ import functools
 import subprocess
 import collections
 import importlib.util
+import fnmatch
-from typing import List
+from typing import List, Union
+import gc
+import torch
 from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice
+from lm_eval.logger import eval_logger
 class ExitCodeError(Exception):
    pass
@@ -25,6 +31,29 @@ def sh(x):
        raise ExitCodeError()
+def escaped_split(text, sep_char, maxsplit=-1):
+    """Split text into a list on occurrences of the given separation
+    character `sep_char`. The separation character may be escaped by a
+    backslash to avoid splitting at that location.
+    The separation character must be a string of size 1.
+    If `maxsplit` is given, at most `maxsplit` splits are done (thus,
+    the list will have at most `maxsplit + 1` elements). If `maxsplit`
+    is not specified or less than 0, then there is no limit on the
+    number of splits (all possible splits are made).
+    """
+    assert (
+        len(sep_char) == 1
+    ), "separation string must be a single character for escaped splitting"
+    if maxsplit == 0:
+        return text
+    maxsplit = max(0, maxsplit)
+    return re.split(r"(?<!\\)" + sep_char, text, maxsplit)
 def simple_parse_args_string(args_string):
    """
    Parses something like
@@ -44,11 +73,11 @@ def join_iters(iters):
        yield from iter
-def chunks(iter, n):
+def chunks(iter, n=0, fn=None):
    arr = []
-    for x in iter:
+    for i, x in enumerate(iter):
        arr.append(x)
-        if len(arr) == n:
+        if len(arr) == (fn(i) if fn else n):
            yield arr
            arr = []
@@ -65,6 +94,35 @@ def group(arr, fn):
    return list(res.values())
+class MultiChoice:
+    def __init__(self, choices):
+        self.choices = choices
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values):
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                eval_logger.warning("{} is not in task list.".format(value))
+                eval_logger.info(f"Available tasks to choose:")
+                for choice in self.choices:
+                    eval_logger.info(f"  - {choice}")
+        return True
+    def __iter__(self):
+        for choice in self.choices:
+            yield choice
+# Returns a list containing all values of the source_list that
+# match at least one of the patterns
+def pattern_match(patterns, source_list):
+    task_names = set()
+    for pattern in patterns:
+        for matching in fnmatch.filter(source_list, pattern):
+            task_names.add(matching)
+    return sorted(list(task_names))
 def general_detokenize(string):
    string = string.replace(" n't", "n't")
    string = string.replace(" )", ")")
@@ -122,6 +180,26 @@ def make_disjoint_window(pair):
    return a[: len(a) - (len(b) - 1)], b
+def select_continuation_from_batch_left_padding(
+    generations: Union[List[List[int]], torch.Tensor], max_context_size: int
+):
+    """Select the continuation from the batch, removing prompts of different lengths.
+    Args:
+        generations (Union[List[List[int]], torch.Tensor]):
+            A tensor or list-of-lists of shape [batch_size, sequence length].
+        max_context_size (int):
+            The size of the biggest context; generations will proceed from that
+            index.
+    Example:
+        PAD     PAD Continue : The dog chased the cat  [every       day of the week]
+        Riddle  me    this   : The  dog chased the  cat [yesterday] PAD PAD PAD PAD
+    Output:
+        [every day of the week]
+        [yesterday]  PAD PAD PAD PAD
+    """
+    return generations[:, max_context_size:]
 class Reorderer:
    def __init__(self, arr, fn):
        self.size = len(arr)
@@ -336,3 +414,8 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
    among ranks in multigpu setting or only pulling a sample of documents
    """
    return islice(raw_iterator, rank, limit, world_size)
+def clear_torch_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
--- a/main.py
+++ b/main.py
@@ -13,54 +13,41 @@ from lm_eval.logger import eval_logger
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-class MultiChoice:
-    def __init__(self, choices):
-        self.choices = choices
-    # Simple wildcard support (linux filename patterns)
-    def __contains__(self, values):
-        for value in values.split(","):
-            if len(fnmatch.filter(self.choices, value)) == 0:
-                eval_logger.warning("{} is not in task list.".format(value))
-                eval_logger.info(f"Available tasks to choose:")
-                for choice in self.choices:
-                    eval_logger.info(f"  - {choice}")
-        return True
-    def __iter__(self):
-        for choice in self.choices:
-            yield choice
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tasks", default=None, choices=MultiChoice(sorted(ALL_TASKS)))
+    parser.add_argument(
+        "--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS))
+    )
    parser.add_argument("--config", default=None)
-    parser.add_argument("--provide_description", action="store_true")
    parser.add_argument("--num_fewshot", type=int, default=0)
    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        help="Maximal batch size to try with --batch_size auto",
+    )
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument(
+        "--limit",
+        type=float,
+        default=None,
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
+    parser.add_argument("--data_sampling", type=float, default=None)
    parser.add_argument("--no_cache", action="store_true")
    parser.add_argument("--decontamination_ngrams_path", default=None)
    parser.add_argument("--description_dict_path", default=None)
    parser.add_argument("--check_integrity", action="store_true")
+    parser.add_argument("--write_out", action="store_true", default=False)
+    parser.add_argument("--output_base_path", type=str, default=None)
    return parser.parse_args()
-# Returns a list containing all values of the source_list that
-# match at least one of the patterns
-def pattern_match(patterns, source_list):
-    task_names = set()
-    for pattern in patterns:
-        for matching in fnmatch.filter(source_list, pattern):
-            task_names.add(matching)
-    return sorted(list(task_names))
 def main():
    args = parse_args()
@@ -70,7 +57,9 @@ def main():
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )
-    if args.tasks is not None:
+    if args.tasks is None:
+        task_names = ALL_TASKS
+    else:
        if os.path.isdir(args.tasks):
            import glob
@@ -81,7 +70,7 @@ def main():
                task_names.append(config)
        else:
            tasks_list = args.tasks.split(",")
-            task_names = pattern_match(tasks_list, ALL_TASKS)
+            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
            for task in [task for task in tasks_list if task not in task_names]:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
@@ -89,23 +78,37 @@ def main():
    eval_logger.info(f"Selected Tasks: {task_names}")
+    # TODO: description_dict?
+    # description_dict = {}
+    # if args.description_dict_path:
+    #     with open(args.description_dict_path, "r") as f:
+    #         description_dict = json.load(f)
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
        device=args.device,
+        no_cache=args.no_cache,
        limit=args.limit,
+        # description_dict=description_dict,
        decontamination_ngrams_path=args.decontamination_ngrams_path,
        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        output_base_path=args.output_base_path,
    )
    if results is not None:
        samples = results.pop("samples")
        dumped = json.dumps(results, indent=2)
        print(dumped)
+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
        if args.output_path:
            os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
@@ -127,8 +130,8 @@ def main():
                    f.write_all(samples[task_name])
        print(
-            f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
+            f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
-            f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
+            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
        print(evaluator.make_table(results))

--- a/results/bloom/bloom-1b1/README.md
+++ b/results/bloom/bloom-1b1/README.md
+# bloom-1b1
+## bloom-1b1_common_sense_reasoning_0-shot.json
+|    Task     |Version| Metric |Value|   |Stderr|
+|-------------|------:|--------|----:|---|-----:|
+|arc_challenge|      0|acc     |23.63|±  |  1.24|
+|             |       |acc_norm|25.68|±  |  1.28|
+|arc_easy     |      0|acc     |51.47|±  |  1.03|
+|             |       |acc_norm|45.45|±  |  1.02|
+|boolq        |      1|acc     |59.08|±  |  0.86|
+|copa         |      0|acc     |68.00|±  |  4.69|
+|hellaswag    |      0|acc     |34.63|±  |  0.47|
+|             |       |acc_norm|41.77|±  |  0.49|
+|mc_taco      |      0|em      |14.49|   |      |
+|             |       |f1      |32.43|   |      |
+|openbookqa   |      0|acc     |19.60|±  |  1.78|
+|             |       |acc_norm|29.40|±  |  2.04|
+|piqa         |      0|acc     |67.14|±  |  1.10|
+|             |       |acc_norm|67.14|±  |  1.10|
+|prost        |      0|acc     |23.41|±  |  0.31|
+|             |       |acc_norm|30.50|±  |  0.34|
+|swag         |      0|acc     |43.43|±  |  0.35|
+|             |       |acc_norm|58.28|±  |  0.35|
+|winogrande   |      0|acc     |54.93|±  |  1.40|
+|wsc273       |      0|acc     |68.50|±  |  2.82|
+## bloom-1b1_gsm8k_8-shot.json
+|Task |Version|Metric|Value|   |Stderr|
+|-----|------:|------|----:|---|-----:|
+|gsm8k|      0|acc   | 0.83|±  |  0.25|
+## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+|          Task           |Version| Metric |Value|   |Stderr|
+|-------------------------|------:|--------|----:|---|-----:|
+|drop                     |      1|em      | 1.38|±  |  0.12|
+|                         |       |f1      | 4.01|±  |  0.15|
+|gsm8k                    |      0|acc     | 0.00|±  |  0.00|
+|math_algebra             |      1|acc     | 0.00|±  |  0.00|
+|math_counting_and_prob   |      1|acc     | 0.21|±  |  0.21|
+|math_geometry            |      1|acc     | 0.21|±  |  0.21|
+|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
+|math_num_theory          |      1|acc     | 0.19|±  |  0.19|
+|math_prealgebra          |      1|acc     | 0.11|±  |  0.11|
+|math_precalc             |      1|acc     | 0.00|±  |  0.00|
+|mathqa                   |      0|acc     |23.55|±  |  0.78|
+|                         |       |acc_norm|23.62|±  |  0.78|
+## bloom-1b1_pawsx_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|pawsx_de|      0|acc   |46.95|±  |  1.12|
+|pawsx_en|      0|acc   |52.45|±  |  1.12|
+|pawsx_es|      0|acc   |51.50|±  |  1.12|
+|pawsx_fr|      0|acc   |46.15|±  |  1.11|
+|pawsx_ja|      0|acc   |48.40|±  |  1.12|
+|pawsx_ko|      0|acc   |49.90|±  |  1.12|
+|pawsx_zh|      0|acc   |48.95|±  |  1.12|
+## bloom-1b1_question_answering_0-shot.json
+|    Task     |Version|   Metric   |Value|   |Stderr|
+|-------------|------:|------------|----:|---|-----:|
+|headqa_en    |      0|acc         |26.44|±  |  0.84|
+|             |       |acc_norm    |30.49|±  |  0.88|
+|headqa_es    |      0|acc         |24.43|±  |  0.82|
+|             |       |acc_norm    |28.30|±  |  0.86|
+|logiqa       |      0|acc         |18.89|±  |  1.54|
+|             |       |acc_norm    |25.65|±  |  1.71|
+|squad2       |      1|exact       | 4.17|   |      |
+|             |       |f1          | 6.60|   |      |
+|             |       |HasAns_exact| 2.19|   |      |
+|             |       |HasAns_f1   | 7.05|   |      |
+|             |       |NoAns_exact | 6.14|   |      |
+|             |       |NoAns_f1    | 6.14|   |      |
+|             |       |best_exact  |50.07|   |      |
+|             |       |best_f1     |50.07|   |      |
+|triviaqa     |      1|acc         | 2.68|±  |  0.15|
+|truthfulqa_mc|      1|mc1         |25.34|±  |  1.52|
+|             |       |mc2         |41.80|±  |  1.46|
+|webqs        |      0|acc         | 1.38|±  |  0.26|
+## bloom-1b1_reading_comprehension_0-shot.json
+|Task|Version|Metric|Value|   |Stderr|
+|----|------:|------|----:|---|-----:|
+|coqa|      1|f1    |45.57|±  |  1.88|
+|    |       |em    |32.98|±  |  1.95|
+|drop|      1|em    | 3.31|±  |  0.18|
+|    |       |f1    | 8.63|±  |  0.22|
+|race|      1|acc   |32.63|±  |  1.45|
+## bloom-1b1_xcopa_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|xcopa_et|      0|acc   | 50.6|±  |  2.24|
+|xcopa_ht|      0|acc   | 53.0|±  |  2.23|
+|xcopa_id|      0|acc   | 64.8|±  |  2.14|
+|xcopa_it|      0|acc   | 50.8|±  |  2.24|
+|xcopa_qu|      0|acc   | 51.2|±  |  2.24|
+|xcopa_sw|      0|acc   | 54.4|±  |  2.23|
+|xcopa_ta|      0|acc   | 57.0|±  |  2.22|
+|xcopa_th|      0|acc   | 53.2|±  |  2.23|
+|xcopa_tr|      0|acc   | 53.0|±  |  2.23|
+|xcopa_vi|      0|acc   | 62.4|±  |  2.17|
+|xcopa_zh|      0|acc   | 59.4|±  |  2.20|
+## bloom-1b1_xnli_0-shot.json
+| Task  |Version|Metric|Value|   |Stderr|
+|-------|------:|------|----:|---|-----:|
+|xnli_ar|      0|acc   |33.93|±  |  0.67|
+|xnli_bg|      0|acc   |34.13|±  |  0.67|
+|xnli_de|      0|acc   |39.64|±  |  0.69|
+|xnli_el|      0|acc   |34.03|±  |  0.67|
+|xnli_en|      0|acc   |51.48|±  |  0.71|
+|xnli_es|      0|acc   |47.98|±  |  0.71|
+|xnli_fr|      0|acc   |47.15|±  |  0.71|
+|xnli_hi|      0|acc   |42.32|±  |  0.70|
+|xnli_ru|      0|acc   |40.46|±  |  0.69|
+|xnli_sw|      0|acc   |35.29|±  |  0.68|
+|xnli_th|      0|acc   |33.75|±  |  0.67|
+|xnli_tr|      0|acc   |34.79|±  |  0.67|
+|xnli_ur|      0|acc   |37.33|±  |  0.68|
+|xnli_vi|      0|acc   |44.45|±  |  0.70|
+|xnli_zh|      0|acc   |36.23|±  |  0.68|
+## bloom-1b1_xstory_cloze_0-shot.json
+|     Task      |Version|Metric|Value|   |Stderr|
+|---------------|------:|------|----:|---|-----:|
+|xstory_cloze_ar|      0|acc   |52.88|±  |  1.28|
+|xstory_cloze_en|      0|acc   |62.54|±  |  1.25|
+|xstory_cloze_es|      0|acc   |58.31|±  |  1.27|
+|xstory_cloze_eu|      0|acc   |54.33|±  |  1.28|
+|xstory_cloze_hi|      0|acc   |55.53|±  |  1.28|
+|xstory_cloze_id|      0|acc   |57.91|±  |  1.27|
+|xstory_cloze_my|      0|acc   |46.19|±  |  1.28|
+|xstory_cloze_ru|      0|acc   |48.25|±  |  1.29|
+|xstory_cloze_sw|      0|acc   |50.56|±  |  1.29|
+|xstory_cloze_te|      0|acc   |56.39|±  |  1.28|
+|xstory_cloze_zh|      0|acc   |58.04|±  |  1.27|
+## bloom-1b1_xwinograd_0-shot.json
+|    Task    |Version|Metric|Value|   |Stderr|
+|------------|------:|------|----:|---|-----:|
+|xwinograd_en|      0|acc   |69.98|±  |  0.95|
+|xwinograd_fr|      0|acc   |66.27|±  |  5.22|
+|xwinograd_jp|      0|acc   |52.87|±  |  1.61|
+|xwinograd_pt|      0|acc   |63.12|±  |  2.98|
+|xwinograd_ru|      0|acc   |54.29|±  |  2.81|
+|xwinograd_zh|      0|acc   |69.25|±  |  2.06|
--- a/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+{
+  "results": {
+    "boolq": {
+      "acc": 0.5908256880733945,
+      "acc_stderr": 0.008599563442397352
+    },
+    "arc_easy": {
+      "acc": 0.5147306397306397,
+      "acc_stderr": 0.010255329977562096,
+      "acc_norm": 0.45454545454545453,
+      "acc_norm_stderr": 0.010217299762709435
+    },
+    "openbookqa": {
+      "acc": 0.196,
+      "acc_stderr": 0.017770751227744862,
+      "acc_norm": 0.294,
+      "acc_norm_stderr": 0.020395095484936614
+    },
+    "hellaswag": {
+      "acc": 0.3463453495319657,
+      "acc_stderr": 0.004748324319714264,
+      "acc_norm": 0.4177454690300737,
+      "acc_norm_stderr": 0.004921798492608764
+    },
+    "swag": {
+      "acc": 0.43431970408877335,
+      "acc_stderr": 0.0035044592489844794,
+      "acc_norm": 0.5828251524542637,
+      "acc_norm_stderr": 0.0034862531772295617
+    },
+    "arc_challenge": {
+      "acc": 0.2363481228668942,
+      "acc_stderr": 0.012414960524301834,
+      "acc_norm": 0.2568259385665529,
+      "acc_norm_stderr": 0.0127669237941168
+    },
+    "mc_taco": {
+      "em": 0.1448948948948949,
+      "f1": 0.32425976796237205
+    },
+    "wsc273": {
+      "acc": 0.684981684981685,
+      "acc_stderr": 0.028165854394193602
+    },
+    "winogrande": {
+      "acc": 0.5493291239147593,
+      "acc_stderr": 0.013983928869040239
+    },
+    "prost": {
+      "acc": 0.23409479077711356,
+      "acc_stderr": 0.003093545711826552,
+      "acc_norm": 0.3049743808710504,
+      "acc_norm_stderr": 0.003363606918420179
+    },
+    "copa": {
+      "acc": 0.68,
+      "acc_stderr": 0.04688261722621504
+    },
+    "piqa": {
+      "acc": 0.6713819368879217,
+      "acc_stderr": 0.010959127105167048,
+      "acc_norm": 0.6713819368879217,
+      "acc_norm_stderr": 0.010959127105167044
+    }
+  },
+  "versions": {
+    "boolq": 1,
+    "arc_easy": 0,
+    "openbookqa": 0,
+    "hellaswag": 0,
+    "swag": 0,
+    "arc_challenge": 0,
+    "mc_taco": 0,
+    "wsc273": 0,
+    "winogrande": 0,
+    "prost": 0,
+    "copa": 0,
+    "piqa": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+{
+  "results": {
+    "gsm8k": {
+      "acc": 0.008339651250947688,
+      "acc_stderr": 0.002504942226860508
+    }
+  },
+  "versions": {
+    "gsm8k": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 8,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+{
+  "results": {
+    "mathqa": {
+      "acc": 0.2355108877721943,
+      "acc_stderr": 0.007767687364650971,
+      "acc_norm": 0.23618090452261306,
+      "acc_norm_stderr": 0.0077753193787470495
+    },
+    "gsm8k": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "drop": {
+      "em": 0.013842281879194632,
+      "em_stderr": 0.001196510970060749,
+      "f1": 0.040085989932885986,
+      "f1_stderr": 0.0014841664758736023
+    },
+    "math_geometry": {
+      "acc": 0.0020876826722338203,
+      "acc_stderr": 0.0020876826722338315
+    },
+    "math_counting_and_prob": {
+      "acc": 0.002109704641350211,
+      "acc_stderr": 0.002109704641350211
+    },
+    "math_prealgebra": {
+      "acc": 0.001148105625717566,
+      "acc_stderr": 0.0011481056257175708
+    },
+    "math_num_theory": {
+      "acc": 0.001851851851851852,
+      "acc_stderr": 0.0018518518518518448
+    },
+    "math_precalc": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "math_algebra": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "math_intermediate_algebra": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    }
+  },
+  "versions": {
+    "mathqa": 0,
+    "gsm8k": 0,
+    "drop": 1,
+    "math_geometry": 1,
+    "math_counting_and_prob": 1,
+    "math_prealgebra": 1,
+    "math_num_theory": 1,
+    "math_precalc": 1,
+    "math_algebra": 1,
+    "math_intermediate_algebra": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 5,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
+{
+  "results": {
+    "pawsx_es": {
+      "acc": 0.515,
+      "acc_stderr": 0.011178102477052804
+    },
+    "pawsx_zh": {
+      "acc": 0.4895,
+      "acc_stderr": 0.011180669867648657
+    },
+    "pawsx_fr": {
+      "acc": 0.4615,
+      "acc_stderr": 0.011149934327957058
+    },
+    "pawsx_ko": {
+      "acc": 0.499,
+      "acc_stderr": 0.01118311365477017
+    },
+    "pawsx_de": {
+      "acc": 0.4695,
+      "acc_stderr": 0.011162310405413175
+    },
+    "pawsx_ja": {
+      "acc": 0.484,
+      "acc_stderr": 0.011177408788874897
+    },
+    "pawsx_en": {
+      "acc": 0.5245,
+      "acc_stderr": 0.011169702598013186
+    }
+  },
+  "versions": {
+    "pawsx_es": 0,
+    "pawsx_zh": 0,
+    "pawsx_fr": 0,
+    "pawsx_ko": 0,
+    "pawsx_de": 0,
+    "pawsx_ja": 0,
+    "pawsx_en": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_question_answering_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_question_answering_0-shot.json
+{
+  "results": {
+    "truthfulqa_mc": {
+      "mc1": 0.2533659730722154,
+      "mc1_stderr": 0.01522589934082683,
+      "mc2": 0.4179977378869182,
+      "mc2_stderr": 0.014601549068840484
+    },
+    "webqs": {
+      "acc": 0.013779527559055118,
+      "acc_stderr": 0.002586718737195641
+    },
+    "logiqa": {
+      "acc": 0.1889400921658986,
+      "acc_stderr": 0.01535436463822078,
+      "acc_norm": 0.2565284178187404,
+      "acc_norm_stderr": 0.017129443327887562
+    },
+    "squad2": {
+      "exact": 4.169123220752969,
+      "f1": 6.5956997780058355,
+      "HasAns_exact": 2.192982456140351,
+      "HasAns_f1": 7.05309437656277,
+      "NoAns_exact": 6.139613120269134,
+      "NoAns_f1": 6.139613120269134,
+      "best_exact": 50.07159100480081,
+      "best_f1": 50.07159100480081
+    },
+    "headqa_es": {
+      "acc": 0.24434719183078046,
+      "acc_stderr": 0.008207488987159709,
+      "acc_norm": 0.2830051057622174,
+      "acc_norm_stderr": 0.008604004902114394
+    },
+    "headqa_en": {
+      "acc": 0.26440554339897887,
+      "acc_stderr": 0.008423643607316284,
+      "acc_norm": 0.30488694383661563,
+      "acc_norm_stderr": 0.008793112278191295
+    },
+    "triviaqa": {
+      "acc": 0.026783346592415803,
+      "acc_stderr": 0.001517985028991893
+    }
+  },
+  "versions": {
+    "truthfulqa_mc": 1,
+    "webqs": 0,
+    "logiqa": 0,
+    "squad2": 1,
+    "headqa_es": 0,
+    "headqa_en": 0,
+    "triviaqa": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_reading_comprehension_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_reading_comprehension_0-shot.json
+{
+  "results": {
+    "drop": {
+      "em": 0.03313758389261745,
+      "em_stderr": 0.0018330841858875643,
+      "f1": 0.08634542785234882,
+      "f1_stderr": 0.0022136353860709133
+    },
+    "coqa": {
+      "f1": 0.4557083534540516,
+      "f1_stderr": 0.01876948425119881,
+      "em": 0.3298333333333334,
+      "em_stderr": 0.019473215823053027
+    },
+    "race": {
+      "acc": 0.3263157894736842,
+      "acc_stderr": 0.014510987877134932
+    }
+  },
+  "versions": {
+    "drop": 1,
+    "coqa": 1,
+    "race": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_xcopa_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xcopa_0-shot.json
+{
+  "results": {
+    "xcopa_id": {
+      "acc": 0.648,
+      "acc_stderr": 0.02138004238594605
+    },
+    "xcopa_ht": {
+      "acc": 0.53,
+      "acc_stderr": 0.022342748192502843
+    },
+    "xcopa_it": {
+      "acc": 0.508,
+      "acc_stderr": 0.02238020883492804
+    },
+    "xcopa_et": {
+      "acc": 0.506,
+      "acc_stderr": 0.022381462412439324
+    },
+    "xcopa_ta": {
+      "acc": 0.57,
+      "acc_stderr": 0.02216263442665284
+    },
+    "xcopa_th": {
+      "acc": 0.532,
+      "acc_stderr": 0.022337186479044296
+    },
+    "xcopa_sw": {
+      "acc": 0.544,
+      "acc_stderr": 0.022296238348407056
+    },
+    "xcopa_zh": {
+      "acc": 0.594,
+      "acc_stderr": 0.02198396209008634
+    },
+    "xcopa_qu": {
+      "acc": 0.512,
+      "acc_stderr": 0.02237662679792717
+    },
+    "xcopa_tr": {
+      "acc": 0.53,
+      "acc_stderr": 0.02234274819250285
+    },
+    "xcopa_vi": {
+      "acc": 0.624,
+      "acc_stderr": 0.021683827539286115
+    }
+  },
+  "versions": {
+    "xcopa_id": 0,
+    "xcopa_ht": 0,
+    "xcopa_it": 0,
+    "xcopa_et": 0,
+    "xcopa_ta": 0,
+    "xcopa_th": 0,
+    "xcopa_sw": 0,
+    "xcopa_zh": 0,
+    "xcopa_qu": 0,
+    "xcopa_tr": 0,
+    "xcopa_vi": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_xnli_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xnli_0-shot.json
+{
+  "results": {
+    "xnli_sw": {
+      "acc": 0.3528942115768463,
+      "acc_stderr": 0.0067520304764183674
+    },
+    "xnli_bg": {
+      "acc": 0.3413173652694611,
+      "acc_stderr": 0.006699490620395283
+    },
+    "xnli_el": {
+      "acc": 0.3403193612774451,
+      "acc_stderr": 0.006694754901092155
+    },
+    "xnli_hi": {
+      "acc": 0.4231536926147705,
+      "acc_stderr": 0.006980774514705842
+    },
+    "xnli_th": {
+      "acc": 0.3375249500998004,
+      "acc_stderr": 0.00668131870192652
+    },
+    "xnli_ar": {
+      "acc": 0.3393213572854291,
+      "acc_stderr": 0.006689986106838006
+    },
+    "xnli_de": {
+      "acc": 0.3964071856287425,
+      "acc_stderr": 0.0069114198150005334
+    },
+    "xnli_ru": {
+      "acc": 0.40459081836327343,
+      "acc_stderr": 0.006934900899149144
+    },
+    "xnli_vi": {
+      "acc": 0.44451097804391215,
+      "acc_stderr": 0.00702107269988888
+    },
+    "xnli_tr": {
+      "acc": 0.34790419161676644,
+      "acc_stderr": 0.006729921818907745
+    },
+    "xnli_ur": {
+      "acc": 0.37325349301397204,
+      "acc_stderr": 0.0068339592620100505
+    },
+    "xnli_fr": {
+      "acc": 0.47145708582834334,
+      "acc_stderr": 0.007053191822382807
+    },
+    "xnli_en": {
+      "acc": 0.5147704590818363,
+      "acc_stderr": 0.007061629189884944
+    },
+    "xnli_es": {
+      "acc": 0.47984031936127747,
+      "acc_stderr": 0.00705896771560341
+    },
+    "xnli_zh": {
+      "acc": 0.36227544910179643,
+      "acc_stderr": 0.006791418670232308
+    }
+  },
+  "versions": {
+    "xnli_sw": 0,
+    "xnli_bg": 0,
+    "xnli_el": 0,
+    "xnli_hi": 0,
+    "xnli_th": 0,
+    "xnli_ar": 0,
+    "xnli_de": 0,
+    "xnli_ru": 0,
+    "xnli_vi": 0,
+    "xnli_tr": 0,
+    "xnli_ur": 0,
+    "xnli_fr": 0,
+    "xnli_en": 0,
+    "xnli_es": 0,
+    "xnli_zh": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_xstory_cloze_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xstory_cloze_0-shot.json
+{
+  "results": {
+    "xstory_cloze_te": {
+      "acc": 0.5638649900727994,
+      "acc_stderr": 0.012761730431435764
+    },
+    "xstory_cloze_ar": {
+      "acc": 0.528788881535407,
+      "acc_stderr": 0.012845779070719484
+    },
+    "xstory_cloze_zh": {
+      "acc": 0.5804103242885507,
+      "acc_stderr": 0.01269964226820075
+    },
+    "xstory_cloze_ru": {
+      "acc": 0.4824619457313038,
+      "acc_stderr": 0.012859207453266304
+    },
+    "xstory_cloze_en": {
+      "acc": 0.6254136333553938,
+      "acc_stderr": 0.012455787254852474
+    },
+    "xstory_cloze_id": {
+      "acc": 0.5790866975512905,
+      "acc_stderr": 0.012705145598630695
+    },
+    "xstory_cloze_my": {
+      "acc": 0.4619457313037723,
+      "acc_stderr": 0.012829804720321695
+    },
+    "xstory_cloze_sw": {
+      "acc": 0.5056254136333554,
+      "acc_stderr": 0.012866310923072511
+    },
+    "xstory_cloze_es": {
+      "acc": 0.5830575777630708,
+      "acc_stderr": 0.01268835412160781
+    },
+    "xstory_cloze_hi": {
+      "acc": 0.5552614162806089,
+      "acc_stderr": 0.012788295970207786
+    },
+    "xstory_cloze_eu": {
+      "acc": 0.5433487756452681,
+      "acc_stderr": 0.012818676452481956
+    }
+  },
+  "versions": {
+    "xstory_cloze_te": 0,
+    "xstory_cloze_ar": 0,
+    "xstory_cloze_zh": 0,
+    "xstory_cloze_ru": 0,
+    "xstory_cloze_en": 0,
+    "xstory_cloze_id": 0,
+    "xstory_cloze_my": 0,
+    "xstory_cloze_sw": 0,
+    "xstory_cloze_es": 0,
+    "xstory_cloze_hi": 0,
+    "xstory_cloze_eu": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}