Merge branch 'big-refactor' into seq2seq-refactor

a6c640d3 · Lintang Sutawika · GitHub · 55eccc29 · 24e3e3fa · 55eccc29
Unverified Commit a6c640d3 authored Jun 16, 2023 by Lintang Sutawika Committed by GitHub Jun 16, 2023
20 changed files
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
-"""
-TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension
-https://arxiv.org/pdf/1705.03551.pdf
-TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
-triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts
-and independently gathered evidence documents, six per question on average, that provide
-high quality distant supervision for answering the questions.
-Homepage: https://nlp.cs.washington.edu/triviaqa/
-"""
-import inspect
-# import lm_eval.datasets.triviaqa.triviaqa
-import string
-from lm_eval.api.task import Task
-from lm_eval.api.instance import Instance
-from lm_eval.api.registry import register_task
-from lm_eval.api.metrics import mean
-_CITATION = """
-@InProceedings{JoshiTriviaQA2017,
-    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
-    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
-    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
-    month = {July},
-    year = {2017},
-    address = {Vancouver, Canada},
-    publisher = {Association for Computational Linguistics},
-}
-"""
-@register_task("triviaqa")
-class TriviaQA(Task):
-    VERSION = 1
-    DATASET_PATH = "trivia_qa"  # inspect.getfile(lm_eval.datasets.triviaqa.triviaqa)
-    DATASET_NAME = "unfiltered.nocontext"
-    OUTPUT_TYPE = "greedy_until"
-    def has_training_docs(self):
-        return True
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return False
-    def training_docs(self):
-        return self.dataset["train"]
-    def validation_docs(self):
-        return self.dataset["validation"]
-    def test_docs(self):
-        raise NotImplementedError()
-    def doc_to_text(self, doc):
-        return f"Q: {doc['question']}\nA:"
-    def should_decontaminate(self):
-        return True
-    def doc_to_decontamination_query(self, doc):
-        return doc["question"]
-    def doc_to_target(self, doc):
-        return " " + doc["answer"]["value"]
-    def _remove_prefixes(self, aliases):
-        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
-        # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
-        aliases.sort()
-        ret = [aliases[0]]
-        for alias in aliases[1:]:
-            if not alias.startswith(ret[-1]):
-                ret.append(alias)
-        return ret
-    def construct_requests(self, doc, ctx, **kwargs):
-        """Uses RequestFactory to construct Requests and returns an iterable of
-        Requests which will be sent to the LM.
-        :param doc:
-                The document as returned from training_docs, validation_docs, or test_docs.
-        :param ctx: str
-                The context string, generated by fewshot_context. This includes the natural
-                language description, as well as the few shot examples, and the question
-                part of the document for `doc`.
-        """
-        continuation = Instance(
-            request_type=self.OUTPUT_TYPE,
-            doc=doc,
-            arguments=(
-                ctx,
-                {
-                    "until": ["\n", ".", ","],
-                    "do_sample": False,
-                },
-            ),
-            idx=0,
-            **kwargs,
-        )
-        return continuation
-    def process_results(self, doc, results):
-        continuation = (
-            results[0]
-            .strip()
-            .lower()
-            .translate(str.maketrans("", "", string.punctuation))
-        )
-        list_of_candidates = [
-            alias.lower().translate(str.maketrans("", "", string.punctuation))
-            for alias in self._remove_prefixes(doc["answer"]["aliases"])
-        ]
-        return {"em": float(continuation in list_of_candidates)}
-    def aggregation(self):
-        return {
-            "em": mean,
-        }
-    def higher_is_better(self):
-        return {"em": True}
--- a/lm_eval/tasks/wikitext.py
+++ b/lm_eval/tasks/wikitext.py
-"""
-Pointer Sentinel Mixture Models
-https://arxiv.org/pdf/1609.07843.pdf
-The WikiText language modeling dataset is a collection of over 100 million tokens
-extracted from the set of verified Good and Featured articles on Wikipedia.
-NOTE: This `Task` is based on WikiText-2.
-Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
-"""
-import re
-from lm_eval.api.task import PerplexityTask
-from lm_eval.api.registry import register_task, register_group
-_CITATION = """
-@misc{merity2016pointer,
-    title={Pointer Sentinel Mixture Models},
-    author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
-    year={2016},
-    eprint={1609.07843},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
-}
-"""
-def wikitext_detokenizer(string):
-    # contractions
-    string = string.replace("s '", "s'")
-    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
-    # number separators
-    string = string.replace(" @-@ ", "-")
-    string = string.replace(" @,@ ", ",")
-    string = string.replace(" @.@ ", ".")
-    # punctuation
-    string = string.replace(" : ", ": ")
-    string = string.replace(" ; ", "; ")
-    string = string.replace(" . ", ". ")
-    string = string.replace(" ! ", "! ")
-    string = string.replace(" ? ", "? ")
-    string = string.replace(" , ", ", ")
-    # double brackets
-    string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
-    string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
-    string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
-    string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
-    string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
-    # miscellaneous
-    string = string.replace("= = = =", "====")
-    string = string.replace("= = =", "===")
-    string = string.replace("= =", "==")
-    string = string.replace(" " + chr(176) + " ", chr(176))
-    string = string.replace(" \n", "\n")
-    string = string.replace("\n ", "\n")
-    string = string.replace(" N ", " 1 ")
-    string = string.replace(" 's", "'s")
-    return string
-@register_task("wikitext")
-class WikiText(PerplexityTask):
-    VERSION = "2.0"
-    DATASET_PATH = "EleutherAI/wikitext_document_level"
-    DATASET_NAME = "wikitext-2-raw-v1"
-    def has_training_docs(self):
-        return True
-    def has_validation_docs(self):
-        return True
-    def has_test_docs(self):
-        return True
-    def training_docs(self):
-        return map(self._process_doc, self.dataset["train"])
-    def validation_docs(self):
-        return map(self._process_doc, self.dataset["validation"])
-    def test_docs(self):
-        return map(self._process_doc, self.dataset["test"])
-    def _process_doc(self, doc):
-        return doc["page"]
-    def doc_to_target(self, doc):
-        return wikitext_detokenizer(doc)
-    def should_decontaminate(self):
-        return True
-    def count_words(self, doc):
-        # count number of words in *original doc before detokenization*
-        return len(re.split(r"\s+", doc))
--- a/lm_eval/tasks/wikitext/README.md
+++ b/lm_eval/tasks/wikitext/README.md
@@ -28,9 +28,16 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-
 ### Subtasks
+* `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods.
 ### Checklist
- [x] Is in Eval-harness v1.0 ?
+* [x] Is the task an existing benchmark in the literature?
- [x] Has been checked for regression from v1.0?
+  * [x] Have you referenced the original paper that introduced the task?
- [ ] Has been checked for equivalence with original paper methodology?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
- [ ] "Main" checked variant clearly denoted?
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
 group:
-  - wikitext_group
+  - perplexity
-task: default
+  - loglikelihood_rolling
+task: wikitext
 dataset_path: EleutherAI/wikitext_document_level
 dataset_name: wikitext-2-raw-v1
 output_type: loglikelihood_rolling

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -8,14 +8,20 @@ import functools
 import subprocess
 import collections
 import importlib.util
+import fnmatch
-from typing import List
+from typing import List, Union
+import gc
+import torch
 from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice
 import torch 
+from lm_eval.logger import eval_logger
 class ExitCodeError(Exception):
    pass
@@ -25,6 +31,29 @@ def sh(x):
        raise ExitCodeError()
+def escaped_split(text, sep_char, maxsplit=-1):
+    """Split text into a list on occurrences of the given separation
+    character `sep_char`. The separation character may be escaped by a
+    backslash to avoid splitting at that location.
+    The separation character must be a string of size 1.
+    If `maxsplit` is given, at most `maxsplit` splits are done (thus,
+    the list will have at most `maxsplit + 1` elements). If `maxsplit`
+    is not specified or less than 0, then there is no limit on the
+    number of splits (all possible splits are made).
+    """
+    assert (
+        len(sep_char) == 1
+    ), "separation string must be a single character for escaped splitting"
+    if maxsplit == 0:
+        return text
+    maxsplit = max(0, maxsplit)
+    return re.split(r"(?<!\\)" + sep_char, text, maxsplit)
 def simple_parse_args_string(args_string):
    """
    Parses something like
@@ -44,11 +73,11 @@ def join_iters(iters):
        yield from iter
-def chunks(iter, n):
+def chunks(iter, n=0, fn=None):
    arr = []
-    for x in iter:
+    for i, x in enumerate(iter):
        arr.append(x)
-        if len(arr) == n:
+        if len(arr) == (fn(i) if fn else n):
            yield arr
            arr = []
@@ -65,6 +94,35 @@ def group(arr, fn):
    return list(res.values())
+class MultiChoice:
+    def __init__(self, choices):
+        self.choices = choices
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values):
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                eval_logger.warning("{} is not in task list.".format(value))
+                eval_logger.info(f"Available tasks to choose:")
+                for choice in self.choices:
+                    eval_logger.info(f"  - {choice}")
+        return True
+    def __iter__(self):
+        for choice in self.choices:
+            yield choice
+# Returns a list containing all values of the source_list that
+# match at least one of the patterns
+def pattern_match(patterns, source_list):
+    task_names = set()
+    for pattern in patterns:
+        for matching in fnmatch.filter(source_list, pattern):
+            task_names.add(matching)
+    return sorted(list(task_names))
 def general_detokenize(string):
    string = string.replace(" n't", "n't")
    string = string.replace(" )", ")")
@@ -122,6 +180,26 @@ def make_disjoint_window(pair):
    return a[: len(a) - (len(b) - 1)], b
+def select_continuation_from_batch_left_padding(
+    generations: Union[List[List[int]], torch.Tensor], max_context_size: int
+):
+    """Select the continuation from the batch, removing prompts of different lengths.
+    Args:
+        generations (Union[List[List[int]], torch.Tensor]):
+            A tensor or list-of-lists of shape [batch_size, sequence length].
+        max_context_size (int):
+            The size of the biggest context; generations will proceed from that
+            index.
+    Example:
+        PAD     PAD Continue : The dog chased the cat  [every       day of the week]
+        Riddle  me    this   : The  dog chased the  cat [yesterday] PAD PAD PAD PAD
+    Output:
+        [every day of the week]
+        [yesterday]  PAD PAD PAD PAD
+    """
+    return generations[:, max_context_size:]
 class Reorderer:
    def __init__(self, arr, fn):
        self.size = len(arr)
@@ -174,7 +252,6 @@ def make_table(result_dict):
        version = result_dict["versions"][k]
        for (mf), v in dic.items():
            m, _, f = mf.partition(",")
-            print(m, f)
            if m.endswith("_stderr"):
                continue
@@ -360,3 +437,8 @@ def pad_and_concat(max_length:int, tensors: List[torch.Tensor]):
            tensors[i] = tensor.unsqueeze(0)
    return torch.cat(tensors, dim = 0)
+def clear_torch_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
--- a/main.py
+++ b/main.py
 import os
+import re
 import json
 import fnmatch
+import jsonlines
 import argparse
+import logging
 from lm_eval import evaluator, utils
-from lm_eval.tasks import ALL_TASKS
+from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-class MultiChoice:
-    def __init__(self, choices):
-        self.choices = choices
-    # Simple wildcard support (linux filename patterns)
-    def __contains__(self, values):
-        for value in values.split(","):
-            if len(fnmatch.filter(self.choices, value)) == 0:
-                eval_logger.warning("{} is not in task list.".format(value))
-                eval_logger.info(f"Available tasks to choose:")
-                for choice in self.choices:
-                    eval_logger.info(f"    {choice}")
-        return True
-    def __iter__(self):
-        for choice in self.choices:
-            yield choice
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tasks", default=None, choices=MultiChoice(ALL_TASKS))
+    parser.add_argument(
+        "--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS))
+    )
    parser.add_argument("--config", default=None)
-    parser.add_argument("--provide_description", action="store_true")
    parser.add_argument("--num_fewshot", type=int, default=0)
    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        help="Maximal batch size to try with --batch_size auto",
+    )
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument(
+        "--limit",
+        type=float,
+        default=None,
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
+    parser.add_argument("--data_sampling", type=float, default=None)
    parser.add_argument("--no_cache", action="store_true")
    parser.add_argument("--decontamination_ngrams_path", default=None)
    parser.add_argument("--description_dict_path", default=None)
    parser.add_argument("--check_integrity", action="store_true")
+    parser.add_argument("--write_out", action="store_true", default=False)
+    parser.add_argument("--output_base_path", type=str, default=None)
    return parser.parse_args()
-# Returns a list containing all values of the source_list that
-# match at least one of the patterns
-def pattern_match(patterns, source_list):
-    task_names = set()
-    for pattern in patterns:
-        for matching in fnmatch.filter(source_list, pattern):
-            task_names.add(matching)
-    return sorted(list(task_names))
 def main():
    args = parse_args()
@@ -68,7 +57,9 @@ def main():
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )
-    if args.tasks is not None:
+    if args.tasks is None:
+        task_names = ALL_TASKS
+    else:
        if os.path.isdir(args.tasks):
            import glob
@@ -79,7 +70,7 @@ def main():
                task_names.append(config)
        else:
            tasks_list = args.tasks.split(",")
-            task_names = pattern_match(tasks_list, ALL_TASKS)
+            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
            for task in [task for task in tasks_list if task not in task_names]:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
@@ -87,28 +78,60 @@ def main():
    eval_logger.info(f"Selected Tasks: {task_names}")
+    # TODO: description_dict?
+    # description_dict = {}
+    # if args.description_dict_path:
+    #     with open(args.description_dict_path, "r") as f:
+    #         description_dict = json.load(f)
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
        device=args.device,
+        no_cache=args.no_cache,
        limit=args.limit,
+        # description_dict=description_dict,
        decontamination_ngrams_path=args.decontamination_ngrams_path,
        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        output_base_path=args.output_base_path,
    )
    if results is not None:
+        samples = results.pop("samples")
        dumped = json.dumps(results, indent=2)
        print(dumped)
+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
        if args.output_path:
+            os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
            with open(args.output_path, "w") as f:
                f.write(dumped)
+            for task_name, config in results["configs"].items():
+                output_name = "{}_{}".format(
+                    re.sub("/", "__", args.model_args), task_name
+                )
+                if os.path.isdir(args.output_path):
+                    filename = f"./{args.output_path}/{output_name}.jsonl"
+                elif os.path.isfile(args.output_path):
+                    filename = (
+                        f"./{os.path.dirname(args.output_path)}/{output_name}.jsonl"
+                    )
+                with jsonlines.open(filename, "w") as f:
+                    f.write_all(samples[task_name])
        print(
-            f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
+            f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
-            f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
+            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
        print(evaluator.make_table(results))

--- a/results/bloom/bloom-1b1/README.md
+++ b/results/bloom/bloom-1b1/README.md
+# bloom-1b1
+## bloom-1b1_common_sense_reasoning_0-shot.json
+|    Task     |Version| Metric |Value|   |Stderr|
+|-------------|------:|--------|----:|---|-----:|
+|arc_challenge|      0|acc     |23.63|±  |  1.24|
+|             |       |acc_norm|25.68|±  |  1.28|
+|arc_easy     |      0|acc     |51.47|±  |  1.03|
+|             |       |acc_norm|45.45|±  |  1.02|
+|boolq        |      1|acc     |59.08|±  |  0.86|
+|copa         |      0|acc     |68.00|±  |  4.69|
+|hellaswag    |      0|acc     |34.63|±  |  0.47|
+|             |       |acc_norm|41.77|±  |  0.49|
+|mc_taco      |      0|em      |14.49|   |      |
+|             |       |f1      |32.43|   |      |
+|openbookqa   |      0|acc     |19.60|±  |  1.78|
+|             |       |acc_norm|29.40|±  |  2.04|
+|piqa         |      0|acc     |67.14|±  |  1.10|
+|             |       |acc_norm|67.14|±  |  1.10|
+|prost        |      0|acc     |23.41|±  |  0.31|
+|             |       |acc_norm|30.50|±  |  0.34|
+|swag         |      0|acc     |43.43|±  |  0.35|
+|             |       |acc_norm|58.28|±  |  0.35|
+|winogrande   |      0|acc     |54.93|±  |  1.40|
+|wsc273       |      0|acc     |68.50|±  |  2.82|
+## bloom-1b1_gsm8k_8-shot.json
+|Task |Version|Metric|Value|   |Stderr|
+|-----|------:|------|----:|---|-----:|
+|gsm8k|      0|acc   | 0.83|±  |  0.25|
+## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+|          Task           |Version| Metric |Value|   |Stderr|
+|-------------------------|------:|--------|----:|---|-----:|
+|drop                     |      1|em      | 1.38|±  |  0.12|
+|                         |       |f1      | 4.01|±  |  0.15|
+|gsm8k                    |      0|acc     | 0.00|±  |  0.00|
+|math_algebra             |      1|acc     | 0.00|±  |  0.00|
+|math_counting_and_prob   |      1|acc     | 0.21|±  |  0.21|
+|math_geometry            |      1|acc     | 0.21|±  |  0.21|
+|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
+|math_num_theory          |      1|acc     | 0.19|±  |  0.19|
+|math_prealgebra          |      1|acc     | 0.11|±  |  0.11|
+|math_precalc             |      1|acc     | 0.00|±  |  0.00|
+|mathqa                   |      0|acc     |23.55|±  |  0.78|
+|                         |       |acc_norm|23.62|±  |  0.78|
+## bloom-1b1_pawsx_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|pawsx_de|      0|acc   |46.95|±  |  1.12|
+|pawsx_en|      0|acc   |52.45|±  |  1.12|
+|pawsx_es|      0|acc   |51.50|±  |  1.12|
+|pawsx_fr|      0|acc   |46.15|±  |  1.11|
+|pawsx_ja|      0|acc   |48.40|±  |  1.12|
+|pawsx_ko|      0|acc   |49.90|±  |  1.12|
+|pawsx_zh|      0|acc   |48.95|±  |  1.12|
+## bloom-1b1_question_answering_0-shot.json
+|    Task     |Version|   Metric   |Value|   |Stderr|
+|-------------|------:|------------|----:|---|-----:|
+|headqa_en    |      0|acc         |26.44|±  |  0.84|
+|             |       |acc_norm    |30.49|±  |  0.88|
+|headqa_es    |      0|acc         |24.43|±  |  0.82|
+|             |       |acc_norm    |28.30|±  |  0.86|
+|logiqa       |      0|acc         |18.89|±  |  1.54|
+|             |       |acc_norm    |25.65|±  |  1.71|
+|squad2       |      1|exact       | 4.17|   |      |
+|             |       |f1          | 6.60|   |      |
+|             |       |HasAns_exact| 2.19|   |      |
+|             |       |HasAns_f1   | 7.05|   |      |
+|             |       |NoAns_exact | 6.14|   |      |
+|             |       |NoAns_f1    | 6.14|   |      |
+|             |       |best_exact  |50.07|   |      |
+|             |       |best_f1     |50.07|   |      |
+|triviaqa     |      1|acc         | 2.68|±  |  0.15|
+|truthfulqa_mc|      1|mc1         |25.34|±  |  1.52|
+|             |       |mc2         |41.80|±  |  1.46|
+|webqs        |      0|acc         | 1.38|±  |  0.26|
+## bloom-1b1_reading_comprehension_0-shot.json
+|Task|Version|Metric|Value|   |Stderr|
+|----|------:|------|----:|---|-----:|
+|coqa|      1|f1    |45.57|±  |  1.88|
+|    |       |em    |32.98|±  |  1.95|
+|drop|      1|em    | 3.31|±  |  0.18|
+|    |       |f1    | 8.63|±  |  0.22|
+|race|      1|acc   |32.63|±  |  1.45|
+## bloom-1b1_xcopa_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|xcopa_et|      0|acc   | 50.6|±  |  2.24|
+|xcopa_ht|      0|acc   | 53.0|±  |  2.23|
+|xcopa_id|      0|acc   | 64.8|±  |  2.14|
+|xcopa_it|      0|acc   | 50.8|±  |  2.24|
+|xcopa_qu|      0|acc   | 51.2|±  |  2.24|
+|xcopa_sw|      0|acc   | 54.4|±  |  2.23|
+|xcopa_ta|      0|acc   | 57.0|±  |  2.22|
+|xcopa_th|      0|acc   | 53.2|±  |  2.23|
+|xcopa_tr|      0|acc   | 53.0|±  |  2.23|
+|xcopa_vi|      0|acc   | 62.4|±  |  2.17|
+|xcopa_zh|      0|acc   | 59.4|±  |  2.20|
+## bloom-1b1_xnli_0-shot.json
+| Task  |Version|Metric|Value|   |Stderr|
+|-------|------:|------|----:|---|-----:|
+|xnli_ar|      0|acc   |33.93|±  |  0.67|
+|xnli_bg|      0|acc   |34.13|±  |  0.67|
+|xnli_de|      0|acc   |39.64|±  |  0.69|
+|xnli_el|      0|acc   |34.03|±  |  0.67|
+|xnli_en|      0|acc   |51.48|±  |  0.71|
+|xnli_es|      0|acc   |47.98|±  |  0.71|
+|xnli_fr|      0|acc   |47.15|±  |  0.71|
+|xnli_hi|      0|acc   |42.32|±  |  0.70|
+|xnli_ru|      0|acc   |40.46|±  |  0.69|
+|xnli_sw|      0|acc   |35.29|±  |  0.68|
+|xnli_th|      0|acc   |33.75|±  |  0.67|
+|xnli_tr|      0|acc   |34.79|±  |  0.67|
+|xnli_ur|      0|acc   |37.33|±  |  0.68|
+|xnli_vi|      0|acc   |44.45|±  |  0.70|
+|xnli_zh|      0|acc   |36.23|±  |  0.68|
+## bloom-1b1_xstory_cloze_0-shot.json
+|     Task      |Version|Metric|Value|   |Stderr|
+|---------------|------:|------|----:|---|-----:|
+|xstory_cloze_ar|      0|acc   |52.88|±  |  1.28|
+|xstory_cloze_en|      0|acc   |62.54|±  |  1.25|
+|xstory_cloze_es|      0|acc   |58.31|±  |  1.27|
+|xstory_cloze_eu|      0|acc   |54.33|±  |  1.28|
+|xstory_cloze_hi|      0|acc   |55.53|±  |  1.28|
+|xstory_cloze_id|      0|acc   |57.91|±  |  1.27|
+|xstory_cloze_my|      0|acc   |46.19|±  |  1.28|
+|xstory_cloze_ru|      0|acc   |48.25|±  |  1.29|
+|xstory_cloze_sw|      0|acc   |50.56|±  |  1.29|
+|xstory_cloze_te|      0|acc   |56.39|±  |  1.28|
+|xstory_cloze_zh|      0|acc   |58.04|±  |  1.27|
+## bloom-1b1_xwinograd_0-shot.json
+|    Task    |Version|Metric|Value|   |Stderr|
+|------------|------:|------|----:|---|-----:|
+|xwinograd_en|      0|acc   |69.98|±  |  0.95|
+|xwinograd_fr|      0|acc   |66.27|±  |  5.22|
+|xwinograd_jp|      0|acc   |52.87|±  |  1.61|
+|xwinograd_pt|      0|acc   |63.12|±  |  2.98|
+|xwinograd_ru|      0|acc   |54.29|±  |  2.81|
+|xwinograd_zh|      0|acc   |69.25|±  |  2.06|
--- a/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+{
+  "results": {
+    "boolq": {
+      "acc": 0.5908256880733945,
+      "acc_stderr": 0.008599563442397352
+    },
+    "arc_easy": {
+      "acc": 0.5147306397306397,
+      "acc_stderr": 0.010255329977562096,
+      "acc_norm": 0.45454545454545453,
+      "acc_norm_stderr": 0.010217299762709435
+    },
+    "openbookqa": {
+      "acc": 0.196,
+      "acc_stderr": 0.017770751227744862,
+      "acc_norm": 0.294,
+      "acc_norm_stderr": 0.020395095484936614
+    },
+    "hellaswag": {
+      "acc": 0.3463453495319657,
+      "acc_stderr": 0.004748324319714264,
+      "acc_norm": 0.4177454690300737,
+      "acc_norm_stderr": 0.004921798492608764
+    },
+    "swag": {
+      "acc": 0.43431970408877335,
+      "acc_stderr": 0.0035044592489844794,
+      "acc_norm": 0.5828251524542637,
+      "acc_norm_stderr": 0.0034862531772295617
+    },
+    "arc_challenge": {
+      "acc": 0.2363481228668942,
+      "acc_stderr": 0.012414960524301834,
+      "acc_norm": 0.2568259385665529,
+      "acc_norm_stderr": 0.0127669237941168
+    },
+    "mc_taco": {
+      "em": 0.1448948948948949,
+      "f1": 0.32425976796237205
+    },
+    "wsc273": {
+      "acc": 0.684981684981685,
+      "acc_stderr": 0.028165854394193602
+    },
+    "winogrande": {
+      "acc": 0.5493291239147593,
+      "acc_stderr": 0.013983928869040239
+    },
+    "prost": {
+      "acc": 0.23409479077711356,
+      "acc_stderr": 0.003093545711826552,
+      "acc_norm": 0.3049743808710504,
+      "acc_norm_stderr": 0.003363606918420179
+    },
+    "copa": {
+      "acc": 0.68,
+      "acc_stderr": 0.04688261722621504
+    },
+    "piqa": {
+      "acc": 0.6713819368879217,
+      "acc_stderr": 0.010959127105167048,
+      "acc_norm": 0.6713819368879217,
+      "acc_norm_stderr": 0.010959127105167044
+    }
+  },
+  "versions": {
+    "boolq": 1,
+    "arc_easy": 0,
+    "openbookqa": 0,
+    "hellaswag": 0,
+    "swag": 0,
+    "arc_challenge": 0,
+    "mc_taco": 0,
+    "wsc273": 0,
+    "winogrande": 0,
+    "prost": 0,
+    "copa": 0,
+    "piqa": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+{
+  "results": {
+    "gsm8k": {
+      "acc": 0.008339651250947688,
+      "acc_stderr": 0.002504942226860508
+    }
+  },
+  "versions": {
+    "gsm8k": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 8,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+{
+  "results": {
+    "mathqa": {
+      "acc": 0.2355108877721943,
+      "acc_stderr": 0.007767687364650971,
+      "acc_norm": 0.23618090452261306,
+      "acc_norm_stderr": 0.0077753193787470495
+    },
+    "gsm8k": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "drop": {
+      "em": 0.013842281879194632,
+      "em_stderr": 0.001196510970060749,
+      "f1": 0.040085989932885986,
+      "f1_stderr": 0.0014841664758736023
+    },
+    "math_geometry": {
+      "acc": 0.0020876826722338203,
+      "acc_stderr": 0.0020876826722338315
+    },
+    "math_counting_and_prob": {
+      "acc": 0.002109704641350211,
+      "acc_stderr": 0.002109704641350211
+    },
+    "math_prealgebra": {
+      "acc": 0.001148105625717566,
+      "acc_stderr": 0.0011481056257175708
+    },
+    "math_num_theory": {
+      "acc": 0.001851851851851852,
+      "acc_stderr": 0.0018518518518518448
+    },
+    "math_precalc": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "math_algebra": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "math_intermediate_algebra": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    }
+  },
+  "versions": {
+    "mathqa": 0,
+    "gsm8k": 0,
+    "drop": 1,
+    "math_geometry": 1,
+    "math_counting_and_prob": 1,
+    "math_prealgebra": 1,
+    "math_num_theory": 1,
+    "math_precalc": 1,
+    "math_algebra": 1,
+    "math_intermediate_algebra": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 5,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
+{
+  "results": {
+    "pawsx_es": {
+      "acc": 0.515,
+      "acc_stderr": 0.011178102477052804
+    },
+    "pawsx_zh": {
+      "acc": 0.4895,
+      "acc_stderr": 0.011180669867648657
+    },
+    "pawsx_fr": {
+      "acc": 0.4615,
+      "acc_stderr": 0.011149934327957058
+    },
+    "pawsx_ko": {
+      "acc": 0.499,
+      "acc_stderr": 0.01118311365477017
+    },
+    "pawsx_de": {
+      "acc": 0.4695,
+      "acc_stderr": 0.011162310405413175
+    },
+    "pawsx_ja": {
+      "acc": 0.484,
+      "acc_stderr": 0.011177408788874897
+    },
+    "pawsx_en": {
+      "acc": 0.5245,
+      "acc_stderr": 0.011169702598013186
+    }
+  },
+  "versions": {
+    "pawsx_es": 0,
+    "pawsx_zh": 0,
+    "pawsx_fr": 0,
+    "pawsx_ko": 0,
+    "pawsx_de": 0,
+    "pawsx_ja": 0,
+    "pawsx_en": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_question_answering_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_question_answering_0-shot.json
+{
+  "results": {
+    "truthfulqa_mc": {
+      "mc1": 0.2533659730722154,
+      "mc1_stderr": 0.01522589934082683,
+      "mc2": 0.4179977378869182,
+      "mc2_stderr": 0.014601549068840484
+    },
+    "webqs": {
+      "acc": 0.013779527559055118,
+      "acc_stderr": 0.002586718737195641
+    },
+    "logiqa": {
+      "acc": 0.1889400921658986,
+      "acc_stderr": 0.01535436463822078,
+      "acc_norm": 0.2565284178187404,
+      "acc_norm_stderr": 0.017129443327887562
+    },
+    "squad2": {
+      "exact": 4.169123220752969,
+      "f1": 6.5956997780058355,
+      "HasAns_exact": 2.192982456140351,
+      "HasAns_f1": 7.05309437656277,
+      "NoAns_exact": 6.139613120269134,
+      "NoAns_f1": 6.139613120269134,
+      "best_exact": 50.07159100480081,
+      "best_f1": 50.07159100480081
+    },
+    "headqa_es": {
+      "acc": 0.24434719183078046,
+      "acc_stderr": 0.008207488987159709,
+      "acc_norm": 0.2830051057622174,
+      "acc_norm_stderr": 0.008604004902114394
+    },
+    "headqa_en": {
+      "acc": 0.26440554339897887,
+      "acc_stderr": 0.008423643607316284,
+      "acc_norm": 0.30488694383661563,
+      "acc_norm_stderr": 0.008793112278191295
+    },
+    "triviaqa": {
+      "acc": 0.026783346592415803,
+      "acc_stderr": 0.001517985028991893
+    }
+  },
+  "versions": {
+    "truthfulqa_mc": 1,
+    "webqs": 0,
+    "logiqa": 0,
+    "squad2": 1,
+    "headqa_es": 0,
+    "headqa_en": 0,
+    "triviaqa": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_reading_comprehension_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_reading_comprehension_0-shot.json
+{
+  "results": {
+    "drop": {
+      "em": 0.03313758389261745,
+      "em_stderr": 0.0018330841858875643,
+      "f1": 0.08634542785234882,
+      "f1_stderr": 0.0022136353860709133
+    },
+    "coqa": {
+      "f1": 0.4557083534540516,
+      "f1_stderr": 0.01876948425119881,
+      "em": 0.3298333333333334,
+      "em_stderr": 0.019473215823053027
+    },
+    "race": {
+      "acc": 0.3263157894736842,
+      "acc_stderr": 0.014510987877134932
+    }
+  },
+  "versions": {
+    "drop": 1,
+    "coqa": 1,
+    "race": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_xcopa_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xcopa_0-shot.json
+{
+  "results": {
+    "xcopa_id": {
+      "acc": 0.648,
+      "acc_stderr": 0.02138004238594605
+    },
+    "xcopa_ht": {
+      "acc": 0.53,
+      "acc_stderr": 0.022342748192502843
+    },
+    "xcopa_it": {
+      "acc": 0.508,
+      "acc_stderr": 0.02238020883492804
+    },
+    "xcopa_et": {
+      "acc": 0.506,
+      "acc_stderr": 0.022381462412439324
+    },
+    "xcopa_ta": {
+      "acc": 0.57,
+      "acc_stderr": 0.02216263442665284
+    },
+    "xcopa_th": {
+      "acc": 0.532,
+      "acc_stderr": 0.022337186479044296
+    },
+    "xcopa_sw": {
+      "acc": 0.544,
+      "acc_stderr": 0.022296238348407056
+    },
+    "xcopa_zh": {
+      "acc": 0.594,
+      "acc_stderr": 0.02198396209008634
+    },
+    "xcopa_qu": {
+      "acc": 0.512,
+      "acc_stderr": 0.02237662679792717
+    },
+    "xcopa_tr": {
+      "acc": 0.53,
+      "acc_stderr": 0.02234274819250285
+    },
+    "xcopa_vi": {
+      "acc": 0.624,
+      "acc_stderr": 0.021683827539286115
+    }
+  },
+  "versions": {
+    "xcopa_id": 0,
+    "xcopa_ht": 0,
+    "xcopa_it": 0,
+    "xcopa_et": 0,
+    "xcopa_ta": 0,
+    "xcopa_th": 0,
+    "xcopa_sw": 0,
+    "xcopa_zh": 0,
+    "xcopa_qu": 0,
+    "xcopa_tr": 0,
+    "xcopa_vi": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_xnli_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xnli_0-shot.json
+{
+  "results": {
+    "xnli_sw": {
+      "acc": 0.3528942115768463,
+      "acc_stderr": 0.0067520304764183674
+    },
+    "xnli_bg": {
+      "acc": 0.3413173652694611,
+      "acc_stderr": 0.006699490620395283
+    },
+    "xnli_el": {
+      "acc": 0.3403193612774451,
+      "acc_stderr": 0.006694754901092155
+    },
+    "xnli_hi": {
+      "acc": 0.4231536926147705,
+      "acc_stderr": 0.006980774514705842
+    },
+    "xnli_th": {
+      "acc": 0.3375249500998004,
+      "acc_stderr": 0.00668131870192652
+    },
+    "xnli_ar": {
+      "acc": 0.3393213572854291,
+      "acc_stderr": 0.006689986106838006
+    },
+    "xnli_de": {
+      "acc": 0.3964071856287425,
+      "acc_stderr": 0.0069114198150005334
+    },
+    "xnli_ru": {
+      "acc": 0.40459081836327343,
+      "acc_stderr": 0.006934900899149144
+    },
+    "xnli_vi": {
+      "acc": 0.44451097804391215,
+      "acc_stderr": 0.00702107269988888
+    },
+    "xnli_tr": {
+      "acc": 0.34790419161676644,
+      "acc_stderr": 0.006729921818907745
+    },
+    "xnli_ur": {
+      "acc": 0.37325349301397204,
+      "acc_stderr": 0.0068339592620100505
+    },
+    "xnli_fr": {
+      "acc": 0.47145708582834334,
+      "acc_stderr": 0.007053191822382807
+    },
+    "xnli_en": {
+      "acc": 0.5147704590818363,
+      "acc_stderr": 0.007061629189884944
+    },
+    "xnli_es": {
+      "acc": 0.47984031936127747,
+      "acc_stderr": 0.00705896771560341
+    },
+    "xnli_zh": {
+      "acc": 0.36227544910179643,
+      "acc_stderr": 0.006791418670232308
+    }
+  },
+  "versions": {
+    "xnli_sw": 0,
+    "xnli_bg": 0,
+    "xnli_el": 0,
+    "xnli_hi": 0,
+    "xnli_th": 0,
+    "xnli_ar": 0,
+    "xnli_de": 0,
+    "xnli_ru": 0,
+    "xnli_vi": 0,
+    "xnli_tr": 0,
+    "xnli_ur": 0,
+    "xnli_fr": 0,
+    "xnli_en": 0,
+    "xnli_es": 0,
+    "xnli_zh": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_xstory_cloze_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xstory_cloze_0-shot.json
+{
+  "results": {
+    "xstory_cloze_te": {
+      "acc": 0.5638649900727994,
+      "acc_stderr": 0.012761730431435764
+    },
+    "xstory_cloze_ar": {
+      "acc": 0.528788881535407,
+      "acc_stderr": 0.012845779070719484
+    },
+    "xstory_cloze_zh": {
+      "acc": 0.5804103242885507,
+      "acc_stderr": 0.01269964226820075
+    },
+    "xstory_cloze_ru": {
+      "acc": 0.4824619457313038,
+      "acc_stderr": 0.012859207453266304
+    },
+    "xstory_cloze_en": {
+      "acc": 0.6254136333553938,
+      "acc_stderr": 0.012455787254852474
+    },
+    "xstory_cloze_id": {
+      "acc": 0.5790866975512905,
+      "acc_stderr": 0.012705145598630695
+    },
+    "xstory_cloze_my": {
+      "acc": 0.4619457313037723,
+      "acc_stderr": 0.012829804720321695
+    },
+    "xstory_cloze_sw": {
+      "acc": 0.5056254136333554,
+      "acc_stderr": 0.012866310923072511
+    },
+    "xstory_cloze_es": {
+      "acc": 0.5830575777630708,
+      "acc_stderr": 0.01268835412160781
+    },
+    "xstory_cloze_hi": {
+      "acc": 0.5552614162806089,
+      "acc_stderr": 0.012788295970207786
+    },
+    "xstory_cloze_eu": {
+      "acc": 0.5433487756452681,
+      "acc_stderr": 0.012818676452481956
+    }
+  },
+  "versions": {
+    "xstory_cloze_te": 0,
+    "xstory_cloze_ar": 0,
+    "xstory_cloze_zh": 0,
+    "xstory_cloze_ru": 0,
+    "xstory_cloze_en": 0,
+    "xstory_cloze_id": 0,
+    "xstory_cloze_my": 0,
+    "xstory_cloze_sw": 0,
+    "xstory_cloze_es": 0,
+    "xstory_cloze_hi": 0,
+    "xstory_cloze_eu": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_xwinograd_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xwinograd_0-shot.json
+{
+  "results": {
+    "xwinograd_ru": {
+      "acc": 0.5428571428571428,
+      "acc_stderr": 0.028112788378274862
+    },
+    "xwinograd_en": {
+      "acc": 0.6997849462365592,
+      "acc_stderr": 0.009507809437511165
+    },
+    "xwinograd_jp": {
+      "acc": 0.5286757038581856,
+      "acc_stderr": 0.016127677684108978
+    },
+    "xwinograd_fr": {
+      "acc": 0.6626506024096386,
+      "acc_stderr": 0.05221260262032129
+    },
+    "xwinograd_zh": {
+      "acc": 0.6924603174603174,
+      "acc_stderr": 0.02057614603593188
+    },
+    "xwinograd_pt": {
+      "acc": 0.6311787072243346,
+      "acc_stderr": 0.02980804663449022
+    }
+  },
+  "versions": {
+    "xwinograd_ru": 0,
+    "xwinograd_en": 0,
+    "xwinograd_jp": 0,
+    "xwinograd_fr": 0,
+    "xwinograd_zh": 0,
+    "xwinograd_pt": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b7/README.md
+++ b/results/bloom/bloom-1b7/README.md
+# bloom-1b7
+## bloom-1b7_common_sense_reasoning_0-shot.json
+|    Task     |Version| Metric |Value|   |Stderr|
+|-------------|------:|--------|----:|---|-----:|
+|arc_challenge|      0|acc     |23.55|±  |  1.24|
+|             |       |acc_norm|26.79|±  |  1.29|
+|arc_easy     |      0|acc     |56.31|±  |  1.02|
+|             |       |acc_norm|48.11|±  |  1.03|
+|boolq        |      1|acc     |61.77|±  |  0.85|
+|copa         |      0|acc     |70.00|±  |  4.61|
+|hellaswag    |      0|acc     |37.62|±  |  0.48|
+|             |       |acc_norm|46.56|±  |  0.50|
+|mc_taco      |      0|em      |12.54|   |      |
+|             |       |f1      |47.46|   |      |
+|openbookqa   |      0|acc     |21.40|±  |  1.84|
+|             |       |acc_norm|30.00|±  |  2.05|
+|piqa         |      0|acc     |68.77|±  |  1.08|
+|             |       |acc_norm|70.08|±  |  1.07|
+|prost        |      0|acc     |23.52|±  |  0.31|
+|             |       |acc_norm|26.70|±  |  0.32|
+|swag         |      0|acc     |45.32|±  |  0.35|
+|             |       |acc_norm|61.15|±  |  0.34|
+|winogrande   |      0|acc     |57.14|±  |  1.39|
+|wsc273       |      0|acc     |72.89|±  |  2.70|
+## bloom-1b7_gsm8k_8-shot.json
+|Task |Version|Metric|Value|   |Stderr|
+|-----|------:|------|----:|---|-----:|
+|gsm8k|      0|acc   | 1.29|±  |  0.31|
+## bloom-1b7_mathematical_reasoning_few_shot_5-shot.json
+|          Task           |Version| Metric |Value|   |Stderr|
+|-------------------------|------:|--------|----:|---|-----:|
+|drop                     |      1|em      | 1.49|±  |  0.12|
+|                         |       |f1      | 4.31|±  |  0.15|
+|gsm8k                    |      0|acc     | 0.00|±  |  0.00|
+|math_algebra             |      1|acc     | 0.00|±  |  0.00|
+|math_counting_and_prob   |      1|acc     | 0.00|±  |  0.00|
+|math_geometry            |      1|acc     | 0.00|±  |  0.00|
+|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
+|math_num_theory          |      1|acc     | 0.74|±  |  0.37|
+|math_prealgebra          |      1|acc     | 0.23|±  |  0.16|
+|math_precalc             |      1|acc     | 0.00|±  |  0.00|
+|mathqa                   |      0|acc     |24.29|±  |  0.79|
+|                         |       |acc_norm|24.62|±  |  0.79|
+## bloom-1b7_pawsx_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|pawsx_de|      0|acc   |48.75|±  |  1.12|
+|pawsx_en|      0|acc   |48.90|±  |  1.12|
+|pawsx_es|      0|acc   |51.30|±  |  1.12|
+|pawsx_fr|      0|acc   |46.20|±  |  1.12|
+|pawsx_ja|      0|acc   |44.70|±  |  1.11|
+|pawsx_ko|      0|acc   |45.80|±  |  1.11|
+|pawsx_zh|      0|acc   |45.40|±  |  1.11|
+## bloom-1b7_question_answering_0-shot.json
+|    Task     |Version|   Metric   |Value|   |Stderr|
+|-------------|------:|------------|----:|---|-----:|
+|headqa_en    |      0|acc         |27.75|±  |  0.86|
+|             |       |acc_norm    |32.57|±  |  0.90|
+|headqa_es    |      0|acc         |25.42|±  |  0.83|
+|             |       |acc_norm    |29.58|±  |  0.87|
+|logiqa       |      0|acc         |21.66|±  |  1.62|
+|             |       |acc_norm    |28.11|±  |  1.76|
+|squad2       |      1|exact       | 1.80|   |      |
+|             |       |f1          | 4.38|   |      |
+|             |       |HasAns_exact| 2.40|   |      |
+|             |       |HasAns_f1   | 7.56|   |      |
+|             |       |NoAns_exact | 1.21|   |      |
+|             |       |NoAns_f1    | 1.21|   |      |
+|             |       |best_exact  |50.07|   |      |
+|             |       |best_f1     |50.07|   |      |
+|triviaqa     |      1|acc         | 3.14|±  |  0.16|
+|truthfulqa_mc|      1|mc1         |24.48|±  |  1.51|
+|             |       |mc2         |41.32|±  |  1.44|
+|webqs        |      0|acc         | 1.28|±  |  0.25|
+## bloom-1b7_reading_comprehension_0-shot.json
+|Task|Version|Metric|Value|   |Stderr|
+|----|------:|------|----:|---|-----:|
+|coqa|      1|f1    |53.55|±  |  1.89|
+|    |       |em    |40.90|±  |  2.03|
+|drop|      1|em    | 0.69|±  |  0.08|
+|    |       |f1    | 6.89|±  |  0.16|
+|race|      1|acc   |33.21|±  |  1.46|
+## bloom-1b7_xcopa_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|xcopa_et|      0|acc   | 47.4|±  |  2.24|
+|xcopa_ht|      0|acc   | 50.4|±  |  2.24|
+|xcopa_id|      0|acc   | 63.2|±  |  2.16|
+|xcopa_it|      0|acc   | 52.6|±  |  2.24|
+|xcopa_qu|      0|acc   | 50.6|±  |  2.24|
+|xcopa_sw|      0|acc   | 51.8|±  |  2.24|
+|xcopa_ta|      0|acc   | 56.6|±  |  2.22|
+|xcopa_th|      0|acc   | 53.2|±  |  2.23|
+|xcopa_tr|      0|acc   | 52.8|±  |  2.23|
+|xcopa_vi|      0|acc   | 65.8|±  |  2.12|
+|xcopa_zh|      0|acc   | 61.4|±  |  2.18|
+## bloom-1b7_xnli_0-shot.json
+| Task  |Version|Metric|Value|   |Stderr|
+|-------|------:|------|----:|---|-----:|
+|xnli_ar|      0|acc   |33.57|±  |  0.67|
+|xnli_bg|      0|acc   |35.43|±  |  0.68|
+|xnli_de|      0|acc   |40.58|±  |  0.69|
+|xnli_el|      0|acc   |33.99|±  |  0.67|
+|xnli_en|      0|acc   |50.14|±  |  0.71|
+|xnli_es|      0|acc   |47.82|±  |  0.71|
+|xnli_fr|      0|acc   |48.18|±  |  0.71|
+|xnli_hi|      0|acc   |43.95|±  |  0.70|
+|xnli_ru|      0|acc   |39.32|±  |  0.69|
+|xnli_sw|      0|acc   |34.51|±  |  0.67|
+|xnli_th|      0|acc   |33.37|±  |  0.67|
+|xnli_tr|      0|acc   |34.93|±  |  0.67|
+|xnli_ur|      0|acc   |40.50|±  |  0.69|
+|xnli_vi|      0|acc   |46.23|±  |  0.70|
+|xnli_zh|      0|acc   |36.21|±  |  0.68|
+## bloom-1b7_xstory_cloze_0-shot.json
+|     Task      |Version|Metric|Value|   |Stderr|
+|---------------|------:|------|----:|---|-----:|
+|xstory_cloze_ar|      0|acc   |55.00|±  |  1.28|
+|xstory_cloze_en|      0|acc   |64.66|±  |  1.23|
+|xstory_cloze_es|      0|acc   |60.82|±  |  1.26|
+|xstory_cloze_eu|      0|acc   |54.93|±  |  1.28|
+|xstory_cloze_hi|      0|acc   |56.78|±  |  1.27|
+|xstory_cloze_id|      0|acc   |59.76|±  |  1.26|
+|xstory_cloze_my|      0|acc   |47.25|±  |  1.28|
+|xstory_cloze_ru|      0|acc   |50.36|±  |  1.29|
+|xstory_cloze_sw|      0|acc   |52.28|±  |  1.29|
+|xstory_cloze_te|      0|acc   |56.52|±  |  1.28|
+|xstory_cloze_zh|      0|acc   |58.24|±  |  1.27|
+## bloom-1b7_xwinograd_0-shot.json
+|    Task    |Version|Metric|Value|   |Stderr|
+|------------|------:|------|----:|---|-----:|
+|xwinograd_en|      0|acc   |74.71|±  |  0.90|
+|xwinograd_fr|      0|acc   |68.67|±  |  5.12|
+|xwinograd_jp|      0|acc   |54.12|±  |  1.61|
+|xwinograd_pt|      0|acc   |63.50|±  |  2.97|
+|xwinograd_ru|      0|acc   |52.38|±  |  2.82|
+|xwinograd_zh|      0|acc   |69.64|±  |  2.05|
--- a/results/bloom/bloom-1b7/bloom-1b7_common_sense_reasoning_0-shot.json
+++ b/results/bloom/bloom-1b7/bloom-1b7_common_sense_reasoning_0-shot.json
+{
+  "results": {
+    "mc_taco": {
+      "em": 0.12537537537537538,
+      "f1": 0.47458014393437276
+    },
+    "arc_easy": {
+      "acc": 0.5631313131313131,
+      "acc_stderr": 0.010177672928157678,
+      "acc_norm": 0.4810606060606061,
+      "acc_norm_stderr": 0.010252420496894487
+    },
+    "boolq": {
+      "acc": 0.617737003058104,
+      "acc_stderr": 0.008499149690449272
+    },
+    "piqa": {
+      "acc": 0.6877040261153428,
+      "acc_stderr": 0.010812581599154424,
+      "acc_norm": 0.7007616974972797,
+      "acc_norm_stderr": 0.010684130673134581
+    },
+    "copa": {
+      "acc": 0.7,
+      "acc_stderr": 0.046056618647183814
+    },
+    "prost": {
+      "acc": 0.23521562766865928,
+      "acc_stderr": 0.003098672944164254,
+      "acc_norm": 0.2669726729291204,
+      "acc_norm_stderr": 0.00323196492387981
+    },
+    "hellaswag": {
+      "acc": 0.37621987651862177,
+      "acc_stderr": 0.004834461997944872,
+      "acc_norm": 0.46564429396534557,
+      "acc_norm_stderr": 0.004977988452502641
+    },
+    "swag": {
+      "acc": 0.4532140357892632,
+      "acc_stderr": 0.0035195819088979174,
+      "acc_norm": 0.6114665600319904,
+      "acc_norm_stderr": 0.003446127007510879
+    },
+    "openbookqa": {
+      "acc": 0.214,
+      "acc_stderr": 0.01835979750238702,
+      "acc_norm": 0.3,
+      "acc_norm_stderr": 0.020514426225628046
+    },
+    "wsc273": {
+      "acc": 0.7289377289377289,
+      "acc_stderr": 0.02695226692070332
+    },
+    "arc_challenge": {
+      "acc": 0.2354948805460751,
+      "acc_stderr": 0.012399451855004752,
+      "acc_norm": 0.26791808873720135,
+      "acc_norm_stderr": 0.012942030195136423
+    },
+    "winogrande": {
+      "acc": 0.5714285714285714,
+      "acc_stderr": 0.013908353814606709
+    }
+  },
+  "versions": {
+    "mc_taco": 0,
+    "arc_easy": 0,
+    "boolq": 1,
+    "piqa": 0,
+    "copa": 0,
+    "prost": 0,
+    "hellaswag": 0,
+    "swag": 0,
+    "openbookqa": 0,
+    "wsc273": 0,
+    "arc_challenge": 0,
+    "winogrande": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b7,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b7/bloom-1b7_gsm8k_8-shot.json
+++ b/results/bloom/bloom-1b7/bloom-1b7_gsm8k_8-shot.json
+{
+  "results": {
+    "gsm8k": {
+      "acc": 0.01288855193328279,
+      "acc_stderr": 0.00310690126649963
+    }
+  },
+  "versions": {
+    "gsm8k": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b7,use_accelerate=True",
+    "num_fewshot": 8,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}