merge with upstream

a702689d · Alexander · 8d66cfef · 008fc2a2 · a702689d · a702689d
Commit a702689d authored Nov 16, 2023 by Alexander
20 changed files
--- a/lm_eval/tasks/triviaqa.py
+++ b/lm_eval/tasks/triviaqa.py
@@ -10,11 +10,10 @@ high quality distant supervision for answering the questions.
 Homepage: https://nlp.cs.washington.edu/triviaqa/
 """
 import inspect
-import lm_eval.datasets.triviaqa.triviaqa
+import string
 from lm_eval.base import Task, rf
 from lm_eval.metrics import mean
 _CITATION = """
 @InProceedings{JoshiTriviaQA2017,
    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
@@ -29,9 +28,9 @@ _CITATION = """
 class TriviaQA(Task):
-    VERSION = 1
+    VERSION = 3
-    DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa)
+    DATASET_PATH = "trivia_qa"
-    DATASET_NAME = None
+    DATASET_NAME = "rc.nocontext"
    def has_training_docs(self):
        return True
@@ -63,30 +62,36 @@ class TriviaQA(Task):
    def doc_to_target(self, doc):
        return " " + doc["answer"]["value"]
-    def _remove_prefixes(self, aliases):
-        # Optimization: Remove any alias that has a strict prefix elsewhere in the list
-        # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
-        aliases.sort()
-        ret = [aliases[0]]
-        for alias in aliases[1:]:
-            if not alias.startswith(ret[-1]):
-                ret.append(alias)
-        return ret
    def construct_requests(self, doc, ctx):
-        ret = []
+        """Uses RequestFactory to construct Requests and returns an iterable of
-        for alias in self._remove_prefixes(doc["answer"]["aliases"]):
+        Requests which will be sent to the LM.
-            _, is_prediction = rf.loglikelihood(ctx, " " + alias)
+        :param doc:
-            ret.append(is_prediction)
+                The document as returned from training_docs, validation_docs, or test_docs.
-        return ret
+        :param ctx: str
+                The context string, generated by fewshot_context. This includes the natural
+                language description, as well as the few shot examples, and the question
+                part of the document for `doc`.
+        """
+        continuation = rf.greedy_until(ctx, {"until": ["\n", ".", ","]})
+        return continuation
    def process_results(self, doc, results):
-        return {"acc": float(any(results))}
+        continuation = (
+            results[0]
+            .strip()
+            .lower()
+            .translate(str.maketrans("", "", string.punctuation))
+        )
+        list_of_candidates = [
+            alias.lower().translate(str.maketrans("", "", string.punctuation))
+            for alias in doc["answer"]["aliases"]
+        ]
+        return {"em": float(continuation in list_of_candidates)}
    def aggregation(self):
        return {
-            "acc": mean,
+            "em": mean,
        }
    def higher_is_better(self):
-        return {"acc": True}
+        return {"em": True}
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -8,6 +8,7 @@ import sys
 import fnmatch
 from typing import List, Union
+import gc
 import torch
 from omegaconf import OmegaConf
@@ -64,11 +65,11 @@ def join_iters(iters):
        yield from iter
-def chunks(iter, n):
+def chunks(iter, n=0, fn=None):
    arr = []
-    for x in iter:
+    for i, x in enumerate(iter):
        arr.append(x)
-        if len(arr) == n:
+        if len(arr) == (fn(i) if fn else n):
            yield arr
            arr = []
@@ -283,3 +284,8 @@ def run_task_tests(task_list: List[str]):
        raise ValueError(
            f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}"
        )
+def clear_torch_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
--- a/main.py
+++ b/main.py
@@ -12,16 +12,27 @@ def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tokenizer", default=None)
+    parser.add_argument(
-    parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS))
+        "--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS)
+    )
    parser.add_argument("--provide_description", action="store_true")
    parser.add_argument("--num_fewshot", type=int, default=0)
    parser.add_argument("--batch_size", type=str, default=None)
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        help="Maximal batch size to try with --batch_size auto",
+    )
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=float, default=None,
+    parser.add_argument(
-                        help="Limit the number of examples per task. "
+        "--limit",
-                             "If <1, limit is a percentage of the total number of examples.")
+        type=float,
+        default=None,
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
    parser.add_argument("--data_sampling", type=float, default=None)
    parser.add_argument("--no_cache", action="store_true")
    parser.add_argument("--decontamination_ngrams_path", default=None)
@@ -61,6 +72,7 @@ def main():
        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
        device=args.device,
        no_cache=args.no_cache,
        limit=args.limit,
@@ -76,13 +88,16 @@ def main():
    print(dumped)
    if args.output_path:
-        os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
+        dirname = os.path.dirname(args.output_path)
+        if dirname:
+            os.makedirs(dirname, exist_ok=True)
        with open(args.output_path, "w") as f:
            f.write(dumped)
+    batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
    print(
        f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
-        f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
+        f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
    )
    print(evaluator.make_table(results))

--- a/results/bloom/bloom-1b1/README.md
+++ b/results/bloom/bloom-1b1/README.md
-# bloom-1b1
-## bloom-1b1_common_sense_reasoning_0-shot.json
-|    Task     |Version| Metric |Value|   |Stderr|
-|-------------|------:|--------|----:|---|-----:|
-|arc_challenge|      0|acc     |23.63|±  |  1.24|
-|             |       |acc_norm|25.68|±  |  1.28|
-|arc_easy     |      0|acc     |51.47|±  |  1.03|
-|             |       |acc_norm|45.45|±  |  1.02|
-|boolq        |      1|acc     |59.08|±  |  0.86|
-|copa         |      0|acc     |68.00|±  |  4.69|
-|hellaswag    |      0|acc     |34.63|±  |  0.47|
-|             |       |acc_norm|41.77|±  |  0.49|
-|mc_taco      |      0|em      |14.49|   |      |
-|             |       |f1      |32.43|   |      |
-|openbookqa   |      0|acc     |19.60|±  |  1.78|
-|             |       |acc_norm|29.40|±  |  2.04|
-|piqa         |      0|acc     |67.14|±  |  1.10|
-|             |       |acc_norm|67.14|±  |  1.10|
-|prost        |      0|acc     |23.41|±  |  0.31|
-|             |       |acc_norm|30.50|±  |  0.34|
-|swag         |      0|acc     |43.43|±  |  0.35|
-|             |       |acc_norm|58.28|±  |  0.35|
-|winogrande   |      0|acc     |54.93|±  |  1.40|
-|wsc273       |      0|acc     |68.50|±  |  2.82|
-## bloom-1b1_gsm8k_8-shot.json
-|Task |Version|Metric|Value|   |Stderr|
-|-----|------:|------|----:|---|-----:|
-|gsm8k|      0|acc   | 0.83|±  |  0.25|
-## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
-|          Task           |Version| Metric |Value|   |Stderr|
-|-------------------------|------:|--------|----:|---|-----:|
-|drop                     |      1|em      | 1.38|±  |  0.12|
-|                         |       |f1      | 4.01|±  |  0.15|
-|gsm8k                    |      0|acc     | 0.00|±  |  0.00|
-|math_algebra             |      1|acc     | 0.00|±  |  0.00|
-|math_counting_and_prob   |      1|acc     | 0.21|±  |  0.21|
-|math_geometry            |      1|acc     | 0.21|±  |  0.21|
-|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
-|math_num_theory          |      1|acc     | 0.19|±  |  0.19|
-|math_prealgebra          |      1|acc     | 0.11|±  |  0.11|
-|math_precalc             |      1|acc     | 0.00|±  |  0.00|
-|mathqa                   |      0|acc     |23.55|±  |  0.78|
-|                         |       |acc_norm|23.62|±  |  0.78|
-## bloom-1b1_pawsx_0-shot.json
-|  Task  |Version|Metric|Value|   |Stderr|
-|--------|------:|------|----:|---|-----:|
-|pawsx_de|      0|acc   |46.95|±  |  1.12|
-|pawsx_en|      0|acc   |52.45|±  |  1.12|
-|pawsx_es|      0|acc   |51.50|±  |  1.12|
-|pawsx_fr|      0|acc   |46.15|±  |  1.11|
-|pawsx_ja|      0|acc   |48.40|±  |  1.12|
-|pawsx_ko|      0|acc   |49.90|±  |  1.12|
-|pawsx_zh|      0|acc   |48.95|±  |  1.12|
-## bloom-1b1_question_answering_0-shot.json
-|    Task     |Version|   Metric   |Value|   |Stderr|
-|-------------|------:|------------|----:|---|-----:|
-|headqa_en    |      0|acc         |26.44|±  |  0.84|
-|             |       |acc_norm    |30.49|±  |  0.88|
-|headqa_es    |      0|acc         |24.43|±  |  0.82|
-|             |       |acc_norm    |28.30|±  |  0.86|
-|logiqa       |      0|acc         |18.89|±  |  1.54|
-|             |       |acc_norm    |25.65|±  |  1.71|
-|squad2       |      1|exact       | 4.17|   |      |
-|             |       |f1          | 6.60|   |      |
-|             |       |HasAns_exact| 2.19|   |      |
-|             |       |HasAns_f1   | 7.05|   |      |
-|             |       |NoAns_exact | 6.14|   |      |
-|             |       |NoAns_f1    | 6.14|   |      |
-|             |       |best_exact  |50.07|   |      |
-|             |       |best_f1     |50.07|   |      |
-|triviaqa     |      1|acc         | 2.68|±  |  0.15|
-|truthfulqa_mc|      1|mc1         |25.34|±  |  1.52|
-|             |       |mc2         |41.80|±  |  1.46|
-|webqs        |      0|acc         | 1.38|±  |  0.26|
-## bloom-1b1_reading_comprehension_0-shot.json
-|Task|Version|Metric|Value|   |Stderr|
-|----|------:|------|----:|---|-----:|
-|coqa|      1|f1    |45.57|±  |  1.88|
-|    |       |em    |32.98|±  |  1.95|
-|drop|      1|em    | 3.31|±  |  0.18|
-|    |       |f1    | 8.63|±  |  0.22|
-|race|      1|acc   |32.63|±  |  1.45|
-## bloom-1b1_xcopa_0-shot.json
-|  Task  |Version|Metric|Value|   |Stderr|
-|--------|------:|------|----:|---|-----:|
-|xcopa_et|      0|acc   | 50.6|±  |  2.24|
-|xcopa_ht|      0|acc   | 53.0|±  |  2.23|
-|xcopa_id|      0|acc   | 64.8|±  |  2.14|
-|xcopa_it|      0|acc   | 50.8|±  |  2.24|
-|xcopa_qu|      0|acc   | 51.2|±  |  2.24|
-|xcopa_sw|      0|acc   | 54.4|±  |  2.23|
-|xcopa_ta|      0|acc   | 57.0|±  |  2.22|
-|xcopa_th|      0|acc   | 53.2|±  |  2.23|
-|xcopa_tr|      0|acc   | 53.0|±  |  2.23|
-|xcopa_vi|      0|acc   | 62.4|±  |  2.17|
-|xcopa_zh|      0|acc   | 59.4|±  |  2.20|
-## bloom-1b1_xnli_0-shot.json
-| Task  |Version|Metric|Value|   |Stderr|
-|-------|------:|------|----:|---|-----:|
-|xnli_ar|      0|acc   |33.93|±  |  0.67|
-|xnli_bg|      0|acc   |34.13|±  |  0.67|
-|xnli_de|      0|acc   |39.64|±  |  0.69|
-|xnli_el|      0|acc   |34.03|±  |  0.67|
-|xnli_en|      0|acc   |51.48|±  |  0.71|
-|xnli_es|      0|acc   |47.98|±  |  0.71|
-|xnli_fr|      0|acc   |47.15|±  |  0.71|
-|xnli_hi|      0|acc   |42.32|±  |  0.70|
-|xnli_ru|      0|acc   |40.46|±  |  0.69|
-|xnli_sw|      0|acc   |35.29|±  |  0.68|
-|xnli_th|      0|acc   |33.75|±  |  0.67|
-|xnli_tr|      0|acc   |34.79|±  |  0.67|
-|xnli_ur|      0|acc   |37.33|±  |  0.68|
-|xnli_vi|      0|acc   |44.45|±  |  0.70|
-|xnli_zh|      0|acc   |36.23|±  |  0.68|
-## bloom-1b1_xstory_cloze_0-shot.json
-|     Task      |Version|Metric|Value|   |Stderr|
-|---------------|------:|------|----:|---|-----:|
-|xstory_cloze_ar|      0|acc   |52.88|±  |  1.28|
-|xstory_cloze_en|      0|acc   |62.54|±  |  1.25|
-|xstory_cloze_es|      0|acc   |58.31|±  |  1.27|
-|xstory_cloze_eu|      0|acc   |54.33|±  |  1.28|
-|xstory_cloze_hi|      0|acc   |55.53|±  |  1.28|
-|xstory_cloze_id|      0|acc   |57.91|±  |  1.27|
-|xstory_cloze_my|      0|acc   |46.19|±  |  1.28|
-|xstory_cloze_ru|      0|acc   |48.25|±  |  1.29|
-|xstory_cloze_sw|      0|acc   |50.56|±  |  1.29|
-|xstory_cloze_te|      0|acc   |56.39|±  |  1.28|
-|xstory_cloze_zh|      0|acc   |58.04|±  |  1.27|
-## bloom-1b1_xwinograd_0-shot.json
-|    Task    |Version|Metric|Value|   |Stderr|
-|------------|------:|------|----:|---|-----:|
-|xwinograd_en|      0|acc   |69.98|±  |  0.95|
-|xwinograd_fr|      0|acc   |66.27|±  |  5.22|
-|xwinograd_jp|      0|acc   |52.87|±  |  1.61|
-|xwinograd_pt|      0|acc   |63.12|±  |  2.98|
-|xwinograd_ru|      0|acc   |54.29|±  |  2.81|
-|xwinograd_zh|      0|acc   |69.25|±  |  2.06|
--- a/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
-{
-  "results": {
-    "boolq": {
-      "acc": 0.5908256880733945,
-      "acc_stderr": 0.008599563442397352
-    },
-    "arc_easy": {
-      "acc": 0.5147306397306397,
-      "acc_stderr": 0.010255329977562096,
-      "acc_norm": 0.45454545454545453,
-      "acc_norm_stderr": 0.010217299762709435
-    },
-    "openbookqa": {
-      "acc": 0.196,
-      "acc_stderr": 0.017770751227744862,
-      "acc_norm": 0.294,
-      "acc_norm_stderr": 0.020395095484936614
-    },
-    "hellaswag": {
-      "acc": 0.3463453495319657,
-      "acc_stderr": 0.004748324319714264,
-      "acc_norm": 0.4177454690300737,
-      "acc_norm_stderr": 0.004921798492608764
-    },
-    "swag": {
-      "acc": 0.43431970408877335,
-      "acc_stderr": 0.0035044592489844794,
-      "acc_norm": 0.5828251524542637,
-      "acc_norm_stderr": 0.0034862531772295617
-    },
-    "arc_challenge": {
-      "acc": 0.2363481228668942,
-      "acc_stderr": 0.012414960524301834,
-      "acc_norm": 0.2568259385665529,
-      "acc_norm_stderr": 0.0127669237941168
-    },
-    "mc_taco": {
-      "em": 0.1448948948948949,
-      "f1": 0.32425976796237205
-    },
-    "wsc273": {
-      "acc": 0.684981684981685,
-      "acc_stderr": 0.028165854394193602
-    },
-    "winogrande": {
-      "acc": 0.5493291239147593,
-      "acc_stderr": 0.013983928869040239
-    },
-    "prost": {
-      "acc": 0.23409479077711356,
-      "acc_stderr": 0.003093545711826552,
-      "acc_norm": 0.3049743808710504,
-      "acc_norm_stderr": 0.003363606918420179
-    },
-    "copa": {
-      "acc": 0.68,
-      "acc_stderr": 0.04688261722621504
-    },
-    "piqa": {
-      "acc": 0.6713819368879217,
-      "acc_stderr": 0.010959127105167048,
-      "acc_norm": 0.6713819368879217,
-      "acc_norm_stderr": 0.010959127105167044
-    }
-  },
-  "versions": {
-    "boolq": 1,
-    "arc_easy": 0,
-    "openbookqa": 0,
-    "hellaswag": 0,
-    "swag": 0,
-    "arc_challenge": 0,
-    "mc_taco": 0,
-    "wsc273": 0,
-    "winogrande": 0,
-    "prost": 0,
-    "copa": 0,
-    "piqa": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
-{
-  "results": {
-    "gsm8k": {
-      "acc": 0.008339651250947688,
-      "acc_stderr": 0.002504942226860508
-    }
-  },
-  "versions": {
-    "gsm8k": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 8,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
-{
-  "results": {
-    "mathqa": {
-      "acc": 0.2355108877721943,
-      "acc_stderr": 0.007767687364650971,
-      "acc_norm": 0.23618090452261306,
-      "acc_norm_stderr": 0.0077753193787470495
-    },
-    "gsm8k": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "drop": {
-      "em": 0.013842281879194632,
-      "em_stderr": 0.001196510970060749,
-      "f1": 0.040085989932885986,
-      "f1_stderr": 0.0014841664758736023
-    },
-    "math_geometry": {
-      "acc": 0.0020876826722338203,
-      "acc_stderr": 0.0020876826722338315
-    },
-    "math_counting_and_prob": {
-      "acc": 0.002109704641350211,
-      "acc_stderr": 0.002109704641350211
-    },
-    "math_prealgebra": {
-      "acc": 0.001148105625717566,
-      "acc_stderr": 0.0011481056257175708
-    },
-    "math_num_theory": {
-      "acc": 0.001851851851851852,
-      "acc_stderr": 0.0018518518518518448
-    },
-    "math_precalc": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_algebra": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_intermediate_algebra": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    }
-  },
-  "versions": {
-    "mathqa": 0,
-    "gsm8k": 0,
-    "drop": 1,
-    "math_geometry": 1,
-    "math_counting_and_prob": 1,
-    "math_prealgebra": 1,
-    "math_num_theory": 1,
-    "math_precalc": 1,
-    "math_algebra": 1,
-    "math_intermediate_algebra": 1
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 5,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
-{
-  "results": {
-    "pawsx_es": {
-      "acc": 0.515,
-      "acc_stderr": 0.011178102477052804
-    },
-    "pawsx_zh": {
-      "acc": 0.4895,
-      "acc_stderr": 0.011180669867648657
-    },
-    "pawsx_fr": {
-      "acc": 0.4615,
-      "acc_stderr": 0.011149934327957058
-    },
-    "pawsx_ko": {
-      "acc": 0.499,
-      "acc_stderr": 0.01118311365477017
-    },
-    "pawsx_de": {
-      "acc": 0.4695,
-      "acc_stderr": 0.011162310405413175
-    },
-    "pawsx_ja": {
-      "acc": 0.484,
-      "acc_stderr": 0.011177408788874897
-    },
-    "pawsx_en": {
-      "acc": 0.5245,
-      "acc_stderr": 0.011169702598013186
-    }
-  },
-  "versions": {
-    "pawsx_es": 0,
-    "pawsx_zh": 0,
-    "pawsx_fr": 0,
-    "pawsx_ko": 0,
-    "pawsx_de": 0,
-    "pawsx_ja": 0,
-    "pawsx_en": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_question_answering_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_question_answering_0-shot.json
-{
-  "results": {
-    "truthfulqa_mc": {
-      "mc1": 0.2533659730722154,
-      "mc1_stderr": 0.01522589934082683,
-      "mc2": 0.4179977378869182,
-      "mc2_stderr": 0.014601549068840484
-    },
-    "webqs": {
-      "acc": 0.013779527559055118,
-      "acc_stderr": 0.002586718737195641
-    },
-    "logiqa": {
-      "acc": 0.1889400921658986,
-      "acc_stderr": 0.01535436463822078,
-      "acc_norm": 0.2565284178187404,
-      "acc_norm_stderr": 0.017129443327887562
-    },
-    "squad2": {
-      "exact": 4.169123220752969,
-      "f1": 6.5956997780058355,
-      "HasAns_exact": 2.192982456140351,
-      "HasAns_f1": 7.05309437656277,
-      "NoAns_exact": 6.139613120269134,
-      "NoAns_f1": 6.139613120269134,
-      "best_exact": 50.07159100480081,
-      "best_f1": 50.07159100480081
-    },
-    "headqa_es": {
-      "acc": 0.24434719183078046,
-      "acc_stderr": 0.008207488987159709,
-      "acc_norm": 0.2830051057622174,
-      "acc_norm_stderr": 0.008604004902114394
-    },
-    "headqa_en": {
-      "acc": 0.26440554339897887,
-      "acc_stderr": 0.008423643607316284,
-      "acc_norm": 0.30488694383661563,
-      "acc_norm_stderr": 0.008793112278191295
-    },
-    "triviaqa": {
-      "acc": 0.026783346592415803,
-      "acc_stderr": 0.001517985028991893
-    }
-  },
-  "versions": {
-    "truthfulqa_mc": 1,
-    "webqs": 0,
-    "logiqa": 0,
-    "squad2": 1,
-    "headqa_es": 0,
-    "headqa_en": 0,
-    "triviaqa": 1
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_reading_comprehension_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_reading_comprehension_0-shot.json
-{
-  "results": {
-    "drop": {
-      "em": 0.03313758389261745,
-      "em_stderr": 0.0018330841858875643,
-      "f1": 0.08634542785234882,
-      "f1_stderr": 0.0022136353860709133
-    },
-    "coqa": {
-      "f1": 0.4557083534540516,
-      "f1_stderr": 0.01876948425119881,
-      "em": 0.3298333333333334,
-      "em_stderr": 0.019473215823053027
-    },
-    "race": {
-      "acc": 0.3263157894736842,
-      "acc_stderr": 0.014510987877134932
-    }
-  },
-  "versions": {
-    "drop": 1,
-    "coqa": 1,
-    "race": 1
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_xcopa_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xcopa_0-shot.json
-{
-  "results": {
-    "xcopa_id": {
-      "acc": 0.648,
-      "acc_stderr": 0.02138004238594605
-    },
-    "xcopa_ht": {
-      "acc": 0.53,
-      "acc_stderr": 0.022342748192502843
-    },
-    "xcopa_it": {
-      "acc": 0.508,
-      "acc_stderr": 0.02238020883492804
-    },
-    "xcopa_et": {
-      "acc": 0.506,
-      "acc_stderr": 0.022381462412439324
-    },
-    "xcopa_ta": {
-      "acc": 0.57,
-      "acc_stderr": 0.02216263442665284
-    },
-    "xcopa_th": {
-      "acc": 0.532,
-      "acc_stderr": 0.022337186479044296
-    },
-    "xcopa_sw": {
-      "acc": 0.544,
-      "acc_stderr": 0.022296238348407056
-    },
-    "xcopa_zh": {
-      "acc": 0.594,
-      "acc_stderr": 0.02198396209008634
-    },
-    "xcopa_qu": {
-      "acc": 0.512,
-      "acc_stderr": 0.02237662679792717
-    },
-    "xcopa_tr": {
-      "acc": 0.53,
-      "acc_stderr": 0.02234274819250285
-    },
-    "xcopa_vi": {
-      "acc": 0.624,
-      "acc_stderr": 0.021683827539286115
-    }
-  },
-  "versions": {
-    "xcopa_id": 0,
-    "xcopa_ht": 0,
-    "xcopa_it": 0,
-    "xcopa_et": 0,
-    "xcopa_ta": 0,
-    "xcopa_th": 0,
-    "xcopa_sw": 0,
-    "xcopa_zh": 0,
-    "xcopa_qu": 0,
-    "xcopa_tr": 0,
-    "xcopa_vi": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_xnli_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xnli_0-shot.json
-{
-  "results": {
-    "xnli_sw": {
-      "acc": 0.3528942115768463,
-      "acc_stderr": 0.0067520304764183674
-    },
-    "xnli_bg": {
-      "acc": 0.3413173652694611,
-      "acc_stderr": 0.006699490620395283
-    },
-    "xnli_el": {
-      "acc": 0.3403193612774451,
-      "acc_stderr": 0.006694754901092155
-    },
-    "xnli_hi": {
-      "acc": 0.4231536926147705,
-      "acc_stderr": 0.006980774514705842
-    },
-    "xnli_th": {
-      "acc": 0.3375249500998004,
-      "acc_stderr": 0.00668131870192652
-    },
-    "xnli_ar": {
-      "acc": 0.3393213572854291,
-      "acc_stderr": 0.006689986106838006
-    },
-    "xnli_de": {
-      "acc": 0.3964071856287425,
-      "acc_stderr": 0.0069114198150005334
-    },
-    "xnli_ru": {
-      "acc": 0.40459081836327343,
-      "acc_stderr": 0.006934900899149144
-    },
-    "xnli_vi": {
-      "acc": 0.44451097804391215,
-      "acc_stderr": 0.00702107269988888
-    },
-    "xnli_tr": {
-      "acc": 0.34790419161676644,
-      "acc_stderr": 0.006729921818907745
-    },
-    "xnli_ur": {
-      "acc": 0.37325349301397204,
-      "acc_stderr": 0.0068339592620100505
-    },
-    "xnli_fr": {
-      "acc": 0.47145708582834334,
-      "acc_stderr": 0.007053191822382807
-    },
-    "xnli_en": {
-      "acc": 0.5147704590818363,
-      "acc_stderr": 0.007061629189884944
-    },
-    "xnli_es": {
-      "acc": 0.47984031936127747,
-      "acc_stderr": 0.00705896771560341
-    },
-    "xnli_zh": {
-      "acc": 0.36227544910179643,
-      "acc_stderr": 0.006791418670232308
-    }
-  },
-  "versions": {
-    "xnli_sw": 0,
-    "xnli_bg": 0,
-    "xnli_el": 0,
-    "xnli_hi": 0,
-    "xnli_th": 0,
-    "xnli_ar": 0,
-    "xnli_de": 0,
-    "xnli_ru": 0,
-    "xnli_vi": 0,
-    "xnli_tr": 0,
-    "xnli_ur": 0,
-    "xnli_fr": 0,
-    "xnli_en": 0,
-    "xnli_es": 0,
-    "xnli_zh": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_xstory_cloze_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xstory_cloze_0-shot.json
-{
-  "results": {
-    "xstory_cloze_te": {
-      "acc": 0.5638649900727994,
-      "acc_stderr": 0.012761730431435764
-    },
-    "xstory_cloze_ar": {
-      "acc": 0.528788881535407,
-      "acc_stderr": 0.012845779070719484
-    },
-    "xstory_cloze_zh": {
-      "acc": 0.5804103242885507,
-      "acc_stderr": 0.01269964226820075
-    },
-    "xstory_cloze_ru": {
-      "acc": 0.4824619457313038,
-      "acc_stderr": 0.012859207453266304
-    },
-    "xstory_cloze_en": {
-      "acc": 0.6254136333553938,
-      "acc_stderr": 0.012455787254852474
-    },
-    "xstory_cloze_id": {
-      "acc": 0.5790866975512905,
-      "acc_stderr": 0.012705145598630695
-    },
-    "xstory_cloze_my": {
-      "acc": 0.4619457313037723,
-      "acc_stderr": 0.012829804720321695
-    },
-    "xstory_cloze_sw": {
-      "acc": 0.5056254136333554,
-      "acc_stderr": 0.012866310923072511
-    },
-    "xstory_cloze_es": {
-      "acc": 0.5830575777630708,
-      "acc_stderr": 0.01268835412160781
-    },
-    "xstory_cloze_hi": {
-      "acc": 0.5552614162806089,
-      "acc_stderr": 0.012788295970207786
-    },
-    "xstory_cloze_eu": {
-      "acc": 0.5433487756452681,
-      "acc_stderr": 0.012818676452481956
-    }
-  },
-  "versions": {
-    "xstory_cloze_te": 0,
-    "xstory_cloze_ar": 0,
-    "xstory_cloze_zh": 0,
-    "xstory_cloze_ru": 0,
-    "xstory_cloze_en": 0,
-    "xstory_cloze_id": 0,
-    "xstory_cloze_my": 0,
-    "xstory_cloze_sw": 0,
-    "xstory_cloze_es": 0,
-    "xstory_cloze_hi": 0,
-    "xstory_cloze_eu": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_xwinograd_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xwinograd_0-shot.json
-{
-  "results": {
-    "xwinograd_ru": {
-      "acc": 0.5428571428571428,
-      "acc_stderr": 0.028112788378274862
-    },
-    "xwinograd_en": {
-      "acc": 0.6997849462365592,
-      "acc_stderr": 0.009507809437511165
-    },
-    "xwinograd_jp": {
-      "acc": 0.5286757038581856,
-      "acc_stderr": 0.016127677684108978
-    },
-    "xwinograd_fr": {
-      "acc": 0.6626506024096386,
-      "acc_stderr": 0.05221260262032129
-    },
-    "xwinograd_zh": {
-      "acc": 0.6924603174603174,
-      "acc_stderr": 0.02057614603593188
-    },
-    "xwinograd_pt": {
-      "acc": 0.6311787072243346,
-      "acc_stderr": 0.02980804663449022
-    }
-  },
-  "versions": {
-    "xwinograd_ru": 0,
-    "xwinograd_en": 0,
-    "xwinograd_jp": 0,
-    "xwinograd_fr": 0,
-    "xwinograd_zh": 0,
-    "xwinograd_pt": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b7/README.md
+++ b/results/bloom/bloom-1b7/README.md
-# bloom-1b7
-## bloom-1b7_common_sense_reasoning_0-shot.json
-|    Task     |Version| Metric |Value|   |Stderr|
-|-------------|------:|--------|----:|---|-----:|
-|arc_challenge|      0|acc     |23.55|±  |  1.24|
-|             |       |acc_norm|26.79|±  |  1.29|
-|arc_easy     |      0|acc     |56.31|±  |  1.02|
-|             |       |acc_norm|48.11|±  |  1.03|
-|boolq        |      1|acc     |61.77|±  |  0.85|
-|copa         |      0|acc     |70.00|±  |  4.61|
-|hellaswag    |      0|acc     |37.62|±  |  0.48|
-|             |       |acc_norm|46.56|±  |  0.50|
-|mc_taco      |      0|em      |12.54|   |      |
-|             |       |f1      |47.46|   |      |
-|openbookqa   |      0|acc     |21.40|±  |  1.84|
-|             |       |acc_norm|30.00|±  |  2.05|
-|piqa         |      0|acc     |68.77|±  |  1.08|
-|             |       |acc_norm|70.08|±  |  1.07|
-|prost        |      0|acc     |23.52|±  |  0.31|
-|             |       |acc_norm|26.70|±  |  0.32|
-|swag         |      0|acc     |45.32|±  |  0.35|
-|             |       |acc_norm|61.15|±  |  0.34|
-|winogrande   |      0|acc     |57.14|±  |  1.39|
-|wsc273       |      0|acc     |72.89|±  |  2.70|
-## bloom-1b7_gsm8k_8-shot.json
-|Task |Version|Metric|Value|   |Stderr|
-|-----|------:|------|----:|---|-----:|
-|gsm8k|      0|acc   | 1.29|±  |  0.31|
-## bloom-1b7_mathematical_reasoning_few_shot_5-shot.json
-|          Task           |Version| Metric |Value|   |Stderr|
-|-------------------------|------:|--------|----:|---|-----:|
-|drop                     |      1|em      | 1.49|±  |  0.12|
-|                         |       |f1      | 4.31|±  |  0.15|
-|gsm8k                    |      0|acc     | 0.00|±  |  0.00|
-|math_algebra             |      1|acc     | 0.00|±  |  0.00|
-|math_counting_and_prob   |      1|acc     | 0.00|±  |  0.00|
-|math_geometry            |      1|acc     | 0.00|±  |  0.00|
-|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
-|math_num_theory          |      1|acc     | 0.74|±  |  0.37|
-|math_prealgebra          |      1|acc     | 0.23|±  |  0.16|
-|math_precalc             |      1|acc     | 0.00|±  |  0.00|
-|mathqa                   |      0|acc     |24.29|±  |  0.79|
-|                         |       |acc_norm|24.62|±  |  0.79|
-## bloom-1b7_pawsx_0-shot.json
-|  Task  |Version|Metric|Value|   |Stderr|
-|--------|------:|------|----:|---|-----:|
-|pawsx_de|      0|acc   |48.75|±  |  1.12|
-|pawsx_en|      0|acc   |48.90|±  |  1.12|
-|pawsx_es|      0|acc   |51.30|±  |  1.12|
-|pawsx_fr|      0|acc   |46.20|±  |  1.12|
-|pawsx_ja|      0|acc   |44.70|±  |  1.11|
-|pawsx_ko|      0|acc   |45.80|±  |  1.11|
-|pawsx_zh|      0|acc   |45.40|±  |  1.11|
-## bloom-1b7_question_answering_0-shot.json
-|    Task     |Version|   Metric   |Value|   |Stderr|
-|-------------|------:|------------|----:|---|-----:|
-|headqa_en    |      0|acc         |27.75|±  |  0.86|
-|             |       |acc_norm    |32.57|±  |  0.90|
-|headqa_es    |      0|acc         |25.42|±  |  0.83|
-|             |       |acc_norm    |29.58|±  |  0.87|
-|logiqa       |      0|acc         |21.66|±  |  1.62|
-|             |       |acc_norm    |28.11|±  |  1.76|
-|squad2       |      1|exact       | 1.80|   |      |
-|             |       |f1          | 4.38|   |      |
-|             |       |HasAns_exact| 2.40|   |      |
-|             |       |HasAns_f1   | 7.56|   |      |
-|             |       |NoAns_exact | 1.21|   |      |
-|             |       |NoAns_f1    | 1.21|   |      |
-|             |       |best_exact  |50.07|   |      |
-|             |       |best_f1     |50.07|   |      |
-|triviaqa     |      1|acc         | 3.14|±  |  0.16|
-|truthfulqa_mc|      1|mc1         |24.48|±  |  1.51|
-|             |       |mc2         |41.32|±  |  1.44|
-|webqs        |      0|acc         | 1.28|±  |  0.25|
-## bloom-1b7_reading_comprehension_0-shot.json
-|Task|Version|Metric|Value|   |Stderr|
-|----|------:|------|----:|---|-----:|
-|coqa|      1|f1    |53.55|±  |  1.89|
-|    |       |em    |40.90|±  |  2.03|
-|drop|      1|em    | 0.69|±  |  0.08|
-|    |       |f1    | 6.89|±  |  0.16|
-|race|      1|acc   |33.21|±  |  1.46|
-## bloom-1b7_xcopa_0-shot.json
-|  Task  |Version|Metric|Value|   |Stderr|
-|--------|------:|------|----:|---|-----:|
-|xcopa_et|      0|acc   | 47.4|±  |  2.24|
-|xcopa_ht|      0|acc   | 50.4|±  |  2.24|
-|xcopa_id|      0|acc   | 63.2|±  |  2.16|
-|xcopa_it|      0|acc   | 52.6|±  |  2.24|
-|xcopa_qu|      0|acc   | 50.6|±  |  2.24|
-|xcopa_sw|      0|acc   | 51.8|±  |  2.24|
-|xcopa_ta|      0|acc   | 56.6|±  |  2.22|
-|xcopa_th|      0|acc   | 53.2|±  |  2.23|
-|xcopa_tr|      0|acc   | 52.8|±  |  2.23|
-|xcopa_vi|      0|acc   | 65.8|±  |  2.12|
-|xcopa_zh|      0|acc   | 61.4|±  |  2.18|
-## bloom-1b7_xnli_0-shot.json
-| Task  |Version|Metric|Value|   |Stderr|
-|-------|------:|------|----:|---|-----:|
-|xnli_ar|      0|acc   |33.57|±  |  0.67|
-|xnli_bg|      0|acc   |35.43|±  |  0.68|
-|xnli_de|      0|acc   |40.58|±  |  0.69|
-|xnli_el|      0|acc   |33.99|±  |  0.67|
-|xnli_en|      0|acc   |50.14|±  |  0.71|
-|xnli_es|      0|acc   |47.82|±  |  0.71|
-|xnli_fr|      0|acc   |48.18|±  |  0.71|
-|xnli_hi|      0|acc   |43.95|±  |  0.70|
-|xnli_ru|      0|acc   |39.32|±  |  0.69|
-|xnli_sw|      0|acc   |34.51|±  |  0.67|
-|xnli_th|      0|acc   |33.37|±  |  0.67|
-|xnli_tr|      0|acc   |34.93|±  |  0.67|
-|xnli_ur|      0|acc   |40.50|±  |  0.69|
-|xnli_vi|      0|acc   |46.23|±  |  0.70|
-|xnli_zh|      0|acc   |36.21|±  |  0.68|
-## bloom-1b7_xstory_cloze_0-shot.json
-|     Task      |Version|Metric|Value|   |Stderr|
-|---------------|------:|------|----:|---|-----:|
-|xstory_cloze_ar|      0|acc   |55.00|±  |  1.28|
-|xstory_cloze_en|      0|acc   |64.66|±  |  1.23|
-|xstory_cloze_es|      0|acc   |60.82|±  |  1.26|
-|xstory_cloze_eu|      0|acc   |54.93|±  |  1.28|
-|xstory_cloze_hi|      0|acc   |56.78|±  |  1.27|
-|xstory_cloze_id|      0|acc   |59.76|±  |  1.26|
-|xstory_cloze_my|      0|acc   |47.25|±  |  1.28|
-|xstory_cloze_ru|      0|acc   |50.36|±  |  1.29|
-|xstory_cloze_sw|      0|acc   |52.28|±  |  1.29|
-|xstory_cloze_te|      0|acc   |56.52|±  |  1.28|
-|xstory_cloze_zh|      0|acc   |58.24|±  |  1.27|
-## bloom-1b7_xwinograd_0-shot.json
-|    Task    |Version|Metric|Value|   |Stderr|
-|------------|------:|------|----:|---|-----:|
-|xwinograd_en|      0|acc   |74.71|±  |  0.90|
-|xwinograd_fr|      0|acc   |68.67|±  |  5.12|
-|xwinograd_jp|      0|acc   |54.12|±  |  1.61|
-|xwinograd_pt|      0|acc   |63.50|±  |  2.97|
-|xwinograd_ru|      0|acc   |52.38|±  |  2.82|
-|xwinograd_zh|      0|acc   |69.64|±  |  2.05|
--- a/results/bloom/bloom-1b7/bloom-1b7_common_sense_reasoning_0-shot.json
+++ b/results/bloom/bloom-1b7/bloom-1b7_common_sense_reasoning_0-shot.json
-{
-  "results": {
-    "mc_taco": {
-      "em": 0.12537537537537538,
-      "f1": 0.47458014393437276
-    },
-    "arc_easy": {
-      "acc": 0.5631313131313131,
-      "acc_stderr": 0.010177672928157678,
-      "acc_norm": 0.4810606060606061,
-      "acc_norm_stderr": 0.010252420496894487
-    },
-    "boolq": {
-      "acc": 0.617737003058104,
-      "acc_stderr": 0.008499149690449272
-    },
-    "piqa": {
-      "acc": 0.6877040261153428,
-      "acc_stderr": 0.010812581599154424,
-      "acc_norm": 0.7007616974972797,
-      "acc_norm_stderr": 0.010684130673134581
-    },
-    "copa": {
-      "acc": 0.7,
-      "acc_stderr": 0.046056618647183814
-    },
-    "prost": {
-      "acc": 0.23521562766865928,
-      "acc_stderr": 0.003098672944164254,
-      "acc_norm": 0.2669726729291204,
-      "acc_norm_stderr": 0.00323196492387981
-    },
-    "hellaswag": {
-      "acc": 0.37621987651862177,
-      "acc_stderr": 0.004834461997944872,
-      "acc_norm": 0.46564429396534557,
-      "acc_norm_stderr": 0.004977988452502641
-    },
-    "swag": {
-      "acc": 0.4532140357892632,
-      "acc_stderr": 0.0035195819088979174,
-      "acc_norm": 0.6114665600319904,
-      "acc_norm_stderr": 0.003446127007510879
-    },
-    "openbookqa": {
-      "acc": 0.214,
-      "acc_stderr": 0.01835979750238702,
-      "acc_norm": 0.3,
-      "acc_norm_stderr": 0.020514426225628046
-    },
-    "wsc273": {
-      "acc": 0.7289377289377289,
-      "acc_stderr": 0.02695226692070332
-    },
-    "arc_challenge": {
-      "acc": 0.2354948805460751,
-      "acc_stderr": 0.012399451855004752,
-      "acc_norm": 0.26791808873720135,
-      "acc_norm_stderr": 0.012942030195136423
-    },
-    "winogrande": {
-      "acc": 0.5714285714285714,
-      "acc_stderr": 0.013908353814606709
-    }
-  },
-  "versions": {
-    "mc_taco": 0,
-    "arc_easy": 0,
-    "boolq": 1,
-    "piqa": 0,
-    "copa": 0,
-    "prost": 0,
-    "hellaswag": 0,
-    "swag": 0,
-    "openbookqa": 0,
-    "wsc273": 0,
-    "arc_challenge": 0,
-    "winogrande": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b7,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b7/bloom-1b7_gsm8k_8-shot.json
+++ b/results/bloom/bloom-1b7/bloom-1b7_gsm8k_8-shot.json
-{
-  "results": {
-    "gsm8k": {
-      "acc": 0.01288855193328279,
-      "acc_stderr": 0.00310690126649963
-    }
-  },
-  "versions": {
-    "gsm8k": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b7,use_accelerate=True",
-    "num_fewshot": 8,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b7/bloom-1b7_mathematical_reasoning_few_shot_5-shot.json
+++ b/results/bloom/bloom-1b7/bloom-1b7_mathematical_reasoning_few_shot_5-shot.json
-{
-  "results": {
-    "math_algebra": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_geometry": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_prealgebra": {
-      "acc": 0.002296211251435132,
-      "acc_stderr": 0.001622733136934626
-    },
-    "math_precalc": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_num_theory": {
-      "acc": 0.007407407407407408,
-      "acc_stderr": 0.003693382168437264
-    },
-    "gsm8k": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_counting_and_prob": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_intermediate_algebra": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "drop": {
-      "em": 0.014890939597315436,
-      "em_stderr": 0.0012403460245602655,
-      "f1": 0.043118708053691356,
-      "f1_stderr": 0.0015424950823374804
-    },
-    "mathqa": {
-      "acc": 0.24288107202680068,
-      "acc_stderr": 0.00785017752394654,
-      "acc_norm": 0.24623115577889448,
-      "acc_norm_stderr": 0.007886624866001843
-    }
-  },
-  "versions": {
-    "math_algebra": 1,
-    "math_geometry": 1,
-    "math_prealgebra": 1,
-    "math_precalc": 1,
-    "math_num_theory": 1,
-    "gsm8k": 0,
-    "math_counting_and_prob": 1,
-    "mathqa": 0,
-    "math_intermediate_algebra": 1,
-    "drop": 1
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b7,use_accelerate=True",
-    "num_fewshot": 5,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b7/bloom-1b7_pawsx_0-shot.json
+++ b/results/bloom/bloom-1b7/bloom-1b7_pawsx_0-shot.json
-{
-  "results": {
-    "pawsx_ja": {
-      "acc": 0.447,
-      "acc_stderr": 0.011120131683767737
-    },
-    "pawsx_es": {
-      "acc": 0.513,
-      "acc_stderr": 0.011179355482070377
-    },
-    "pawsx_fr": {
-      "acc": 0.462,
-      "acc_stderr": 0.01115079235234166
-    },
-    "pawsx_de": {
-      "acc": 0.4875,
-      "acc_stderr": 0.011179640744835734
-    },
-    "pawsx_ko": {
-      "acc": 0.458,
-      "acc_stderr": 0.011143612073516636
-    },
-    "pawsx_zh": {
-      "acc": 0.454,
-      "acc_stderr": 0.0111357084193598
-    },
-    "pawsx_en": {
-      "acc": 0.489,
-      "acc_stderr": 0.011180429374603775
-    }
-  },
-  "versions": {
-    "pawsx_ja": 0,
-    "pawsx_es": 0,
-    "pawsx_fr": 0,
-    "pawsx_de": 0,
-    "pawsx_ko": 0,
-    "pawsx_zh": 0,
-    "pawsx_en": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b7",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b7/bloom-1b7_question_answering_0-shot.json
+++ b/results/bloom/bloom-1b7/bloom-1b7_question_answering_0-shot.json
-{
-  "results": {
-    "webqs": {
-      "acc": 0.012795275590551181,
-      "acc_stderr": 0.0024938680596856277
-    },
-    "headqa_en": {
-      "acc": 0.2775346462436178,
-      "acc_stderr": 0.008552884316239918,
-      "acc_norm": 0.32567469000729393,
-      "acc_norm_stderr": 0.008951013596145294
-    },
-    "logiqa": {
-      "acc": 0.21658986175115208,
-      "acc_stderr": 0.016156860583178303,
-      "acc_norm": 0.28110599078341014,
-      "acc_norm_stderr": 0.017632374626460005
-    },
-    "squad2": {
-      "exact": 1.8024088267497684,
-      "f1": 4.382884035952938,
-      "HasAns_exact": 2.395411605937922,
-      "HasAns_f1": 7.563762172548798,
-      "NoAns_exact": 1.2111017661900756,
-      "NoAns_f1": 1.2111017661900756,
-      "best_exact": 50.07159100480081,
-      "best_f1": 50.07207926399809
-    },
-    "headqa_es": {
-      "acc": 0.25419401896425964,
-      "acc_stderr": 0.008316509290190668,
-      "acc_norm": 0.29576951130561635,
-      "acc_norm_stderr": 0.008717251898361422
-    },
-    "triviaqa": {
-      "acc": 0.0313798285158667,
-      "acc_stderr": 0.0016392014864795154
-    },
-    "truthfulqa_mc": {
-      "mc1": 0.24479804161566707,
-      "mc1_stderr": 0.015051869486715006,
-      "mc2": 0.41318090310186134,
-      "mc2_stderr": 0.014436426641105374
-    }
-  },
-  "versions": {
-    "webqs": 0,
-    "headqa_en": 0,
-    "logiqa": 0,
-    "squad2": 1,
-    "headqa_es": 0,
-    "triviaqa": 1,
-    "truthfulqa_mc": 1
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b7,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}