Merge branch 'gakada-big-refactor-merge' into big-refactor

4a0b0d6e · lintangsutawika · 6ae376e3 · c490f165 · 4a0b0d6e · 4a0b0d6e
Commit 4a0b0d6e authored Jun 16, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
 # v1.0 Tasks
 This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.

-Boxes should be checked iff tasks are implemented in v2.0 and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation.
+Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation.

 - [ ] Glue
- [ ] SuperGlue
+- [x] SuperGlue
 - [ ] CoQA
 - [ ] DROP
 - [x] ~~Lambada~~
@@ -31,7 +31,7 @@ Boxes should be checked iff tasks are implemented in v2.0 and tested for regress
 - [ ] WebQs
 - [ ] WSC273
 - [ ] Winogrande
- [ ] ANLI
+- [x] ANLI
 - [ ] Hendrycks Ethics
 - [ ] TruthfulQA
 - [ ] MuTual

--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -3,6 +3,7 @@ from typing import List, Union

 from .gsm8k import *
 from .triviaqa import *
+from .glue import *

 from lm_eval import utils
 from lm_eval.logger import eval_logger
@@ -12,6 +13,7 @@ from lm_eval.api.registry import (
    register_group,
    TASK_REGISTRY,
    GROUP_REGISTRY,
+    ALL_TASKS,
 )


@@ -38,6 +40,9 @@ def include_task_folder(task_dir):
                        )

                        if "task" in config:
+                            # task_name = "{}:{}".format(
+                            #     get_task_name_from_config(config), config["task"]
+                            # )
                            task_name = "{}".format(config["task"])
                            register_task(task_name)(SubClass)

@@ -62,7 +67,7 @@ def get_task(task_name, config):
        return TASK_REGISTRY[task_name](config=config)
    except KeyError:
        eval_logger.info("Available tasks:")
-        eval_logger.info(ALL_TASKS)
+        eval_logger.info(list(TASK_REGISTRY) + list(GROUP_REGISTRY))
        raise KeyError(f"Missing task {task_name}")



--- a/lm_eval/tasks/gsm8k/README.md
+++ b/lm_eval/tasks/gsm8k/README.md
@@ -43,4 +43,4 @@ Homepage: https://github.com/openai/grade-school-math

 - [ ] Variant with Calculator (see https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py for example implementation)
 - [ ] Using Verifiers
- [ ] Majority voting "without CoT"
\ No newline at end of file
+- [ ] Majority voting "without CoT"
--- a/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml
+++ b/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml
@@ -9,7 +9,7 @@ generation_kwargs:
    - "\n\n"
  do_sample: true
  temperature: 0.2
-repeats: 8
+repeats: 64
 filter_list:
  - name: "score-first" # pick only the first response, and report metrics on that
    filter:

--- a/lm_eval/tasks/pile/pile_europarl.yaml
+++ b/lm_eval/tasks/pile/pile_europarl.yaml
 include: pile_arxiv.yaml
 task: pile_europarl
-dataset_name: pile_europarl
\ No newline at end of file
+dataset_name: pile_europarl
--- a/lm_eval/tasks/pile/pile_gutenberg.yaml
+++ b/lm_eval/tasks/pile/pile_gutenberg.yaml
 include: pile_arxiv.yaml
 task: pile_gutenberg
-dataset_name: pile_gutenberg
\ No newline at end of file
+dataset_name: pile_gutenberg
--- a/lm_eval/tasks/pile/pile_pubmed-abstracts.yaml
+++ b/lm_eval/tasks/pile/pile_pubmed-abstracts.yaml
 include: pile_arxiv.yaml
 task: pile_pubmed-abstracts
 dataset_name: pile_pubmed-abstracts
-
--- a/lm_eval/tasks/pile/pile_pubmed-central.yaml
+++ b/lm_eval/tasks/pile/pile_pubmed-central.yaml
 include: pile_arxiv.yaml
 task: pile_pubmed-central
 dataset_name: pile_pubmed-central
-
--- a/lm_eval/tasks/pile/pile_stackexchange.yaml
+++ b/lm_eval/tasks/pile/pile_stackexchange.yaml
 include: pile_arxiv.yaml
 task: pile_stackexchange
 dataset_name: pile_stackexchange
-
--- a/lm_eval/tasks/pile/pile_ubuntu-irc.yaml
+++ b/lm_eval/tasks/pile/pile_ubuntu-irc.yaml
 include: pile_arxiv.yaml
 task: pile_ubuntu-irc
 dataset_name: pile_ubuntu-irc
-
--- a/lm_eval/tasks/pile/pile_uspto.yaml
+++ b/lm_eval/tasks/pile/pile_uspto.yaml
 include: pile_arxiv.yaml
 task: pile_uspto
 dataset_name: pile_uspto
-
--- a/lm_eval/tasks/pile/pile_wikipedia.yaml
+++ b/lm_eval/tasks/pile/pile_wikipedia.yaml
 include: pile_arxiv.yaml
 task: pile_wikipedia
 dataset_name: pile_wikipedia
-
--- a/lm_eval/tasks/wikitext/README.md
+++ b/lm_eval/tasks/wikitext/README.md
@@ -28,9 +28,16 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-

 ### Subtasks

+* `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods.
+
 ### Checklist

- [x] Is in Eval-harness v1.0 ?
- [x] Has been checked for regression from v1.0?
- [ ] Has been checked for equivalence with original paper methodology?
- [ ] "Main" checked variant clearly denoted?
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -8,13 +8,19 @@ import functools
 import subprocess
 import collections
 import importlib.util
+import fnmatch

-from typing import List
+from typing import List, Union
+
+import gc
+import torch

 from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice

+from lm_eval.logger import eval_logger
+

 class ExitCodeError(Exception):
    pass
@@ -25,6 +31,29 @@ def sh(x):
        raise ExitCodeError()


+def escaped_split(text, sep_char, maxsplit=-1):
+    """Split text into a list on occurrences of the given separation
+    character `sep_char`. The separation character may be escaped by a
+    backslash to avoid splitting at that location.
+
+    The separation character must be a string of size 1.
+
+    If `maxsplit` is given, at most `maxsplit` splits are done (thus,
+    the list will have at most `maxsplit + 1` elements). If `maxsplit`
+    is not specified or less than 0, then there is no limit on the
+    number of splits (all possible splits are made).
+    """
+    assert (
+        len(sep_char) == 1
+    ), "separation string must be a single character for escaped splitting"
+
+    if maxsplit == 0:
+        return text
+    maxsplit = max(0, maxsplit)
+
+    return re.split(r"(?<!\\)" + sep_char, text, maxsplit)
+
+
 def simple_parse_args_string(args_string):
    """
    Parses something like
@@ -44,11 +73,11 @@ def join_iters(iters):
        yield from iter


-def chunks(iter, n):
+def chunks(iter, n=0, fn=None):
    arr = []
-    for x in iter:
+    for i, x in enumerate(iter):
        arr.append(x)
-        if len(arr) == n:
+        if len(arr) == (fn(i) if fn else n):
            yield arr
            arr = []

@@ -65,6 +94,35 @@ def group(arr, fn):
    return list(res.values())


+class MultiChoice:
+    def __init__(self, choices):
+        self.choices = choices
+
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values):
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                eval_logger.warning("{} is not in task list.".format(value))
+                eval_logger.info(f"Available tasks to choose:")
+                for choice in self.choices:
+                    eval_logger.info(f"  - {choice}")
+        return True
+
+    def __iter__(self):
+        for choice in self.choices:
+            yield choice
+
+
+# Returns a list containing all values of the source_list that
+# match at least one of the patterns
+def pattern_match(patterns, source_list):
+    task_names = set()
+    for pattern in patterns:
+        for matching in fnmatch.filter(source_list, pattern):
+            task_names.add(matching)
+    return sorted(list(task_names))
+
+
 def general_detokenize(string):
    string = string.replace(" n't", "n't")
    string = string.replace(" )", ")")
@@ -110,8 +168,8 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
        window_end = predicted + window_pred_len

        yield (
-            token_list[window_end - max_seq_len - 1 : window_end - 1],
-            token_list[window_end - window_pred_len : window_end],
+            token_list[window_end - max_seq_len - 1: window_end - 1],
+            token_list[window_end - window_pred_len: window_end],
        )
        predicted += window_pred_len

@@ -122,6 +180,26 @@ def make_disjoint_window(pair):
    return a[: len(a) - (len(b) - 1)], b


+def select_continuation_from_batch_left_padding(
+    generations: Union[List[List[int]], torch.Tensor], max_context_size: int
+):
+    """Select the continuation from the batch, removing prompts of different lengths.
+    Args:
+        generations (Union[List[List[int]], torch.Tensor]):
+            A tensor or list-of-lists of shape [batch_size, sequence length].
+        max_context_size (int):
+            The size of the biggest context; generations will proceed from that
+            index.
+    Example:
+        PAD     PAD Continue : The dog chased the cat  [every       day of the week]
+        Riddle  me    this   : The  dog chased the  cat [yesterday] PAD PAD PAD PAD
+    Output:
+        [every day of the week]
+        [yesterday]  PAD PAD PAD PAD
+    """
+    return generations[:, max_context_size:]
+
+
 class Reorderer:
    def __init__(self, arr, fn):
        self.size = len(arr)
@@ -336,3 +414,8 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
    among ranks in multigpu setting or only pulling a sample of documents
    """
    return islice(raw_iterator, rank, limit, world_size)
+
+
+def clear_torch_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
--- a/main.py
+++ b/main.py
 import os
 import json
-import fnmatch
 import argparse

 from lm_eval import evaluator, utils
-from lm_eval.api.registry import GROUP_REGISTRY, TASK_REGISTRY
+from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger

 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-ALL_TASKS = sorted(list(TASK_REGISTRY.keys()) + list(GROUP_REGISTRY.keys()))
-
-class MultiChoice:
-    def __init__(self, choices):
-        self.choices = choices
-
-    # Simple wildcard support (linux filename patterns)
-    def __contains__(self, values):
-        for value in values.split(","):
-            if len(fnmatch.filter(self.choices, value)) == 0:
-                eval_logger.warning("{} is not in task list.".format(value))
-                eval_logger.info(f"Available tasks to choose:")
-                # for choice in self.choices:
-                    # eval_logger.info(f"    {choice}")
-                eval_logger.info(ALL_TASKS)
-        return True
-
-    def __iter__(self):
-        for choice in self.choices:
-            yield choice


 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tasks", default=None, choices=MultiChoice(ALL_TASKS))
+    parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS)))
    parser.add_argument("--config", default=None)
-    parser.add_argument("--provide_description", action="store_true")
    parser.add_argument("--num_fewshot", type=int, default=0)
    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument("--max_batch_size", type=int, default=None,
+                        help="Maximal batch size to try with --batch_size auto")
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument("--limit", type=float, default=None,
+                        help="Limit the number of examples per task. "
+                             "If <1, limit is a percentage of the total number of examples.")
+    parser.add_argument("--data_sampling", type=float, default=None)
    parser.add_argument("--no_cache", action="store_true")
    parser.add_argument("--decontamination_ngrams_path", default=None)
    parser.add_argument("--description_dict_path", default=None)
    parser.add_argument("--check_integrity", action="store_true")
+    parser.add_argument("--write_out", action="store_true", default=False)
+    parser.add_argument("--output_base_path", type=str, default=None)
    return parser.parse_args()


-# Returns a list containing all values of the source_list that
-# match at least one of the patterns
-def pattern_match(patterns, source_list):
-    task_names = set()
-    for pattern in patterns:
-        for matching in fnmatch.filter(source_list, pattern):
-            task_names.add(matching)
-    return sorted(list(task_names))
-
-
 def main():
    args = parse_args()

@@ -68,7 +43,9 @@ def main():
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

-    if args.tasks is not None:
+    if args.tasks is None:
+        task_names = ALL_TASKS
+    else:
        if os.path.isdir(args.tasks):
            import glob

@@ -79,7 +56,7 @@ def main():
                task_names.append(config)
        else:
            tasks_list = args.tasks.split(",")
-            task_names = pattern_match(tasks_list, ALL_TASKS)
+            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
            for task in [task for task in tasks_list if task not in task_names]:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
@@ -87,28 +64,42 @@ def main():

    eval_logger.info(f"Selected Tasks: {task_names}")

+    # TODO: description_dict?
+    # description_dict = {}
+    # if args.description_dict_path:
+    #     with open(args.description_dict_path, "r") as f:
+    #         description_dict = json.load(f)
+
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
        device=args.device,
+        no_cache=args.no_cache,
        limit=args.limit,
+        # description_dict=description_dict,
        decontamination_ngrams_path=args.decontamination_ngrams_path,
        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        output_base_path=args.output_base_path,
    )
+
    if results is not None:
        dumped = json.dumps(results, indent=2)
        print(dumped)

        if args.output_path:
+            os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
            with open(args.output_path, "w") as f:
                f.write(dumped)

+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
        print(
-            f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
-            f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
+            f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
+            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
        print(evaluator.make_table(results))


--- a/results/bloom/bloom-1b1/README.md
+++ b/results/bloom/bloom-1b1/README.md
+# bloom-1b1
+
+## bloom-1b1_common_sense_reasoning_0-shot.json
+|    Task     |Version| Metric |Value|   |Stderr|
+|-------------|------:|--------|----:|---|-----:|
+|arc_challenge|      0|acc     |23.63|±  |  1.24|
+|             |       |acc_norm|25.68|±  |  1.28|
+|arc_easy     |      0|acc     |51.47|±  |  1.03|
+|             |       |acc_norm|45.45|±  |  1.02|
+|boolq        |      1|acc     |59.08|±  |  0.86|
+|copa         |      0|acc     |68.00|±  |  4.69|
+|hellaswag    |      0|acc     |34.63|±  |  0.47|
+|             |       |acc_norm|41.77|±  |  0.49|
+|mc_taco      |      0|em      |14.49|   |      |
+|             |       |f1      |32.43|   |      |
+|openbookqa   |      0|acc     |19.60|±  |  1.78|
+|             |       |acc_norm|29.40|±  |  2.04|
+|piqa         |      0|acc     |67.14|±  |  1.10|
+|             |       |acc_norm|67.14|±  |  1.10|
+|prost        |      0|acc     |23.41|±  |  0.31|
+|             |       |acc_norm|30.50|±  |  0.34|
+|swag         |      0|acc     |43.43|±  |  0.35|
+|             |       |acc_norm|58.28|±  |  0.35|
+|winogrande   |      0|acc     |54.93|±  |  1.40|
+|wsc273       |      0|acc     |68.50|±  |  2.82|
+
+## bloom-1b1_gsm8k_8-shot.json
+|Task |Version|Metric|Value|   |Stderr|
+|-----|------:|------|----:|---|-----:|
+|gsm8k|      0|acc   | 0.83|±  |  0.25|
+
+## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+|          Task           |Version| Metric |Value|   |Stderr|
+|-------------------------|------:|--------|----:|---|-----:|
+|drop                     |      1|em      | 1.38|±  |  0.12|
+|                         |       |f1      | 4.01|±  |  0.15|
+|gsm8k                    |      0|acc     | 0.00|±  |  0.00|
+|math_algebra             |      1|acc     | 0.00|±  |  0.00|
+|math_counting_and_prob   |      1|acc     | 0.21|±  |  0.21|
+|math_geometry            |      1|acc     | 0.21|±  |  0.21|
+|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
+|math_num_theory          |      1|acc     | 0.19|±  |  0.19|
+|math_prealgebra          |      1|acc     | 0.11|±  |  0.11|
+|math_precalc             |      1|acc     | 0.00|±  |  0.00|
+|mathqa                   |      0|acc     |23.55|±  |  0.78|
+|                         |       |acc_norm|23.62|±  |  0.78|
+
+## bloom-1b1_pawsx_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|pawsx_de|      0|acc   |46.95|±  |  1.12|
+|pawsx_en|      0|acc   |52.45|±  |  1.12|
+|pawsx_es|      0|acc   |51.50|±  |  1.12|
+|pawsx_fr|      0|acc   |46.15|±  |  1.11|
+|pawsx_ja|      0|acc   |48.40|±  |  1.12|
+|pawsx_ko|      0|acc   |49.90|±  |  1.12|
+|pawsx_zh|      0|acc   |48.95|±  |  1.12|
+
+## bloom-1b1_question_answering_0-shot.json
+|    Task     |Version|   Metric   |Value|   |Stderr|
+|-------------|------:|------------|----:|---|-----:|
+|headqa_en    |      0|acc         |26.44|±  |  0.84|
+|             |       |acc_norm    |30.49|±  |  0.88|
+|headqa_es    |      0|acc         |24.43|±  |  0.82|
+|             |       |acc_norm    |28.30|±  |  0.86|
+|logiqa       |      0|acc         |18.89|±  |  1.54|
+|             |       |acc_norm    |25.65|±  |  1.71|
+|squad2       |      1|exact       | 4.17|   |      |
+|             |       |f1          | 6.60|   |      |
+|             |       |HasAns_exact| 2.19|   |      |
+|             |       |HasAns_f1   | 7.05|   |      |
+|             |       |NoAns_exact | 6.14|   |      |
+|             |       |NoAns_f1    | 6.14|   |      |
+|             |       |best_exact  |50.07|   |      |
+|             |       |best_f1     |50.07|   |      |
+|triviaqa     |      1|acc         | 2.68|±  |  0.15|
+|truthfulqa_mc|      1|mc1         |25.34|±  |  1.52|
+|             |       |mc2         |41.80|±  |  1.46|
+|webqs        |      0|acc         | 1.38|±  |  0.26|
+
+## bloom-1b1_reading_comprehension_0-shot.json
+|Task|Version|Metric|Value|   |Stderr|
+|----|------:|------|----:|---|-----:|
+|coqa|      1|f1    |45.57|±  |  1.88|
+|    |       |em    |32.98|±  |  1.95|
+|drop|      1|em    | 3.31|±  |  0.18|
+|    |       |f1    | 8.63|±  |  0.22|
+|race|      1|acc   |32.63|±  |  1.45|
+
+## bloom-1b1_xcopa_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|xcopa_et|      0|acc   | 50.6|±  |  2.24|
+|xcopa_ht|      0|acc   | 53.0|±  |  2.23|
+|xcopa_id|      0|acc   | 64.8|±  |  2.14|
+|xcopa_it|      0|acc   | 50.8|±  |  2.24|
+|xcopa_qu|      0|acc   | 51.2|±  |  2.24|
+|xcopa_sw|      0|acc   | 54.4|±  |  2.23|
+|xcopa_ta|      0|acc   | 57.0|±  |  2.22|
+|xcopa_th|      0|acc   | 53.2|±  |  2.23|
+|xcopa_tr|      0|acc   | 53.0|±  |  2.23|
+|xcopa_vi|      0|acc   | 62.4|±  |  2.17|
+|xcopa_zh|      0|acc   | 59.4|±  |  2.20|
+
+## bloom-1b1_xnli_0-shot.json
+| Task  |Version|Metric|Value|   |Stderr|
+|-------|------:|------|----:|---|-----:|
+|xnli_ar|      0|acc   |33.93|±  |  0.67|
+|xnli_bg|      0|acc   |34.13|±  |  0.67|
+|xnli_de|      0|acc   |39.64|±  |  0.69|
+|xnli_el|      0|acc   |34.03|±  |  0.67|
+|xnli_en|      0|acc   |51.48|±  |  0.71|
+|xnli_es|      0|acc   |47.98|±  |  0.71|
+|xnli_fr|      0|acc   |47.15|±  |  0.71|
+|xnli_hi|      0|acc   |42.32|±  |  0.70|
+|xnli_ru|      0|acc   |40.46|±  |  0.69|
+|xnli_sw|      0|acc   |35.29|±  |  0.68|
+|xnli_th|      0|acc   |33.75|±  |  0.67|
+|xnli_tr|      0|acc   |34.79|±  |  0.67|
+|xnli_ur|      0|acc   |37.33|±  |  0.68|
+|xnli_vi|      0|acc   |44.45|±  |  0.70|
+|xnli_zh|      0|acc   |36.23|±  |  0.68|
+
+## bloom-1b1_xstory_cloze_0-shot.json
+|     Task      |Version|Metric|Value|   |Stderr|
+|---------------|------:|------|----:|---|-----:|
+|xstory_cloze_ar|      0|acc   |52.88|±  |  1.28|
+|xstory_cloze_en|      0|acc   |62.54|±  |  1.25|
+|xstory_cloze_es|      0|acc   |58.31|±  |  1.27|
+|xstory_cloze_eu|      0|acc   |54.33|±  |  1.28|
+|xstory_cloze_hi|      0|acc   |55.53|±  |  1.28|
+|xstory_cloze_id|      0|acc   |57.91|±  |  1.27|
+|xstory_cloze_my|      0|acc   |46.19|±  |  1.28|
+|xstory_cloze_ru|      0|acc   |48.25|±  |  1.29|
+|xstory_cloze_sw|      0|acc   |50.56|±  |  1.29|
+|xstory_cloze_te|      0|acc   |56.39|±  |  1.28|
+|xstory_cloze_zh|      0|acc   |58.04|±  |  1.27|
+
+## bloom-1b1_xwinograd_0-shot.json
+|    Task    |Version|Metric|Value|   |Stderr|
+|------------|------:|------|----:|---|-----:|
+|xwinograd_en|      0|acc   |69.98|±  |  0.95|
+|xwinograd_fr|      0|acc   |66.27|±  |  5.22|
+|xwinograd_jp|      0|acc   |52.87|±  |  1.61|
+|xwinograd_pt|      0|acc   |63.12|±  |  2.98|
+|xwinograd_ru|      0|acc   |54.29|±  |  2.81|
+|xwinograd_zh|      0|acc   |69.25|±  |  2.06|
--- a/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+{
+  "results": {
+    "boolq": {
+      "acc": 0.5908256880733945,
+      "acc_stderr": 0.008599563442397352
+    },
+    "arc_easy": {
+      "acc": 0.5147306397306397,
+      "acc_stderr": 0.010255329977562096,
+      "acc_norm": 0.45454545454545453,
+      "acc_norm_stderr": 0.010217299762709435
+    },
+    "openbookqa": {
+      "acc": 0.196,
+      "acc_stderr": 0.017770751227744862,
+      "acc_norm": 0.294,
+      "acc_norm_stderr": 0.020395095484936614
+    },
+    "hellaswag": {
+      "acc": 0.3463453495319657,
+      "acc_stderr": 0.004748324319714264,
+      "acc_norm": 0.4177454690300737,
+      "acc_norm_stderr": 0.004921798492608764
+    },
+    "swag": {
+      "acc": 0.43431970408877335,
+      "acc_stderr": 0.0035044592489844794,
+      "acc_norm": 0.5828251524542637,
+      "acc_norm_stderr": 0.0034862531772295617
+    },
+    "arc_challenge": {
+      "acc": 0.2363481228668942,
+      "acc_stderr": 0.012414960524301834,
+      "acc_norm": 0.2568259385665529,
+      "acc_norm_stderr": 0.0127669237941168
+    },
+    "mc_taco": {
+      "em": 0.1448948948948949,
+      "f1": 0.32425976796237205
+    },
+    "wsc273": {
+      "acc": 0.684981684981685,
+      "acc_stderr": 0.028165854394193602
+    },
+    "winogrande": {
+      "acc": 0.5493291239147593,
+      "acc_stderr": 0.013983928869040239
+    },
+    "prost": {
+      "acc": 0.23409479077711356,
+      "acc_stderr": 0.003093545711826552,
+      "acc_norm": 0.3049743808710504,
+      "acc_norm_stderr": 0.003363606918420179
+    },
+    "copa": {
+      "acc": 0.68,
+      "acc_stderr": 0.04688261722621504
+    },
+    "piqa": {
+      "acc": 0.6713819368879217,
+      "acc_stderr": 0.010959127105167048,
+      "acc_norm": 0.6713819368879217,
+      "acc_norm_stderr": 0.010959127105167044
+    }
+  },
+  "versions": {
+    "boolq": 1,
+    "arc_easy": 0,
+    "openbookqa": 0,
+    "hellaswag": 0,
+    "swag": 0,
+    "arc_challenge": 0,
+    "mc_taco": 0,
+    "wsc273": 0,
+    "winogrande": 0,
+    "prost": 0,
+    "copa": 0,
+    "piqa": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+{
+  "results": {
+    "gsm8k": {
+      "acc": 0.008339651250947688,
+      "acc_stderr": 0.002504942226860508
+    }
+  },
+  "versions": {
+    "gsm8k": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 8,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+{
+  "results": {
+    "mathqa": {
+      "acc": 0.2355108877721943,
+      "acc_stderr": 0.007767687364650971,
+      "acc_norm": 0.23618090452261306,
+      "acc_norm_stderr": 0.0077753193787470495
+    },
+    "gsm8k": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "drop": {
+      "em": 0.013842281879194632,
+      "em_stderr": 0.001196510970060749,
+      "f1": 0.040085989932885986,
+      "f1_stderr": 0.0014841664758736023
+    },
+    "math_geometry": {
+      "acc": 0.0020876826722338203,
+      "acc_stderr": 0.0020876826722338315
+    },
+    "math_counting_and_prob": {
+      "acc": 0.002109704641350211,
+      "acc_stderr": 0.002109704641350211
+    },
+    "math_prealgebra": {
+      "acc": 0.001148105625717566,
+      "acc_stderr": 0.0011481056257175708
+    },
+    "math_num_theory": {
+      "acc": 0.001851851851851852,
+      "acc_stderr": 0.0018518518518518448
+    },
+    "math_precalc": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "math_algebra": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "math_intermediate_algebra": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    }
+  },
+  "versions": {
+    "mathqa": 0,
+    "gsm8k": 0,
+    "drop": 1,
+    "math_geometry": 1,
+    "math_counting_and_prob": 1,
+    "math_prealgebra": 1,
+    "math_num_theory": 1,
+    "math_precalc": 1,
+    "math_algebra": 1,
+    "math_intermediate_algebra": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 5,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
+{
+  "results": {
+    "pawsx_es": {
+      "acc": 0.515,
+      "acc_stderr": 0.011178102477052804
+    },
+    "pawsx_zh": {
+      "acc": 0.4895,
+      "acc_stderr": 0.011180669867648657
+    },
+    "pawsx_fr": {
+      "acc": 0.4615,
+      "acc_stderr": 0.011149934327957058
+    },
+    "pawsx_ko": {
+      "acc": 0.499,
+      "acc_stderr": 0.01118311365477017
+    },
+    "pawsx_de": {
+      "acc": 0.4695,
+      "acc_stderr": 0.011162310405413175
+    },
+    "pawsx_ja": {
+      "acc": 0.484,
+      "acc_stderr": 0.011177408788874897
+    },
+    "pawsx_en": {
+      "acc": 0.5245,
+      "acc_stderr": 0.011169702598013186
+    }
+  },
+  "versions": {
+    "pawsx_es": 0,
+    "pawsx_zh": 0,
+    "pawsx_fr": 0,
+    "pawsx_ko": 0,
+    "pawsx_de": 0,
+    "pawsx_ja": 0,
+    "pawsx_en": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}