Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into toxicity

16c4afc6 · lintangsutawika · 7b376ae1 · 176d5a26 · 7b376ae1 · 16c4afc6
Commit 16c4afc6 authored Aug 03, 2023 by lintangsutawika
20 changed files
--- a/lm_eval/tasks/super_glue/multirc/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/multirc/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "confirm"
-use_prompt: "promptsource:confirm"
--- a/lm_eval/tasks/super_glue/record/default.yaml
+++ b/lm_eval/tasks/super_glue/record/default.yaml
-# group:
-#   - super-glue-lm-eval-v1
+group:
+  - super-glue-lm-eval-v1
 task: record
 dataset_path: super_glue
 dataset_name: record
@@ -9,6 +9,10 @@ validation_split: validation
 doc_to_text: !function util.doc_to_text
 doc_to_target: "{{answers}}"
 doc_to_choice: "{{entities}}"
+process_results: !function util.process_results
 metric_list:
  - metric: f1
+    aggregation: mean
  - metric: em
+    higher_is_better: True
+    aggregation: mean
--- a/lm_eval/tasks/super_glue/record/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/record/promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "Add sentence after (continuation choices)"
-dataset_path: super_glue
-dataset_name: record
-training_split: train
-validation_split: validation
-use_prompt: "promptsource:Add sentence after (continuation choices)"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/record/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/record/t5-prompt.yaml
@@ -5,6 +5,7 @@ dataset_path: super_glue
 dataset_name: record
 training_split: train
 validation_split: validation
+output_type: greedy_until
 doc_to_text: "record query: {{query}} entities: {{entities}} passage: {{passage}}"
 doc_to_target: "{{answers}}"
 metric_list:

--- a/lm_eval/tasks/super_glue/record/util.py
+++ b/lm_eval/tasks/super_glue/record/util.py
+import numpy as np
+import transformers.data.metrics.squad_metrics as squad_metrics
+
+from lm_eval.api.metrics import metric_max_over_ground_truths
+
+
 def doc_to_text(doc):
    initial_text, *highlights = doc["passage"].strip().split("\n@highlight\n")
    text = initial_text + "\n\n"
@@ -13,3 +19,25 @@ def format_answer(query, entity):
 def doc_to_target(doc):
    # We only output the first correct entity in a doc
    return format_answer(query=doc["query"], entity=doc["answers"][0])
+
+
+def process_results(doc, results):
+    # ReCoRD's evaluation is actually deceptively simple:
+    # - Pick the maximum likelihood prediction entity
+    # - Evaluate the accuracy and token F1 PER EXAMPLE
+    # - Average over all examples
+    max_idx = np.argmax(np.array([result[0] for result in results]))
+
+    prediction = doc["entities"][max_idx]
+    gold_label_set = doc["answers"]
+    f1 = metric_max_over_ground_truths(
+        squad_metrics.compute_f1, prediction, gold_label_set
+    )
+    em = metric_max_over_ground_truths(
+        squad_metrics.compute_exact, prediction, gold_label_set
+    )
+
+    return {
+        "f1": f1,
+        "em": em,
+    }
--- a/lm_eval/tasks/super_glue/rte/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/rte/promptsource-00.yaml
 group:
-  - super-glue-promptsource
-task: "rte"
+  - super-glue-lm-eval-v1
+task: rte
 dataset_path: super_glue
 dataset_name: rte
+output_type: multiple_choice
 training_split: train
 validation_split: validation
-use_prompt: "promptsource:GPT-3 style"
-generation_kwargs:
-    until:
-    - "\n"
-    - "\n\n"
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True or False?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ['True', 'False']
 metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
+  - metric: acc
--- a/lm_eval/tasks/super_glue/rte/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/rte/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "MNLI crowdsource"
-use_prompt: "promptsource:MNLI crowdsource"
--- a/lm_eval/tasks/super_glue/rte/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/rte/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "based on the previous passage"
-use_prompt: "promptsource:based on the previous passage"
--- a/lm_eval/tasks/super_glue/wsc.fixed/promptsource-00.yaml
+++ b/lm_eval/tasks/super_glue/wsc.fixed/promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "GPT-3 Style"
-dataset_path: super_glue
-dataset_name: wsc.fixed
-training_split: train
-validation_split: validation
-use_prompt: "promptsource:GPT-3 Style"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
--- a/lm_eval/tasks/super_glue/wsc.fixed/promptsource-01.yaml
+++ b/lm_eval/tasks/super_glue/wsc.fixed/promptsource-01.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "I think they mean"
-use_prompt: "promptsource:I think they mean"
--- a/lm_eval/tasks/super_glue/wsc.fixed/promptsource-02.yaml
+++ b/lm_eval/tasks/super_glue/wsc.fixed/promptsource-02.yaml
-include: promptsource-00.yaml
-group:
-  - super-glue-promptsource
-task: "Who or what is/are"
-use_prompt: "promptsource:Who or what is/are"
--- a/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
+++ b/lm_eval/tasks/super_glue/wsc/preprocess_wsc.py
@@ -18,14 +18,14 @@ def t5_prompt_doc_to_text(x):
    return text


-def default_doc_to_text(doc):
-    raw_passage = doc["text"]
+def default_doc_to_text(x):
+    raw_passage = x["text"]
    # NOTE: HuggingFace span indices are word-based not character-based.
-    pre = " ".join(raw_passage.split()[: doc["span2_index"]])
-    post = raw_passage[len(pre) + len(doc["span2_text"]) + 1 :]
-    passage = general_detokenize(pre + " *{}*".format(doc["span2_text"]) + post)
-    noun = doc["span1_text"]
-    pronoun = doc["span2_text"]
+    pre = " ".join(raw_passage.split()[: x["span2_index"]])
+    post = raw_passage[len(pre) + len(x["span2_text"]) + 1 :]
+    passage = general_detokenize(pre + " *{}*".format(x["span2_text"]) + post)
+    noun = x["span1_text"]
+    pronoun = x["span2_text"]
    text = (
        f"Passage: {passage}\n"
        + f'Question: In the passage above, does the pronoun "*{pronoun}*" refer to "*{noun}*"?\n'

--- a/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
+++ b/lm_eval/tasks/super_glue/wsc/t5-prompt.yaml
@@ -5,6 +5,7 @@ dataset_path: super_glue
 dataset_name: wsc
 training_split: train
 validation_split: validation
+output_type: greedy_until
 doc_to_text: !function "preprocess_wsc.t5_prompt_doc_to_text"
 doc_to_target: label
 doc_to_choice: ['False', 'True']

--- a/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
+++ b/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
@@ -8,7 +8,6 @@ training_split: null
 validation_split: validation
 test_split: null
 num_fewshot: 0
-template_aliases: "{% set gold = 0 %}{% set answer_choices = mc1_targets['choices'] %}"  # The first answer is always the correct one
 doc_to_text: "\
  {% set prompt_qa = '\
  Q: What is human life expectancy in the United States?\n\
@@ -25,8 +24,8 @@ doc_to_text: "\
  A: The 1992 Olympics were held in Barcelona, Spain.\
  '%}\
  {{prompt_qa + '\n\nQ: ' + question + '\nA:'}}"
-doc_to_target: "{{answer_choices[gold]}}"
-gold_alias: "{{gold}}"
+doc_to_target: 0
+doc_to_choice: "{{mc1_targets.choices}}"
 should_decontaminate: True
 doc_to_decontamination_query: question
 metric_list:

--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
@@ -8,7 +8,6 @@ output_type: loglikelihood_rolling
 training_split: train
 validation_split: validation
 test_split: test
-template_aliases: ""
 doc_to_text: ""
 doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
 should_decontaminate: true

--- a/lm_eval/tasks/winogrande/winogrande.yaml
+++ b/lm_eval/tasks/winogrande/winogrande.yaml
@@ -7,6 +7,8 @@ validation_split: validation
 doc_to_text: !function preprocess_winogrande.doc_to_text
 doc_to_target: !function preprocess_winogrande.doc_to_target
 doc_to_choice: !function preprocess_winogrande.doc_to_choice
+should_decontaminate: true
+doc_to_decontamination_query: sentence
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -108,6 +108,10 @@ class MultiChoice:
 # Returns a list containing all values of the source_list that
 # match at least one of the patterns
 def pattern_match(patterns, source_list):
+
+    if type(patterns) == str:
+        patterns = [patterns]
+
    task_names = set()
    for pattern in patterns:
        for matching in fnmatch.filter(source_list, pattern):
@@ -259,16 +263,20 @@ class Grouper:
        return res


-def make_table(result_dict):
+def make_table(result_dict, column="results"):
    """Generate table of results."""
    from pytablewriter import MarkdownTableWriter, LatexTableWriter

+    if column == "results":
+        column_name = "Task"
+    elif column == "aggregate":
+        column_name = "Benchmark"
+
    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()
    md_writer.headers = [
-        "Task",
+        column_name,
        "Version",
-        "Fewshot",
        "Filter",
        "Metric",
        "Value",
@@ -276,7 +284,7 @@ def make_table(result_dict):
        "Stderr",
    ]
    latex_writer.headers = [
-        "Task",
+        column_name,
        "Version",
        "Fewshot",
        "Filter",
@@ -288,7 +296,7 @@ def make_table(result_dict):

    values = []

-    for k, dic in result_dict["results"].items():
+    for k, dic in result_dict[column].items():
        version = result_dict["versions"][k]
        n = str(result_dict["configs"][k]["num_fewshot"])
        for (mf), v in dic.items():

--- a/main.py
+++ b/main.py
@@ -10,6 +10,7 @@ from pathlib import Path
 from lm_eval import evaluator, utils
 from lm_eval.api.registry import ALL_TASKS
 from lm_eval.logger import eval_logger
+from lm_eval.tasks import include_task_folder

 os.environ["TOKENIZERS_PARALLELISM"] = "false"

@@ -23,7 +24,7 @@ def parse_args():
        help="String arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
    )
    parser.add_argument(
-        "--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS))
+        "--tasks", default=None  # , choices=utils.MultiChoice(sorted(ALL_TASKS))
    )
    parser.add_argument(
        "--num_fewshot",
@@ -82,6 +83,18 @@ def parse_args():
        default=False,
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis",
    )
+    parser.add_argument(
+        "--show_config",
+        action="store_true",
+        default=False,
+        help="If True, shows the the full config of all tasks at the end of the evaluation.",
+    )
+    parser.add_argument(
+        "--include_path",
+        type=str,
+        default=None,
+        help="Additional path to include if there are external tasks to include.",
+    )
    return parser.parse_args()


@@ -94,6 +107,10 @@ def main():
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
+        include_task_folder(args.include_path)
+
    if args.tasks is None:
        task_names = ALL_TASKS
    else:
@@ -120,6 +137,7 @@ def main():
            eval_logger.warning(
                f"File already exists at {path}. Results will be overwritten."
            )
+            output_path_file = path.joinpath("results.json")
            assert not path.is_file(), "File already exists"
        # if path json then get parent dir
        elif path.suffix in (".json", ".jsonl"):
@@ -154,6 +172,7 @@ def main():
        if args.log_samples:
            samples = results.pop("samples")
        dumped = json.dumps(results, indent=2, default=lambda o: str(o))
+        if args.show_config:
            print(dumped)

        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
@@ -164,7 +183,7 @@ def main():
            if args.log_samples:
                for task_name, config in results["configs"].items():
                    output_name = "{}_{}".format(
-                        re.sub("/", "__", args.model_args), task_name
+                        re.sub("/|=", "__", args.model_args), task_name
                    )
                    filename = path.joinpath(f"{output_name}.jsonl")

@@ -176,6 +195,8 @@ def main():
            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
        print(evaluator.make_table(results))
+        if "aggregate" in results:
+            print(evaluator.make_table(results, "aggregate"))


 if __name__ == "__main__":

--- a/results/bloom/bloom-1b1/README.md
+++ b/results/bloom/bloom-1b1/README.md
-# bloom-1b1
-
-## bloom-1b1_common_sense_reasoning_0-shot.json
-|    Task     |Version| Metric |Value|   |Stderr|
-|-------------|------:|--------|----:|---|-----:|
-|arc_challenge|      0|acc     |23.63|±  |  1.24|
-|             |       |acc_norm|25.68|±  |  1.28|
-|arc_easy     |      0|acc     |51.47|±  |  1.03|
-|             |       |acc_norm|45.45|±  |  1.02|
-|boolq        |      1|acc     |59.08|±  |  0.86|
-|copa         |      0|acc     |68.00|±  |  4.69|
-|hellaswag    |      0|acc     |34.63|±  |  0.47|
-|             |       |acc_norm|41.77|±  |  0.49|
-|mc_taco      |      0|em      |14.49|   |      |
-|             |       |f1      |32.43|   |      |
-|openbookqa   |      0|acc     |19.60|±  |  1.78|
-|             |       |acc_norm|29.40|±  |  2.04|
-|piqa         |      0|acc     |67.14|±  |  1.10|
-|             |       |acc_norm|67.14|±  |  1.10|
-|prost        |      0|acc     |23.41|±  |  0.31|
-|             |       |acc_norm|30.50|±  |  0.34|
-|swag         |      0|acc     |43.43|±  |  0.35|
-|             |       |acc_norm|58.28|±  |  0.35|
-|winogrande   |      0|acc     |54.93|±  |  1.40|
-|wsc273       |      0|acc     |68.50|±  |  2.82|
-
-## bloom-1b1_gsm8k_8-shot.json
-|Task |Version|Metric|Value|   |Stderr|
-|-----|------:|------|----:|---|-----:|
-|gsm8k|      0|acc   | 0.83|±  |  0.25|
-
-## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
-|          Task           |Version| Metric |Value|   |Stderr|
-|-------------------------|------:|--------|----:|---|-----:|
-|drop                     |      1|em      | 1.38|±  |  0.12|
-|                         |       |f1      | 4.01|±  |  0.15|
-|gsm8k                    |      0|acc     | 0.00|±  |  0.00|
-|math_algebra             |      1|acc     | 0.00|±  |  0.00|
-|math_counting_and_prob   |      1|acc     | 0.21|±  |  0.21|
-|math_geometry            |      1|acc     | 0.21|±  |  0.21|
-|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
-|math_num_theory          |      1|acc     | 0.19|±  |  0.19|
-|math_prealgebra          |      1|acc     | 0.11|±  |  0.11|
-|math_precalc             |      1|acc     | 0.00|±  |  0.00|
-|mathqa                   |      0|acc     |23.55|±  |  0.78|
-|                         |       |acc_norm|23.62|±  |  0.78|
-
-## bloom-1b1_pawsx_0-shot.json
-|  Task  |Version|Metric|Value|   |Stderr|
-|--------|------:|------|----:|---|-----:|
-|pawsx_de|      0|acc   |46.95|±  |  1.12|
-|pawsx_en|      0|acc   |52.45|±  |  1.12|
-|pawsx_es|      0|acc   |51.50|±  |  1.12|
-|pawsx_fr|      0|acc   |46.15|±  |  1.11|
-|pawsx_ja|      0|acc   |48.40|±  |  1.12|
-|pawsx_ko|      0|acc   |49.90|±  |  1.12|
-|pawsx_zh|      0|acc   |48.95|±  |  1.12|
-
-## bloom-1b1_question_answering_0-shot.json
-|    Task     |Version|   Metric   |Value|   |Stderr|
-|-------------|------:|------------|----:|---|-----:|
-|headqa_en    |      0|acc         |26.44|±  |  0.84|
-|             |       |acc_norm    |30.49|±  |  0.88|
-|headqa_es    |      0|acc         |24.43|±  |  0.82|
-|             |       |acc_norm    |28.30|±  |  0.86|
-|logiqa       |      0|acc         |18.89|±  |  1.54|
-|             |       |acc_norm    |25.65|±  |  1.71|
-|squad2       |      1|exact       | 4.17|   |      |
-|             |       |f1          | 6.60|   |      |
-|             |       |HasAns_exact| 2.19|   |      |
-|             |       |HasAns_f1   | 7.05|   |      |
-|             |       |NoAns_exact | 6.14|   |      |
-|             |       |NoAns_f1    | 6.14|   |      |
-|             |       |best_exact  |50.07|   |      |
-|             |       |best_f1     |50.07|   |      |
-|triviaqa     |      1|acc         | 2.68|±  |  0.15|
-|truthfulqa_mc|      1|mc1         |25.34|±  |  1.52|
-|             |       |mc2         |41.80|±  |  1.46|
-|webqs        |      0|acc         | 1.38|±  |  0.26|
-
-## bloom-1b1_reading_comprehension_0-shot.json
-|Task|Version|Metric|Value|   |Stderr|
-|----|------:|------|----:|---|-----:|
-|coqa|      1|f1    |45.57|±  |  1.88|
-|    |       |em    |32.98|±  |  1.95|
-|drop|      1|em    | 3.31|±  |  0.18|
-|    |       |f1    | 8.63|±  |  0.22|
-|race|      1|acc   |32.63|±  |  1.45|
-
-## bloom-1b1_xcopa_0-shot.json
-|  Task  |Version|Metric|Value|   |Stderr|
-|--------|------:|------|----:|---|-----:|
-|xcopa_et|      0|acc   | 50.6|±  |  2.24|
-|xcopa_ht|      0|acc   | 53.0|±  |  2.23|
-|xcopa_id|      0|acc   | 64.8|±  |  2.14|
-|xcopa_it|      0|acc   | 50.8|±  |  2.24|
-|xcopa_qu|      0|acc   | 51.2|±  |  2.24|
-|xcopa_sw|      0|acc   | 54.4|±  |  2.23|
-|xcopa_ta|      0|acc   | 57.0|±  |  2.22|
-|xcopa_th|      0|acc   | 53.2|±  |  2.23|
-|xcopa_tr|      0|acc   | 53.0|±  |  2.23|
-|xcopa_vi|      0|acc   | 62.4|±  |  2.17|
-|xcopa_zh|      0|acc   | 59.4|±  |  2.20|
-
-## bloom-1b1_xnli_0-shot.json
-| Task  |Version|Metric|Value|   |Stderr|
-|-------|------:|------|----:|---|-----:|
-|xnli_ar|      0|acc   |33.93|±  |  0.67|
-|xnli_bg|      0|acc   |34.13|±  |  0.67|
-|xnli_de|      0|acc   |39.64|±  |  0.69|
-|xnli_el|      0|acc   |34.03|±  |  0.67|
-|xnli_en|      0|acc   |51.48|±  |  0.71|
-|xnli_es|      0|acc   |47.98|±  |  0.71|
-|xnli_fr|      0|acc   |47.15|±  |  0.71|
-|xnli_hi|      0|acc   |42.32|±  |  0.70|
-|xnli_ru|      0|acc   |40.46|±  |  0.69|
-|xnli_sw|      0|acc   |35.29|±  |  0.68|
-|xnli_th|      0|acc   |33.75|±  |  0.67|
-|xnli_tr|      0|acc   |34.79|±  |  0.67|
-|xnli_ur|      0|acc   |37.33|±  |  0.68|
-|xnli_vi|      0|acc   |44.45|±  |  0.70|
-|xnli_zh|      0|acc   |36.23|±  |  0.68|
-
-## bloom-1b1_xstory_cloze_0-shot.json
-|     Task      |Version|Metric|Value|   |Stderr|
-|---------------|------:|------|----:|---|-----:|
-|xstory_cloze_ar|      0|acc   |52.88|±  |  1.28|
-|xstory_cloze_en|      0|acc   |62.54|±  |  1.25|
-|xstory_cloze_es|      0|acc   |58.31|±  |  1.27|
-|xstory_cloze_eu|      0|acc   |54.33|±  |  1.28|
-|xstory_cloze_hi|      0|acc   |55.53|±  |  1.28|
-|xstory_cloze_id|      0|acc   |57.91|±  |  1.27|
-|xstory_cloze_my|      0|acc   |46.19|±  |  1.28|
-|xstory_cloze_ru|      0|acc   |48.25|±  |  1.29|
-|xstory_cloze_sw|      0|acc   |50.56|±  |  1.29|
-|xstory_cloze_te|      0|acc   |56.39|±  |  1.28|
-|xstory_cloze_zh|      0|acc   |58.04|±  |  1.27|
-
-## bloom-1b1_xwinograd_0-shot.json
-|    Task    |Version|Metric|Value|   |Stderr|
-|------------|------:|------|----:|---|-----:|
-|xwinograd_en|      0|acc   |69.98|±  |  0.95|
-|xwinograd_fr|      0|acc   |66.27|±  |  5.22|
-|xwinograd_jp|      0|acc   |52.87|±  |  1.61|
-|xwinograd_pt|      0|acc   |63.12|±  |  2.98|
-|xwinograd_ru|      0|acc   |54.29|±  |  2.81|
-|xwinograd_zh|      0|acc   |69.25|±  |  2.06|
--- a/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
-{
-  "results": {
-    "boolq": {
-      "acc": 0.5908256880733945,
-      "acc_stderr": 0.008599563442397352
-    },
-    "arc_easy": {
-      "acc": 0.5147306397306397,
-      "acc_stderr": 0.010255329977562096,
-      "acc_norm": 0.45454545454545453,
-      "acc_norm_stderr": 0.010217299762709435
-    },
-    "openbookqa": {
-      "acc": 0.196,
-      "acc_stderr": 0.017770751227744862,
-      "acc_norm": 0.294,
-      "acc_norm_stderr": 0.020395095484936614
-    },
-    "hellaswag": {
-      "acc": 0.3463453495319657,
-      "acc_stderr": 0.004748324319714264,
-      "acc_norm": 0.4177454690300737,
-      "acc_norm_stderr": 0.004921798492608764
-    },
-    "swag": {
-      "acc": 0.43431970408877335,
-      "acc_stderr": 0.0035044592489844794,
-      "acc_norm": 0.5828251524542637,
-      "acc_norm_stderr": 0.0034862531772295617
-    },
-    "arc_challenge": {
-      "acc": 0.2363481228668942,
-      "acc_stderr": 0.012414960524301834,
-      "acc_norm": 0.2568259385665529,
-      "acc_norm_stderr": 0.0127669237941168
-    },
-    "mc_taco": {
-      "em": 0.1448948948948949,
-      "f1": 0.32425976796237205
-    },
-    "wsc273": {
-      "acc": 0.684981684981685,
-      "acc_stderr": 0.028165854394193602
-    },
-    "winogrande": {
-      "acc": 0.5493291239147593,
-      "acc_stderr": 0.013983928869040239
-    },
-    "prost": {
-      "acc": 0.23409479077711356,
-      "acc_stderr": 0.003093545711826552,
-      "acc_norm": 0.3049743808710504,
-      "acc_norm_stderr": 0.003363606918420179
-    },
-    "copa": {
-      "acc": 0.68,
-      "acc_stderr": 0.04688261722621504
-    },
-    "piqa": {
-      "acc": 0.6713819368879217,
-      "acc_stderr": 0.010959127105167048,
-      "acc_norm": 0.6713819368879217,
-      "acc_norm_stderr": 0.010959127105167044
-    }
-  },
-  "versions": {
-    "boolq": 1,
-    "arc_easy": 0,
-    "openbookqa": 0,
-    "hellaswag": 0,
-    "swag": 0,
-    "arc_challenge": 0,
-    "mc_taco": 0,
-    "wsc273": 0,
-    "winogrande": 0,
-    "prost": 0,
-    "copa": 0,
-    "piqa": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}