Merge branch 'big-refactor' into benchmark-scripts

29f12dd9 · Lintang Sutawika · GitHub · e37698df · 4168c05f · 29f12dd9
Unverified Commit 29f12dd9 authored Aug 01, 2023 by Lintang Sutawika Committed by GitHub Aug 01, 2023
20 changed files
--- a/lm_eval/tasks/hendrycks_ethics/deontology.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/deontology.yaml
 include: commonsense.yaml
 task: ethics_deontology
-dataset_path: hails/hendrycks_ethics
 dataset_name: deontology
 doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}} {{excuse.rstrip()}}\"\nAnswer:"
 doc_to_target: label

--- a/lm_eval/tasks/hendrycks_ethics/justice.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/justice.yaml
@@ -3,6 +3,5 @@ group:
  - hendrycks_ethics
 task: ethics_justice
 dataset_name: justice
-output_type: multiple_choice
 doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:"
 # TODO: impl. exact match for this and deontology
--- a/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
+++ b/lm_eval/tasks/hendrycks_ethics/utilitarianism.yaml
@@ -2,11 +2,7 @@ include: commonsense.yaml
 group:
  - hendrycks_ethics
 task: ethics_utilitarianism
-dataset_path: hails/hendrycks_ethics
 dataset_name: utilitarianism
-output_type: multiple_choice
-training_split: train
-test_split: test
 doc_to_text: !function utils.doc_to_text
 doc_to_target: !function utils.doc_to_target
 doc_to_choice: ['no', 'yes']

--- a/lm_eval/tasks/lambada/lambada_openai.yaml
+++ b/lm_eval/tasks/lambada/lambada_openai.yaml
@@ -7,7 +7,6 @@ dataset_path: EleutherAI/lambada_openai
 dataset_name: default
 output_type: loglikelihood
 test_split: test
-template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true

--- a/lm_eval/tasks/lambada/lambada_standard.yaml
+++ b/lm_eval/tasks/lambada/lambada_standard.yaml
@@ -8,7 +8,6 @@ dataset_name: null
 output_type: loglikelihood
 validation_split: validation
 test_split: test
-template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true

--- a/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
+++ b/lm_eval/tasks/lambada_cloze/lambada_openai_cloze.yaml
@@ -6,7 +6,6 @@ dataset_path: EleutherAI/lambada_openai
 dataset_name: default
 output_type: loglikelihood
 test_split: test
-template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}} ____. ->"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true

--- a/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
+++ b/lm_eval/tasks/lambada_cloze/lambada_standard_cloze.yaml
@@ -7,7 +7,6 @@ dataset_name: null
 output_type: loglikelihood
 validation_split: validation
 test_split: test
-template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}} ____. ->"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true

--- a/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
+++ b/lm_eval/tasks/lambada_multilingual/lambada_mt_en.yaml
@@ -7,7 +7,6 @@ dataset_path: EleutherAI/lambada_openai
 dataset_name: en
 output_type: loglikelihood
 test_split: test
-template_aliases: ""
 doc_to_text: "{{text.split(' ')[:-1]|join(' ')}}"
 doc_to_target: "{{' '+text.split(' ')[-1]}}"
 should_decontaminate: true

--- a/lm_eval/tasks/pile/pile_arxiv.yaml
+++ b/lm_eval/tasks/pile/pile_arxiv.yaml
@@ -3,11 +3,10 @@ group:
  - perplexity
  - loglikelihood_rolling
 task: pile_arxiv
-dataset_path: EleutherAI/the_pile
+dataset_path: EleutherAI/pile
 dataset_name: pile_arxiv
 output_type: loglikelihood_rolling
 test_split: train
-template_aliases: ""
 doc_to_text: ""
 doc_to_target: "{{text}}"
 should_decontaminate: true

--- a/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
+++ b/lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml
@@ -8,7 +8,6 @@ training_split: null
 validation_split: validation
 test_split: null
 num_fewshot: 0
-template_aliases: "{% set gold = 0 %}{% set answer_choices = mc1_targets['choices'] %}"  # The first answer is always the correct one
 doc_to_text: "\
  {% set prompt_qa = '\
  Q: What is human life expectancy in the United States?\n\
@@ -25,8 +24,8 @@ doc_to_text: "\
  A: The 1992 Olympics were held in Barcelona, Spain.\
  '%}\
  {{prompt_qa + '\n\nQ: ' + question + '\nA:'}}"
-doc_to_target: "{{answer_choices[gold]}}"
-gold_alias: "{{gold}}"
+doc_to_target: 0
+doc_to_choice: "{{mc1_targets.choices}}"
 should_decontaminate: True
 doc_to_decontamination_query: question
 metric_list:

--- a/lm_eval/tasks/wikitext/wikitext.yaml
+++ b/lm_eval/tasks/wikitext/wikitext.yaml
@@ -8,7 +8,6 @@ output_type: loglikelihood_rolling
 training_split: train
 validation_split: validation
 test_split: test
-template_aliases: ""
 doc_to_text: ""
 doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
 should_decontaminate: true

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -286,6 +286,7 @@ def make_table(result_dict, column="results"):
    latex_writer.headers = [
        column_name,
        "Version",
+        "Fewshot",
        "Filter",
        "Metric",
        "Value",
@@ -297,6 +298,7 @@ def make_table(result_dict, column="results"):

    for k, dic in result_dict[column].items():
        version = result_dict["versions"][k]
+        n = str(result_dict["configs"][k]["num_fewshot"])
        for (mf), v in dic.items():
            m, _, f = mf.partition(",")
            if m.endswith("_stderr"):
@@ -304,10 +306,11 @@ def make_table(result_dict, column="results"):

            if m + "_stderr" + "," + f in dic:
                se = dic[m + "_stderr" + "," + f]
-                values.append([k, version, f, m, "%.4f" % v, "±", "%.4f" % se])
+                values.append([k, version, n, f, m, "%.4f" % v, "±", "%.4f" % se])
            else:
-                values.append([k, version, f, m, "%.4f" % v, "", ""])
+                values.append([k, version, n, f, m, "%.4f" % v, "", ""])
            k = ""
+            n = ""
            version = ""
    md_writer.value_matrix = values
    latex_writer.value_matrix = values

--- a/main.py
+++ b/main.py
@@ -29,7 +29,7 @@ def parse_args():
    parser.add_argument(
        "--num_fewshot",
        type=int,
-        default=0,
+        default=None,
        help="Number of examples in few-shot context",
    )
    parser.add_argument("--batch_size", type=int, default=1)  # TODO: only integers

--- a/results/bloom/bloom-1b1/README.md
+++ b/results/bloom/bloom-1b1/README.md
-# bloom-1b1
-
-## bloom-1b1_common_sense_reasoning_0-shot.json
-|    Task     |Version| Metric |Value|   |Stderr|
-|-------------|------:|--------|----:|---|-----:|
-|arc_challenge|      0|acc     |23.63|±  |  1.24|
-|             |       |acc_norm|25.68|±  |  1.28|
-|arc_easy     |      0|acc     |51.47|±  |  1.03|
-|             |       |acc_norm|45.45|±  |  1.02|
-|boolq        |      1|acc     |59.08|±  |  0.86|
-|copa         |      0|acc     |68.00|±  |  4.69|
-|hellaswag    |      0|acc     |34.63|±  |  0.47|
-|             |       |acc_norm|41.77|±  |  0.49|
-|mc_taco      |      0|em      |14.49|   |      |
-|             |       |f1      |32.43|   |      |
-|openbookqa   |      0|acc     |19.60|±  |  1.78|
-|             |       |acc_norm|29.40|±  |  2.04|
-|piqa         |      0|acc     |67.14|±  |  1.10|
-|             |       |acc_norm|67.14|±  |  1.10|
-|prost        |      0|acc     |23.41|±  |  0.31|
-|             |       |acc_norm|30.50|±  |  0.34|
-|swag         |      0|acc     |43.43|±  |  0.35|
-|             |       |acc_norm|58.28|±  |  0.35|
-|winogrande   |      0|acc     |54.93|±  |  1.40|
-|wsc273       |      0|acc     |68.50|±  |  2.82|
-
-## bloom-1b1_gsm8k_8-shot.json
-|Task |Version|Metric|Value|   |Stderr|
-|-----|------:|------|----:|---|-----:|
-|gsm8k|      0|acc   | 0.83|±  |  0.25|
-
-## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
-|          Task           |Version| Metric |Value|   |Stderr|
-|-------------------------|------:|--------|----:|---|-----:|
-|drop                     |      1|em      | 1.38|±  |  0.12|
-|                         |       |f1      | 4.01|±  |  0.15|
-|gsm8k                    |      0|acc     | 0.00|±  |  0.00|
-|math_algebra             |      1|acc     | 0.00|±  |  0.00|
-|math_counting_and_prob   |      1|acc     | 0.21|±  |  0.21|
-|math_geometry            |      1|acc     | 0.21|±  |  0.21|
-|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
-|math_num_theory          |      1|acc     | 0.19|±  |  0.19|
-|math_prealgebra          |      1|acc     | 0.11|±  |  0.11|
-|math_precalc             |      1|acc     | 0.00|±  |  0.00|
-|mathqa                   |      0|acc     |23.55|±  |  0.78|
-|                         |       |acc_norm|23.62|±  |  0.78|
-
-## bloom-1b1_pawsx_0-shot.json
-|  Task  |Version|Metric|Value|   |Stderr|
-|--------|------:|------|----:|---|-----:|
-|pawsx_de|      0|acc   |46.95|±  |  1.12|
-|pawsx_en|      0|acc   |52.45|±  |  1.12|
-|pawsx_es|      0|acc   |51.50|±  |  1.12|
-|pawsx_fr|      0|acc   |46.15|±  |  1.11|
-|pawsx_ja|      0|acc   |48.40|±  |  1.12|
-|pawsx_ko|      0|acc   |49.90|±  |  1.12|
-|pawsx_zh|      0|acc   |48.95|±  |  1.12|
-
-## bloom-1b1_question_answering_0-shot.json
-|    Task     |Version|   Metric   |Value|   |Stderr|
-|-------------|------:|------------|----:|---|-----:|
-|headqa_en    |      0|acc         |26.44|±  |  0.84|
-|             |       |acc_norm    |30.49|±  |  0.88|
-|headqa_es    |      0|acc         |24.43|±  |  0.82|
-|             |       |acc_norm    |28.30|±  |  0.86|
-|logiqa       |      0|acc         |18.89|±  |  1.54|
-|             |       |acc_norm    |25.65|±  |  1.71|
-|squad2       |      1|exact       | 4.17|   |      |
-|             |       |f1          | 6.60|   |      |
-|             |       |HasAns_exact| 2.19|   |      |
-|             |       |HasAns_f1   | 7.05|   |      |
-|             |       |NoAns_exact | 6.14|   |      |
-|             |       |NoAns_f1    | 6.14|   |      |
-|             |       |best_exact  |50.07|   |      |
-|             |       |best_f1     |50.07|   |      |
-|triviaqa     |      1|acc         | 2.68|±  |  0.15|
-|truthfulqa_mc|      1|mc1         |25.34|±  |  1.52|
-|             |       |mc2         |41.80|±  |  1.46|
-|webqs        |      0|acc         | 1.38|±  |  0.26|
-
-## bloom-1b1_reading_comprehension_0-shot.json
-|Task|Version|Metric|Value|   |Stderr|
-|----|------:|------|----:|---|-----:|
-|coqa|      1|f1    |45.57|±  |  1.88|
-|    |       |em    |32.98|±  |  1.95|
-|drop|      1|em    | 3.31|±  |  0.18|
-|    |       |f1    | 8.63|±  |  0.22|
-|race|      1|acc   |32.63|±  |  1.45|
-
-## bloom-1b1_xcopa_0-shot.json
-|  Task  |Version|Metric|Value|   |Stderr|
-|--------|------:|------|----:|---|-----:|
-|xcopa_et|      0|acc   | 50.6|±  |  2.24|
-|xcopa_ht|      0|acc   | 53.0|±  |  2.23|
-|xcopa_id|      0|acc   | 64.8|±  |  2.14|
-|xcopa_it|      0|acc   | 50.8|±  |  2.24|
-|xcopa_qu|      0|acc   | 51.2|±  |  2.24|
-|xcopa_sw|      0|acc   | 54.4|±  |  2.23|
-|xcopa_ta|      0|acc   | 57.0|±  |  2.22|
-|xcopa_th|      0|acc   | 53.2|±  |  2.23|
-|xcopa_tr|      0|acc   | 53.0|±  |  2.23|
-|xcopa_vi|      0|acc   | 62.4|±  |  2.17|
-|xcopa_zh|      0|acc   | 59.4|±  |  2.20|
-
-## bloom-1b1_xnli_0-shot.json
-| Task  |Version|Metric|Value|   |Stderr|
-|-------|------:|------|----:|---|-----:|
-|xnli_ar|      0|acc   |33.93|±  |  0.67|
-|xnli_bg|      0|acc   |34.13|±  |  0.67|
-|xnli_de|      0|acc   |39.64|±  |  0.69|
-|xnli_el|      0|acc   |34.03|±  |  0.67|
-|xnli_en|      0|acc   |51.48|±  |  0.71|
-|xnli_es|      0|acc   |47.98|±  |  0.71|
-|xnli_fr|      0|acc   |47.15|±  |  0.71|
-|xnli_hi|      0|acc   |42.32|±  |  0.70|
-|xnli_ru|      0|acc   |40.46|±  |  0.69|
-|xnli_sw|      0|acc   |35.29|±  |  0.68|
-|xnli_th|      0|acc   |33.75|±  |  0.67|
-|xnli_tr|      0|acc   |34.79|±  |  0.67|
-|xnli_ur|      0|acc   |37.33|±  |  0.68|
-|xnli_vi|      0|acc   |44.45|±  |  0.70|
-|xnli_zh|      0|acc   |36.23|±  |  0.68|
-
-## bloom-1b1_xstory_cloze_0-shot.json
-|     Task      |Version|Metric|Value|   |Stderr|
-|---------------|------:|------|----:|---|-----:|
-|xstory_cloze_ar|      0|acc   |52.88|±  |  1.28|
-|xstory_cloze_en|      0|acc   |62.54|±  |  1.25|
-|xstory_cloze_es|      0|acc   |58.31|±  |  1.27|
-|xstory_cloze_eu|      0|acc   |54.33|±  |  1.28|
-|xstory_cloze_hi|      0|acc   |55.53|±  |  1.28|
-|xstory_cloze_id|      0|acc   |57.91|±  |  1.27|
-|xstory_cloze_my|      0|acc   |46.19|±  |  1.28|
-|xstory_cloze_ru|      0|acc   |48.25|±  |  1.29|
-|xstory_cloze_sw|      0|acc   |50.56|±  |  1.29|
-|xstory_cloze_te|      0|acc   |56.39|±  |  1.28|
-|xstory_cloze_zh|      0|acc   |58.04|±  |  1.27|
-
-## bloom-1b1_xwinograd_0-shot.json
-|    Task    |Version|Metric|Value|   |Stderr|
-|------------|------:|------|----:|---|-----:|
-|xwinograd_en|      0|acc   |69.98|±  |  0.95|
-|xwinograd_fr|      0|acc   |66.27|±  |  5.22|
-|xwinograd_jp|      0|acc   |52.87|±  |  1.61|
-|xwinograd_pt|      0|acc   |63.12|±  |  2.98|
-|xwinograd_ru|      0|acc   |54.29|±  |  2.81|
-|xwinograd_zh|      0|acc   |69.25|±  |  2.06|
--- a/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
-{
-  "results": {
-    "boolq": {
-      "acc": 0.5908256880733945,
-      "acc_stderr": 0.008599563442397352
-    },
-    "arc_easy": {
-      "acc": 0.5147306397306397,
-      "acc_stderr": 0.010255329977562096,
-      "acc_norm": 0.45454545454545453,
-      "acc_norm_stderr": 0.010217299762709435
-    },
-    "openbookqa": {
-      "acc": 0.196,
-      "acc_stderr": 0.017770751227744862,
-      "acc_norm": 0.294,
-      "acc_norm_stderr": 0.020395095484936614
-    },
-    "hellaswag": {
-      "acc": 0.3463453495319657,
-      "acc_stderr": 0.004748324319714264,
-      "acc_norm": 0.4177454690300737,
-      "acc_norm_stderr": 0.004921798492608764
-    },
-    "swag": {
-      "acc": 0.43431970408877335,
-      "acc_stderr": 0.0035044592489844794,
-      "acc_norm": 0.5828251524542637,
-      "acc_norm_stderr": 0.0034862531772295617
-    },
-    "arc_challenge": {
-      "acc": 0.2363481228668942,
-      "acc_stderr": 0.012414960524301834,
-      "acc_norm": 0.2568259385665529,
-      "acc_norm_stderr": 0.0127669237941168
-    },
-    "mc_taco": {
-      "em": 0.1448948948948949,
-      "f1": 0.32425976796237205
-    },
-    "wsc273": {
-      "acc": 0.684981684981685,
-      "acc_stderr": 0.028165854394193602
-    },
-    "winogrande": {
-      "acc": 0.5493291239147593,
-      "acc_stderr": 0.013983928869040239
-    },
-    "prost": {
-      "acc": 0.23409479077711356,
-      "acc_stderr": 0.003093545711826552,
-      "acc_norm": 0.3049743808710504,
-      "acc_norm_stderr": 0.003363606918420179
-    },
-    "copa": {
-      "acc": 0.68,
-      "acc_stderr": 0.04688261722621504
-    },
-    "piqa": {
-      "acc": 0.6713819368879217,
-      "acc_stderr": 0.010959127105167048,
-      "acc_norm": 0.6713819368879217,
-      "acc_norm_stderr": 0.010959127105167044
-    }
-  },
-  "versions": {
-    "boolq": 1,
-    "arc_easy": 0,
-    "openbookqa": 0,
-    "hellaswag": 0,
-    "swag": 0,
-    "arc_challenge": 0,
-    "mc_taco": 0,
-    "wsc273": 0,
-    "winogrande": 0,
-    "prost": 0,
-    "copa": 0,
-    "piqa": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
-{
-  "results": {
-    "gsm8k": {
-      "acc": 0.008339651250947688,
-      "acc_stderr": 0.002504942226860508
-    }
-  },
-  "versions": {
-    "gsm8k": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 8,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
-{
-  "results": {
-    "mathqa": {
-      "acc": 0.2355108877721943,
-      "acc_stderr": 0.007767687364650971,
-      "acc_norm": 0.23618090452261306,
-      "acc_norm_stderr": 0.0077753193787470495
-    },
-    "gsm8k": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "drop": {
-      "em": 0.013842281879194632,
-      "em_stderr": 0.001196510970060749,
-      "f1": 0.040085989932885986,
-      "f1_stderr": 0.0014841664758736023
-    },
-    "math_geometry": {
-      "acc": 0.0020876826722338203,
-      "acc_stderr": 0.0020876826722338315
-    },
-    "math_counting_and_prob": {
-      "acc": 0.002109704641350211,
-      "acc_stderr": 0.002109704641350211
-    },
-    "math_prealgebra": {
-      "acc": 0.001148105625717566,
-      "acc_stderr": 0.0011481056257175708
-    },
-    "math_num_theory": {
-      "acc": 0.001851851851851852,
-      "acc_stderr": 0.0018518518518518448
-    },
-    "math_precalc": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_algebra": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_intermediate_algebra": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    }
-  },
-  "versions": {
-    "mathqa": 0,
-    "gsm8k": 0,
-    "drop": 1,
-    "math_geometry": 1,
-    "math_counting_and_prob": 1,
-    "math_prealgebra": 1,
-    "math_num_theory": 1,
-    "math_precalc": 1,
-    "math_algebra": 1,
-    "math_intermediate_algebra": 1
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 5,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
-{
-  "results": {
-    "pawsx_es": {
-      "acc": 0.515,
-      "acc_stderr": 0.011178102477052804
-    },
-    "pawsx_zh": {
-      "acc": 0.4895,
-      "acc_stderr": 0.011180669867648657
-    },
-    "pawsx_fr": {
-      "acc": 0.4615,
-      "acc_stderr": 0.011149934327957058
-    },
-    "pawsx_ko": {
-      "acc": 0.499,
-      "acc_stderr": 0.01118311365477017
-    },
-    "pawsx_de": {
-      "acc": 0.4695,
-      "acc_stderr": 0.011162310405413175
-    },
-    "pawsx_ja": {
-      "acc": 0.484,
-      "acc_stderr": 0.011177408788874897
-    },
-    "pawsx_en": {
-      "acc": 0.5245,
-      "acc_stderr": 0.011169702598013186
-    }
-  },
-  "versions": {
-    "pawsx_es": 0,
-    "pawsx_zh": 0,
-    "pawsx_fr": 0,
-    "pawsx_ko": 0,
-    "pawsx_de": 0,
-    "pawsx_ja": 0,
-    "pawsx_en": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_question_answering_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_question_answering_0-shot.json
-{
-  "results": {
-    "truthfulqa_mc": {
-      "mc1": 0.2533659730722154,
-      "mc1_stderr": 0.01522589934082683,
-      "mc2": 0.4179977378869182,
-      "mc2_stderr": 0.014601549068840484
-    },
-    "webqs": {
-      "acc": 0.013779527559055118,
-      "acc_stderr": 0.002586718737195641
-    },
-    "logiqa": {
-      "acc": 0.1889400921658986,
-      "acc_stderr": 0.01535436463822078,
-      "acc_norm": 0.2565284178187404,
-      "acc_norm_stderr": 0.017129443327887562
-    },
-    "squad2": {
-      "exact": 4.169123220752969,
-      "f1": 6.5956997780058355,
-      "HasAns_exact": 2.192982456140351,
-      "HasAns_f1": 7.05309437656277,
-      "NoAns_exact": 6.139613120269134,
-      "NoAns_f1": 6.139613120269134,
-      "best_exact": 50.07159100480081,
-      "best_f1": 50.07159100480081
-    },
-    "headqa_es": {
-      "acc": 0.24434719183078046,
-      "acc_stderr": 0.008207488987159709,
-      "acc_norm": 0.2830051057622174,
-      "acc_norm_stderr": 0.008604004902114394
-    },
-    "headqa_en": {
-      "acc": 0.26440554339897887,
-      "acc_stderr": 0.008423643607316284,
-      "acc_norm": 0.30488694383661563,
-      "acc_norm_stderr": 0.008793112278191295
-    },
-    "triviaqa": {
-      "acc": 0.026783346592415803,
-      "acc_stderr": 0.001517985028991893
-    }
-  },
-  "versions": {
-    "truthfulqa_mc": 1,
-    "webqs": 0,
-    "logiqa": 0,
-    "squad2": 1,
-    "headqa_es": 0,
-    "headqa_en": 0,
-    "triviaqa": 1
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/bloom/bloom-1b1/bloom-1b1_reading_comprehension_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_reading_comprehension_0-shot.json
-{
-  "results": {
-    "drop": {
-      "em": 0.03313758389261745,
-      "em_stderr": 0.0018330841858875643,
-      "f1": 0.08634542785234882,
-      "f1_stderr": 0.0022136353860709133
-    },
-    "coqa": {
-      "f1": 0.4557083534540516,
-      "f1_stderr": 0.01876948425119881,
-      "em": 0.3298333333333334,
-      "em_stderr": 0.019473215823053027
-    },
-    "race": {
-      "acc": 0.3263157894736842,
-      "acc_stderr": 0.014510987877134932
-    }
-  },
-  "versions": {
-    "drop": 1,
-    "coqa": 1,
-    "race": 1
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}