merge with upstream

a702689d · Alexander · 8d66cfef · 008fc2a2 · 8d66cfef · 8d66cfef
Commit a702689d authored Nov 16, 2023 by Alexander
20 changed files
--- a/results/xglm/xglm-564M/xglm-564M_xstory_cloze_0-shot.json
+++ b/results/xglm/xglm-564M/xglm-564M_xstory_cloze_0-shot.json
-{
-  "results": {
-    "xstory_cloze_ar": {
-      "acc": 0.500992720052945,
-      "acc_stderr": 0.012867099955422925
-    },
-    "xstory_cloze_id": {
-      "acc": 0.5400397088021178,
-      "acc_stderr": 0.012825802370083988
-    },
-    "xstory_cloze_sw": {
-      "acc": 0.5307743216412971,
-      "acc_stderr": 0.01284273034058578
-    },
-    "xstory_cloze_en": {
-      "acc": 0.6055592322964924,
-      "acc_stderr": 0.012577106513936133
-    },
-    "xstory_cloze_te": {
-      "acc": 0.5585704831237591,
-      "acc_stderr": 0.012778538985880637
-    },
-    "xstory_cloze_zh": {
-      "acc": 0.5327597617471873,
-      "acc_stderr": 0.012839477563855915
-    },
-    "xstory_cloze_my": {
-      "acc": 0.514890800794176,
-      "acc_stderr": 0.012861417842074004
-    },
-    "xstory_cloze_hi": {
-      "acc": 0.5228325612177366,
-      "acc_stderr": 0.01285370238487085
-    },
-    "xstory_cloze_ru": {
-      "acc": 0.5618795499669094,
-      "acc_stderr": 0.01276820661627776
-    },
-    "xstory_cloze_es": {
-      "acc": 0.5506287227001986,
-      "acc_stderr": 0.012800991591293383
-    },
-    "xstory_cloze_eu": {
-      "acc": 0.5314361350099271,
-      "acc_stderr": 0.012841668760976905
-    }
-  },
-  "versions": {
-    "xstory_cloze_ar": 0,
-    "xstory_cloze_id": 0,
-    "xstory_cloze_sw": 0,
-    "xstory_cloze_en": 0,
-    "xstory_cloze_te": 0,
-    "xstory_cloze_zh": 0,
-    "xstory_cloze_my": 0,
-    "xstory_cloze_hi": 0,
-    "xstory_cloze_ru": 0,
-    "xstory_cloze_es": 0,
-    "xstory_cloze_eu": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=facebook/xglm-564M,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/xglm/xglm-564M/xglm-564M_xwinograd_0-shot.json
+++ b/results/xglm/xglm-564M/xglm-564M_xwinograd_0-shot.json
-{
-  "results": {
-    "xwinograd_pt": {
-      "acc": 0.5855513307984791,
-      "acc_stderr": 0.030434573161228055
-    },
-    "xwinograd_zh": {
-      "acc": 0.6567460317460317,
-      "acc_stderr": 0.0211700809891982
-    },
-    "xwinograd_ru": {
-      "acc": 0.5904761904761905,
-      "acc_stderr": 0.02775082824017435
-    },
-    "xwinograd_fr": {
-      "acc": 0.5783132530120482,
-      "acc_stderr": 0.054534284852951115
-    },
-    "xwinograd_en": {
-      "acc": 0.6262365591397849,
-      "acc_stderr": 0.01003574358830904
-    },
-    "xwinograd_jp": {
-      "acc": 0.5453597497393118,
-      "acc_stderr": 0.01608765437474968
-    }
-  },
-  "versions": {
-    "xwinograd_pt": 0,
-    "xwinograd_zh": 0,
-    "xwinograd_ru": 0,
-    "xwinograd_fr": 0,
-    "xwinograd_en": 0,
-    "xwinograd_jp": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=facebook/xglm-564M,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/xglm/xglm-7.5B/README.md
+++ b/results/xglm/xglm-7.5B/README.md
-# xglm-7.5B
-
-## xglm-7.5B_common_sense_reasoning_0-shot.json
-|    Task     |Version| Metric |Value|   |Stderr|
-|-------------|------:|--------|----:|---|-----:|
-|arc_challenge|      0|acc     |28.75|±  |  1.32|
-|             |       |acc_norm|31.91|±  |  1.36|
-|arc_easy     |      0|acc     |62.37|±  |  0.99|
-|             |       |acc_norm|58.63|±  |  1.01|
-|boolq        |      1|acc     |60.18|±  |  0.86|
-|copa         |      0|acc     |79.00|±  |  4.09|
-|hellaswag    |      0|acc     |45.69|±  |  0.50|
-|             |       |acc_norm|61.23|±  |  0.49|
-|mc_taco      |      0|em      |13.81|   |      |
-|             |       |f1      |47.92|   |      |
-|openbookqa   |      0|acc     |25.40|±  |  1.95|
-|             |       |acc_norm|35.80|±  |  2.15|
-|piqa         |      0|acc     |73.94|±  |  1.02|
-|             |       |acc_norm|74.92|±  |  1.01|
-|prost        |      0|acc     |25.89|±  |  0.32|
-|             |       |acc_norm|26.36|±  |  0.32|
-|swag         |      0|acc     |50.51|±  |  0.35|
-|             |       |acc_norm|69.23|±  |  0.33|
-|winogrande   |      0|acc     |57.85|±  |  1.39|
-|wsc273       |      0|acc     |75.82|±  |  2.60|
-
-## xglm-7.5B_gsm8k_8-shot.json
-|Task |Version|Metric|Value|   |Stderr|
-|-----|------:|------|----:|---|-----:|
-|gsm8k|      0|acc   | 0.15|±  |  0.11|
-
-## xglm-7.5B_mathematical_reasoning_few_shot_5-shot.json
-|          Task           |Version| Metric |Value|   |Stderr|
-|-------------------------|------:|--------|----:|---|-----:|
-|drop                     |      1|em      | 5.42|±  |  0.23|
-|                         |       |f1      | 8.96|±  |  0.26|
-|gsm8k                    |      0|acc     | 0.23|±  |  0.13|
-|math_algebra             |      1|acc     | 0.00|±  |  0.00|
-|math_counting_and_prob   |      1|acc     | 0.00|±  |  0.00|
-|math_geometry            |      1|acc     | 0.00|±  |  0.00|
-|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
-|math_num_theory          |      1|acc     | 0.00|±  |  0.00|
-|math_prealgebra          |      1|acc     | 0.00|±  |  0.00|
-|math_precalc             |      1|acc     | 0.00|±  |  0.00|
-|mathqa                   |      0|acc     |23.99|±  |  0.78|
-|                         |       |acc_norm|23.52|±  |  0.78|
-
-## xglm-7.5B_pawsx_0-shot.json
-|  Task  |Version|Metric|Value|   |Stderr|
-|--------|------:|------|----:|---|-----:|
-|pawsx_de|      0|acc   |55.90|±  |  1.11|
-|pawsx_en|      0|acc   |58.85|±  |  1.10|
-|pawsx_es|      0|acc   |52.80|±  |  1.12|
-|pawsx_fr|      0|acc   |51.80|±  |  1.12|
-|pawsx_ja|      0|acc   |52.00|±  |  1.12|
-|pawsx_ko|      0|acc   |45.95|±  |  1.11|
-|pawsx_zh|      0|acc   |51.30|±  |  1.12|
-
-## xglm-7.5B_xcopa_0-shot.json
-|  Task  |Version|Metric|Value|   |Stderr|
-|--------|------:|------|----:|---|-----:|
-|xcopa_et|      0|acc   | 61.2|±  |  2.18|
-|xcopa_ht|      0|acc   | 57.4|±  |  2.21|
-|xcopa_id|      0|acc   | 69.4|±  |  2.06|
-|xcopa_it|      0|acc   | 63.6|±  |  2.15|
-|xcopa_qu|      0|acc   | 48.8|±  |  2.24|
-|xcopa_sw|      0|acc   | 60.0|±  |  2.19|
-|xcopa_ta|      0|acc   | 54.4|±  |  2.23|
-|xcopa_th|      0|acc   | 59.4|±  |  2.20|
-|xcopa_tr|      0|acc   | 58.4|±  |  2.21|
-|xcopa_vi|      0|acc   | 70.2|±  |  2.05|
-|xcopa_zh|      0|acc   | 63.8|±  |  2.15|
-
-## xglm-7.5B_xnli_0-shot.json
-| Task  |Version|Metric|Value|   |Stderr|
-|-------|------:|------|----:|---|-----:|
-|xnli_ar|      0|acc   |33.37|±  |  0.67|
-|xnli_bg|      0|acc   |44.89|±  |  0.70|
-|xnli_de|      0|acc   |48.98|±  |  0.71|
-|xnli_el|      0|acc   |40.66|±  |  0.69|
-|xnli_en|      0|acc   |53.85|±  |  0.70|
-|xnli_es|      0|acc   |47.70|±  |  0.71|
-|xnli_fr|      0|acc   |46.95|±  |  0.71|
-|xnli_hi|      0|acc   |47.21|±  |  0.71|
-|xnli_ru|      0|acc   |46.33|±  |  0.70|
-|xnli_sw|      0|acc   |45.83|±  |  0.70|
-|xnli_th|      0|acc   |43.71|±  |  0.70|
-|xnli_tr|      0|acc   |46.27|±  |  0.70|
-|xnli_ur|      0|acc   |42.10|±  |  0.70|
-|xnli_vi|      0|acc   |46.33|±  |  0.70|
-|xnli_zh|      0|acc   |35.37|±  |  0.68|
-
-## xglm-7.5B_xstory_cloze_0-shot.json
-|     Task      |Version|Metric|Value|   |Stderr|
-|---------------|------:|------|----:|---|-----:|
-|xstory_cloze_ar|      0|acc   |56.19|±  |  1.28|
-|xstory_cloze_en|      0|acc   |69.82|±  |  1.18|
-|xstory_cloze_es|      0|acc   |64.06|±  |  1.23|
-|xstory_cloze_eu|      0|acc   |57.71|±  |  1.27|
-|xstory_cloze_hi|      0|acc   |58.77|±  |  1.27|
-|xstory_cloze_id|      0|acc   |62.94|±  |  1.24|
-|xstory_cloze_my|      0|acc   |57.11|±  |  1.27|
-|xstory_cloze_ru|      0|acc   |63.53|±  |  1.24|
-|xstory_cloze_sw|      0|acc   |59.30|±  |  1.26|
-|xstory_cloze_te|      0|acc   |60.23|±  |  1.26|
-|xstory_cloze_zh|      0|acc   |58.90|±  |  1.27|
-
-## xglm-7.5B_xwinograd_0-shot.json
-|    Task    |Version|Metric|Value|   |Stderr|
-|------------|------:|------|----:|---|-----:|
-|xwinograd_en|      0|acc   |79.48|±  |  0.84|
-|xwinograd_fr|      0|acc   |65.06|±  |  5.27|
-|xwinograd_jp|      0|acc   |64.96|±  |  1.54|
-|xwinograd_pt|      0|acc   |67.30|±  |  2.90|
-|xwinograd_ru|      0|acc   |63.17|±  |  2.72|
-|xwinograd_zh|      0|acc   |72.82|±  |  1.98|
--- a/results/xglm/xglm-7.5B/xglm-7.5B_common_sense_reasoning_0-shot.json
+++ b/results/xglm/xglm-7.5B/xglm-7.5B_common_sense_reasoning_0-shot.json
-{
-  "results": {
-    "openbookqa": {
-      "acc": 0.254,
-      "acc_stderr": 0.019486596801643385,
-      "acc_norm": 0.358,
-      "acc_norm_stderr": 0.02146143486285912
-    },
-    "winogrande": {
-      "acc": 0.5785319652722968,
-      "acc_stderr": 0.0138780723774976
-    },
-    "arc_easy": {
-      "acc": 0.6237373737373737,
-      "acc_stderr": 0.009940646221513789,
-      "acc_norm": 0.5862794612794613,
-      "acc_norm_stderr": 0.010105878530238135
-    },
-    "copa": {
-      "acc": 0.79,
-      "acc_stderr": 0.040936018074033256
-    },
-    "mc_taco": {
-      "em": 0.13813813813813813,
-      "f1": 0.479152974631639
-    },
-    "wsc273": {
-      "acc": 0.7582417582417582,
-      "acc_stderr": 0.02596031999685269
-    },
-    "hellaswag": {
-      "acc": 0.45688109938259314,
-      "acc_stderr": 0.004971192387202445,
-      "acc_norm": 0.6123282214698267,
-      "acc_norm_stderr": 0.004862232790041574
-    },
-    "boolq": {
-      "acc": 0.6018348623853211,
-      "acc_stderr": 0.008561755594317445
-    },
-    "swag": {
-      "acc": 0.505148455463361,
-      "acc_stderr": 0.003534904635576977,
-      "acc_norm": 0.692292312306308,
-      "acc_norm_stderr": 0.003263207195550976
-    },
-    "piqa": {
-      "acc": 0.7393906420021763,
-      "acc_stderr": 0.010241826155811627,
-      "acc_norm": 0.749183895538629,
-      "acc_norm_stderr": 0.010113869547069046
-    },
-    "prost": {
-      "acc": 0.2588599487617421,
-      "acc_stderr": 0.0032000423309913543,
-      "acc_norm": 0.26361016225448336,
-      "acc_norm_stderr": 0.0032189046983713983
-    },
-    "arc_challenge": {
-      "acc": 0.28754266211604096,
-      "acc_stderr": 0.013226719056266129,
-      "acc_norm": 0.3191126279863481,
-      "acc_norm_stderr": 0.013621696119173304
-    }
-  },
-  "versions": {
-    "openbookqa": 0,
-    "winogrande": 0,
-    "arc_easy": 0,
-    "copa": 0,
-    "mc_taco": 0,
-    "wsc273": 0,
-    "hellaswag": 0,
-    "boolq": 1,
-    "swag": 0,
-    "piqa": 0,
-    "prost": 0,
-    "arc_challenge": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=facebook/xglm-7.5B,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/xglm/xglm-7.5B/xglm-7.5B_gsm8k_8-shot.json
+++ b/results/xglm/xglm-7.5B/xglm-7.5B_gsm8k_8-shot.json
-{
-  "results": {
-    "gsm8k": {
-      "acc": 0.001516300227445034,
-      "acc_stderr": 0.0010717793485492655
-    }
-  },
-  "versions": {
-    "gsm8k": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=facebook/xglm-7.5B,use_accelerate=True",
-    "num_fewshot": 8,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/xglm/xglm-7.5B/xglm-7.5B_mathematical_reasoning_few_shot_5-shot.json
+++ b/results/xglm/xglm-7.5B/xglm-7.5B_mathematical_reasoning_few_shot_5-shot.json
-{
-  "results": {
-    "math_num_theory": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "gsm8k": {
-      "acc": 0.002274450341167551,
-      "acc_stderr": 0.0013121578148674316
-    },
-    "math_geometry": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "drop": {
-      "em": 0.05421560402684564,
-      "em_stderr": 0.002318984649948223,
-      "f1": 0.08962458053691245,
-      "f1_stderr": 0.0026401926224488034
-    },
-    "math_prealgebra": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_counting_and_prob": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_precalc": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_intermediate_algebra": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "math_algebra": {
-      "acc": 0.0,
-      "acc_stderr": 0.0
-    },
-    "mathqa": {
-      "acc": 0.23986599664991626,
-      "acc_stderr": 0.007816818250028128,
-      "acc_norm": 0.23517587939698492,
-      "acc_norm_stderr": 0.0077638612776946255
-    }
-  },
-  "versions": {
-    "math_num_theory": 1,
-    "gsm8k": 0,
-    "math_geometry": 1,
-    "drop": 1,
-    "math_prealgebra": 1,
-    "math_counting_and_prob": 1,
-    "math_precalc": 1,
-    "math_intermediate_algebra": 1,
-    "math_algebra": 1,
-    "mathqa": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=facebook/xglm-7.5B,use_accelerate=True",
-    "num_fewshot": 5,
-    "batch_size": "auto",
-    "device": "cuda:0",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/xglm/xglm-7.5B/xglm-7.5B_pawsx_0-shot.json
+++ b/results/xglm/xglm-7.5B/xglm-7.5B_pawsx_0-shot.json
-{
-  "results": {
-    "pawsx_en": {
-      "acc": 0.5885,
-      "acc_stderr": 0.011006563824537298
-    },
-    "pawsx_es": {
-      "acc": 0.528,
-      "acc_stderr": 0.011165587094621537
-    },
-    "pawsx_fr": {
-      "acc": 0.518,
-      "acc_stderr": 0.011175886999478619
-    },
-    "pawsx_zh": {
-      "acc": 0.513,
-      "acc_stderr": 0.01117935548207038
-    },
-    "pawsx_ja": {
-      "acc": 0.52,
-      "acc_stderr": 0.011174185930778312
-    },
-    "pawsx_de": {
-      "acc": 0.559,
-      "acc_stderr": 0.011105006104468736
-    },
-    "pawsx_ko": {
-      "acc": 0.4595,
-      "acc_stderr": 0.011146389370464362
-    }
-  },
-  "versions": {
-    "pawsx_en": 0,
-    "pawsx_es": 0,
-    "pawsx_fr": 0,
-    "pawsx_zh": 0,
-    "pawsx_ja": 0,
-    "pawsx_de": 0,
-    "pawsx_ko": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=facebook/xglm-7.5B,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/xglm/xglm-7.5B/xglm-7.5B_xcopa_0-shot.json
+++ b/results/xglm/xglm-7.5B/xglm-7.5B_xcopa_0-shot.json
-{
-  "results": {
-    "xcopa_et": {
-      "acc": 0.612,
-      "acc_stderr": 0.021814300984787635
-    },
-    "xcopa_th": {
-      "acc": 0.594,
-      "acc_stderr": 0.02198396209008634
-    },
-    "xcopa_qu": {
-      "acc": 0.488,
-      "acc_stderr": 0.02237662679792717
-    },
-    "xcopa_ta": {
-      "acc": 0.544,
-      "acc_stderr": 0.02229623834840705
-    },
-    "xcopa_zh": {
-      "acc": 0.638,
-      "acc_stderr": 0.0215136625275824
-    },
-    "xcopa_vi": {
-      "acc": 0.702,
-      "acc_stderr": 0.02047511809298897
-    },
-    "xcopa_sw": {
-      "acc": 0.6,
-      "acc_stderr": 0.021930844120728505
-    },
-    "xcopa_it": {
-      "acc": 0.636,
-      "acc_stderr": 0.021539170637317685
-    },
-    "xcopa_tr": {
-      "acc": 0.584,
-      "acc_stderr": 0.022064943313928848
-    },
-    "xcopa_id": {
-      "acc": 0.694,
-      "acc_stderr": 0.0206295699983454
-    },
-    "xcopa_ht": {
-      "acc": 0.574,
-      "acc_stderr": 0.022136577335085637
-    }
-  },
-  "versions": {
-    "xcopa_et": 0,
-    "xcopa_th": 0,
-    "xcopa_qu": 0,
-    "xcopa_ta": 0,
-    "xcopa_zh": 0,
-    "xcopa_vi": 0,
-    "xcopa_sw": 0,
-    "xcopa_it": 0,
-    "xcopa_tr": 0,
-    "xcopa_id": 0,
-    "xcopa_ht": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=facebook/xglm-7.5B,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/xglm/xglm-7.5B/xglm-7.5B_xnli_0-shot.json
+++ b/results/xglm/xglm-7.5B/xglm-7.5B_xnli_0-shot.json
-{
-  "results": {
-    "xnli_ar": {
-      "acc": 0.3337325349301397,
-      "acc_stderr": 0.00666266628252267
-    },
-    "xnli_bg": {
-      "acc": 0.4489021956087824,
-      "acc_stderr": 0.007027723874210379
-    },
-    "xnli_de": {
-      "acc": 0.48982035928143713,
-      "acc_stderr": 0.0070632481147059134
-    },
-    "xnli_el": {
-      "acc": 0.40658682634730536,
-      "acc_stderr": 0.006940323712177368
-    },
-    "xnli_en": {
-      "acc": 0.5385229540918164,
-      "acc_stderr": 0.0070437128985425335
-    },
-    "xnli_es": {
-      "acc": 0.47704590818363274,
-      "acc_stderr": 0.007057263845316342
-    },
-    "xnli_fr": {
-      "acc": 0.4694610778443114,
-      "acc_stderr": 0.007051522651006734
-    },
-    "xnli_hi": {
-      "acc": 0.4720558882235529,
-      "acc_stderr": 0.007053670508441103
-    },
-    "xnli_ru": {
-      "acc": 0.46327345309381235,
-      "acc_stderr": 0.007045628330322907
-    },
-    "xnli_sw": {
-      "acc": 0.45828343313373254,
-      "acc_stderr": 0.007040080446339805
-    },
-    "xnli_th": {
-      "acc": 0.437125748502994,
-      "acc_stderr": 0.007008633817895695
-    },
-    "xnli_tr": {
-      "acc": 0.4626746506986028,
-      "acc_stderr": 0.007045000071900887
-    },
-    "xnli_ur": {
-      "acc": 0.42095808383233535,
-      "acc_stderr": 0.006975878576227385
-    },
-    "xnli_vi": {
-      "acc": 0.46327345309381235,
-      "acc_stderr": 0.007045628330322896
-    },
-    "xnli_zh": {
-      "acc": 0.3536926147704591,
-      "acc_stderr": 0.006755492859492898
-    }
-  },
-  "versions": {
-    "xnli_ar": 0,
-    "xnli_bg": 0,
-    "xnli_de": 0,
-    "xnli_el": 0,
-    "xnli_en": 0,
-    "xnli_es": 0,
-    "xnli_fr": 0,
-    "xnli_hi": 0,
-    "xnli_ru": 0,
-    "xnli_sw": 0,
-    "xnli_th": 0,
-    "xnli_tr": 0,
-    "xnli_ur": 0,
-    "xnli_vi": 0,
-    "xnli_zh": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=facebook/xglm-7.5B,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/xglm/xglm-7.5B/xglm-7.5B_xstory_cloze_0-shot.json
+++ b/results/xglm/xglm-7.5B/xglm-7.5B_xstory_cloze_0-shot.json
-{
-  "results": {
-    "xstory_cloze_es": {
-      "acc": 0.6406353408338848,
-      "acc_stderr": 0.012347659802101675
-    },
-    "xstory_cloze_zh": {
-      "acc": 0.5890138980807412,
-      "acc_stderr": 0.012661578894368948
-    },
-    "xstory_cloze_sw": {
-      "acc": 0.5929847782925215,
-      "acc_stderr": 0.012642664836816926
-    },
-    "xstory_cloze_en": {
-      "acc": 0.6982131039046989,
-      "acc_stderr": 0.011812877848905303
-    },
-    "xstory_cloze_hi": {
-      "acc": 0.5876902713434812,
-      "acc_stderr": 0.012667694122397068
-    },
-    "xstory_cloze_ar": {
-      "acc": 0.5618795499669094,
-      "acc_stderr": 0.012768206616277757
-    },
-    "xstory_cloze_eu": {
-      "acc": 0.5771012574454004,
-      "acc_stderr": 0.0127132250091262
-    },
-    "xstory_cloze_id": {
-      "acc": 0.6293845135671741,
-      "acc_stderr": 0.012428861084065903
-    },
-    "xstory_cloze_ru": {
-      "acc": 0.6353408338848445,
-      "acc_stderr": 0.012386781532906161
-    },
-    "xstory_cloze_te": {
-      "acc": 0.6022501654533422,
-      "acc_stderr": 0.012595197856703525
-    },
-    "xstory_cloze_my": {
-      "acc": 0.57114493712773,
-      "acc_stderr": 0.01273620271314778
-    }
-  },
-  "versions": {
-    "xstory_cloze_es": 0,
-    "xstory_cloze_zh": 0,
-    "xstory_cloze_sw": 0,
-    "xstory_cloze_en": 0,
-    "xstory_cloze_hi": 0,
-    "xstory_cloze_ar": 0,
-    "xstory_cloze_eu": 0,
-    "xstory_cloze_id": 0,
-    "xstory_cloze_ru": 0,
-    "xstory_cloze_te": 0,
-    "xstory_cloze_my": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=facebook/xglm-7.5B,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/results/xglm/xglm-7.5B/xglm-7.5B_xwinograd_0-shot.json
+++ b/results/xglm/xglm-7.5B/xglm-7.5B_xwinograd_0-shot.json
-{
-  "results": {
-    "xwinograd_zh": {
-      "acc": 0.7281746031746031,
-      "acc_stderr": 0.01983712759311063
-    },
-    "xwinograd_ru": {
-      "acc": 0.6317460317460317,
-      "acc_stderr": 0.027219500732466696
-    },
-    "xwinograd_pt": {
-      "acc": 0.6730038022813688,
-      "acc_stderr": 0.028982074243683254
-    },
-    "xwinograd_en": {
-      "acc": 0.7948387096774193,
-      "acc_stderr": 0.008376626547826555
-    },
-    "xwinograd_jp": {
-      "acc": 0.6496350364963503,
-      "acc_stderr": 0.01541389159576608
-    },
-    "xwinograd_fr": {
-      "acc": 0.6506024096385542,
-      "acc_stderr": 0.05265151356440471
-    }
-  },
-  "versions": {
-    "xwinograd_zh": 0,
-    "xwinograd_ru": 0,
-    "xwinograd_pt": 0,
-    "xwinograd_en": 0,
-    "xwinograd_jp": 0,
-    "xwinograd_fr": 0
-  },
-  "config": {
-    "model": "hf-causal-experimental",
-    "model_args": "pretrained=facebook/xglm-7.5B,use_accelerate=True",
-    "num_fewshot": 0,
-    "batch_size": "auto",
-    "device": "cuda",
-    "no_cache": true,
-    "limit": null,
-    "bootstrap_iters": 100000,
-    "description_dict": {}
-  }
-}
--- a/scripts/regression.py
+++ b/scripts/regression.py
@@ -9,7 +9,12 @@ from lm_eval import tasks, utils


 seq2seq_models = ["google/flan-t5-small"]
-causal_models = ["gpt2", "facebook/opt-125m", "EleutherAI/gpt-neo-125m", "EleutherAI/pythia-160m"]
+causal_models = [
+    "gpt2",
+    "facebook/opt-125m",
+    "EleutherAI/gpt-neo-125m",
+    "EleutherAI/pythia-160m",
+]
 model_names = seq2seq_models + causal_models


@@ -50,22 +55,41 @@ def eval_models(args, branch=None):
    results = {}

    for model in args.models:
-        model_type = "hf-causal-experimental" if model in causal_models \
-            else "hf-seq2seq" if model in seq2seq_models else args.model
+        model_type = (
+            "hf-causal-experimental"
+            if model in causal_models
+            else "hf-seq2seq"
+            if model in seq2seq_models
+            else args.model
+        )
        model_args = f"pretrained={model},{args.model_args}"
        # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
-        tasks = args.tasks if model in causal_models or model_type == "hf-causal-experimental" \
+        tasks = (
+            args.tasks
+            if model in causal_models or model_type == "hf-causal-experimental"
            else list(filter(lambda task: task not in perplexity_tasks, args.tasks))
+        )
        # TODO: OOM with auto for seq2seq models, also can OOM with llama
-        batch_size = args.batch_size if model in causal_models or model_type == "hf-causal-experimental" \
-            else 64 if args.batch_size == "auto" else args.batch_size
-        output_path = f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
-
-        command = f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} " \
-                  f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} " \
-                  f"--batch_size {batch_size} --no_cache --output_path {output_path}"
-
-        print(f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}")
+        batch_size = (
+            args.batch_size
+            if model in causal_models or model_type == "hf-causal-experimental"
+            else 64
+            if args.batch_size == "auto"
+            else args.batch_size
+        )
+        output_path = (
+            f"data/regression/{int(start_time)}-{branch}-{Path(model).name}.json"
+        )
+
+        command = (
+            f"python3 main.py --model {model_type} --model_args {model_args} --tasks {','.join(tasks)} "
+            f"--num_fewshot {args.num_fewshot}{'' if args.limit is None else f' --limit {args.limit}'} "
+            f"--batch_size {batch_size} --no_cache --output_path {output_path}"
+        )
+
+        print(
+            f"{'=' * 80}\nEvaluating {model} on {', '.join(tasks)} at {branch} with:\n\n{command}\n{'=' * 80}"
+        )

        ret = os.system(command)

@@ -108,13 +132,25 @@ def format_diff(args, results1, results2, model, task):
 def main():
    args = parse_args()

-    args.branches = args.branches.split(",") if type(args.branches) == str else args.branches
+    args.branches = (
+        args.branches.split(",") if type(args.branches) == str else args.branches
+    )
    args.models = args.models.split(",") if type(args.models) == str else args.models
-    args.tasks = tasks.ALL_TASKS if args.tasks == "all_tasks" \
-        else utils.pattern_match(args.tasks.split(",") if type(args.tasks) == str else args.tasks, tasks.ALL_TASKS)
+    args.tasks = (
+        tasks.ALL_TASKS
+        if args.tasks == "all_tasks"
+        else utils.pattern_match(
+            args.tasks.split(",") if type(args.tasks) == str else args.tasks,
+            tasks.ALL_TASKS,
+        )
+    )

    global initial_branch
-    initial_branch = subprocess.check_output("git branch --show-current", shell=True).decode("ascii").strip()
+    initial_branch = (
+        subprocess.check_output("git branch --show-current", shell=True)
+        .decode("ascii")
+        .strip()
+    )

    # TODO: implement proper timing for each task
    # TODO: reduce IO by sharing tasks between models?
@@ -132,10 +168,16 @@ def main():
    print(f"|task|{'|'.join(map(lambda model: Path(model).name, args.models))}|")
    print(f"|--|{'--|' * len(args.models)}")
    for task in args.tasks:
-        print(f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|")
+        print(
+            f"|{task} ({initial_branch})|{'|'.join(map(lambda model: format_value(args, results, model, task), args.models))}|"
+        )
        for branch, branch_results, branch_runtime in runs:
-            print(f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|")
-            print(f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|")
+            print(
+                f"|{task} ({branch})|{'|'.join(map(lambda model: format_value(args, branch_results, model, task), args.models))}|"
+            )
+            print(
+                f"|{task} (diff)|{'|'.join(map(lambda model: format_diff(args, results, branch_results, model, task), args.models))}|"
+            )

    print("")
    print("|branch|runtime|%|")

--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,9 @@ setuptools.setup(
    long_description=long_description,
    long_description_content_type="text/markdown",
    url="https://github.com/EleutherAI/lm-evaluation-harness",
-    packages=setuptools.find_packages(),
+    packages=setuptools.find_packages(exclude=["scripts.*", "scripts"]),
+    package_data={"lm_eval": ["**/*.json"]},
+    include_package_data=True,
    classifiers=[
        "Development Status :: 3 - Alpha",
        "Programming Language :: Python :: 3",
@@ -22,6 +24,7 @@ setuptools.setup(
    python_requires=">=3.8",
    install_requires=[
        "datasets>=2.0.0",
+        "einops",
        "jsonlines",
        "numexpr",
        "openai>=0.6.4",

--- a/tests/test_gguf.py
+++ b/tests/test_gguf.py
+import unittest
+from unittest.mock import patch
+import hashlib
+import json
+import os
+import pickle
+from lm_eval.models.gguf import GGUFLM
+
+base_url = "https://matthoffner-ggml-llm-api.hf.space"
+
+def gguf_completion_mock(base_url, **kwargs):
+    # Generate a hash from the parameters
+    hash_kwargs = {'base_url': base_url, **kwargs}
+    hash = hashlib.sha256(json.dumps(hash_kwargs, sort_keys=True).encode('utf-8')).hexdigest()
+
+    fname = f"./tests/testdata/ggml_test_{hash}.pkl"
+
+    if os.path.exists(fname):
+        with open(fname, "rb") as fh:
+            return pickle.load(fh)
+    else:
+        print("The file does not exist, attempting to write...")  
+        if 'stop' in kwargs:
+            result = {"choices": [{"text": f"generated text until {kwargs['stop']}", "logprobs": {"token_logprobs": [-1.2345]}, "finish_reason": "length"}]}
+        else:
+            result = {"choices": [{"logprobs": {"token_logprobs": [-1.2345]}, "finish_reason": "length"}]}
+
+        try:
+            os.makedirs(os.path.dirname(fname), exist_ok=True)
+            print('Writing file at', fname)
+            with open(fname, "wb") as fh:
+                pickle.dump(result, fh)
+            print('File written successfully')
+        except Exception as e:
+            print('File writing failed:', e)
+
+        return result
+
+
+class GGUFLMTest(unittest.TestCase):
+    @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock)
+    def test_loglikelihood(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+
+        # Test loglikelihood
+        requests = [("context1", "continuation1"), ("context2", "continuation2")]
+        res = lm.loglikelihood(requests)
+
+        # Assert the loglikelihood response is correct
+        expected_res = [(logprob, True) for logprob in [-1.2345, -1.2345]]
+        self.assertEqual(res, expected_res)
+
+    @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock)
+    def test_greedy_until(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+
+        # Test greedy_until
+        requests = [("input1", {"until": "stop1"}), ("input2", {"until": "stop2"})]
+        res = lm.greedy_until(requests)
+
+        # Assert the greedy_until response is correct
+        expected_res = ["generated text until stop1", "generated text until stop2"]
+        self.assertEqual(res, expected_res)
+
+    @patch('lm_eval.models.gguf.GGUFLM.gguf_completion', side_effect=gguf_completion_mock)
+    def test_loglikelihood_rolling(self, gguf_completion_mock):
+        lm = GGUFLM(base_url)
+
+        # Test loglikelihood_rolling
+        requests = ["input1", "input2"]
+        res = lm.loglikelihood_rolling(requests)
+
+        # Assert the loglikelihood_rolling response is correct
+        expected_res = [(-1.2345, True), (-1.2345, True)]
+        self.assertEqual(res, expected_res)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/testdata/ggml_test_01d366e32dd8ae86bd079b6822814dcafad69a9082e4cf4db9633eaad47933c2.pkl
+++ b/tests/testdata/ggml_test_01d366e32dd8ae86bd079b6822814dcafad69a9082e4cf4db9633eaad47933c2.pkl
--- a/tests/testdata/ggml_test_04e9938f35d50bceb56453089ce5c7a0738ac878d40ded36f8a1fc170ab54b18.pkl
+++ b/tests/testdata/ggml_test_04e9938f35d50bceb56453089ce5c7a0738ac878d40ded36f8a1fc170ab54b18.pkl
--- a/tests/testdata/ggml_test_18f981234b6b471823bff9f977aad7f72439244c6c4d5f091b1f0984a71c8f11.pkl
+++ b/tests/testdata/ggml_test_18f981234b6b471823bff9f977aad7f72439244c6c4d5f091b1f0984a71c8f11.pkl
--- a/tests/testdata/ggml_test_941e4a484a2f5d4d99b45084003946423f63cc2955e9400f7153a51cbed9470a.pkl
+++ b/tests/testdata/ggml_test_941e4a484a2f5d4d99b45084003946423f63cc2955e9400f7153a51cbed9470a.pkl
--- a/tests/testdata/ggml_test_c28e46a48a7076dc266da0a1a93be005d91162fb16950e31daee94d23d9e091e.pkl
+++ b/tests/testdata/ggml_test_c28e46a48a7076dc266da0a1a93be005d91162fb16950e31daee94d23d9e091e.pkl
--- a/tests/testdata/ggml_test_e768167d669b5de84c99743c4b55bc85889d4894d1901d9f955626e0622059a6.pkl
+++ b/tests/testdata/ggml_test_e768167d669b5de84c99743c4b55bc85889d4894d1901d9f955626e0622059a6.pkl