Merge pull request #590 from gakada/big-refactor-merge

Merge master into big-refactor

Merge pull request #590 from gakada/big-refactor-merge
Merge master into big-refactor
24e3e3fa · Lintang Sutawika · GitHub · 3fc5bedc · b7c3580a · 24e3e3fa
Unverified Commit 24e3e3fa authored Jun 16, 2023 by Lintang Sutawika Committed by GitHub Jun 16, 2023
20 changed files
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -9,5 +9,5 @@ jobs:
      - uses: actions/checkout@v3
      - uses: actions/setup-python@v4
        with:
-          python-version: 3.8
+          python-version: 3.9
      - uses: pre-commit/action@v2.0.3
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -12,7 +12,7 @@ repos:
      - id: check-merge-conflict
      - id: check-symlinks
      - id: check-yaml
-        args: ['--unsafe']
+        args: ["--unsafe"]
      - id: destroyed-symlinks
      - id: detect-private-key
      - id: end-of-file-fixer
@@ -33,7 +33,7 @@ repos:
    rev: 22.3.0
    hooks:
      - id: black
-        language_version: python3.8
+        language_version: python3.9
  - repo: https://github.com/codespell-project/codespell
    rev: v2.1.0
    hooks:

--- a/CODEOWNERS
+++ b/CODEOWNERS
-* @jon-tow @StellaAthena
+* @haileyschoelkopf @lintangsutawika
--- a/README.md
+++ b/README.md
--- a/ignore.txt
+++ b/ignore.txt
 ROUGE
 rouge
 nin
+maka
+mor
+te
--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -37,14 +37,16 @@ def simple_evaluate(
    tasks=[],
    num_fewshot=0,
    batch_size=None,
+    max_batch_size=None,
    device=None,
    no_cache=False,
    limit=None,
    bootstrap_iters=100000,
    check_integrity=False,
    decontamination_ngrams_path=None,
+    write_out=False,
+    output_base_path=None,
 ):
    """Instantiate and evaluate a model on a list of tasks.
    :param model: Union[str, LM]
@@ -56,18 +58,24 @@ def simple_evaluate(
        List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
    :param num_fewshot: int
        Number of examples in few-shot context
-    :param batch_size: int, optional
+    :param batch_size: int or str, optional
        Batch size for model
+    :param max_batch_size: int, optional
+        Maximal batch size to try with automatic batch size detection
    :param device: str, optional
        PyTorch device (e.g. "cpu" or "cuda:0") for running models
    :param no_cache: bool
        Whether or not to cache
-    :param limit: int, optional
+    :param limit: int or float, optional
-        Limit the number of examples per task (only use this for testing)
+        Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
    :param check_integrity: bool
        Whether to run the relevant part of the test suite for the tasks
+    :param write_out: bool
+        If True, write details about prompts and logits to json for all tasks
+    :param output_base_path: str, optional
+        Directory to which detailed eval info will be written. Defaults to present working dir.
    :return
        Dictionary of results
    """
@@ -80,7 +88,12 @@ def simple_evaluate(
        if model_args is None:
            model_args = ""
        lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
-            model_args, {"batch_size": batch_size, "device": device}
+            model_args,
+            {
+                "batch_size": batch_size,
+                "max_batch_size": max_batch_size,
+                "device": device,
+            },
        )
    else:
        assert isinstance(model, lm_eval.api.model.LM)
@@ -97,15 +110,22 @@ def simple_evaluate(
        limit=limit,
        bootstrap_iters=bootstrap_iters,
        decontamination_ngrams_path=decontamination_ngrams_path,
+        write_out=write_out,
+        output_base_path=output_base_path,
    )
    if lm.rank == 0:
        # add info about the model and few shot config
        results["config"] = {
-            "model": model,
+            "model": model
+            if isinstance(model, str)
+            else model.model.config._name_or_path,
            "model_args": model_args,
            "num_fewshot": num_fewshot,
            "batch_size": batch_size,
+            "batch_sizes": list(lm.batch_sizes.values())
+            if hasattr(lm, "batch_sizes")
+            else [],
            "device": device,
            "no_cache": no_cache,
            "limit": limit,
@@ -127,6 +147,8 @@ def evaluate(
    limit=None,
    bootstrap_iters=100000,
    decontamination_ngrams_path=None,
+    write_out=False,
+    output_base_path=None,
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -140,6 +162,10 @@ def evaluate(
        Limit the number of examples per task (only use this for testing)
    :param bootstrap_iters:
        Number of iterations for bootstrap statistics
+    :param write_out: bool
+        If True, write all prompts, logits and metrics to json for offline analysis
+    :param output_base_path: str, optional
+        Directory to which detailed eval info will be written. Defaults to present working dir
    :return
        Dictionary of results
    """
@@ -166,6 +192,14 @@ def evaluate(
        # rnd = random.Random()
        # rnd.seed(42)
        # rnd.shuffle(task_docs)
+        if limit is not None:
+            if task.has_test_docs():
+                task_docs = task.test_docs()
+            elif task.has_validation_docs():
+                task_docs = task.validation_docs()
+            else:
+                raise RuntimeError("Task has neither test_docs nor validation_docs")
+            limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
        task.build_all_requests(limit=limit, rank=lm.rank, world_size=lm.world_size)

--- a/lm_eval/models/anthropic_llms.py
+++ b/lm_eval/models/anthropic_llms.py
+import os
+from lm_eval.base import BaseLM
+from tqdm import tqdm
+import time
+def anthropic_completion(
+    client, model, prompt, max_tokens_to_sample, temperature, stop
+):
+    """Query Anthropic API for completion.
+    Retry with back-off until they respond
+    """
+    import anthropic
+    backoff_time = 3
+    while True:
+        try:
+            response = client.completion(
+                prompt=f"{anthropic.HUMAN_PROMPT} {prompt}{anthropic.AI_PROMPT}",
+                model=model,
+                # NOTE: Claude really likes to do CoT, and overly aggressive stop sequences
+                #       (e.g. gsm8k's ":") may truncate a lot of the input.
+                stop_sequences=[anthropic.HUMAN_PROMPT] + stop,
+                max_tokens_to_sample=max_tokens_to_sample,
+                temperature=temperature,
+            )
+            print(response)
+            return response["completion"]
+        except RuntimeError:
+            # TODO: I don't actually know what error Anthropic raises when it times out
+            #       So err update this error when we find out.
+            import traceback
+            traceback.print_exc()
+            time.sleep(backoff_time)
+            backoff_time *= 1.5
+class AnthropicLM(BaseLM):
+    REQ_CHUNK_SIZE = 20
+    def __init__(self, model):
+        """
+        :param model: str
+            Anthropic model e.g. claude-instant-v1
+        """
+        super().__init__()
+        import anthropic
+        self.model = model
+        self.client = anthropic.Client(os.environ["ANTHROPIC_API_KEY"])
+    @property
+    def eot_token_id(self):
+        raise NotImplementedError("No idea about anthropic tokenization.")
+    @property
+    def max_length(self):
+        return 2048
+    @property
+    def max_gen_toks(self):
+        return 256
+    @property
+    def batch_size(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+    @property
+    def device(self):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+    def tok_encode(self, string: str):
+        raise NotImplementedError("No idea about anthropic tokenization.")
+    def tok_decode(self, tokens):
+        raise NotImplementedError("No idea about anthropic tokenization.")
+    def _loglikelihood_tokens(self, requests, disable_tqdm=False):
+        raise NotImplementedError("No support for logits.")
+    def greedy_until(self, requests):
+        if not requests:
+            return []
+        res = []
+        for request in tqdm(requests):
+            inp = request[0]
+            request_args = request[1]
+            until = request_args["until"]
+            response = anthropic_completion(
+                client=self.client,
+                model=self.model,
+                prompt=inp,
+                max_tokens_to_sample=self.max_gen_toks,
+                temperature=0.0,
+                stop=until,
+            )
+            res.append(response)
+        return res
+    def _model_call(self, inps):
+        # Isn't used because we override _loglikelihood_tokens
+        raise NotImplementedError()
+    def _model_generate(self, context, max_length, eos_token_id):
+        # Isn't used because we override greedy_until
+        raise NotImplementedError()
--- a/lm_eval/models/textsynth.py
+++ b/lm_eval/models/textsynth.py
@@ -125,7 +125,8 @@ class TextSynthLM(LM):
        res = []
        for request in tqdm(requests):
            inp = request[0]
-            until = request[1]
+            request_args = request[1]
+            until = request_args["until"]
            response = textsynth_completion(
                url=self.api_url + "/v1/engines/" + self.engine + "/completions",
                headers={"Authorization": "Bearer " + self.api_key},

--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -8,13 +8,19 @@ import functools
 import subprocess
 import collections
 import importlib.util
+import fnmatch
-from typing import List
+from typing import List, Union
+import gc
+import torch
 from omegaconf import OmegaConf
 from jinja2 import BaseLoader, Environment, StrictUndefined
 from itertools import islice
+from lm_eval.logger import eval_logger
 class ExitCodeError(Exception):
    pass
@@ -25,6 +31,29 @@ def sh(x):
        raise ExitCodeError()
+def escaped_split(text, sep_char, maxsplit=-1):
+    """Split text into a list on occurrences of the given separation
+    character `sep_char`. The separation character may be escaped by a
+    backslash to avoid splitting at that location.
+    The separation character must be a string of size 1.
+    If `maxsplit` is given, at most `maxsplit` splits are done (thus,
+    the list will have at most `maxsplit + 1` elements). If `maxsplit`
+    is not specified or less than 0, then there is no limit on the
+    number of splits (all possible splits are made).
+    """
+    assert (
+        len(sep_char) == 1
+    ), "separation string must be a single character for escaped splitting"
+    if maxsplit == 0:
+        return text
+    maxsplit = max(0, maxsplit)
+    return re.split(r"(?<!\\)" + sep_char, text, maxsplit)
 def simple_parse_args_string(args_string):
    """
    Parses something like
@@ -44,11 +73,11 @@ def join_iters(iters):
        yield from iter
-def chunks(iter, n):
+def chunks(iter, n=0, fn=None):
    arr = []
-    for x in iter:
+    for i, x in enumerate(iter):
        arr.append(x)
-        if len(arr) == n:
+        if len(arr) == (fn(i) if fn else n):
            yield arr
            arr = []
@@ -65,6 +94,35 @@ def group(arr, fn):
    return list(res.values())
+class MultiChoice:
+    def __init__(self, choices):
+        self.choices = choices
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values):
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                eval_logger.warning("{} is not in task list.".format(value))
+                eval_logger.info(f"Available tasks to choose:")
+                for choice in self.choices:
+                    eval_logger.info(f"  - {choice}")
+        return True
+    def __iter__(self):
+        for choice in self.choices:
+            yield choice
+# Returns a list containing all values of the source_list that
+# match at least one of the patterns
+def pattern_match(patterns, source_list):
+    task_names = set()
+    for pattern in patterns:
+        for matching in fnmatch.filter(source_list, pattern):
+            task_names.add(matching)
+    return sorted(list(task_names))
 def general_detokenize(string):
    string = string.replace(" n't", "n't")
    string = string.replace(" )", ")")
@@ -122,6 +180,26 @@ def make_disjoint_window(pair):
    return a[: len(a) - (len(b) - 1)], b
+def select_continuation_from_batch_left_padding(
+    generations: Union[List[List[int]], torch.Tensor], max_context_size: int
+):
+    """Select the continuation from the batch, removing prompts of different lengths.
+    Args:
+        generations (Union[List[List[int]], torch.Tensor]):
+            A tensor or list-of-lists of shape [batch_size, sequence length].
+        max_context_size (int):
+            The size of the biggest context; generations will proceed from that
+            index.
+    Example:
+        PAD     PAD Continue : The dog chased the cat  [every       day of the week]
+        Riddle  me    this   : The  dog chased the  cat [yesterday] PAD PAD PAD PAD
+    Output:
+        [every day of the week]
+        [yesterday]  PAD PAD PAD PAD
+    """
+    return generations[:, max_context_size:]
 class Reorderer:
    def __init__(self, arr, fn):
        self.size = len(arr)
@@ -336,3 +414,8 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
    among ranks in multigpu setting or only pulling a sample of documents
    """
    return islice(raw_iterator, rank, limit, world_size)
+def clear_torch_cache():
+    gc.collect()
+    torch.cuda.empty_cache()
--- a/main.py
+++ b/main.py
@@ -13,54 +13,41 @@ from lm_eval.logger import eval_logger
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
-class MultiChoice:
-    def __init__(self, choices):
-        self.choices = choices
-    # Simple wildcard support (linux filename patterns)
-    def __contains__(self, values):
-        for value in values.split(","):
-            if len(fnmatch.filter(self.choices, value)) == 0:
-                eval_logger.warning("{} is not in task list.".format(value))
-                eval_logger.info(f"Available tasks to choose:")
-                for choice in self.choices:
-                    eval_logger.info(f"  - {choice}")
-        return True
-    def __iter__(self):
-        for choice in self.choices:
-            yield choice
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model", required=True)
    parser.add_argument("--model_args", default="")
-    parser.add_argument("--tasks", default=None, choices=MultiChoice(sorted(ALL_TASKS)))
+    parser.add_argument(
+        "--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS))
+    )
    parser.add_argument("--config", default=None)
-    parser.add_argument("--provide_description", action="store_true")
    parser.add_argument("--num_fewshot", type=int, default=0)
    parser.add_argument("--batch_size", type=int, default=1)
+    parser.add_argument(
+        "--max_batch_size",
+        type=int,
+        default=None,
+        help="Maximal batch size to try with --batch_size auto",
+    )
    parser.add_argument("--device", type=str, default=None)
    parser.add_argument("--output_path", default=None)
-    parser.add_argument("--limit", type=int, default=None)
+    parser.add_argument(
+        "--limit",
+        type=float,
+        default=None,
+        help="Limit the number of examples per task. "
+        "If <1, limit is a percentage of the total number of examples.",
+    )
+    parser.add_argument("--data_sampling", type=float, default=None)
    parser.add_argument("--no_cache", action="store_true")
    parser.add_argument("--decontamination_ngrams_path", default=None)
    parser.add_argument("--description_dict_path", default=None)
    parser.add_argument("--check_integrity", action="store_true")
+    parser.add_argument("--write_out", action="store_true", default=False)
+    parser.add_argument("--output_base_path", type=str, default=None)
    return parser.parse_args()
-# Returns a list containing all values of the source_list that
-# match at least one of the patterns
-def pattern_match(patterns, source_list):
-    task_names = set()
-    for pattern in patterns:
-        for matching in fnmatch.filter(source_list, pattern):
-            task_names.add(matching)
-    return sorted(list(task_names))
 def main():
    args = parse_args()
@@ -70,7 +57,9 @@ def main():
            "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )
-    if args.tasks is not None:
+    if args.tasks is None:
+        task_names = ALL_TASKS
+    else:
        if os.path.isdir(args.tasks):
            import glob
@@ -81,7 +70,7 @@ def main():
                task_names.append(config)
        else:
            tasks_list = args.tasks.split(",")
-            task_names = pattern_match(tasks_list, ALL_TASKS)
+            task_names = utils.pattern_match(tasks_list, ALL_TASKS)
            for task in [task for task in tasks_list if task not in task_names]:
                if os.path.isfile(task):
                    config = utils.load_yaml_config(task)
@@ -89,23 +78,37 @@ def main():
    eval_logger.info(f"Selected Tasks: {task_names}")
+    # TODO: description_dict?
+    # description_dict = {}
+    # if args.description_dict_path:
+    #     with open(args.description_dict_path, "r") as f:
+    #         description_dict = json.load(f)
    results = evaluator.simple_evaluate(
        model=args.model,
        model_args=args.model_args,
        tasks=task_names,
        num_fewshot=args.num_fewshot,
        batch_size=args.batch_size,
+        max_batch_size=args.max_batch_size,
        device=args.device,
+        no_cache=args.no_cache,
        limit=args.limit,
+        # description_dict=description_dict,
        decontamination_ngrams_path=args.decontamination_ngrams_path,
        check_integrity=args.check_integrity,
+        write_out=args.write_out,
+        output_base_path=args.output_base_path,
    )
    if results is not None:
        samples = results.pop("samples")
        dumped = json.dumps(results, indent=2)
        print(dumped)
+        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
        if args.output_path:
            os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
@@ -127,8 +130,8 @@ def main():
                    f.write_all(samples[task_name])
        print(
-            f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
+            f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
-            f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
+            f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
        )
        print(evaluator.make_table(results))

--- a/results/bloom/bloom-1b1/README.md
+++ b/results/bloom/bloom-1b1/README.md
+# bloom-1b1
+## bloom-1b1_common_sense_reasoning_0-shot.json
+|    Task     |Version| Metric |Value|   |Stderr|
+|-------------|------:|--------|----:|---|-----:|
+|arc_challenge|      0|acc     |23.63|±  |  1.24|
+|             |       |acc_norm|25.68|±  |  1.28|
+|arc_easy     |      0|acc     |51.47|±  |  1.03|
+|             |       |acc_norm|45.45|±  |  1.02|
+|boolq        |      1|acc     |59.08|±  |  0.86|
+|copa         |      0|acc     |68.00|±  |  4.69|
+|hellaswag    |      0|acc     |34.63|±  |  0.47|
+|             |       |acc_norm|41.77|±  |  0.49|
+|mc_taco      |      0|em      |14.49|   |      |
+|             |       |f1      |32.43|   |      |
+|openbookqa   |      0|acc     |19.60|±  |  1.78|
+|             |       |acc_norm|29.40|±  |  2.04|
+|piqa         |      0|acc     |67.14|±  |  1.10|
+|             |       |acc_norm|67.14|±  |  1.10|
+|prost        |      0|acc     |23.41|±  |  0.31|
+|             |       |acc_norm|30.50|±  |  0.34|
+|swag         |      0|acc     |43.43|±  |  0.35|
+|             |       |acc_norm|58.28|±  |  0.35|
+|winogrande   |      0|acc     |54.93|±  |  1.40|
+|wsc273       |      0|acc     |68.50|±  |  2.82|
+## bloom-1b1_gsm8k_8-shot.json
+|Task |Version|Metric|Value|   |Stderr|
+|-----|------:|------|----:|---|-----:|
+|gsm8k|      0|acc   | 0.83|±  |  0.25|
+## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+|          Task           |Version| Metric |Value|   |Stderr|
+|-------------------------|------:|--------|----:|---|-----:|
+|drop                     |      1|em      | 1.38|±  |  0.12|
+|                         |       |f1      | 4.01|±  |  0.15|
+|gsm8k                    |      0|acc     | 0.00|±  |  0.00|
+|math_algebra             |      1|acc     | 0.00|±  |  0.00|
+|math_counting_and_prob   |      1|acc     | 0.21|±  |  0.21|
+|math_geometry            |      1|acc     | 0.21|±  |  0.21|
+|math_intermediate_algebra|      1|acc     | 0.00|±  |  0.00|
+|math_num_theory          |      1|acc     | 0.19|±  |  0.19|
+|math_prealgebra          |      1|acc     | 0.11|±  |  0.11|
+|math_precalc             |      1|acc     | 0.00|±  |  0.00|
+|mathqa                   |      0|acc     |23.55|±  |  0.78|
+|                         |       |acc_norm|23.62|±  |  0.78|
+## bloom-1b1_pawsx_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|pawsx_de|      0|acc   |46.95|±  |  1.12|
+|pawsx_en|      0|acc   |52.45|±  |  1.12|
+|pawsx_es|      0|acc   |51.50|±  |  1.12|
+|pawsx_fr|      0|acc   |46.15|±  |  1.11|
+|pawsx_ja|      0|acc   |48.40|±  |  1.12|
+|pawsx_ko|      0|acc   |49.90|±  |  1.12|
+|pawsx_zh|      0|acc   |48.95|±  |  1.12|
+## bloom-1b1_question_answering_0-shot.json
+|    Task     |Version|   Metric   |Value|   |Stderr|
+|-------------|------:|------------|----:|---|-----:|
+|headqa_en    |      0|acc         |26.44|±  |  0.84|
+|             |       |acc_norm    |30.49|±  |  0.88|
+|headqa_es    |      0|acc         |24.43|±  |  0.82|
+|             |       |acc_norm    |28.30|±  |  0.86|
+|logiqa       |      0|acc         |18.89|±  |  1.54|
+|             |       |acc_norm    |25.65|±  |  1.71|
+|squad2       |      1|exact       | 4.17|   |      |
+|             |       |f1          | 6.60|   |      |
+|             |       |HasAns_exact| 2.19|   |      |
+|             |       |HasAns_f1   | 7.05|   |      |
+|             |       |NoAns_exact | 6.14|   |      |
+|             |       |NoAns_f1    | 6.14|   |      |
+|             |       |best_exact  |50.07|   |      |
+|             |       |best_f1     |50.07|   |      |
+|triviaqa     |      1|acc         | 2.68|±  |  0.15|
+|truthfulqa_mc|      1|mc1         |25.34|±  |  1.52|
+|             |       |mc2         |41.80|±  |  1.46|
+|webqs        |      0|acc         | 1.38|±  |  0.26|
+## bloom-1b1_reading_comprehension_0-shot.json
+|Task|Version|Metric|Value|   |Stderr|
+|----|------:|------|----:|---|-----:|
+|coqa|      1|f1    |45.57|±  |  1.88|
+|    |       |em    |32.98|±  |  1.95|
+|drop|      1|em    | 3.31|±  |  0.18|
+|    |       |f1    | 8.63|±  |  0.22|
+|race|      1|acc   |32.63|±  |  1.45|
+## bloom-1b1_xcopa_0-shot.json
+|  Task  |Version|Metric|Value|   |Stderr|
+|--------|------:|------|----:|---|-----:|
+|xcopa_et|      0|acc   | 50.6|±  |  2.24|
+|xcopa_ht|      0|acc   | 53.0|±  |  2.23|
+|xcopa_id|      0|acc   | 64.8|±  |  2.14|
+|xcopa_it|      0|acc   | 50.8|±  |  2.24|
+|xcopa_qu|      0|acc   | 51.2|±  |  2.24|
+|xcopa_sw|      0|acc   | 54.4|±  |  2.23|
+|xcopa_ta|      0|acc   | 57.0|±  |  2.22|
+|xcopa_th|      0|acc   | 53.2|±  |  2.23|
+|xcopa_tr|      0|acc   | 53.0|±  |  2.23|
+|xcopa_vi|      0|acc   | 62.4|±  |  2.17|
+|xcopa_zh|      0|acc   | 59.4|±  |  2.20|
+## bloom-1b1_xnli_0-shot.json
+| Task  |Version|Metric|Value|   |Stderr|
+|-------|------:|------|----:|---|-----:|
+|xnli_ar|      0|acc   |33.93|±  |  0.67|
+|xnli_bg|      0|acc   |34.13|±  |  0.67|
+|xnli_de|      0|acc   |39.64|±  |  0.69|
+|xnli_el|      0|acc   |34.03|±  |  0.67|
+|xnli_en|      0|acc   |51.48|±  |  0.71|
+|xnli_es|      0|acc   |47.98|±  |  0.71|
+|xnli_fr|      0|acc   |47.15|±  |  0.71|
+|xnli_hi|      0|acc   |42.32|±  |  0.70|
+|xnli_ru|      0|acc   |40.46|±  |  0.69|
+|xnli_sw|      0|acc   |35.29|±  |  0.68|
+|xnli_th|      0|acc   |33.75|±  |  0.67|
+|xnli_tr|      0|acc   |34.79|±  |  0.67|
+|xnli_ur|      0|acc   |37.33|±  |  0.68|
+|xnli_vi|      0|acc   |44.45|±  |  0.70|
+|xnli_zh|      0|acc   |36.23|±  |  0.68|
+## bloom-1b1_xstory_cloze_0-shot.json
+|     Task      |Version|Metric|Value|   |Stderr|
+|---------------|------:|------|----:|---|-----:|
+|xstory_cloze_ar|      0|acc   |52.88|±  |  1.28|
+|xstory_cloze_en|      0|acc   |62.54|±  |  1.25|
+|xstory_cloze_es|      0|acc   |58.31|±  |  1.27|
+|xstory_cloze_eu|      0|acc   |54.33|±  |  1.28|
+|xstory_cloze_hi|      0|acc   |55.53|±  |  1.28|
+|xstory_cloze_id|      0|acc   |57.91|±  |  1.27|
+|xstory_cloze_my|      0|acc   |46.19|±  |  1.28|
+|xstory_cloze_ru|      0|acc   |48.25|±  |  1.29|
+|xstory_cloze_sw|      0|acc   |50.56|±  |  1.29|
+|xstory_cloze_te|      0|acc   |56.39|±  |  1.28|
+|xstory_cloze_zh|      0|acc   |58.04|±  |  1.27|
+## bloom-1b1_xwinograd_0-shot.json
+|    Task    |Version|Metric|Value|   |Stderr|
+|------------|------:|------|----:|---|-----:|
+|xwinograd_en|      0|acc   |69.98|±  |  0.95|
+|xwinograd_fr|      0|acc   |66.27|±  |  5.22|
+|xwinograd_jp|      0|acc   |52.87|±  |  1.61|
+|xwinograd_pt|      0|acc   |63.12|±  |  2.98|
+|xwinograd_ru|      0|acc   |54.29|±  |  2.81|
+|xwinograd_zh|      0|acc   |69.25|±  |  2.06|
--- a/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_common_sense_reasoning_0-shot.json
+{
+  "results": {
+    "boolq": {
+      "acc": 0.5908256880733945,
+      "acc_stderr": 0.008599563442397352
+    },
+    "arc_easy": {
+      "acc": 0.5147306397306397,
+      "acc_stderr": 0.010255329977562096,
+      "acc_norm": 0.45454545454545453,
+      "acc_norm_stderr": 0.010217299762709435
+    },
+    "openbookqa": {
+      "acc": 0.196,
+      "acc_stderr": 0.017770751227744862,
+      "acc_norm": 0.294,
+      "acc_norm_stderr": 0.020395095484936614
+    },
+    "hellaswag": {
+      "acc": 0.3463453495319657,
+      "acc_stderr": 0.004748324319714264,
+      "acc_norm": 0.4177454690300737,
+      "acc_norm_stderr": 0.004921798492608764
+    },
+    "swag": {
+      "acc": 0.43431970408877335,
+      "acc_stderr": 0.0035044592489844794,
+      "acc_norm": 0.5828251524542637,
+      "acc_norm_stderr": 0.0034862531772295617
+    },
+    "arc_challenge": {
+      "acc": 0.2363481228668942,
+      "acc_stderr": 0.012414960524301834,
+      "acc_norm": 0.2568259385665529,
+      "acc_norm_stderr": 0.0127669237941168
+    },
+    "mc_taco": {
+      "em": 0.1448948948948949,
+      "f1": 0.32425976796237205
+    },
+    "wsc273": {
+      "acc": 0.684981684981685,
+      "acc_stderr": 0.028165854394193602
+    },
+    "winogrande": {
+      "acc": 0.5493291239147593,
+      "acc_stderr": 0.013983928869040239
+    },
+    "prost": {
+      "acc": 0.23409479077711356,
+      "acc_stderr": 0.003093545711826552,
+      "acc_norm": 0.3049743808710504,
+      "acc_norm_stderr": 0.003363606918420179
+    },
+    "copa": {
+      "acc": 0.68,
+      "acc_stderr": 0.04688261722621504
+    },
+    "piqa": {
+      "acc": 0.6713819368879217,
+      "acc_stderr": 0.010959127105167048,
+      "acc_norm": 0.6713819368879217,
+      "acc_norm_stderr": 0.010959127105167044
+    }
+  },
+  "versions": {
+    "boolq": 1,
+    "arc_easy": 0,
+    "openbookqa": 0,
+    "hellaswag": 0,
+    "swag": 0,
+    "arc_challenge": 0,
+    "mc_taco": 0,
+    "wsc273": 0,
+    "winogrande": 0,
+    "prost": 0,
+    "copa": 0,
+    "piqa": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_gsm8k_8-shot.json
+{
+  "results": {
+    "gsm8k": {
+      "acc": 0.008339651250947688,
+      "acc_stderr": 0.002504942226860508
+    }
+  },
+  "versions": {
+    "gsm8k": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 8,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
+{
+  "results": {
+    "mathqa": {
+      "acc": 0.2355108877721943,
+      "acc_stderr": 0.007767687364650971,
+      "acc_norm": 0.23618090452261306,
+      "acc_norm_stderr": 0.0077753193787470495
+    },
+    "gsm8k": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "drop": {
+      "em": 0.013842281879194632,
+      "em_stderr": 0.001196510970060749,
+      "f1": 0.040085989932885986,
+      "f1_stderr": 0.0014841664758736023
+    },
+    "math_geometry": {
+      "acc": 0.0020876826722338203,
+      "acc_stderr": 0.0020876826722338315
+    },
+    "math_counting_and_prob": {
+      "acc": 0.002109704641350211,
+      "acc_stderr": 0.002109704641350211
+    },
+    "math_prealgebra": {
+      "acc": 0.001148105625717566,
+      "acc_stderr": 0.0011481056257175708
+    },
+    "math_num_theory": {
+      "acc": 0.001851851851851852,
+      "acc_stderr": 0.0018518518518518448
+    },
+    "math_precalc": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "math_algebra": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    },
+    "math_intermediate_algebra": {
+      "acc": 0.0,
+      "acc_stderr": 0.0
+    }
+  },
+  "versions": {
+    "mathqa": 0,
+    "gsm8k": 0,
+    "drop": 1,
+    "math_geometry": 1,
+    "math_counting_and_prob": 1,
+    "math_prealgebra": 1,
+    "math_num_theory": 1,
+    "math_precalc": 1,
+    "math_algebra": 1,
+    "math_intermediate_algebra": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 5,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_pawsx_0-shot.json
+{
+  "results": {
+    "pawsx_es": {
+      "acc": 0.515,
+      "acc_stderr": 0.011178102477052804
+    },
+    "pawsx_zh": {
+      "acc": 0.4895,
+      "acc_stderr": 0.011180669867648657
+    },
+    "pawsx_fr": {
+      "acc": 0.4615,
+      "acc_stderr": 0.011149934327957058
+    },
+    "pawsx_ko": {
+      "acc": 0.499,
+      "acc_stderr": 0.01118311365477017
+    },
+    "pawsx_de": {
+      "acc": 0.4695,
+      "acc_stderr": 0.011162310405413175
+    },
+    "pawsx_ja": {
+      "acc": 0.484,
+      "acc_stderr": 0.011177408788874897
+    },
+    "pawsx_en": {
+      "acc": 0.5245,
+      "acc_stderr": 0.011169702598013186
+    }
+  },
+  "versions": {
+    "pawsx_es": 0,
+    "pawsx_zh": 0,
+    "pawsx_fr": 0,
+    "pawsx_ko": 0,
+    "pawsx_de": 0,
+    "pawsx_ja": 0,
+    "pawsx_en": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_question_answering_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_question_answering_0-shot.json
+{
+  "results": {
+    "truthfulqa_mc": {
+      "mc1": 0.2533659730722154,
+      "mc1_stderr": 0.01522589934082683,
+      "mc2": 0.4179977378869182,
+      "mc2_stderr": 0.014601549068840484
+    },
+    "webqs": {
+      "acc": 0.013779527559055118,
+      "acc_stderr": 0.002586718737195641
+    },
+    "logiqa": {
+      "acc": 0.1889400921658986,
+      "acc_stderr": 0.01535436463822078,
+      "acc_norm": 0.2565284178187404,
+      "acc_norm_stderr": 0.017129443327887562
+    },
+    "squad2": {
+      "exact": 4.169123220752969,
+      "f1": 6.5956997780058355,
+      "HasAns_exact": 2.192982456140351,
+      "HasAns_f1": 7.05309437656277,
+      "NoAns_exact": 6.139613120269134,
+      "NoAns_f1": 6.139613120269134,
+      "best_exact": 50.07159100480081,
+      "best_f1": 50.07159100480081
+    },
+    "headqa_es": {
+      "acc": 0.24434719183078046,
+      "acc_stderr": 0.008207488987159709,
+      "acc_norm": 0.2830051057622174,
+      "acc_norm_stderr": 0.008604004902114394
+    },
+    "headqa_en": {
+      "acc": 0.26440554339897887,
+      "acc_stderr": 0.008423643607316284,
+      "acc_norm": 0.30488694383661563,
+      "acc_norm_stderr": 0.008793112278191295
+    },
+    "triviaqa": {
+      "acc": 0.026783346592415803,
+      "acc_stderr": 0.001517985028991893
+    }
+  },
+  "versions": {
+    "truthfulqa_mc": 1,
+    "webqs": 0,
+    "logiqa": 0,
+    "squad2": 1,
+    "headqa_es": 0,
+    "headqa_en": 0,
+    "triviaqa": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_reading_comprehension_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_reading_comprehension_0-shot.json
+{
+  "results": {
+    "drop": {
+      "em": 0.03313758389261745,
+      "em_stderr": 0.0018330841858875643,
+      "f1": 0.08634542785234882,
+      "f1_stderr": 0.0022136353860709133
+    },
+    "coqa": {
+      "f1": 0.4557083534540516,
+      "f1_stderr": 0.01876948425119881,
+      "em": 0.3298333333333334,
+      "em_stderr": 0.019473215823053027
+    },
+    "race": {
+      "acc": 0.3263157894736842,
+      "acc_stderr": 0.014510987877134932
+    }
+  },
+  "versions": {
+    "drop": 1,
+    "coqa": 1,
+    "race": 1
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda:0",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_xcopa_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xcopa_0-shot.json
+{
+  "results": {
+    "xcopa_id": {
+      "acc": 0.648,
+      "acc_stderr": 0.02138004238594605
+    },
+    "xcopa_ht": {
+      "acc": 0.53,
+      "acc_stderr": 0.022342748192502843
+    },
+    "xcopa_it": {
+      "acc": 0.508,
+      "acc_stderr": 0.02238020883492804
+    },
+    "xcopa_et": {
+      "acc": 0.506,
+      "acc_stderr": 0.022381462412439324
+    },
+    "xcopa_ta": {
+      "acc": 0.57,
+      "acc_stderr": 0.02216263442665284
+    },
+    "xcopa_th": {
+      "acc": 0.532,
+      "acc_stderr": 0.022337186479044296
+    },
+    "xcopa_sw": {
+      "acc": 0.544,
+      "acc_stderr": 0.022296238348407056
+    },
+    "xcopa_zh": {
+      "acc": 0.594,
+      "acc_stderr": 0.02198396209008634
+    },
+    "xcopa_qu": {
+      "acc": 0.512,
+      "acc_stderr": 0.02237662679792717
+    },
+    "xcopa_tr": {
+      "acc": 0.53,
+      "acc_stderr": 0.02234274819250285
+    },
+    "xcopa_vi": {
+      "acc": 0.624,
+      "acc_stderr": 0.021683827539286115
+    }
+  },
+  "versions": {
+    "xcopa_id": 0,
+    "xcopa_ht": 0,
+    "xcopa_it": 0,
+    "xcopa_et": 0,
+    "xcopa_ta": 0,
+    "xcopa_th": 0,
+    "xcopa_sw": 0,
+    "xcopa_zh": 0,
+    "xcopa_qu": 0,
+    "xcopa_tr": 0,
+    "xcopa_vi": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_xnli_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xnli_0-shot.json
+{
+  "results": {
+    "xnli_sw": {
+      "acc": 0.3528942115768463,
+      "acc_stderr": 0.0067520304764183674
+    },
+    "xnli_bg": {
+      "acc": 0.3413173652694611,
+      "acc_stderr": 0.006699490620395283
+    },
+    "xnli_el": {
+      "acc": 0.3403193612774451,
+      "acc_stderr": 0.006694754901092155
+    },
+    "xnli_hi": {
+      "acc": 0.4231536926147705,
+      "acc_stderr": 0.006980774514705842
+    },
+    "xnli_th": {
+      "acc": 0.3375249500998004,
+      "acc_stderr": 0.00668131870192652
+    },
+    "xnli_ar": {
+      "acc": 0.3393213572854291,
+      "acc_stderr": 0.006689986106838006
+    },
+    "xnli_de": {
+      "acc": 0.3964071856287425,
+      "acc_stderr": 0.0069114198150005334
+    },
+    "xnli_ru": {
+      "acc": 0.40459081836327343,
+      "acc_stderr": 0.006934900899149144
+    },
+    "xnli_vi": {
+      "acc": 0.44451097804391215,
+      "acc_stderr": 0.00702107269988888
+    },
+    "xnli_tr": {
+      "acc": 0.34790419161676644,
+      "acc_stderr": 0.006729921818907745
+    },
+    "xnli_ur": {
+      "acc": 0.37325349301397204,
+      "acc_stderr": 0.0068339592620100505
+    },
+    "xnli_fr": {
+      "acc": 0.47145708582834334,
+      "acc_stderr": 0.007053191822382807
+    },
+    "xnli_en": {
+      "acc": 0.5147704590818363,
+      "acc_stderr": 0.007061629189884944
+    },
+    "xnli_es": {
+      "acc": 0.47984031936127747,
+      "acc_stderr": 0.00705896771560341
+    },
+    "xnli_zh": {
+      "acc": 0.36227544910179643,
+      "acc_stderr": 0.006791418670232308
+    }
+  },
+  "versions": {
+    "xnli_sw": 0,
+    "xnli_bg": 0,
+    "xnli_el": 0,
+    "xnli_hi": 0,
+    "xnli_th": 0,
+    "xnli_ar": 0,
+    "xnli_de": 0,
+    "xnli_ru": 0,
+    "xnli_vi": 0,
+    "xnli_tr": 0,
+    "xnli_ur": 0,
+    "xnli_fr": 0,
+    "xnli_en": 0,
+    "xnli_es": 0,
+    "xnli_zh": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}
--- a/results/bloom/bloom-1b1/bloom-1b1_xstory_cloze_0-shot.json
+++ b/results/bloom/bloom-1b1/bloom-1b1_xstory_cloze_0-shot.json
+{
+  "results": {
+    "xstory_cloze_te": {
+      "acc": 0.5638649900727994,
+      "acc_stderr": 0.012761730431435764
+    },
+    "xstory_cloze_ar": {
+      "acc": 0.528788881535407,
+      "acc_stderr": 0.012845779070719484
+    },
+    "xstory_cloze_zh": {
+      "acc": 0.5804103242885507,
+      "acc_stderr": 0.01269964226820075
+    },
+    "xstory_cloze_ru": {
+      "acc": 0.4824619457313038,
+      "acc_stderr": 0.012859207453266304
+    },
+    "xstory_cloze_en": {
+      "acc": 0.6254136333553938,
+      "acc_stderr": 0.012455787254852474
+    },
+    "xstory_cloze_id": {
+      "acc": 0.5790866975512905,
+      "acc_stderr": 0.012705145598630695
+    },
+    "xstory_cloze_my": {
+      "acc": 0.4619457313037723,
+      "acc_stderr": 0.012829804720321695
+    },
+    "xstory_cloze_sw": {
+      "acc": 0.5056254136333554,
+      "acc_stderr": 0.012866310923072511
+    },
+    "xstory_cloze_es": {
+      "acc": 0.5830575777630708,
+      "acc_stderr": 0.01268835412160781
+    },
+    "xstory_cloze_hi": {
+      "acc": 0.5552614162806089,
+      "acc_stderr": 0.012788295970207786
+    },
+    "xstory_cloze_eu": {
+      "acc": 0.5433487756452681,
+      "acc_stderr": 0.012818676452481956
+    }
+  },
+  "versions": {
+    "xstory_cloze_te": 0,
+    "xstory_cloze_ar": 0,
+    "xstory_cloze_zh": 0,
+    "xstory_cloze_ru": 0,
+    "xstory_cloze_en": 0,
+    "xstory_cloze_id": 0,
+    "xstory_cloze_my": 0,
+    "xstory_cloze_sw": 0,
+    "xstory_cloze_es": 0,
+    "xstory_cloze_hi": 0,
+    "xstory_cloze_eu": 0
+  },
+  "config": {
+    "model": "hf-causal-experimental",
+    "model_args": "pretrained=bigscience/bloom-1b1",
+    "num_fewshot": 0,
+    "batch_size": "auto",
+    "device": "cuda",
+    "no_cache": true,
+    "limit": null,
+    "bootstrap_iters": 100000,
+    "description_dict": {}
+  }
+}