Commit 4a0b0d6e authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'gakada-big-refactor-merge' into big-refactor

parents 6ae376e3 c490f165
# v1.0 Tasks # v1.0 Tasks
This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness. This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
Boxes should be checked iff tasks are implemented in v2.0 and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation.
- [ ] Glue - [ ] Glue
- [ ] SuperGlue - [x] SuperGlue
- [ ] CoQA - [ ] CoQA
- [ ] DROP - [ ] DROP
- [x] ~~Lambada~~ - [x] ~~Lambada~~
...@@ -31,7 +31,7 @@ Boxes should be checked iff tasks are implemented in v2.0 and tested for regress ...@@ -31,7 +31,7 @@ Boxes should be checked iff tasks are implemented in v2.0 and tested for regress
- [ ] WebQs - [ ] WebQs
- [ ] WSC273 - [ ] WSC273
- [ ] Winogrande - [ ] Winogrande
- [ ] ANLI - [x] ANLI
- [ ] Hendrycks Ethics - [ ] Hendrycks Ethics
- [ ] TruthfulQA - [ ] TruthfulQA
- [ ] MuTual - [ ] MuTual
......
...@@ -3,6 +3,7 @@ from typing import List, Union ...@@ -3,6 +3,7 @@ from typing import List, Union
from .gsm8k import * from .gsm8k import *
from .triviaqa import * from .triviaqa import *
from .glue import *
from lm_eval import utils from lm_eval import utils
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
...@@ -12,6 +13,7 @@ from lm_eval.api.registry import ( ...@@ -12,6 +13,7 @@ from lm_eval.api.registry import (
register_group, register_group,
TASK_REGISTRY, TASK_REGISTRY,
GROUP_REGISTRY, GROUP_REGISTRY,
ALL_TASKS,
) )
...@@ -38,6 +40,9 @@ def include_task_folder(task_dir): ...@@ -38,6 +40,9 @@ def include_task_folder(task_dir):
) )
if "task" in config: if "task" in config:
# task_name = "{}:{}".format(
# get_task_name_from_config(config), config["task"]
# )
task_name = "{}".format(config["task"]) task_name = "{}".format(config["task"])
register_task(task_name)(SubClass) register_task(task_name)(SubClass)
...@@ -62,7 +67,7 @@ def get_task(task_name, config): ...@@ -62,7 +67,7 @@ def get_task(task_name, config):
return TASK_REGISTRY[task_name](config=config) return TASK_REGISTRY[task_name](config=config)
except KeyError: except KeyError:
eval_logger.info("Available tasks:") eval_logger.info("Available tasks:")
eval_logger.info(ALL_TASKS) eval_logger.info(list(TASK_REGISTRY) + list(GROUP_REGISTRY))
raise KeyError(f"Missing task {task_name}") raise KeyError(f"Missing task {task_name}")
......
...@@ -43,4 +43,4 @@ Homepage: https://github.com/openai/grade-school-math ...@@ -43,4 +43,4 @@ Homepage: https://github.com/openai/grade-school-math
- [ ] Variant with Calculator (see https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py for example implementation) - [ ] Variant with Calculator (see https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py for example implementation)
- [ ] Using Verifiers - [ ] Using Verifiers
- [ ] Majority voting "without CoT" - [ ] Majority voting "without CoT"
\ No newline at end of file
...@@ -9,7 +9,7 @@ generation_kwargs: ...@@ -9,7 +9,7 @@ generation_kwargs:
- "\n\n" - "\n\n"
do_sample: true do_sample: true
temperature: 0.2 temperature: 0.2
repeats: 8 repeats: 64
filter_list: filter_list:
- name: "score-first" # pick only the first response, and report metrics on that - name: "score-first" # pick only the first response, and report metrics on that
filter: filter:
......
include: pile_arxiv.yaml include: pile_arxiv.yaml
task: pile_europarl task: pile_europarl
dataset_name: pile_europarl dataset_name: pile_europarl
\ No newline at end of file
include: pile_arxiv.yaml include: pile_arxiv.yaml
task: pile_gutenberg task: pile_gutenberg
dataset_name: pile_gutenberg dataset_name: pile_gutenberg
\ No newline at end of file
include: pile_arxiv.yaml include: pile_arxiv.yaml
task: pile_pubmed-abstracts task: pile_pubmed-abstracts
dataset_name: pile_pubmed-abstracts dataset_name: pile_pubmed-abstracts
include: pile_arxiv.yaml include: pile_arxiv.yaml
task: pile_pubmed-central task: pile_pubmed-central
dataset_name: pile_pubmed-central dataset_name: pile_pubmed-central
include: pile_arxiv.yaml include: pile_arxiv.yaml
task: pile_stackexchange task: pile_stackexchange
dataset_name: pile_stackexchange dataset_name: pile_stackexchange
include: pile_arxiv.yaml include: pile_arxiv.yaml
task: pile_ubuntu-irc task: pile_ubuntu-irc
dataset_name: pile_ubuntu-irc dataset_name: pile_ubuntu-irc
include: pile_arxiv.yaml include: pile_arxiv.yaml
task: pile_uspto task: pile_uspto
dataset_name: pile_uspto dataset_name: pile_uspto
include: pile_arxiv.yaml include: pile_arxiv.yaml
task: pile_wikipedia task: pile_wikipedia
dataset_name: pile_wikipedia dataset_name: pile_wikipedia
...@@ -28,9 +28,16 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext- ...@@ -28,9 +28,16 @@ Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-
### Subtasks ### Subtasks
* `wikitext`: measure perplexity on the Wikitext dataset, via rolling loglikelihoods.
### Checklist ### Checklist
- [x] Is in Eval-harness v1.0 ? * [x] Is the task an existing benchmark in the literature?
- [x] Has been checked for regression from v1.0? * [x] Have you referenced the original paper that introduced the task?
- [ ] Has been checked for equivalence with original paper methodology? * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
- [ ] "Main" checked variant clearly denoted?
If other tasks on this dataset are already supported:
* [x] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
...@@ -8,13 +8,19 @@ import functools ...@@ -8,13 +8,19 @@ import functools
import subprocess import subprocess
import collections import collections
import importlib.util import importlib.util
import fnmatch
from typing import List from typing import List, Union
import gc
import torch
from omegaconf import OmegaConf from omegaconf import OmegaConf
from jinja2 import BaseLoader, Environment, StrictUndefined from jinja2 import BaseLoader, Environment, StrictUndefined
from itertools import islice from itertools import islice
from lm_eval.logger import eval_logger
class ExitCodeError(Exception): class ExitCodeError(Exception):
pass pass
...@@ -25,6 +31,29 @@ def sh(x): ...@@ -25,6 +31,29 @@ def sh(x):
raise ExitCodeError() raise ExitCodeError()
def escaped_split(text, sep_char, maxsplit=-1):
"""Split text into a list on occurrences of the given separation
character `sep_char`. The separation character may be escaped by a
backslash to avoid splitting at that location.
The separation character must be a string of size 1.
If `maxsplit` is given, at most `maxsplit` splits are done (thus,
the list will have at most `maxsplit + 1` elements). If `maxsplit`
is not specified or less than 0, then there is no limit on the
number of splits (all possible splits are made).
"""
assert (
len(sep_char) == 1
), "separation string must be a single character for escaped splitting"
if maxsplit == 0:
return text
maxsplit = max(0, maxsplit)
return re.split(r"(?<!\\)" + sep_char, text, maxsplit)
def simple_parse_args_string(args_string): def simple_parse_args_string(args_string):
""" """
Parses something like Parses something like
...@@ -44,11 +73,11 @@ def join_iters(iters): ...@@ -44,11 +73,11 @@ def join_iters(iters):
yield from iter yield from iter
def chunks(iter, n): def chunks(iter, n=0, fn=None):
arr = [] arr = []
for x in iter: for i, x in enumerate(iter):
arr.append(x) arr.append(x)
if len(arr) == n: if len(arr) == (fn(i) if fn else n):
yield arr yield arr
arr = [] arr = []
...@@ -65,6 +94,35 @@ def group(arr, fn): ...@@ -65,6 +94,35 @@ def group(arr, fn):
return list(res.values()) return list(res.values())
class MultiChoice:
def __init__(self, choices):
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values):
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0:
eval_logger.warning("{} is not in task list.".format(value))
eval_logger.info(f"Available tasks to choose:")
for choice in self.choices:
eval_logger.info(f" - {choice}")
return True
def __iter__(self):
for choice in self.choices:
yield choice
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
task_names = set()
for pattern in patterns:
for matching in fnmatch.filter(source_list, pattern):
task_names.add(matching)
return sorted(list(task_names))
def general_detokenize(string): def general_detokenize(string):
string = string.replace(" n't", "n't") string = string.replace(" n't", "n't")
string = string.replace(" )", ")") string = string.replace(" )", ")")
...@@ -110,8 +168,8 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len ...@@ -110,8 +168,8 @@ def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len
window_end = predicted + window_pred_len window_end = predicted + window_pred_len
yield ( yield (
token_list[window_end - max_seq_len - 1 : window_end - 1], token_list[window_end - max_seq_len - 1: window_end - 1],
token_list[window_end - window_pred_len : window_end], token_list[window_end - window_pred_len: window_end],
) )
predicted += window_pred_len predicted += window_pred_len
...@@ -122,6 +180,26 @@ def make_disjoint_window(pair): ...@@ -122,6 +180,26 @@ def make_disjoint_window(pair):
return a[: len(a) - (len(b) - 1)], b return a[: len(a) - (len(b) - 1)], b
def select_continuation_from_batch_left_padding(
generations: Union[List[List[int]], torch.Tensor], max_context_size: int
):
"""Select the continuation from the batch, removing prompts of different lengths.
Args:
generations (Union[List[List[int]], torch.Tensor]):
A tensor or list-of-lists of shape [batch_size, sequence length].
max_context_size (int):
The size of the biggest context; generations will proceed from that
index.
Example:
PAD PAD Continue : The dog chased the cat [every day of the week]
Riddle me this : The dog chased the cat [yesterday] PAD PAD PAD PAD
Output:
[every day of the week]
[yesterday] PAD PAD PAD PAD
"""
return generations[:, max_context_size:]
class Reorderer: class Reorderer:
def __init__(self, arr, fn): def __init__(self, arr, fn):
self.size = len(arr) self.size = len(arr)
...@@ -336,3 +414,8 @@ def create_iterator(raw_iterator, rank, world_size, limit=None): ...@@ -336,3 +414,8 @@ def create_iterator(raw_iterator, rank, world_size, limit=None):
among ranks in multigpu setting or only pulling a sample of documents among ranks in multigpu setting or only pulling a sample of documents
""" """
return islice(raw_iterator, rank, limit, world_size) return islice(raw_iterator, rank, limit, world_size)
def clear_torch_cache():
gc.collect()
torch.cuda.empty_cache()
import os import os
import json import json
import fnmatch
import argparse import argparse
from lm_eval import evaluator, utils from lm_eval import evaluator, utils
from lm_eval.api.registry import GROUP_REGISTRY, TASK_REGISTRY from lm_eval.api.registry import ALL_TASKS
from lm_eval.logger import eval_logger from lm_eval.logger import eval_logger
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
ALL_TASKS = sorted(list(TASK_REGISTRY.keys()) + list(GROUP_REGISTRY.keys()))
class MultiChoice:
def __init__(self, choices):
self.choices = choices
# Simple wildcard support (linux filename patterns)
def __contains__(self, values):
for value in values.split(","):
if len(fnmatch.filter(self.choices, value)) == 0:
eval_logger.warning("{} is not in task list.".format(value))
eval_logger.info(f"Available tasks to choose:")
# for choice in self.choices:
# eval_logger.info(f" {choice}")
eval_logger.info(ALL_TASKS)
return True
def __iter__(self):
for choice in self.choices:
yield choice
def parse_args(): def parse_args():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", required=True) parser.add_argument("--model", required=True)
parser.add_argument("--model_args", default="") parser.add_argument("--model_args", default="")
parser.add_argument("--tasks", default=None, choices=MultiChoice(ALL_TASKS)) parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(sorted(ALL_TASKS)))
parser.add_argument("--config", default=None) parser.add_argument("--config", default=None)
parser.add_argument("--provide_description", action="store_true")
parser.add_argument("--num_fewshot", type=int, default=0) parser.add_argument("--num_fewshot", type=int, default=0)
parser.add_argument("--batch_size", type=int, default=1) parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--max_batch_size", type=int, default=None,
help="Maximal batch size to try with --batch_size auto")
parser.add_argument("--device", type=str, default=None) parser.add_argument("--device", type=str, default=None)
parser.add_argument("--output_path", default=None) parser.add_argument("--output_path", default=None)
parser.add_argument("--limit", type=int, default=None) parser.add_argument("--limit", type=float, default=None,
help="Limit the number of examples per task. "
"If <1, limit is a percentage of the total number of examples.")
parser.add_argument("--data_sampling", type=float, default=None)
parser.add_argument("--no_cache", action="store_true") parser.add_argument("--no_cache", action="store_true")
parser.add_argument("--decontamination_ngrams_path", default=None) parser.add_argument("--decontamination_ngrams_path", default=None)
parser.add_argument("--description_dict_path", default=None) parser.add_argument("--description_dict_path", default=None)
parser.add_argument("--check_integrity", action="store_true") parser.add_argument("--check_integrity", action="store_true")
parser.add_argument("--write_out", action="store_true", default=False)
parser.add_argument("--output_base_path", type=str, default=None)
return parser.parse_args() return parser.parse_args()
# Returns a list containing all values of the source_list that
# match at least one of the patterns
def pattern_match(patterns, source_list):
task_names = set()
for pattern in patterns:
for matching in fnmatch.filter(source_list, pattern):
task_names.add(matching)
return sorted(list(task_names))
def main(): def main():
args = parse_args() args = parse_args()
...@@ -68,7 +43,9 @@ def main(): ...@@ -68,7 +43,9 @@ def main():
"REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
) )
if args.tasks is not None: if args.tasks is None:
task_names = ALL_TASKS
else:
if os.path.isdir(args.tasks): if os.path.isdir(args.tasks):
import glob import glob
...@@ -79,7 +56,7 @@ def main(): ...@@ -79,7 +56,7 @@ def main():
task_names.append(config) task_names.append(config)
else: else:
tasks_list = args.tasks.split(",") tasks_list = args.tasks.split(",")
task_names = pattern_match(tasks_list, ALL_TASKS) task_names = utils.pattern_match(tasks_list, ALL_TASKS)
for task in [task for task in tasks_list if task not in task_names]: for task in [task for task in tasks_list if task not in task_names]:
if os.path.isfile(task): if os.path.isfile(task):
config = utils.load_yaml_config(task) config = utils.load_yaml_config(task)
...@@ -87,28 +64,42 @@ def main(): ...@@ -87,28 +64,42 @@ def main():
eval_logger.info(f"Selected Tasks: {task_names}") eval_logger.info(f"Selected Tasks: {task_names}")
# TODO: description_dict?
# description_dict = {}
# if args.description_dict_path:
# with open(args.description_dict_path, "r") as f:
# description_dict = json.load(f)
results = evaluator.simple_evaluate( results = evaluator.simple_evaluate(
model=args.model, model=args.model,
model_args=args.model_args, model_args=args.model_args,
tasks=task_names, tasks=task_names,
num_fewshot=args.num_fewshot, num_fewshot=args.num_fewshot,
batch_size=args.batch_size, batch_size=args.batch_size,
max_batch_size=args.max_batch_size,
device=args.device, device=args.device,
no_cache=args.no_cache,
limit=args.limit, limit=args.limit,
# description_dict=description_dict,
decontamination_ngrams_path=args.decontamination_ngrams_path, decontamination_ngrams_path=args.decontamination_ngrams_path,
check_integrity=args.check_integrity, check_integrity=args.check_integrity,
write_out=args.write_out,
output_base_path=args.output_base_path,
) )
if results is not None: if results is not None:
dumped = json.dumps(results, indent=2) dumped = json.dumps(results, indent=2)
print(dumped) print(dumped)
if args.output_path: if args.output_path:
os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
with open(args.output_path, "w") as f: with open(args.output_path, "w") as f:
f.write(dumped) f.write(dumped)
batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
print( print(
f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, " f"{args.model} ({args.model_args}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, "
f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}" f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
) )
print(evaluator.make_table(results)) print(evaluator.make_table(results))
......
# bloom-1b1
## bloom-1b1_common_sense_reasoning_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|--------|----:|---|-----:|
|arc_challenge| 0|acc |23.63|± | 1.24|
| | |acc_norm|25.68|± | 1.28|
|arc_easy | 0|acc |51.47|± | 1.03|
| | |acc_norm|45.45|± | 1.02|
|boolq | 1|acc |59.08|± | 0.86|
|copa | 0|acc |68.00|± | 4.69|
|hellaswag | 0|acc |34.63|± | 0.47|
| | |acc_norm|41.77|± | 0.49|
|mc_taco | 0|em |14.49| | |
| | |f1 |32.43| | |
|openbookqa | 0|acc |19.60|± | 1.78|
| | |acc_norm|29.40|± | 2.04|
|piqa | 0|acc |67.14|± | 1.10|
| | |acc_norm|67.14|± | 1.10|
|prost | 0|acc |23.41|± | 0.31|
| | |acc_norm|30.50|± | 0.34|
|swag | 0|acc |43.43|± | 0.35|
| | |acc_norm|58.28|± | 0.35|
|winogrande | 0|acc |54.93|± | 1.40|
|wsc273 | 0|acc |68.50|± | 2.82|
## bloom-1b1_gsm8k_8-shot.json
|Task |Version|Metric|Value| |Stderr|
|-----|------:|------|----:|---|-----:|
|gsm8k| 0|acc | 0.83|± | 0.25|
## bloom-1b1_mathematical_reasoning_few_shot_5-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------------------|------:|--------|----:|---|-----:|
|drop | 1|em | 1.38|± | 0.12|
| | |f1 | 4.01|± | 0.15|
|gsm8k | 0|acc | 0.00|± | 0.00|
|math_algebra | 1|acc | 0.00|± | 0.00|
|math_counting_and_prob | 1|acc | 0.21|± | 0.21|
|math_geometry | 1|acc | 0.21|± | 0.21|
|math_intermediate_algebra| 1|acc | 0.00|± | 0.00|
|math_num_theory | 1|acc | 0.19|± | 0.19|
|math_prealgebra | 1|acc | 0.11|± | 0.11|
|math_precalc | 1|acc | 0.00|± | 0.00|
|mathqa | 0|acc |23.55|± | 0.78|
| | |acc_norm|23.62|± | 0.78|
## bloom-1b1_pawsx_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|pawsx_de| 0|acc |46.95|± | 1.12|
|pawsx_en| 0|acc |52.45|± | 1.12|
|pawsx_es| 0|acc |51.50|± | 1.12|
|pawsx_fr| 0|acc |46.15|± | 1.11|
|pawsx_ja| 0|acc |48.40|± | 1.12|
|pawsx_ko| 0|acc |49.90|± | 1.12|
|pawsx_zh| 0|acc |48.95|± | 1.12|
## bloom-1b1_question_answering_0-shot.json
| Task |Version| Metric |Value| |Stderr|
|-------------|------:|------------|----:|---|-----:|
|headqa_en | 0|acc |26.44|± | 0.84|
| | |acc_norm |30.49|± | 0.88|
|headqa_es | 0|acc |24.43|± | 0.82|
| | |acc_norm |28.30|± | 0.86|
|logiqa | 0|acc |18.89|± | 1.54|
| | |acc_norm |25.65|± | 1.71|
|squad2 | 1|exact | 4.17| | |
| | |f1 | 6.60| | |
| | |HasAns_exact| 2.19| | |
| | |HasAns_f1 | 7.05| | |
| | |NoAns_exact | 6.14| | |
| | |NoAns_f1 | 6.14| | |
| | |best_exact |50.07| | |
| | |best_f1 |50.07| | |
|triviaqa | 1|acc | 2.68|± | 0.15|
|truthfulqa_mc| 1|mc1 |25.34|± | 1.52|
| | |mc2 |41.80|± | 1.46|
|webqs | 0|acc | 1.38|± | 0.26|
## bloom-1b1_reading_comprehension_0-shot.json
|Task|Version|Metric|Value| |Stderr|
|----|------:|------|----:|---|-----:|
|coqa| 1|f1 |45.57|± | 1.88|
| | |em |32.98|± | 1.95|
|drop| 1|em | 3.31|± | 0.18|
| | |f1 | 8.63|± | 0.22|
|race| 1|acc |32.63|± | 1.45|
## bloom-1b1_xcopa_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|--------|------:|------|----:|---|-----:|
|xcopa_et| 0|acc | 50.6|± | 2.24|
|xcopa_ht| 0|acc | 53.0|± | 2.23|
|xcopa_id| 0|acc | 64.8|± | 2.14|
|xcopa_it| 0|acc | 50.8|± | 2.24|
|xcopa_qu| 0|acc | 51.2|± | 2.24|
|xcopa_sw| 0|acc | 54.4|± | 2.23|
|xcopa_ta| 0|acc | 57.0|± | 2.22|
|xcopa_th| 0|acc | 53.2|± | 2.23|
|xcopa_tr| 0|acc | 53.0|± | 2.23|
|xcopa_vi| 0|acc | 62.4|± | 2.17|
|xcopa_zh| 0|acc | 59.4|± | 2.20|
## bloom-1b1_xnli_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|-------|------:|------|----:|---|-----:|
|xnli_ar| 0|acc |33.93|± | 0.67|
|xnli_bg| 0|acc |34.13|± | 0.67|
|xnli_de| 0|acc |39.64|± | 0.69|
|xnli_el| 0|acc |34.03|± | 0.67|
|xnli_en| 0|acc |51.48|± | 0.71|
|xnli_es| 0|acc |47.98|± | 0.71|
|xnli_fr| 0|acc |47.15|± | 0.71|
|xnli_hi| 0|acc |42.32|± | 0.70|
|xnli_ru| 0|acc |40.46|± | 0.69|
|xnli_sw| 0|acc |35.29|± | 0.68|
|xnli_th| 0|acc |33.75|± | 0.67|
|xnli_tr| 0|acc |34.79|± | 0.67|
|xnli_ur| 0|acc |37.33|± | 0.68|
|xnli_vi| 0|acc |44.45|± | 0.70|
|xnli_zh| 0|acc |36.23|± | 0.68|
## bloom-1b1_xstory_cloze_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|---------------|------:|------|----:|---|-----:|
|xstory_cloze_ar| 0|acc |52.88|± | 1.28|
|xstory_cloze_en| 0|acc |62.54|± | 1.25|
|xstory_cloze_es| 0|acc |58.31|± | 1.27|
|xstory_cloze_eu| 0|acc |54.33|± | 1.28|
|xstory_cloze_hi| 0|acc |55.53|± | 1.28|
|xstory_cloze_id| 0|acc |57.91|± | 1.27|
|xstory_cloze_my| 0|acc |46.19|± | 1.28|
|xstory_cloze_ru| 0|acc |48.25|± | 1.29|
|xstory_cloze_sw| 0|acc |50.56|± | 1.29|
|xstory_cloze_te| 0|acc |56.39|± | 1.28|
|xstory_cloze_zh| 0|acc |58.04|± | 1.27|
## bloom-1b1_xwinograd_0-shot.json
| Task |Version|Metric|Value| |Stderr|
|------------|------:|------|----:|---|-----:|
|xwinograd_en| 0|acc |69.98|± | 0.95|
|xwinograd_fr| 0|acc |66.27|± | 5.22|
|xwinograd_jp| 0|acc |52.87|± | 1.61|
|xwinograd_pt| 0|acc |63.12|± | 2.98|
|xwinograd_ru| 0|acc |54.29|± | 2.81|
|xwinograd_zh| 0|acc |69.25|± | 2.06|
{
"results": {
"boolq": {
"acc": 0.5908256880733945,
"acc_stderr": 0.008599563442397352
},
"arc_easy": {
"acc": 0.5147306397306397,
"acc_stderr": 0.010255329977562096,
"acc_norm": 0.45454545454545453,
"acc_norm_stderr": 0.010217299762709435
},
"openbookqa": {
"acc": 0.196,
"acc_stderr": 0.017770751227744862,
"acc_norm": 0.294,
"acc_norm_stderr": 0.020395095484936614
},
"hellaswag": {
"acc": 0.3463453495319657,
"acc_stderr": 0.004748324319714264,
"acc_norm": 0.4177454690300737,
"acc_norm_stderr": 0.004921798492608764
},
"swag": {
"acc": 0.43431970408877335,
"acc_stderr": 0.0035044592489844794,
"acc_norm": 0.5828251524542637,
"acc_norm_stderr": 0.0034862531772295617
},
"arc_challenge": {
"acc": 0.2363481228668942,
"acc_stderr": 0.012414960524301834,
"acc_norm": 0.2568259385665529,
"acc_norm_stderr": 0.0127669237941168
},
"mc_taco": {
"em": 0.1448948948948949,
"f1": 0.32425976796237205
},
"wsc273": {
"acc": 0.684981684981685,
"acc_stderr": 0.028165854394193602
},
"winogrande": {
"acc": 0.5493291239147593,
"acc_stderr": 0.013983928869040239
},
"prost": {
"acc": 0.23409479077711356,
"acc_stderr": 0.003093545711826552,
"acc_norm": 0.3049743808710504,
"acc_norm_stderr": 0.003363606918420179
},
"copa": {
"acc": 0.68,
"acc_stderr": 0.04688261722621504
},
"piqa": {
"acc": 0.6713819368879217,
"acc_stderr": 0.010959127105167048,
"acc_norm": 0.6713819368879217,
"acc_norm_stderr": 0.010959127105167044
}
},
"versions": {
"boolq": 1,
"arc_easy": 0,
"openbookqa": 0,
"hellaswag": 0,
"swag": 0,
"arc_challenge": 0,
"mc_taco": 0,
"wsc273": 0,
"winogrande": 0,
"prost": 0,
"copa": 0,
"piqa": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"gsm8k": {
"acc": 0.008339651250947688,
"acc_stderr": 0.002504942226860508
}
},
"versions": {
"gsm8k": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
"num_fewshot": 8,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"mathqa": {
"acc": 0.2355108877721943,
"acc_stderr": 0.007767687364650971,
"acc_norm": 0.23618090452261306,
"acc_norm_stderr": 0.0077753193787470495
},
"gsm8k": {
"acc": 0.0,
"acc_stderr": 0.0
},
"drop": {
"em": 0.013842281879194632,
"em_stderr": 0.001196510970060749,
"f1": 0.040085989932885986,
"f1_stderr": 0.0014841664758736023
},
"math_geometry": {
"acc": 0.0020876826722338203,
"acc_stderr": 0.0020876826722338315
},
"math_counting_and_prob": {
"acc": 0.002109704641350211,
"acc_stderr": 0.002109704641350211
},
"math_prealgebra": {
"acc": 0.001148105625717566,
"acc_stderr": 0.0011481056257175708
},
"math_num_theory": {
"acc": 0.001851851851851852,
"acc_stderr": 0.0018518518518518448
},
"math_precalc": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
},
"math_intermediate_algebra": {
"acc": 0.0,
"acc_stderr": 0.0
}
},
"versions": {
"mathqa": 0,
"gsm8k": 0,
"drop": 1,
"math_geometry": 1,
"math_counting_and_prob": 1,
"math_prealgebra": 1,
"math_num_theory": 1,
"math_precalc": 1,
"math_algebra": 1,
"math_intermediate_algebra": 1
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-1b1,use_accelerate=True",
"num_fewshot": 5,
"batch_size": "auto",
"device": "cuda:0",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
{
"results": {
"pawsx_es": {
"acc": 0.515,
"acc_stderr": 0.011178102477052804
},
"pawsx_zh": {
"acc": 0.4895,
"acc_stderr": 0.011180669867648657
},
"pawsx_fr": {
"acc": 0.4615,
"acc_stderr": 0.011149934327957058
},
"pawsx_ko": {
"acc": 0.499,
"acc_stderr": 0.01118311365477017
},
"pawsx_de": {
"acc": 0.4695,
"acc_stderr": 0.011162310405413175
},
"pawsx_ja": {
"acc": 0.484,
"acc_stderr": 0.011177408788874897
},
"pawsx_en": {
"acc": 0.5245,
"acc_stderr": 0.011169702598013186
}
},
"versions": {
"pawsx_es": 0,
"pawsx_zh": 0,
"pawsx_fr": 0,
"pawsx_ko": 0,
"pawsx_de": 0,
"pawsx_ja": 0,
"pawsx_en": 0
},
"config": {
"model": "hf-causal-experimental",
"model_args": "pretrained=bigscience/bloom-1b1",
"num_fewshot": 0,
"batch_size": "auto",
"device": "cuda",
"no_cache": true,
"limit": null,
"bootstrap_iters": 100000,
"description_dict": {}
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment