Commit 6a6a0ebb authored by Benjamin Fattori's avatar Benjamin Fattori
Browse files

Merge remote-tracking branch 'upstream/big-refactor' into big-refactor-autobatching

parents e4acfcaa 2820042d
name: Tasks Modified
on:
push:
branches:
- big-refactor
pull_request:
branches:
- big-refactor
workflow_dispatch:
jobs:
changed_files:
runs-on: ubuntu-latest # windows-latest || macos-latest
name: Scan for changed tasks
steps:
- name: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0 # OR "2" -> To retrieve the preceding commit.
# Example 1
- name: Check task folders
id: changed-tasks
uses: tj-actions/changed-files@v37.1.2
with:
files_yaml: |
tasks:
- lm_eval/tasks/**
api:
- lm_eval/api/**
write_output_files: true
- name: Run Tests
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: |
echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
echo "One or more test file(s) has changed."
echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
- name: Set up Python 3.9
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
uses: actions/setup-python@v4
with:
python-version: 3.9
cache: 'pip'
- name: Install dependencies
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: |
python -m pip install --upgrade pip
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto --new_task
- name: Test more tasks with pytest
env:
API: true
if: steps.changed-tasks.outputs.api_any_modified == 'true'
run: python -m pytest tests/test_api.py -s -vv -n=auto --new_task
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: Build
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Cache
uses: actions/cache@v2.1.3
with:
# A list of files, directories, and wildcard patterns to cache and restore
path: |
~/.cache
# An explicit key for restoring and saving the cache
key: evaldata-cache-4
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest pytest-cov
pip install -e .[dev,multilingual]
# Install optional git dependencies
pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest -vv --cov=lm_eval/ tests/
- name: Upload to codecov
run: |
bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Unit Tests
on:
push:
branches:
- big-refactor
pull_request:
branches:
- big-refactor
workflow_dispatch:
jobs:
linter:
name: Linters
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Lint with pylint
run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Lint with mypy
run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
testcpu:
name: CPU Tests
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
run: python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
......@@ -32,6 +32,7 @@ Prompting / in-context formatting options:
- **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text and doc_to_target and make template_aliases unused.
- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model.
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into possible choices for `multiple_choice`
- **gold_alias** (`str`, *optional*, defaults to None) — if provided, used to generate the reference answer that is scored against. Used in cases where `doc_to_target` should be the "target string" format appended to each example's input for a fewshot exemplar, so doc_to_target is used for fewshot examples, but the input to the metric function as `gold` is from `gold_alias`.
- **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
- **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
......
......@@ -4,13 +4,13 @@ from typing import Literal, Tuple
@dataclass
class Instance:
request_type: str = Literal[
"loglikelihood", "loglikelihood_rolling", "greedy_until"
]
doc: dict = None
arguments: tuple = None
idx: int = None
metadata: tuple = Tuple[str, int, int] # TODO: better typehints here
request_type: Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
doc: dict
arguments: tuple
idx: int
metadata: Tuple[str, int, int] = field(
default_factory=lambda: (None, None, None)
) # TODO: better typehints here
resps: list = field(default_factory=list)
filtered_resps: dict = field(default_factory=dict)
......
......@@ -114,6 +114,8 @@ class LM(abc.ABC):
additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None}
if args2.get("device") == "mps" or args.get("device") == "mps":
args["dtype"] = "float32"
return cls(**args, **args2)
@property
......
......@@ -10,6 +10,10 @@ class Sampler:
self.target_delimiter = self.config.target_delimiter
self.fewshot_delimiter = self.config.fewshot_delimiter
self.doc_to_text = self.task.doc_to_text
self.doc_to_target = self.task.doc_to_target
self.doc_to_choice = self.task.doc_to_choice
self.docs = docs # HF dataset split, provided by task._fewshot_docs()
if fewshot_indices: # subset few-shot docs from
self.docs = self.docs.select(fewshot_indices)
......@@ -34,16 +38,29 @@ class Sampler:
self.fewshot_delimiter.join(
[
# TODO: is separating doc_to_text and doc_to_target by one space always desired?
self.task.doc_to_text(doc)
(
self.doc_to_text(doc)
if (
self.config.doc_to_choice is None
or type(self.doc_to_text(doc)) is str
)
else self.doc_to_choice(doc)[self.doc_to_text(doc)]
)
+ self.target_delimiter
+ self.task.doc_to_target(doc)
+ (
self.doc_to_target(doc)
if (
self.config.doc_to_choice is None
or type(self.doc_to_target(doc)) is str
)
else self.doc_to_choice(doc)[self.doc_to_target(doc)]
)
for doc in selected_docs
]
)
+ self.fewshot_delimiter
)
# only returns the fewshot context! Does not append the document, do this outside the object
return labeled_examples
def sample(self, n):
......
......@@ -8,6 +8,7 @@ import evaluate
import random
import itertools
import functools
from tqdm import tqdm
import datasets
import numpy as np
......@@ -27,6 +28,7 @@ from lm_eval.api.metrics import (
mean,
weighted_perplexity,
bits_per_byte,
metric_max_over_ground_truths,
)
from lm_eval.api.registry import (
get_metric,
......@@ -43,7 +45,6 @@ ALL_OUTPUT_TYPES = [
"multiple_choice",
"loglikelihood_rolling",
"greedy_until",
"winograd_schema",
]
......@@ -64,9 +65,10 @@ class TaskConfig(dict):
fewshot_split: str = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
# formatting / prompting options.
# see docs/advanced_task_guide.md for more info
template_aliases: str = ""
template_aliases: Union[str, list] = None
doc_to_text: Union[Callable, str] = None
doc_to_target: Union[Callable, str] = None
doc_to_choice: Union[Callable, str, dict, list] = None
gold_alias: Union[Callable, str] = None
use_prompt: str = None
description: str = ""
......@@ -76,8 +78,6 @@ class TaskConfig(dict):
num_fewshot: int = 0
# scoring options
metric_list: str = None
gold_alias: Union[Callable, str] = None
create_choices: Union[Callable, str] = None
output_type: str = "greedy_until"
generation_kwargs: dict = None
repeats: int = 1
......@@ -217,8 +217,8 @@ class Task(abc.ABC):
self._filters.append(filter_pipeline)
self.sampler = samplers.Sampler(
list(self.fewshot_docs()), self, rnd=random.Random()
) # TODO: pass the correct docs in here
list(self.fewshot_docs()), self, rnd=random.Random(1234)
)
def download(self, data_dir=None, cache_dir=None, download_mode=None):
"""Downloads and returns the task dataset.
......@@ -316,18 +316,6 @@ class Task(abc.ABC):
"""
return doc
def create_choices(self, doc):
if self._config.create_choices is None:
return ast.literal_eval(
utils.apply_template(
self._config.template_aliases + "{{answer_choices}}", doc
)
)
elif type(self._config.create_choices) == str:
return utils.apply_template(self._config.create_choices, doc)
else:
return self._config.create_choices(doc)
@property
def instances(self):
"""After calling `task.build_all_requests()`, tasks
......@@ -366,13 +354,18 @@ class Task(abc.ABC):
False
), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
eval_logger.info(
f"Building contexts for task '{self._config.task}' on rank {rank}..."
)
instances = []
for doc_id, doc in utils.create_iterator(
enumerate(docs), rank, world_size, limit
):
# sample fewshot context #TODO: need to offset doc_id by rank now!
fewshot_ctx = self.fewshot_context(
doc, self._config.num_fewshot, rnd=random.Random()
doc,
self._config.num_fewshot,
)
# TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
......@@ -453,7 +446,7 @@ class Task(abc.ABC):
return len(re.split(r"\s+", doc))
@utils.positional_deprecated
def fewshot_context(self, doc, num_fewshot, rnd=None):
def fewshot_context(self, doc, num_fewshot):
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
......@@ -461,15 +454,9 @@ class Task(abc.ABC):
The document as returned from training_docs, validation_docs, or test_docs.
:param num_fewshot: int
The number of fewshot examples to provide in the returned context string.
:param rnd: random.Random
The pseudo-random number generator used to randomly sample examples.
WARNING: This is currently a required arg although it's optionalized with a default `None`.
:returns: str
The fewshot context.
"""
assert (
rnd is not None
), "A `random.Random` generator argument must be provided to `rnd`"
if num_fewshot == 0:
# always prepend the (possibly empty) task description
......@@ -480,7 +467,10 @@ class Task(abc.ABC):
)
example = self.doc_to_text(doc)
return labeled_examples + example
if type(example) == str:
return labeled_examples + example
elif type(example) == list:
return [labeled_examples + ex for ex in example]
def apply_filters(self):
......@@ -625,9 +615,43 @@ class ConfigurableTask(Task):
if self.fewshot_docs() is not None:
self.sampler = samplers.Sampler(
list(self.fewshot_docs()), self, rnd=random.Random()
list(self.fewshot_docs()), self, rnd=random.Random(1234)
)
if self._config.template_aliases is not None:
for key, alias in self._config.template_aliases:
self.dataset.rename_column(key, alias)
if self.has_test_docs():
docs = self.test_docs()
elif self.has_validation_docs():
docs = self.validation_docs()
else:
assert (
False
), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
# Test One Doc
self.features = list(docs.features.keys())
self.multiple_input = 0
self.multiple_target = 0
test_doc = docs[0]
test_text = self.doc_to_text(test_doc)
test_target = self.doc_to_target(test_doc)
if self._config.doc_to_choice is not None:
test_choice = self.doc_to_choice(test_doc)
if type(test_choice) is not list:
eval_logger.error("doc_to_choice must return list")
else:
num_choice = len(test_choice)
if type(test_text) is int:
self.multiple_input = num_choice
if type(test_target) is list:
self.multiple_target = len(test_target)
def download(self, dataset_kwargs=None):
self.dataset = datasets.load_dataset(
......@@ -683,7 +707,12 @@ class ConfigurableTask(Task):
def doc_to_decontamination_query(self, doc):
if self._config.should_decontaminate:
return utils.apply_template(self._config.doc_to_decontamination_query, doc)
if self._config.doc_to_decontamination_query in self.features:
return doc[self._config.doc_to_decontamination_query]
else:
return ast.literal_eval(
utils.apply_template(self._config.doc_to_decontamination_query, doc)
)
def _process_doc(self, doc):
"""
......@@ -703,11 +732,24 @@ class ConfigurableTask(Task):
else:
doc_to_text = self._config.doc_to_text
if type(doc_to_text) == str:
return utils.apply_template(doc_to_text, doc)
if type(doc_to_text) == int:
return doc_to_text
elif type(doc_to_text) == str:
if doc_to_text in self.features:
# if self._config.doc_to_choice is not None:
# return self.doc_to_choice(doc)[doc[doc_to_text]]
# else:
return doc[doc_to_text]
else:
text_string = utils.apply_template(doc_to_text, doc)
if text_string.isdigit():
return ast.literal_eval(text_string)
else:
return text_string
elif callable(doc_to_text):
return doc_to_text(doc)
if hasattr(doc_to_text, "apply"):
# Used when applying a Promptsource template
elif hasattr(doc_to_text, "apply"):
return doc_to_text.apply(doc)[0]
else:
print(type(doc_to_text))
......@@ -720,15 +762,50 @@ class ConfigurableTask(Task):
else:
doc_to_target = self._config.doc_to_target
if type(doc_to_target) == str:
return utils.apply_template(doc_to_target, doc)
if type(doc_to_target) == int:
return doc_to_target
elif type(doc_to_target) == str:
if doc_to_target in self.features:
# if self._config.doc_to_choice is not None:
# return self.doc_to_choice(doc)[doc[doc_to_target]]
# else:
return doc[doc_to_target]
else:
target_string = utils.apply_template(doc_to_target, doc)
if target_string.isdigit():
return ast.literal_eval(target_string)
else:
return target_string
elif callable(doc_to_target):
return doc_to_target(doc)
# Used when applying a Promptsource template
elif hasattr(doc_to_target, "apply"):
return doc_to_target.apply(doc)[1]
else:
raise TypeError
def doc_to_choice(self, doc):
if self.prompt is not None:
doc_to_choice = self.prompt
elif self._config.doc_to_choice is None:
eval_logger.error("doc_to_choice was called but not set in config")
else:
doc_to_choice = self._config.doc_to_choice
if type(doc_to_choice) == str:
return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
elif type(doc_to_choice) == list:
return doc_to_choice
elif type(doc_to_choice) == dict:
return list(doc_to_choice.values())
elif callable(doc_to_choice):
return doc_to_choice(doc)
elif hasattr(doc_to_choice, "get_answer_choices_list"):
return doc_to_choice.get_answer_choices_list(doc)
else:
raise TypeError
def gold_alias(self, doc):
# returns a version of the gold target answer to a document,
# which should be passed into metric for scoring as the ground truth.
......@@ -756,19 +833,25 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "loglikelihood_rolling":
arguments = (self.doc_to_target(doc),)
elif self.OUTPUT_TYPE == "multiple_choice":
# we pass the user-defined answer_choices var (in aliases) and translate the result to a Python list.
# TODO: any cleaner way to do this?
choices = self.create_choices(doc)
choices = self.doc_to_choice(doc)
if self.multiple_input:
# If there are multiple inputs, choices are placed in the ctx
cont = self.doc_to_target(doc)
arguments = [(ctx, " {}".format(cont)) for ctx in choices]
else:
# Otherwise they are placed in the continuation
arguments = [(ctx, " {}".format(cont)) for cont in choices]
request_list = [
Instance(
request_type="loglikelihood",
doc=doc,
arguments=(ctx, " {}".format(choice)),
arguments=arg,
idx=i,
**kwargs,
)
for i, choice in enumerate(choices)
for i, arg in enumerate(arguments)
]
# TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys():
......@@ -795,26 +878,6 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "greedy_until":
arguments = (ctx, self._config.generation_kwargs)
elif self.OUTPUT_TYPE == "winograd_schema":
# similar to multiple_choice task type except each request contains
# multiple differing contexts with the same continuation
contexts = self.create_choices(doc)
choice = self.doc_to_target(doc)
request_list = [
Instance(
request_type="loglikelihood",
doc=doc,
arguments=(context, " {}".format(choice)),
idx=i,
**kwargs,
)
for i, context in enumerate(contexts)
]
return request_list
return Instance(
request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
)
......@@ -857,13 +920,11 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "multiple_choice":
lls, is_greedy = zip(*results)
if self._config.gold_alias is not None:
gold = int(self.gold_alias(doc))
else:
gold = int(self.doc_to_target(doc))
# retrieve choices in List[str] form, to compute choice lengths, etc.
choices = self.create_choices(doc)
choices = self.doc_to_choice(doc)
completion_len = np.array([float(len(i)) for i in choices])
if (
2 * len(choices) == len(lls)
and "acc_mutual_info" in self._metric_fn_list.keys()
......@@ -876,23 +937,33 @@ class ConfigurableTask(Task):
lls = lls[::2]
pred = np.argmax(lls)
pred_norm = np.argmax(lls / completion_len)
acc = 1.0 if np.argmax(lls) == gold else 0.0
completion_len = np.array([float(len(i)) for i in choices])
acc_norm = 1.0 if np.argmax(lls / completion_len) == gold else 0.0
if self.multiple_input:
gold = self.doc_to_text(doc)
else:
gold = self.doc_to_target(doc)
if type(gold) is str:
gold = choices.index(gold)
if self.multiple_target:
acc = 1.0 if pred in gold else 0.0
acc_norm = 1.0 if pred_norm in gold else 0.0
exact_match = int(any([is_greedy[i] for i in gold]))
else:
acc = 1.0 if pred == gold else 0.0
acc_norm = 1.0 if pred_norm == gold else 0.0
# TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
exact_match = int(is_greedy[gold])
result_dict = {
**({"acc": acc} if "acc" in use_metric else {}),
**({"f1": (gold, pred)} if "f1" in use_metric else {}),
**({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
**({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
**({"exact_match": exact_match} if "exact_match" in use_metric else {}),
}
if "exact_match" in self._metric_fn_list.keys():
# TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
is_greedy = is_greedy[gold] # take value for the gold answer
result_dict["exact_match"] = int(is_greedy)
if "acc_mutual_info" in use_metric:
lls_mutual_info = [
ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)
......@@ -900,40 +971,45 @@ class ConfigurableTask(Task):
acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
result_dict["acc_mutual_info"] = acc_mutual_info
elif self.OUTPUT_TYPE == "winograd_schema":
lls, is_greedy = zip(*results)
if self._config.gold_alias is not None:
gold = int(self.gold_alias(doc))
else:
gold = int(self.doc_to_target(doc))
pred = np.argmax(lls)
acc = 1.0 if np.argmax(lls) == gold else 0.0
result_dict = {
**({"acc": acc} if "acc" in use_metric else {}),
}
elif self.OUTPUT_TYPE == "greedy_until":
if self._config.gold_alias is not None:
gold = self.gold_alias(doc)
else:
gold = self.doc_to_target(doc)
gold = self.doc_to_target(doc)
for key, result in zip(self._metric_fn_list.keys(), results):
_dict = self._metric_fn_list[key](
references=[gold],
predictions=[result],
**self._metric_fn_kwargs[key],
)
if self.multiple_target:
# in the case where we have multiple targets,
# return true if any are true
# TODO: this may break for multipLe_target, non zero-or-1 metrics
scores = []
for gold_option in gold:
res = self._metric_fn_list[key](
references=[gold_option],
predictions=[result],
**self._metric_fn_kwargs[key],
)
if isinstance(res, dict):
# TODO: this handles the case where HF evaluate returns a dict.
res = res[key]
scores.append(res)
if any(scores):
result = 1.0
else:
result = 0.0
else:
result = self._metric_fn_list[key](
references=[gold],
predictions=[result],
**self._metric_fn_kwargs[key],
)
result_dict = {**result_dict, **_dict}
if isinstance(result, dict):
result_dict.update(result)
else:
result_dict[key] = result
else:
raise ValueError(
f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
"'loglikelihood', 'loglikelihood_rolling', 'greedy_until', 'multiple_choice' or 'winograd_schema' ",
"'loglikelihood', 'loglikelihood_rolling', 'greedy_until' or 'multiple_choice'",
)
return result_dict
......@@ -1004,13 +1080,10 @@ class PerplexityTask(Task):
assert k == 0
return []
def fewshot_context(self, doc, num_fewshot, rnd=None):
def fewshot_context(self, doc, num_fewshot):
assert (
num_fewshot == 0
), "The number of fewshot examples must be 0 for perplexity tasks."
assert (
rnd is not None
), "A `random.Random` generator argument must be provided to `rnd`."
return ""
......
......@@ -45,6 +45,7 @@ def simple_evaluate(
check_integrity=False,
decontamination_ngrams_path=None,
write_out=False,
log_samples=True,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -72,12 +73,17 @@ def simple_evaluate(
:param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks
:param write_out: bool
If True, write details about prompts and logits to json for all tasks
If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:return
Dictionary of results
"""
random.seed(1234)
random.seed(0)
np.random.seed(1234)
torch.manual_seed(
1234
) # TODO: this may affect training runs that are run with evaluation mid-run.
assert tasks != [], "No tasks specified"
......@@ -118,6 +124,7 @@ def simple_evaluate(
bootstrap_iters=bootstrap_iters,
decontamination_ngrams_path=decontamination_ngrams_path,
write_out=write_out,
log_samples=log_samples,
)
if lm.rank == 0:
......@@ -154,6 +161,7 @@ def evaluate(
bootstrap_iters=100000,
decontamination_ngrams_path=None,
write_out=False,
log_samples=True,
):
"""Instantiate and evaluate a model on a list of tasks.
......@@ -168,7 +176,9 @@ def evaluate(
:param bootstrap_iters:
Number of iterations for bootstrap statistics
:param write_out: bool
If True, write all prompts, logits and metrics to json for offline analysis
If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:return
Dictionary of results
"""
......@@ -213,7 +223,10 @@ def evaluate(
# aggregate Instances by LM method requested to get output.
reqtype = (
"loglikelihood"
if (task.OUTPUT_TYPE == "multiple_choice" or task.OUTPUT_TYPE == "winograd_schema")
if (
task.OUTPUT_TYPE == "multiple_choice"
or task.OUTPUT_TYPE == "winograd_schema"
)
else task.OUTPUT_TYPE
) # TODO: this is hacky, fix in task.py
requests[reqtype].extend(task.instances)
......@@ -279,17 +292,18 @@ def evaluate(
metrics = task.process_results(
doc, [req.filtered_resps[key] for req in requests]
)
target = task.doc_to_target(doc)
example = {
"doc_id": doc_id,
"doc": doc,
"target": target,
"arguments": requests[0].args,
"resps": [req.resps for req in requests],
"filtered_resps": [req.filtered_resps[key] for req in requests],
}
example.update(metrics)
samples[task_name].append(example)
if log_samples:
target = task.doc_to_target(doc)
example = {
"doc_id": doc_id,
"doc": doc,
"target": target,
"arguments": [req.args for req in requests],
"resps": [req.resps for req in requests],
"filtered_resps": [req.filtered_resps[key] for req in requests],
}
example.update(metrics)
samples[task_name].append(example)
for metric, value in metrics.items():
vals[(task_name, key, metric)].append(value)
......@@ -359,12 +373,15 @@ def evaluate(
if stderr is not None:
results[task_name][metric + "_stderr" + "," + key] = stderr(items)
return {
results_dict = {
"results": dict(results),
"configs": dict(configs),
"versions": dict(versions),
"samples": samples,
}
if log_samples:
results_dict["samples"] = dict(samples)
return results_dict
else:
return None
......@@ -6,7 +6,7 @@ from lm_eval.api.registry import register_model
@register_model("dummy")
class DummyLM(LM):
def __init__(self):
pass
super().__init__()
@classmethod
def create_from_arg_string(cls, arg_string, additional_config=None):
......
......@@ -71,6 +71,7 @@ class HFLM(LM):
max_batch_size: Optional[int] = 64,
low_cpu_mem_usage: Optional[bool] = True,
trust_remote_code: Optional[bool] = False,
use_fast_tokenizer: Optional[bool] = True,
# arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`.
parallelize: Optional[bool] = False,
......@@ -99,7 +100,7 @@ class HFLM(LM):
if not (parallelize or accelerator.num_processes > 1):
# use user-passed device
device_list = set(
["cuda", "cpu"]
["cuda", "cpu", "mps"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())]
)
if device:
......@@ -107,6 +108,10 @@ class HFLM(LM):
device = int(device)
self._device = torch.device(device)
eval_logger.info(f"Using device '{device}'")
if device == "mps":
eval_logger.info(
"MPS is still in beta and only supports float32; setting dtype to float32."
)
else:
eval_logger.info("Device not specified")
eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
......@@ -217,6 +222,7 @@ class HFLM(LM):
pretrained if tokenizer is None else tokenizer,
revision=revision,
trust_remote_code=trust_remote_code,
use_fast=use_fast_tokenizer,
)
self.vocab_size = self.tokenizer.vocab_size
......
......@@ -24,21 +24,18 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [x] RACE
- [ ] LogiQA (WIP)
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [ ] SQuADv2 (WIP)
- [x] RACE
- [x] HeadQA (WIP)
- [x] HeadQA
- [ ] MathQA (WIP)
- [ ] WebQs
- [ ] WSC273
- [x] Winogrande
- [x] ANLI
- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [ ] TruthfulQA
- [x] TruthfulQA (mc1)
- [ ] TruthfulQA (mc2)
- [ ] TruthfulQA (gen)
- [ ] MuTual
- [ ] Hendrycks Math (WIP)
- [ ] Asdiv (WIP)
......@@ -51,7 +48,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [ ] BLiMP
- [x] ToxiGen
- [ ] StoryCloze
- [ ] NaturalQs
- [ ] NaturalQs (WIP)
- [ ] CrowS-Pairs
- [ ] XCopa
- [ ] BIG-Bench
......
# Task-name
### Paper
Title: `Adversarial NLI: A New Benchmark for Natural Language Understanding`
Abstract: `https://arxiv.org/pdf/1910.14599.pdf`
Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
human-and-model-in-the-loop procedure. It consists of three rounds that progressively
increase in difficulty and complexity, and each question-answer includes annotator-
provided explanations.
Homepage: `https://github.com/facebookresearch/anli`
### Citation
```
@inproceedings{nie-etal-2020-adversarial,
title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
author = "Nie, Yixin and
Williams, Adina and
Dinan, Emily and
Bansal, Mohit and
Weston, Jason and
Kiela, Douwe",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
```
### Subtasks
List or describe tasks defined in this folder, and their names here:
* `anli_r1`: The data collected adversarially in the first round.
* `anli_r2`: The data collected adversarially in the second round, after training on the previous round's data.
* `anli_r3`: The data collected adversarially in the third round, after training on the previous multiple rounds of data.
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group:
- multiple_choice
- natural_language_inference
- nli
- adverserial
task: anli_r1
dataset_path: anli
dataset_name: null
output_type: multiple_choice
training_split: train_r1
validation_split: dev_r1
test_split: test_r1
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
# True = entailment
# False = contradiction
# Neither = neutral
doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
doc_to_choice:
- "True"
- "Neither"
- "False"
should_decontaminate: true
doc_to_decontamination_query: premise
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
group:
- multiple_choice
- natural_language_inference
- nli
- adverserial
task: anli_r2
dataset_path: anli
dataset_name: null
output_type: multiple_choice
training_split: train_r2
validation_split: dev_r2
test_split: test_r2
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
# True = entailment
# False = contradiction
# Neither = neutral
doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
doc_to_choice:
- "True"
- "Neither"
- "False"
should_decontaminate: true
doc_to_decontamination_query: premise
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
group:
- multiple_choice
- natural_language_inference
- nli
- adverserial
task: anli_r3
dataset_path: anli
dataset_name: null
output_type: multiple_choice
training_split: train_r3
validation_split: dev_r3
test_split: test_r3
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
# True = entailment
# False = contradiction
# Neither = neutral
doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
doc_to_choice:
- "True"
- "Neither"
- "False"
should_decontaminate: true
doc_to_decontamination_query: premise
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
include: arc_easy.yaml
group:
- ai2_arc
- multiple_choice
task: arc_challenge
dataset_path: ai2_arc
dataset_name: ARC-Challenge
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{answer_choices[gold]}}"
gold_alias: "{{gold}}" # this will be cast to an int.
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
# - metric: acc_mutual_info
# aggregation: mean
# higher_is_better: true
......@@ -8,10 +8,11 @@ output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{answer_choices[gold]}}"
gold_alias: "{{gold}}" # this will be cast to an int.
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
metric_list:
- metric: acc
aggregation: mean
......
group:
- glue-promptsource
task: qnli
dataset_path: glue
dataset_name: qnli
output_type: multiple_choice
training_split: train
validation_split: validation
use_prompt: "promptsource:have all you need"
metric_list:
- metric: acc
......@@ -7,10 +7,11 @@ output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
template_aliases: "{% set answer_choices = answers|map(attribute='atext')|list %}{% set gold = ra - 1 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
doc_to_text: "Question: {{qtext}}\nAnswer:"
doc_to_target: "{{answer_choices[gold]}}"
gold_alias: "{{gold}}" # this will be cast to an int.
doc_to_target: "{{ra - 1}}"
doc_to_choice: "{{answers|map(attribute='atext')|list}}" # this will be cast to an int.
should_decontaminate: true
doc_to_decontamination_query: query
metric_list:
- metric: acc
aggregation: mean
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment