Commit 6a6a0ebb authored by Benjamin Fattori's avatar Benjamin Fattori
Browse files

Merge remote-tracking branch 'upstream/big-refactor' into big-refactor-autobatching

parents e4acfcaa 2820042d
name: Tasks Modified
on:
push:
branches:
- big-refactor
pull_request:
branches:
- big-refactor
workflow_dispatch:
jobs:
changed_files:
runs-on: ubuntu-latest # windows-latest || macos-latest
name: Scan for changed tasks
steps:
- name: checkout
uses: actions/checkout@v3
with:
fetch-depth: 0 # OR "2" -> To retrieve the preceding commit.
# Example 1
- name: Check task folders
id: changed-tasks
uses: tj-actions/changed-files@v37.1.2
with:
files_yaml: |
tasks:
- lm_eval/tasks/**
api:
- lm_eval/api/**
write_output_files: true
- name: Run Tests
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: |
echo .github/outputs/tasks_all_changed_and_modified_files.txt >> 'GITHUB_ENV'
echo "One or more test file(s) has changed."
echo "List of all the files that have changed: ${{ steps.changed-tasks.outputs.tasks_all_modified_files }}"
- name: Set up Python 3.9
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
uses: actions/setup-python@v4
with:
python-version: 3.9
cache: 'pip'
- name: Install dependencies
if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true'
run: |
python -m pip install --upgrade pip
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
if: steps.changed-tasks.outputs.tasks_any_modified == 'true'
run: python -m pytest tests/test_tasks.py -s -vv -n=auto --new_task
- name: Test more tasks with pytest
env:
API: true
if: steps.changed-tasks.outputs.api_any_modified == 'true'
run: python -m pytest tests/test_api.py -s -vv -n=auto --new_task
# This workflow will install Python dependencies, run tests and lint with a single version of Python
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
name: Build
on:
push:
branches: [ master ]
pull_request:
branches: [ master ]
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Cache
uses: actions/cache@v2.1.3
with:
# A list of files, directories, and wildcard patterns to cache and restore
path: |
~/.cache
# An explicit key for restoring and saving the cache
key: evaldata-cache-4
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install flake8 pytest pytest-cov
pip install -e .[dev,multilingual]
# Install optional git dependencies
pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Test with pytest
run: |
pytest -vv --cov=lm_eval/ tests/
- name: Upload to codecov
run: |
bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
name: Unit Tests
on:
push:
branches:
- big-refactor
pull_request:
branches:
- big-refactor
workflow_dispatch:
jobs:
linter:
name: Linters
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Lint with pylint
run: python -m pylint --disable=all -e W0311 --jobs=0 --indent-string=' ' **/*.py
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=F,E9,E71,E72,E501,E112,E113,W6 --extend-ignore=F541 --show-source --statistics --exit-zero
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Lint with mypy
run: mypy . --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
testcpu:
name: CPU Tests
runs-on: ubuntu-latest
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.9
uses: actions/setup-python@v4
with:
python-version: 3.9
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
# Install optional git dependencies
# pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
# if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Test with pytest
run: python -m pytest -s -v -n=auto --ignore=tests/tests_master --ignore=tests/extra
...@@ -32,6 +32,7 @@ Prompting / in-context formatting options: ...@@ -32,6 +32,7 @@ Prompting / in-context formatting options:
- **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text and doc_to_target and make template_aliases unused. - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text and doc_to_target and make template_aliases unused.
- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model. - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model.
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into possible choices for `multiple_choice`
- **gold_alias** (`str`, *optional*, defaults to None) — if provided, used to generate the reference answer that is scored against. Used in cases where `doc_to_target` should be the "target string" format appended to each example's input for a fewshot exemplar, so doc_to_target is used for fewshot examples, but the input to the metric function as `gold` is from `gold_alias`. - **gold_alias** (`str`, *optional*, defaults to None) — if provided, used to generate the reference answer that is scored against. Used in cases where `doc_to_target` should be the "target string" format appended to each example's input for a fewshot exemplar, so doc_to_target is used for fewshot examples, but the input to the metric function as `gold` is from `gold_alias`.
- **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples. - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
- **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested. - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
......
...@@ -4,13 +4,13 @@ from typing import Literal, Tuple ...@@ -4,13 +4,13 @@ from typing import Literal, Tuple
@dataclass @dataclass
class Instance: class Instance:
request_type: str = Literal[ request_type: Literal["loglikelihood", "loglikelihood_rolling", "greedy_until"]
"loglikelihood", "loglikelihood_rolling", "greedy_until" doc: dict
] arguments: tuple
doc: dict = None idx: int
arguments: tuple = None metadata: Tuple[str, int, int] = field(
idx: int = None default_factory=lambda: (None, None, None)
metadata: tuple = Tuple[str, int, int] # TODO: better typehints here ) # TODO: better typehints here
resps: list = field(default_factory=list) resps: list = field(default_factory=list)
filtered_resps: dict = field(default_factory=dict) filtered_resps: dict = field(default_factory=dict)
......
...@@ -114,6 +114,8 @@ class LM(abc.ABC): ...@@ -114,6 +114,8 @@ class LM(abc.ABC):
additional_config = {} if additional_config is None else additional_config additional_config = {} if additional_config is None else additional_config
args = utils.simple_parse_args_string(arg_string) args = utils.simple_parse_args_string(arg_string)
args2 = {k: v for k, v in additional_config.items() if v is not None} args2 = {k: v for k, v in additional_config.items() if v is not None}
if args2.get("device") == "mps" or args.get("device") == "mps":
args["dtype"] = "float32"
return cls(**args, **args2) return cls(**args, **args2)
@property @property
......
...@@ -10,6 +10,10 @@ class Sampler: ...@@ -10,6 +10,10 @@ class Sampler:
self.target_delimiter = self.config.target_delimiter self.target_delimiter = self.config.target_delimiter
self.fewshot_delimiter = self.config.fewshot_delimiter self.fewshot_delimiter = self.config.fewshot_delimiter
self.doc_to_text = self.task.doc_to_text
self.doc_to_target = self.task.doc_to_target
self.doc_to_choice = self.task.doc_to_choice
self.docs = docs # HF dataset split, provided by task._fewshot_docs() self.docs = docs # HF dataset split, provided by task._fewshot_docs()
if fewshot_indices: # subset few-shot docs from if fewshot_indices: # subset few-shot docs from
self.docs = self.docs.select(fewshot_indices) self.docs = self.docs.select(fewshot_indices)
...@@ -34,16 +38,29 @@ class Sampler: ...@@ -34,16 +38,29 @@ class Sampler:
self.fewshot_delimiter.join( self.fewshot_delimiter.join(
[ [
# TODO: is separating doc_to_text and doc_to_target by one space always desired? # TODO: is separating doc_to_text and doc_to_target by one space always desired?
self.task.doc_to_text(doc) (
self.doc_to_text(doc)
if (
self.config.doc_to_choice is None
or type(self.doc_to_text(doc)) is str
)
else self.doc_to_choice(doc)[self.doc_to_text(doc)]
)
+ self.target_delimiter + self.target_delimiter
+ self.task.doc_to_target(doc) + (
self.doc_to_target(doc)
if (
self.config.doc_to_choice is None
or type(self.doc_to_target(doc)) is str
)
else self.doc_to_choice(doc)[self.doc_to_target(doc)]
)
for doc in selected_docs for doc in selected_docs
] ]
) )
+ self.fewshot_delimiter + self.fewshot_delimiter
) )
# only returns the fewshot context! Does not append the document, do this outside the object
return labeled_examples return labeled_examples
def sample(self, n): def sample(self, n):
......
...@@ -8,6 +8,7 @@ import evaluate ...@@ -8,6 +8,7 @@ import evaluate
import random import random
import itertools import itertools
import functools import functools
from tqdm import tqdm
import datasets import datasets
import numpy as np import numpy as np
...@@ -27,6 +28,7 @@ from lm_eval.api.metrics import ( ...@@ -27,6 +28,7 @@ from lm_eval.api.metrics import (
mean, mean,
weighted_perplexity, weighted_perplexity,
bits_per_byte, bits_per_byte,
metric_max_over_ground_truths,
) )
from lm_eval.api.registry import ( from lm_eval.api.registry import (
get_metric, get_metric,
...@@ -43,7 +45,6 @@ ALL_OUTPUT_TYPES = [ ...@@ -43,7 +45,6 @@ ALL_OUTPUT_TYPES = [
"multiple_choice", "multiple_choice",
"loglikelihood_rolling", "loglikelihood_rolling",
"greedy_until", "greedy_until",
"winograd_schema",
] ]
...@@ -64,9 +65,10 @@ class TaskConfig(dict): ...@@ -64,9 +65,10 @@ class TaskConfig(dict):
fewshot_split: str = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?) fewshot_split: str = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
# formatting / prompting options. # formatting / prompting options.
# see docs/advanced_task_guide.md for more info # see docs/advanced_task_guide.md for more info
template_aliases: str = "" template_aliases: Union[str, list] = None
doc_to_text: Union[Callable, str] = None doc_to_text: Union[Callable, str] = None
doc_to_target: Union[Callable, str] = None doc_to_target: Union[Callable, str] = None
doc_to_choice: Union[Callable, str, dict, list] = None
gold_alias: Union[Callable, str] = None gold_alias: Union[Callable, str] = None
use_prompt: str = None use_prompt: str = None
description: str = "" description: str = ""
...@@ -76,8 +78,6 @@ class TaskConfig(dict): ...@@ -76,8 +78,6 @@ class TaskConfig(dict):
num_fewshot: int = 0 num_fewshot: int = 0
# scoring options # scoring options
metric_list: str = None metric_list: str = None
gold_alias: Union[Callable, str] = None
create_choices: Union[Callable, str] = None
output_type: str = "greedy_until" output_type: str = "greedy_until"
generation_kwargs: dict = None generation_kwargs: dict = None
repeats: int = 1 repeats: int = 1
...@@ -217,8 +217,8 @@ class Task(abc.ABC): ...@@ -217,8 +217,8 @@ class Task(abc.ABC):
self._filters.append(filter_pipeline) self._filters.append(filter_pipeline)
self.sampler = samplers.Sampler( self.sampler = samplers.Sampler(
list(self.fewshot_docs()), self, rnd=random.Random() list(self.fewshot_docs()), self, rnd=random.Random(1234)
) # TODO: pass the correct docs in here )
def download(self, data_dir=None, cache_dir=None, download_mode=None): def download(self, data_dir=None, cache_dir=None, download_mode=None):
"""Downloads and returns the task dataset. """Downloads and returns the task dataset.
...@@ -316,18 +316,6 @@ class Task(abc.ABC): ...@@ -316,18 +316,6 @@ class Task(abc.ABC):
""" """
return doc return doc
def create_choices(self, doc):
if self._config.create_choices is None:
return ast.literal_eval(
utils.apply_template(
self._config.template_aliases + "{{answer_choices}}", doc
)
)
elif type(self._config.create_choices) == str:
return utils.apply_template(self._config.create_choices, doc)
else:
return self._config.create_choices(doc)
@property @property
def instances(self): def instances(self):
"""After calling `task.build_all_requests()`, tasks """After calling `task.build_all_requests()`, tasks
...@@ -366,13 +354,18 @@ class Task(abc.ABC): ...@@ -366,13 +354,18 @@ class Task(abc.ABC):
False False
), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!" ), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
eval_logger.info(
f"Building contexts for task '{self._config.task}' on rank {rank}..."
)
instances = [] instances = []
for doc_id, doc in utils.create_iterator( for doc_id, doc in utils.create_iterator(
enumerate(docs), rank, world_size, limit enumerate(docs), rank, world_size, limit
): ):
# sample fewshot context #TODO: need to offset doc_id by rank now! # sample fewshot context #TODO: need to offset doc_id by rank now!
fewshot_ctx = self.fewshot_context( fewshot_ctx = self.fewshot_context(
doc, self._config.num_fewshot, rnd=random.Random() doc,
self._config.num_fewshot,
) )
# TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute # TODO: we should override self._config.repeats if doing greedy gen so users don't waste time+compute
...@@ -453,7 +446,7 @@ class Task(abc.ABC): ...@@ -453,7 +446,7 @@ class Task(abc.ABC):
return len(re.split(r"\s+", doc)) return len(re.split(r"\s+", doc))
@utils.positional_deprecated @utils.positional_deprecated
def fewshot_context(self, doc, num_fewshot, rnd=None): def fewshot_context(self, doc, num_fewshot):
"""Returns a fewshot context string that is made up of a prepended description """Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example. (if provided), the `num_fewshot` number of examples, and an appended prompt example.
...@@ -461,15 +454,9 @@ class Task(abc.ABC): ...@@ -461,15 +454,9 @@ class Task(abc.ABC):
The document as returned from training_docs, validation_docs, or test_docs. The document as returned from training_docs, validation_docs, or test_docs.
:param num_fewshot: int :param num_fewshot: int
The number of fewshot examples to provide in the returned context string. The number of fewshot examples to provide in the returned context string.
:param rnd: random.Random
The pseudo-random number generator used to randomly sample examples.
WARNING: This is currently a required arg although it's optionalized with a default `None`.
:returns: str :returns: str
The fewshot context. The fewshot context.
""" """
assert (
rnd is not None
), "A `random.Random` generator argument must be provided to `rnd`"
if num_fewshot == 0: if num_fewshot == 0:
# always prepend the (possibly empty) task description # always prepend the (possibly empty) task description
...@@ -480,7 +467,10 @@ class Task(abc.ABC): ...@@ -480,7 +467,10 @@ class Task(abc.ABC):
) )
example = self.doc_to_text(doc) example = self.doc_to_text(doc)
return labeled_examples + example if type(example) == str:
return labeled_examples + example
elif type(example) == list:
return [labeled_examples + ex for ex in example]
def apply_filters(self): def apply_filters(self):
...@@ -625,9 +615,43 @@ class ConfigurableTask(Task): ...@@ -625,9 +615,43 @@ class ConfigurableTask(Task):
if self.fewshot_docs() is not None: if self.fewshot_docs() is not None:
self.sampler = samplers.Sampler( self.sampler = samplers.Sampler(
list(self.fewshot_docs()), self, rnd=random.Random() list(self.fewshot_docs()), self, rnd=random.Random(1234)
) )
if self._config.template_aliases is not None:
for key, alias in self._config.template_aliases:
self.dataset.rename_column(key, alias)
if self.has_test_docs():
docs = self.test_docs()
elif self.has_validation_docs():
docs = self.validation_docs()
else:
assert (
False
), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
# Test One Doc
self.features = list(docs.features.keys())
self.multiple_input = 0
self.multiple_target = 0
test_doc = docs[0]
test_text = self.doc_to_text(test_doc)
test_target = self.doc_to_target(test_doc)
if self._config.doc_to_choice is not None:
test_choice = self.doc_to_choice(test_doc)
if type(test_choice) is not list:
eval_logger.error("doc_to_choice must return list")
else:
num_choice = len(test_choice)
if type(test_text) is int:
self.multiple_input = num_choice
if type(test_target) is list:
self.multiple_target = len(test_target)
def download(self, dataset_kwargs=None): def download(self, dataset_kwargs=None):
self.dataset = datasets.load_dataset( self.dataset = datasets.load_dataset(
...@@ -683,7 +707,12 @@ class ConfigurableTask(Task): ...@@ -683,7 +707,12 @@ class ConfigurableTask(Task):
def doc_to_decontamination_query(self, doc): def doc_to_decontamination_query(self, doc):
if self._config.should_decontaminate: if self._config.should_decontaminate:
return utils.apply_template(self._config.doc_to_decontamination_query, doc) if self._config.doc_to_decontamination_query in self.features:
return doc[self._config.doc_to_decontamination_query]
else:
return ast.literal_eval(
utils.apply_template(self._config.doc_to_decontamination_query, doc)
)
def _process_doc(self, doc): def _process_doc(self, doc):
""" """
...@@ -703,11 +732,24 @@ class ConfigurableTask(Task): ...@@ -703,11 +732,24 @@ class ConfigurableTask(Task):
else: else:
doc_to_text = self._config.doc_to_text doc_to_text = self._config.doc_to_text
if type(doc_to_text) == str: if type(doc_to_text) == int:
return utils.apply_template(doc_to_text, doc) return doc_to_text
elif type(doc_to_text) == str:
if doc_to_text in self.features:
# if self._config.doc_to_choice is not None:
# return self.doc_to_choice(doc)[doc[doc_to_text]]
# else:
return doc[doc_to_text]
else:
text_string = utils.apply_template(doc_to_text, doc)
if text_string.isdigit():
return ast.literal_eval(text_string)
else:
return text_string
elif callable(doc_to_text): elif callable(doc_to_text):
return doc_to_text(doc) return doc_to_text(doc)
if hasattr(doc_to_text, "apply"): # Used when applying a Promptsource template
elif hasattr(doc_to_text, "apply"):
return doc_to_text.apply(doc)[0] return doc_to_text.apply(doc)[0]
else: else:
print(type(doc_to_text)) print(type(doc_to_text))
...@@ -720,15 +762,50 @@ class ConfigurableTask(Task): ...@@ -720,15 +762,50 @@ class ConfigurableTask(Task):
else: else:
doc_to_target = self._config.doc_to_target doc_to_target = self._config.doc_to_target
if type(doc_to_target) == str: if type(doc_to_target) == int:
return utils.apply_template(doc_to_target, doc) return doc_to_target
elif type(doc_to_target) == str:
if doc_to_target in self.features:
# if self._config.doc_to_choice is not None:
# return self.doc_to_choice(doc)[doc[doc_to_target]]
# else:
return doc[doc_to_target]
else:
target_string = utils.apply_template(doc_to_target, doc)
if target_string.isdigit():
return ast.literal_eval(target_string)
else:
return target_string
elif callable(doc_to_target): elif callable(doc_to_target):
return doc_to_target(doc) return doc_to_target(doc)
# Used when applying a Promptsource template
elif hasattr(doc_to_target, "apply"): elif hasattr(doc_to_target, "apply"):
return doc_to_target.apply(doc)[1] return doc_to_target.apply(doc)[1]
else: else:
raise TypeError raise TypeError
def doc_to_choice(self, doc):
if self.prompt is not None:
doc_to_choice = self.prompt
elif self._config.doc_to_choice is None:
eval_logger.error("doc_to_choice was called but not set in config")
else:
doc_to_choice = self._config.doc_to_choice
if type(doc_to_choice) == str:
return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
elif type(doc_to_choice) == list:
return doc_to_choice
elif type(doc_to_choice) == dict:
return list(doc_to_choice.values())
elif callable(doc_to_choice):
return doc_to_choice(doc)
elif hasattr(doc_to_choice, "get_answer_choices_list"):
return doc_to_choice.get_answer_choices_list(doc)
else:
raise TypeError
def gold_alias(self, doc): def gold_alias(self, doc):
# returns a version of the gold target answer to a document, # returns a version of the gold target answer to a document,
# which should be passed into metric for scoring as the ground truth. # which should be passed into metric for scoring as the ground truth.
...@@ -756,19 +833,25 @@ class ConfigurableTask(Task): ...@@ -756,19 +833,25 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "loglikelihood_rolling": elif self.OUTPUT_TYPE == "loglikelihood_rolling":
arguments = (self.doc_to_target(doc),) arguments = (self.doc_to_target(doc),)
elif self.OUTPUT_TYPE == "multiple_choice": elif self.OUTPUT_TYPE == "multiple_choice":
# we pass the user-defined answer_choices var (in aliases) and translate the result to a Python list.
# TODO: any cleaner way to do this? choices = self.doc_to_choice(doc)
choices = self.create_choices(doc) if self.multiple_input:
# If there are multiple inputs, choices are placed in the ctx
cont = self.doc_to_target(doc)
arguments = [(ctx, " {}".format(cont)) for ctx in choices]
else:
# Otherwise they are placed in the continuation
arguments = [(ctx, " {}".format(cont)) for cont in choices]
request_list = [ request_list = [
Instance( Instance(
request_type="loglikelihood", request_type="loglikelihood",
doc=doc, doc=doc,
arguments=(ctx, " {}".format(choice)), arguments=arg,
idx=i, idx=i,
**kwargs, **kwargs,
) )
for i, choice in enumerate(choices) for i, arg in enumerate(arguments)
] ]
# TODO: we should raise a warning telling users this will at most ~2x runtime. # TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys(): if "acc_mutual_info" in self._metric_fn_list.keys():
...@@ -795,26 +878,6 @@ class ConfigurableTask(Task): ...@@ -795,26 +878,6 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "greedy_until": elif self.OUTPUT_TYPE == "greedy_until":
arguments = (ctx, self._config.generation_kwargs) arguments = (ctx, self._config.generation_kwargs)
elif self.OUTPUT_TYPE == "winograd_schema":
# similar to multiple_choice task type except each request contains
# multiple differing contexts with the same continuation
contexts = self.create_choices(doc)
choice = self.doc_to_target(doc)
request_list = [
Instance(
request_type="loglikelihood",
doc=doc,
arguments=(context, " {}".format(choice)),
idx=i,
**kwargs,
)
for i, context in enumerate(contexts)
]
return request_list
return Instance( return Instance(
request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
) )
...@@ -857,13 +920,11 @@ class ConfigurableTask(Task): ...@@ -857,13 +920,11 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "multiple_choice": elif self.OUTPUT_TYPE == "multiple_choice":
lls, is_greedy = zip(*results) lls, is_greedy = zip(*results)
if self._config.gold_alias is not None:
gold = int(self.gold_alias(doc))
else:
gold = int(self.doc_to_target(doc))
# retrieve choices in List[str] form, to compute choice lengths, etc. # retrieve choices in List[str] form, to compute choice lengths, etc.
choices = self.create_choices(doc) choices = self.doc_to_choice(doc)
completion_len = np.array([float(len(i)) for i in choices])
if ( if (
2 * len(choices) == len(lls) 2 * len(choices) == len(lls)
and "acc_mutual_info" in self._metric_fn_list.keys() and "acc_mutual_info" in self._metric_fn_list.keys()
...@@ -876,23 +937,33 @@ class ConfigurableTask(Task): ...@@ -876,23 +937,33 @@ class ConfigurableTask(Task):
lls = lls[::2] lls = lls[::2]
pred = np.argmax(lls) pred = np.argmax(lls)
pred_norm = np.argmax(lls / completion_len)
acc = 1.0 if np.argmax(lls) == gold else 0.0 if self.multiple_input:
completion_len = np.array([float(len(i)) for i in choices]) gold = self.doc_to_text(doc)
acc_norm = 1.0 if np.argmax(lls / completion_len) == gold else 0.0 else:
gold = self.doc_to_target(doc)
if type(gold) is str:
gold = choices.index(gold)
if self.multiple_target:
acc = 1.0 if pred in gold else 0.0
acc_norm = 1.0 if pred_norm in gold else 0.0
exact_match = int(any([is_greedy[i] for i in gold]))
else:
acc = 1.0 if pred == gold else 0.0
acc_norm = 1.0 if pred_norm == gold else 0.0
# TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
exact_match = int(is_greedy[gold])
result_dict = { result_dict = {
**({"acc": acc} if "acc" in use_metric else {}), **({"acc": acc} if "acc" in use_metric else {}),
**({"f1": (gold, pred)} if "f1" in use_metric else {}), **({"f1": (gold, pred)} if "f1" in use_metric else {}),
**({"mcc": (gold, pred)} if "mcc" in use_metric else {}), **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
**({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}), **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
**({"exact_match": exact_match} if "exact_match" in use_metric else {}),
} }
if "exact_match" in self._metric_fn_list.keys():
# TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
is_greedy = is_greedy[gold] # take value for the gold answer
result_dict["exact_match"] = int(is_greedy)
if "acc_mutual_info" in use_metric: if "acc_mutual_info" in use_metric:
lls_mutual_info = [ lls_mutual_info = [
ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional) ll_c - ll_u for ll_c, ll_u in zip(lls, lls_unconditional)
...@@ -900,40 +971,45 @@ class ConfigurableTask(Task): ...@@ -900,40 +971,45 @@ class ConfigurableTask(Task):
acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0 acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
result_dict["acc_mutual_info"] = acc_mutual_info result_dict["acc_mutual_info"] = acc_mutual_info
elif self.OUTPUT_TYPE == "winograd_schema":
lls, is_greedy = zip(*results)
if self._config.gold_alias is not None:
gold = int(self.gold_alias(doc))
else:
gold = int(self.doc_to_target(doc))
pred = np.argmax(lls)
acc = 1.0 if np.argmax(lls) == gold else 0.0
result_dict = {
**({"acc": acc} if "acc" in use_metric else {}),
}
elif self.OUTPUT_TYPE == "greedy_until": elif self.OUTPUT_TYPE == "greedy_until":
if self._config.gold_alias is not None: gold = self.doc_to_target(doc)
gold = self.gold_alias(doc)
else:
gold = self.doc_to_target(doc)
for key, result in zip(self._metric_fn_list.keys(), results): for key, result in zip(self._metric_fn_list.keys(), results):
_dict = self._metric_fn_list[key]( if self.multiple_target:
references=[gold], # in the case where we have multiple targets,
predictions=[result], # return true if any are true
**self._metric_fn_kwargs[key], # TODO: this may break for multipLe_target, non zero-or-1 metrics
) scores = []
for gold_option in gold:
res = self._metric_fn_list[key](
references=[gold_option],
predictions=[result],
**self._metric_fn_kwargs[key],
)
if isinstance(res, dict):
# TODO: this handles the case where HF evaluate returns a dict.
res = res[key]
scores.append(res)
if any(scores):
result = 1.0
else:
result = 0.0
else:
result = self._metric_fn_list[key](
references=[gold],
predictions=[result],
**self._metric_fn_kwargs[key],
)
result_dict = {**result_dict, **_dict} if isinstance(result, dict):
result_dict.update(result)
else:
result_dict[key] = result
else: else:
raise ValueError( raise ValueError(
f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ", f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
"'loglikelihood', 'loglikelihood_rolling', 'greedy_until', 'multiple_choice' or 'winograd_schema' ", "'loglikelihood', 'loglikelihood_rolling', 'greedy_until' or 'multiple_choice'",
) )
return result_dict return result_dict
...@@ -1004,13 +1080,10 @@ class PerplexityTask(Task): ...@@ -1004,13 +1080,10 @@ class PerplexityTask(Task):
assert k == 0 assert k == 0
return [] return []
def fewshot_context(self, doc, num_fewshot, rnd=None): def fewshot_context(self, doc, num_fewshot):
assert ( assert (
num_fewshot == 0 num_fewshot == 0
), "The number of fewshot examples must be 0 for perplexity tasks." ), "The number of fewshot examples must be 0 for perplexity tasks."
assert (
rnd is not None
), "A `random.Random` generator argument must be provided to `rnd`."
return "" return ""
......
...@@ -45,6 +45,7 @@ def simple_evaluate( ...@@ -45,6 +45,7 @@ def simple_evaluate(
check_integrity=False, check_integrity=False,
decontamination_ngrams_path=None, decontamination_ngrams_path=None,
write_out=False, write_out=False,
log_samples=True,
): ):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
...@@ -72,12 +73,17 @@ def simple_evaluate( ...@@ -72,12 +73,17 @@ def simple_evaluate(
:param check_integrity: bool :param check_integrity: bool
Whether to run the relevant part of the test suite for the tasks Whether to run the relevant part of the test suite for the tasks
:param write_out: bool :param write_out: bool
If True, write details about prompts and logits to json for all tasks If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:return :return
Dictionary of results Dictionary of results
""" """
random.seed(1234) random.seed(0)
np.random.seed(1234) np.random.seed(1234)
torch.manual_seed(
1234
) # TODO: this may affect training runs that are run with evaluation mid-run.
assert tasks != [], "No tasks specified" assert tasks != [], "No tasks specified"
...@@ -118,6 +124,7 @@ def simple_evaluate( ...@@ -118,6 +124,7 @@ def simple_evaluate(
bootstrap_iters=bootstrap_iters, bootstrap_iters=bootstrap_iters,
decontamination_ngrams_path=decontamination_ngrams_path, decontamination_ngrams_path=decontamination_ngrams_path,
write_out=write_out, write_out=write_out,
log_samples=log_samples,
) )
if lm.rank == 0: if lm.rank == 0:
...@@ -154,6 +161,7 @@ def evaluate( ...@@ -154,6 +161,7 @@ def evaluate(
bootstrap_iters=100000, bootstrap_iters=100000,
decontamination_ngrams_path=None, decontamination_ngrams_path=None,
write_out=False, write_out=False,
log_samples=True,
): ):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
...@@ -168,7 +176,9 @@ def evaluate( ...@@ -168,7 +176,9 @@ def evaluate(
:param bootstrap_iters: :param bootstrap_iters:
Number of iterations for bootstrap statistics Number of iterations for bootstrap statistics
:param write_out: bool :param write_out: bool
If True, write all prompts, logits and metrics to json for offline analysis If True, write out an example document and model input for checking task integrity
:param log_samples: bool
If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
:return :return
Dictionary of results Dictionary of results
""" """
...@@ -213,7 +223,10 @@ def evaluate( ...@@ -213,7 +223,10 @@ def evaluate(
# aggregate Instances by LM method requested to get output. # aggregate Instances by LM method requested to get output.
reqtype = ( reqtype = (
"loglikelihood" "loglikelihood"
if (task.OUTPUT_TYPE == "multiple_choice" or task.OUTPUT_TYPE == "winograd_schema") if (
task.OUTPUT_TYPE == "multiple_choice"
or task.OUTPUT_TYPE == "winograd_schema"
)
else task.OUTPUT_TYPE else task.OUTPUT_TYPE
) # TODO: this is hacky, fix in task.py ) # TODO: this is hacky, fix in task.py
requests[reqtype].extend(task.instances) requests[reqtype].extend(task.instances)
...@@ -279,17 +292,18 @@ def evaluate( ...@@ -279,17 +292,18 @@ def evaluate(
metrics = task.process_results( metrics = task.process_results(
doc, [req.filtered_resps[key] for req in requests] doc, [req.filtered_resps[key] for req in requests]
) )
target = task.doc_to_target(doc) if log_samples:
example = { target = task.doc_to_target(doc)
"doc_id": doc_id, example = {
"doc": doc, "doc_id": doc_id,
"target": target, "doc": doc,
"arguments": requests[0].args, "target": target,
"resps": [req.resps for req in requests], "arguments": [req.args for req in requests],
"filtered_resps": [req.filtered_resps[key] for req in requests], "resps": [req.resps for req in requests],
} "filtered_resps": [req.filtered_resps[key] for req in requests],
example.update(metrics) }
samples[task_name].append(example) example.update(metrics)
samples[task_name].append(example)
for metric, value in metrics.items(): for metric, value in metrics.items():
vals[(task_name, key, metric)].append(value) vals[(task_name, key, metric)].append(value)
...@@ -359,12 +373,15 @@ def evaluate( ...@@ -359,12 +373,15 @@ def evaluate(
if stderr is not None: if stderr is not None:
results[task_name][metric + "_stderr" + "," + key] = stderr(items) results[task_name][metric + "_stderr" + "," + key] = stderr(items)
return { results_dict = {
"results": dict(results), "results": dict(results),
"configs": dict(configs), "configs": dict(configs),
"versions": dict(versions), "versions": dict(versions),
"samples": samples,
} }
if log_samples:
results_dict["samples"] = dict(samples)
return results_dict
else: else:
return None return None
...@@ -6,7 +6,7 @@ from lm_eval.api.registry import register_model ...@@ -6,7 +6,7 @@ from lm_eval.api.registry import register_model
@register_model("dummy") @register_model("dummy")
class DummyLM(LM): class DummyLM(LM):
def __init__(self): def __init__(self):
pass super().__init__()
@classmethod @classmethod
def create_from_arg_string(cls, arg_string, additional_config=None): def create_from_arg_string(cls, arg_string, additional_config=None):
......
...@@ -71,6 +71,7 @@ class HFLM(LM): ...@@ -71,6 +71,7 @@ class HFLM(LM):
max_batch_size: Optional[int] = 64, max_batch_size: Optional[int] = 64,
low_cpu_mem_usage: Optional[bool] = True, low_cpu_mem_usage: Optional[bool] = True,
trust_remote_code: Optional[bool] = False, trust_remote_code: Optional[bool] = False,
use_fast_tokenizer: Optional[bool] = True,
# arguments used for splitting a model across GPUs naively. # arguments used for splitting a model across GPUs naively.
# only used if `parallelize=True`. # only used if `parallelize=True`.
parallelize: Optional[bool] = False, parallelize: Optional[bool] = False,
...@@ -99,7 +100,7 @@ class HFLM(LM): ...@@ -99,7 +100,7 @@ class HFLM(LM):
if not (parallelize or accelerator.num_processes > 1): if not (parallelize or accelerator.num_processes > 1):
# use user-passed device # use user-passed device
device_list = set( device_list = set(
["cuda", "cpu"] ["cuda", "cpu", "mps"]
+ [f"cuda:{i}" for i in range(torch.cuda.device_count())] + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
) )
if device: if device:
...@@ -107,6 +108,10 @@ class HFLM(LM): ...@@ -107,6 +108,10 @@ class HFLM(LM):
device = int(device) device = int(device)
self._device = torch.device(device) self._device = torch.device(device)
eval_logger.info(f"Using device '{device}'") eval_logger.info(f"Using device '{device}'")
if device == "mps":
eval_logger.info(
"MPS is still in beta and only supports float32; setting dtype to float32."
)
else: else:
eval_logger.info("Device not specified") eval_logger.info("Device not specified")
eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}") eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")
...@@ -217,6 +222,7 @@ class HFLM(LM): ...@@ -217,6 +222,7 @@ class HFLM(LM):
pretrained if tokenizer is None else tokenizer, pretrained if tokenizer is None else tokenizer,
revision=revision, revision=revision,
trust_remote_code=trust_remote_code, trust_remote_code=trust_remote_code,
use_fast=use_fast_tokenizer,
) )
self.vocab_size = self.tokenizer.vocab_size self.vocab_size = self.tokenizer.vocab_size
......
...@@ -24,21 +24,18 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -24,21 +24,18 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [x] HellaSwag - [x] HellaSwag
- [x] SWAG - [x] SWAG
- [x] OpenBookQA - [x] OpenBookQA
- [x] RACE
- [ ] LogiQA (WIP)
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [ ] SQuADv2 (WIP) - [ ] SQuADv2 (WIP)
- [x] RACE - [x] RACE
- [x] HeadQA (WIP) - [x] HeadQA
- [ ] MathQA (WIP) - [ ] MathQA (WIP)
- [ ] WebQs - [ ] WebQs
- [ ] WSC273 - [ ] WSC273
- [x] Winogrande - [x] Winogrande
- [x] ANLI - [x] ANLI
- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info) - [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [ ] TruthfulQA - [x] TruthfulQA (mc1)
- [ ] TruthfulQA (mc2)
- [ ] TruthfulQA (gen)
- [ ] MuTual - [ ] MuTual
- [ ] Hendrycks Math (WIP) - [ ] Hendrycks Math (WIP)
- [ ] Asdiv (WIP) - [ ] Asdiv (WIP)
...@@ -51,7 +48,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for ...@@ -51,7 +48,7 @@ Boxes should be checked iff tasks are implemented in the refactor and tested for
- [ ] BLiMP - [ ] BLiMP
- [x] ToxiGen - [x] ToxiGen
- [ ] StoryCloze - [ ] StoryCloze
- [ ] NaturalQs - [ ] NaturalQs (WIP)
- [ ] CrowS-Pairs - [ ] CrowS-Pairs
- [ ] XCopa - [ ] XCopa
- [ ] BIG-Bench - [ ] BIG-Bench
......
# Task-name
### Paper
Title: `Adversarial NLI: A New Benchmark for Natural Language Understanding`
Abstract: `https://arxiv.org/pdf/1910.14599.pdf`
Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
human-and-model-in-the-loop procedure. It consists of three rounds that progressively
increase in difficulty and complexity, and each question-answer includes annotator-
provided explanations.
Homepage: `https://github.com/facebookresearch/anli`
### Citation
```
@inproceedings{nie-etal-2020-adversarial,
title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
author = "Nie, Yixin and
Williams, Adina and
Dinan, Emily and
Bansal, Mohit and
Weston, Jason and
Kiela, Douwe",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
year = "2020",
publisher = "Association for Computational Linguistics",
}
```
### Subtasks
List or describe tasks defined in this folder, and their names here:
* `anli_r1`: The data collected adversarially in the first round.
* `anli_r2`: The data collected adversarially in the second round, after training on the previous round's data.
* `anli_r3`: The data collected adversarially in the third round, after training on the previous multiple rounds of data.
### Checklist
For adding novel benchmarks/datasets to the library:
* [x] Is the task an existing benchmark in the literature?
* [x] Have you referenced the original paper that introduced the task?
* [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
If other tasks on this dataset are already supported:
* [ ] Is the "Main" variant of this task clearly denoted?
* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
group:
- multiple_choice
- natural_language_inference
- nli
- adverserial
task: anli_r1
dataset_path: anli
dataset_name: null
output_type: multiple_choice
training_split: train_r1
validation_split: dev_r1
test_split: test_r1
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
# True = entailment
# False = contradiction
# Neither = neutral
doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
doc_to_choice:
- "True"
- "Neither"
- "False"
should_decontaminate: true
doc_to_decontamination_query: premise
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
group:
- multiple_choice
- natural_language_inference
- nli
- adverserial
task: anli_r2
dataset_path: anli
dataset_name: null
output_type: multiple_choice
training_split: train_r2
validation_split: dev_r2
test_split: test_r2
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
# True = entailment
# False = contradiction
# Neither = neutral
doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
doc_to_choice:
- "True"
- "Neither"
- "False"
should_decontaminate: true
doc_to_decontamination_query: premise
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
group:
- multiple_choice
- natural_language_inference
- nli
- adverserial
task: anli_r3
dataset_path: anli
dataset_name: null
output_type: multiple_choice
training_split: train_r3
validation_split: dev_r3
test_split: test_r3
doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
# True = entailment
# False = contradiction
# Neither = neutral
doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
doc_to_choice:
- "True"
- "Neither"
- "False"
should_decontaminate: true
doc_to_decontamination_query: premise
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
include: arc_easy.yaml
group: group:
- ai2_arc - ai2_arc
- multiple_choice - multiple_choice
task: arc_challenge task: arc_challenge
dataset_path: ai2_arc dataset_path: ai2_arc
dataset_name: ARC-Challenge dataset_name: ARC-Challenge
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{answer_choices[gold]}}"
gold_alias: "{{gold}}" # this will be cast to an int.
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
# - metric: acc_mutual_info
# aggregation: mean
# higher_is_better: true
...@@ -8,10 +8,11 @@ output_type: multiple_choice ...@@ -8,10 +8,11 @@ output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
test_split: test test_split: test
template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
doc_to_text: "Question: {{question}}\nAnswer:" doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{answer_choices[gold]}}" doc_to_target: "{{choices.label.index(answerKey)}}"
gold_alias: "{{gold}}" # this will be cast to an int. doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
group:
- glue-promptsource
task: qnli
dataset_path: glue
dataset_name: qnli
output_type: multiple_choice
training_split: train
validation_split: validation
use_prompt: "promptsource:have all you need"
metric_list:
- metric: acc
...@@ -7,10 +7,11 @@ output_type: multiple_choice ...@@ -7,10 +7,11 @@ output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
test_split: test test_split: test
template_aliases: "{% set answer_choices = answers|map(attribute='atext')|list %}{% set gold = ra - 1 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
doc_to_text: "Question: {{qtext}}\nAnswer:" doc_to_text: "Question: {{qtext}}\nAnswer:"
doc_to_target: "{{answer_choices[gold]}}" doc_to_target: "{{ra - 1}}"
gold_alias: "{{gold}}" # this will be cast to an int. doc_to_choice: "{{answers|map(attribute='atext')|list}}" # this will be cast to an int.
should_decontaminate: true
doc_to_decontamination_query: query
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment