Unverified Commit 2c20cd1f authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge pull request #671 from EleutherAI/revamp-process

Revamp process
parents 6862fa7d 0dadc92a
...@@ -32,6 +32,7 @@ Prompting / in-context formatting options: ...@@ -32,6 +32,7 @@ Prompting / in-context formatting options:
- **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text and doc_to_target and make template_aliases unused. - **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text and doc_to_target and make template_aliases unused.
- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model - **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate input for the model
- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model. - **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into the appropriate target output for the model.
- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2, f-string, or function to process a sample into possible choices for `multiple_choice`
- **gold_alias** (`str`, *optional*, defaults to None) — if provided, used to generate the reference answer that is scored against. Used in cases where `doc_to_target` should be the "target string" format appended to each example's input for a fewshot exemplar, so doc_to_target is used for fewshot examples, but the input to the metric function as `gold` is from `gold_alias`. - **gold_alias** (`str`, *optional*, defaults to None) — if provided, used to generate the reference answer that is scored against. Used in cases where `doc_to_target` should be the "target string" format appended to each example's input for a fewshot exemplar, so doc_to_target is used for fewshot examples, but the input to the metric function as `gold` is from `gold_alias`.
- **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples. - **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
- **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested. - **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
......
...@@ -10,6 +10,10 @@ class Sampler: ...@@ -10,6 +10,10 @@ class Sampler:
self.target_delimiter = self.config.target_delimiter self.target_delimiter = self.config.target_delimiter
self.fewshot_delimiter = self.config.fewshot_delimiter self.fewshot_delimiter = self.config.fewshot_delimiter
self.doc_to_text = self.task.doc_to_text
self.doc_to_target = self.task.doc_to_target
self.doc_to_choice = self.task.doc_to_choice
self.docs = docs # HF dataset split, provided by task._fewshot_docs() self.docs = docs # HF dataset split, provided by task._fewshot_docs()
if fewshot_indices: # subset few-shot docs from if fewshot_indices: # subset few-shot docs from
self.docs = self.docs.select(fewshot_indices) self.docs = self.docs.select(fewshot_indices)
...@@ -34,16 +38,29 @@ class Sampler: ...@@ -34,16 +38,29 @@ class Sampler:
self.fewshot_delimiter.join( self.fewshot_delimiter.join(
[ [
# TODO: is separating doc_to_text and doc_to_target by one space always desired? # TODO: is separating doc_to_text and doc_to_target by one space always desired?
self.task.doc_to_text(doc) (
self.doc_to_text(doc)
if (
self.config.doc_to_choice is None
or type(self.doc_to_text(doc)) is str
)
else self.doc_to_choice(doc)[self.doc_to_text(doc)]
)
+ self.target_delimiter + self.target_delimiter
+ self.task.doc_to_target(doc) + (
self.doc_to_target(doc)
if (
self.config.doc_to_choice is None
or type(self.doc_to_target(doc)) is str
)
else self.doc_to_choice(doc)[self.doc_to_target(doc)]
)
for doc in selected_docs for doc in selected_docs
] ]
) )
+ self.fewshot_delimiter + self.fewshot_delimiter
) )
# only returns the fewshot context! Does not append the document, do this outside the object
return labeled_examples return labeled_examples
def sample(self, n): def sample(self, n):
......
...@@ -28,6 +28,7 @@ from lm_eval.api.metrics import ( ...@@ -28,6 +28,7 @@ from lm_eval.api.metrics import (
mean, mean,
weighted_perplexity, weighted_perplexity,
bits_per_byte, bits_per_byte,
metric_max_over_ground_truths,
) )
from lm_eval.api.registry import ( from lm_eval.api.registry import (
get_metric, get_metric,
...@@ -44,7 +45,6 @@ ALL_OUTPUT_TYPES = [ ...@@ -44,7 +45,6 @@ ALL_OUTPUT_TYPES = [
"multiple_choice", "multiple_choice",
"loglikelihood_rolling", "loglikelihood_rolling",
"greedy_until", "greedy_until",
"winograd_schema",
] ]
...@@ -65,9 +65,10 @@ class TaskConfig(dict): ...@@ -65,9 +65,10 @@ class TaskConfig(dict):
fewshot_split: str = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?) fewshot_split: str = None # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
# formatting / prompting options. # formatting / prompting options.
# see docs/advanced_task_guide.md for more info # see docs/advanced_task_guide.md for more info
template_aliases: str = "" template_aliases: Union[str, list] = None
doc_to_text: Union[Callable, str] = None doc_to_text: Union[Callable, str] = None
doc_to_target: Union[Callable, str] = None doc_to_target: Union[Callable, str] = None
doc_to_choice: Union[Callable, str, dict, list] = None
gold_alias: Union[Callable, str] = None gold_alias: Union[Callable, str] = None
use_prompt: str = None use_prompt: str = None
description: str = "" description: str = ""
...@@ -77,8 +78,6 @@ class TaskConfig(dict): ...@@ -77,8 +78,6 @@ class TaskConfig(dict):
num_fewshot: int = 0 num_fewshot: int = 0
# scoring options # scoring options
metric_list: str = None metric_list: str = None
gold_alias: Union[Callable, str] = None
create_choices: Union[Callable, str] = None
output_type: str = "greedy_until" output_type: str = "greedy_until"
generation_kwargs: dict = None generation_kwargs: dict = None
repeats: int = 1 repeats: int = 1
...@@ -317,18 +316,6 @@ class Task(abc.ABC): ...@@ -317,18 +316,6 @@ class Task(abc.ABC):
""" """
return doc return doc
def create_choices(self, doc):
if self._config.create_choices is None:
return ast.literal_eval(
utils.apply_template(
self._config.template_aliases + "{{answer_choices}}", doc
)
)
elif type(self._config.create_choices) == str:
return utils.apply_template(self._config.create_choices, doc)
else:
return self._config.create_choices(doc)
@property @property
def instances(self): def instances(self):
"""After calling `task.build_all_requests()`, tasks """After calling `task.build_all_requests()`, tasks
...@@ -480,7 +467,10 @@ class Task(abc.ABC): ...@@ -480,7 +467,10 @@ class Task(abc.ABC):
) )
example = self.doc_to_text(doc) example = self.doc_to_text(doc)
if type(example) == str:
return labeled_examples + example return labeled_examples + example
elif type(example) == list:
return [labeled_examples + ex for ex in example]
def apply_filters(self): def apply_filters(self):
...@@ -628,6 +618,40 @@ class ConfigurableTask(Task): ...@@ -628,6 +618,40 @@ class ConfigurableTask(Task):
list(self.fewshot_docs()), self, rnd=random.Random(1234) list(self.fewshot_docs()), self, rnd=random.Random(1234)
) )
if self._config.template_aliases is not None:
for key, alias in self._config.template_aliases:
self.dataset.rename_column(key, alias)
if self.has_test_docs():
docs = self.test_docs()
elif self.has_validation_docs():
docs = self.validation_docs()
else:
assert (
False
), f"Task dataset (path={self.DATASET_PATH}, name={self.DATASET_NAME}) must have valid or test docs!"
# Test One Doc
self.features = list(docs.features.keys())
self.multiple_input = 0
self.multiple_target = 0
test_doc = docs[0]
test_text = self.doc_to_text(test_doc)
test_target = self.doc_to_target(test_doc)
if self._config.doc_to_choice is not None:
test_choice = self.doc_to_choice(test_doc)
if type(test_choice) is not list:
eval_logger.error("doc_to_choice must return list")
else:
num_choice = len(test_choice)
if type(test_text) is int:
self.multiple_input = num_choice
if type(test_target) is list:
self.multiple_target = len(test_target)
def download(self, dataset_kwargs=None): def download(self, dataset_kwargs=None):
self.dataset = datasets.load_dataset( self.dataset = datasets.load_dataset(
...@@ -683,7 +707,12 @@ class ConfigurableTask(Task): ...@@ -683,7 +707,12 @@ class ConfigurableTask(Task):
def doc_to_decontamination_query(self, doc): def doc_to_decontamination_query(self, doc):
if self._config.should_decontaminate: if self._config.should_decontaminate:
return utils.apply_template(self._config.doc_to_decontamination_query, doc) if self._config.doc_to_decontamination_query in self.features:
return doc[self._config.doc_to_decontamination_query]
else:
return ast.literal_eval(
utils.apply_template(self._config.doc_to_decontamination_query, doc)
)
def _process_doc(self, doc): def _process_doc(self, doc):
""" """
...@@ -703,11 +732,24 @@ class ConfigurableTask(Task): ...@@ -703,11 +732,24 @@ class ConfigurableTask(Task):
else: else:
doc_to_text = self._config.doc_to_text doc_to_text = self._config.doc_to_text
if type(doc_to_text) == str: if type(doc_to_text) == int:
return utils.apply_template(doc_to_text, doc) return doc_to_text
elif type(doc_to_text) == str:
if doc_to_text in self.features:
# if self._config.doc_to_choice is not None:
# return self.doc_to_choice(doc)[doc[doc_to_text]]
# else:
return doc[doc_to_text]
else:
text_string = utils.apply_template(doc_to_text, doc)
if text_string.isdigit():
return ast.literal_eval(text_string)
else:
return text_string
elif callable(doc_to_text): elif callable(doc_to_text):
return doc_to_text(doc) return doc_to_text(doc)
if hasattr(doc_to_text, "apply"): # Used when applying a Promptsource template
elif hasattr(doc_to_text, "apply"):
return doc_to_text.apply(doc)[0] return doc_to_text.apply(doc)[0]
else: else:
print(type(doc_to_text)) print(type(doc_to_text))
...@@ -720,15 +762,50 @@ class ConfigurableTask(Task): ...@@ -720,15 +762,50 @@ class ConfigurableTask(Task):
else: else:
doc_to_target = self._config.doc_to_target doc_to_target = self._config.doc_to_target
if type(doc_to_target) == str: if type(doc_to_target) == int:
return utils.apply_template(doc_to_target, doc) return doc_to_target
elif type(doc_to_target) == str:
if doc_to_target in self.features:
# if self._config.doc_to_choice is not None:
# return self.doc_to_choice(doc)[doc[doc_to_target]]
# else:
return doc[doc_to_target]
else:
target_string = utils.apply_template(doc_to_target, doc)
if target_string.isdigit():
return ast.literal_eval(target_string)
else:
return target_string
elif callable(doc_to_target): elif callable(doc_to_target):
return doc_to_target(doc) return doc_to_target(doc)
# Used when applying a Promptsource template
elif hasattr(doc_to_target, "apply"): elif hasattr(doc_to_target, "apply"):
return doc_to_target.apply(doc)[1] return doc_to_target.apply(doc)[1]
else: else:
raise TypeError raise TypeError
def doc_to_choice(self, doc):
if self.prompt is not None:
doc_to_choice = self.prompt
elif self._config.doc_to_choice is None:
eval_logger.error("doc_to_choice was called but not set in config")
else:
doc_to_choice = self._config.doc_to_choice
if type(doc_to_choice) == str:
return ast.literal_eval(utils.apply_template(doc_to_choice, doc))
elif type(doc_to_choice) == list:
return doc_to_choice
elif type(doc_to_choice) == dict:
return list(doc_to_choice.values())
elif callable(doc_to_choice):
return doc_to_choice(doc)
elif hasattr(doc_to_choice, "get_answer_choices_list"):
return doc_to_choice.get_answer_choices_list(doc)
else:
raise TypeError
def gold_alias(self, doc): def gold_alias(self, doc):
# returns a version of the gold target answer to a document, # returns a version of the gold target answer to a document,
# which should be passed into metric for scoring as the ground truth. # which should be passed into metric for scoring as the ground truth.
...@@ -756,19 +833,25 @@ class ConfigurableTask(Task): ...@@ -756,19 +833,25 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "loglikelihood_rolling": elif self.OUTPUT_TYPE == "loglikelihood_rolling":
arguments = (self.doc_to_target(doc),) arguments = (self.doc_to_target(doc),)
elif self.OUTPUT_TYPE == "multiple_choice": elif self.OUTPUT_TYPE == "multiple_choice":
# we pass the user-defined answer_choices var (in aliases) and translate the result to a Python list.
# TODO: any cleaner way to do this? choices = self.doc_to_choice(doc)
choices = self.create_choices(doc) if self.multiple_input:
# If there are multiple inputs, choices are placed in the ctx
cont = self.doc_to_target(doc)
arguments = [(ctx, " {}".format(cont)) for ctx in choices]
else:
# Otherwise they are placed in the continuation
arguments = [(ctx, " {}".format(cont)) for cont in choices]
request_list = [ request_list = [
Instance( Instance(
request_type="loglikelihood", request_type="loglikelihood",
doc=doc, doc=doc,
arguments=(ctx, " {}".format(choice)), arguments=arg,
idx=i, idx=i,
**kwargs, **kwargs,
) )
for i, choice in enumerate(choices) for i, arg in enumerate(arguments)
] ]
# TODO: we should raise a warning telling users this will at most ~2x runtime. # TODO: we should raise a warning telling users this will at most ~2x runtime.
if "acc_mutual_info" in self._metric_fn_list.keys(): if "acc_mutual_info" in self._metric_fn_list.keys():
...@@ -795,26 +878,6 @@ class ConfigurableTask(Task): ...@@ -795,26 +878,6 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "greedy_until": elif self.OUTPUT_TYPE == "greedy_until":
arguments = (ctx, self._config.generation_kwargs) arguments = (ctx, self._config.generation_kwargs)
elif self.OUTPUT_TYPE == "winograd_schema":
# similar to multiple_choice task type except each request contains
# multiple differing contexts with the same continuation
contexts = self.create_choices(doc)
choice = self.doc_to_target(doc)
request_list = [
Instance(
request_type="loglikelihood",
doc=doc,
arguments=(context, " {}".format(choice)),
idx=i,
**kwargs,
)
for i, context in enumerate(contexts)
]
return request_list
return Instance( return Instance(
request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs request_type=self.OUTPUT_TYPE, doc=doc, arguments=arguments, idx=0, **kwargs
) )
...@@ -857,13 +920,11 @@ class ConfigurableTask(Task): ...@@ -857,13 +920,11 @@ class ConfigurableTask(Task):
elif self.OUTPUT_TYPE == "multiple_choice": elif self.OUTPUT_TYPE == "multiple_choice":
lls, is_greedy = zip(*results) lls, is_greedy = zip(*results)
if self._config.gold_alias is not None:
gold = int(self.gold_alias(doc))
else:
gold = int(self.doc_to_target(doc))
# retrieve choices in List[str] form, to compute choice lengths, etc. # retrieve choices in List[str] form, to compute choice lengths, etc.
choices = self.create_choices(doc) choices = self.doc_to_choice(doc)
completion_len = np.array([float(len(i)) for i in choices])
if ( if (
2 * len(choices) == len(lls) 2 * len(choices) == len(lls)
and "acc_mutual_info" in self._metric_fn_list.keys() and "acc_mutual_info" in self._metric_fn_list.keys()
...@@ -876,10 +937,21 @@ class ConfigurableTask(Task): ...@@ -876,10 +937,21 @@ class ConfigurableTask(Task):
lls = lls[::2] lls = lls[::2]
pred = np.argmax(lls) pred = np.argmax(lls)
pred_norm = np.argmax(lls / completion_len)
acc = 1.0 if np.argmax(lls) == gold else 0.0 if self.multiple_input:
completion_len = np.array([float(len(i)) for i in choices]) gold = self.doc_to_text(doc)
acc_norm = 1.0 if np.argmax(lls / completion_len) == gold else 0.0 else:
gold = self.doc_to_target(doc)
if type(gold) is str:
gold = choices.index(gold)
if self.multiple_target:
acc = 1.0 if pred in gold else 0.0
acc_norm = 1.0 if pred_norm in gold else 0.0
else:
acc = 1.0 if pred == gold else 0.0
acc_norm = 1.0 if pred_norm == gold else 0.0
result_dict = { result_dict = {
**({"acc": acc} if "acc" in use_metric else {}), **({"acc": acc} if "acc" in use_metric else {}),
...@@ -900,40 +972,45 @@ class ConfigurableTask(Task): ...@@ -900,40 +972,45 @@ class ConfigurableTask(Task):
acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0 acc_mutual_info = 1.0 if np.argmax(lls_mutual_info) == gold else 0.0
result_dict["acc_mutual_info"] = acc_mutual_info result_dict["acc_mutual_info"] = acc_mutual_info
elif self.OUTPUT_TYPE == "winograd_schema":
lls, is_greedy = zip(*results)
if self._config.gold_alias is not None:
gold = int(self.gold_alias(doc))
else:
gold = int(self.doc_to_target(doc))
pred = np.argmax(lls)
acc = 1.0 if np.argmax(lls) == gold else 0.0
result_dict = {
**({"acc": acc} if "acc" in use_metric else {}),
}
elif self.OUTPUT_TYPE == "greedy_until": elif self.OUTPUT_TYPE == "greedy_until":
if self._config.gold_alias is not None:
gold = self.gold_alias(doc)
else:
gold = self.doc_to_target(doc) gold = self.doc_to_target(doc)
for key, result in zip(self._metric_fn_list.keys(), results): for key, result in zip(self._metric_fn_list.keys(), results):
_dict = self._metric_fn_list[key]( if self.multiple_target:
# in the case where we have multiple targets,
# return true if any are true
# TODO: this may break for multipLe_target, non zero-or-1 metrics
scores = []
for gold_option in gold:
res = self._metric_fn_list[key](
references=[gold_option],
predictions=[result],
**self._metric_fn_kwargs[key],
)
if isinstance(res, dict):
# TODO: this handles the case where HF evaluate returns a dict.
res = res[key]
scores.append(res)
if any(scores):
result = 1.0
else:
result = 0.0
else:
result = self._metric_fn_list[key](
references=[gold], references=[gold],
predictions=[result], predictions=[result],
**self._metric_fn_kwargs[key], **self._metric_fn_kwargs[key],
) )
result_dict = {**result_dict, **_dict} if isinstance(result, dict):
result_dict.update(result)
else:
result_dict[key] = result
else: else:
raise ValueError( raise ValueError(
f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ", f"Passed invalid output_type '{self.OUTPUT_TYPE}' ! Please use one of ",
"'loglikelihood', 'loglikelihood_rolling', 'greedy_until', 'multiple_choice' or 'winograd_schema' ", "'loglikelihood', 'loglikelihood_rolling', 'greedy_until' or 'multiple_choice'",
) )
return result_dict return result_dict
......
include: arc_easy.yaml
group: group:
- ai2_arc - ai2_arc
- multiple_choice - multiple_choice
task: arc_challenge task: arc_challenge
dataset_path: ai2_arc dataset_path: ai2_arc
dataset_name: ARC-Challenge dataset_name: ARC-Challenge
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{answer_choices[gold]}}"
gold_alias: "{{gold}}" # this will be cast to an int.
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
# - metric: acc_mutual_info
# aggregation: mean
# higher_is_better: true
...@@ -8,10 +8,11 @@ output_type: multiple_choice ...@@ -8,10 +8,11 @@ output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
test_split: test test_split: test
template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
doc_to_text: "Question: {{question}}\nAnswer:" doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{answer_choices[gold]}}" doc_to_target: "{{choices.label.index(answerKey)}}"
gold_alias: "{{gold}}" # this will be cast to an int. doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
group:
- glue-promptsource
task: qnli
dataset_path: glue
dataset_name: qnli
output_type: multiple_choice
training_split: train
validation_split: validation
use_prompt: "promptsource:have all you need"
metric_list:
- metric: acc
...@@ -7,10 +7,11 @@ output_type: multiple_choice ...@@ -7,10 +7,11 @@ output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
test_split: test test_split: test
template_aliases: "{% set answer_choices = answers|map(attribute='atext')|list %}{% set gold = ra - 1 %}" # set the list of possible answer choices, and set what this doc's gold label idx is
doc_to_text: "Question: {{qtext}}\nAnswer:" doc_to_text: "Question: {{qtext}}\nAnswer:"
doc_to_target: "{{answer_choices[gold]}}" doc_to_target: "{{ra - 1}}"
gold_alias: "{{gold}}" # this will be cast to an int. doc_to_choice: "{{answers|map(attribute='atext')|list}}" # this will be cast to an int.
should_decontaminate: true
doc_to_decontamination_query: query
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
...@@ -7,10 +7,9 @@ output_type: multiple_choice ...@@ -7,10 +7,9 @@ output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
test_split: null test_split: null
template_aliases: "{% set gold = label | int %}{% set answer_choices = endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list %}"
doc_to_text: "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}" doc_to_text: "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace(' ', ' ')}}"
doc_to_target: "{{answer_choices[gold]}}" doc_to_target: "{{label}}"
gold_alias: "{{gold}}" doc_to_choice: "{{endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', ' ', ' ')|list}}"
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
...@@ -6,9 +6,8 @@ dataset_name: commonsense ...@@ -6,9 +6,8 @@ dataset_name: commonsense
output_type: multiple_choice output_type: multiple_choice
training_split: train training_split: train
test_split: test test_split: test
template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
doc_to_text: "{{input}}\nQuestion: Is this wrong?\nAnswer:" doc_to_text: "{{input}}\nQuestion: Is this wrong?\nAnswer:"
doc_to_target: "{{answer_choices[label]}}" doc_to_target: label
gold_alias: "{{label}}" # this will be cast to an int. doc_to_choice: ['no', 'yes']
metric_list: metric_list:
- metric: acc - metric: acc
group: include: commonsense.yaml
- hendrycks_ethics
task: ethics_deontology task: ethics_deontology
dataset_path: hails/hendrycks_ethics dataset_path: hails/hendrycks_ethics
dataset_name: deontology dataset_name: deontology
output_type: multiple_choice
training_split: train
test_split: test
template_aliases: "{% set answer_choices = ['unreasonable', 'reasonable'] %}{% if excuse is not defined %}{% set excuse = '' %}{% endif %}"
doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}} {{excuse.rstrip()}}\"\nAnswer:" doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}} {{excuse.rstrip()}}\"\nAnswer:"
doc_to_target: "{{answer_choices[label]}}" doc_to_target: label
gold_alias: "{{label}}" # this will be cast to an int. doc_to_choice: ['unreasonable', 'reasonable']
metric_list:
- metric: acc
# TODO: implement exact-match metric for this subset # TODO: implement exact-match metric for this subset
...@@ -4,5 +4,5 @@ group: ...@@ -4,5 +4,5 @@ group:
task: ethics_justice task: ethics_justice
dataset_name: justice dataset_name: justice
output_type: multiple_choice output_type: multiple_choice
doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:"
# TODO: impl. exact match for this and deontology # TODO: impl. exact match for this and deontology
include: commonsense.yaml
group: group:
- hendrycks_ethics - hendrycks_ethics
task: ethics_utilitarianism task: ethics_utilitarianism
...@@ -6,9 +7,8 @@ dataset_name: utilitarianism ...@@ -6,9 +7,8 @@ dataset_name: utilitarianism
output_type: multiple_choice output_type: multiple_choice
training_split: train training_split: train
test_split: test test_split: test
template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
doc_to_text: !function utils.doc_to_text doc_to_text: !function utils.doc_to_text
doc_to_target: !function utils.doc_to_target doc_to_target: !function utils.doc_to_target
gold_alias: !function utils.gold_alias doc_to_choice: ['no', 'yes']
metric_list: metric_list:
- metric: acc - metric: acc
...@@ -15,23 +15,11 @@ def _preproc_doc(doc): ...@@ -15,23 +15,11 @@ def _preproc_doc(doc):
return doc return doc
def _yesno(x):
if x:
return "yes"
else:
return "no"
def doc_to_text(doc): def doc_to_text(doc):
doc = _preproc_doc(doc) doc = _preproc_doc(doc)
return f"Scenario 1: {doc['scenarios'][0]}\nScenario 2: {doc['scenarios'][1]}\nQuestion: Is Scenario 1 preferable?\nAnswer:" return f"Scenario 1: {doc['scenarios'][0]}\nScenario 2: {doc['scenarios'][1]}\nQuestion: Is Scenario 1 preferable?\nAnswer:"
def doc_to_target(doc): def doc_to_target(doc):
doc = _preproc_doc(doc)
return _yesno(doc["label"])
def gold_alias(doc):
doc = _preproc_doc(doc) doc = _preproc_doc(doc)
return doc["label"] return doc["label"]
include: commonsense.yaml
group: group:
- hendrycks_ethics - hendrycks_ethics
task: ethics_virtue task: ethics_virtue
dataset_path: hails/hendrycks_ethics
dataset_name: virtue dataset_name: virtue
output_type: multiple_choice
training_split: train
test_split: test
template_aliases: "{% set answer_choices = ['no', 'yes'] %}"
doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sentence exhibit the trait \"{{trait}}\"?\nAnswer:" doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sentence exhibit the trait \"{{trait}}\"?\nAnswer:"
doc_to_target: "{{answer_choices[label]}}" doc_to_target: label
gold_alias: "{{label}}" # this will be cast to an int. doc_to_choice: ['no', 'yes']
metric_list:
- metric: acc
...@@ -7,10 +7,11 @@ output_type: multiple_choice ...@@ -7,10 +7,11 @@ output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
test_split: test test_split: test
create_choices: !function utils.create_choices # create list of answer choices
doc_to_text: "Question: {{Problem}}\nAnswer:" doc_to_text: "Question: {{Problem}}\nAnswer:"
doc_to_target: !function utils.doc_to_target doc_to_target: "{{['a', 'b', 'c', 'd', 'e'].index(correct)}}"
gold_alias: "{{['a', 'b', 'c', 'd', 'e'].index(correct)}}" # this will be cast to an int. doc_to_choice: !function utils.doc_to_choice
should_decontaminate: true
doc_to_decontamination_query: "Question: {{Problem}}\nAnswer:"
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
import re import re
def create_choices(doc): def doc_to_choice(doc):
choices = [ choices = [
c[4:].rstrip(" ,") c[4:].rstrip(" ,")
for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"]) for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"])
] ]
return choices return choices
def doc_to_target(doc):
choices = create_choices(doc)
return choices[["a", "b", "c", "d", "e"].index(doc["correct"])]
...@@ -7,11 +7,11 @@ output_type: multiple_choice ...@@ -7,11 +7,11 @@ output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
test_split: test test_split: test
template_aliases: "{% set answer_choices = choices['text'] %}{% set gold = choices.label.index(answerKey.lstrip()) %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what) doc_to_text: question_stem
doc_to_text: "{{question_stem}}" doc_to_target: "{{choices.label.index(answerKey.lstrip())}}"
doc_to_target: "{{gold}}" # this will be cast to an int. doc_to_choice: "{{choices.text}}"
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "{{question_stem}}" doc_to_decontamination_query: question_stem
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
...@@ -7,10 +7,11 @@ output_type: multiple_choice ...@@ -7,10 +7,11 @@ output_type: multiple_choice
training_split: train training_split: train
validation_split: validation validation_split: validation
test_split: null test_split: null
template_aliases: "{% set question = goal %}{% set answer_choices = [sol1, sol2] %}{% set gold = label %}" # set the list of possible answer choices, and set what this doc's gold label idx is doc_to_text: "Question: {{goal}}\nAnswer:"
doc_to_text: "Question: {{question}}\nAnswer:" doc_to_target: label
doc_to_target: "{{answer_choices[gold]}}" doc_to_choice: "{{[sol1, sol2]}}"
gold_alias: "{{gold}}" # this will be cast to an int. should_decontaminate: true
doc_to_decontamination_query: goal
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
...@@ -5,10 +5,9 @@ dataset_path: corypaik/prost ...@@ -5,10 +5,9 @@ dataset_path: corypaik/prost
dataset_name: null dataset_name: null
output_type: multiple_choice output_type: multiple_choice
test_split: test test_split: test
template_aliases: "{% set answer_choices = [A, B, C, D] %}{% set gold = label %}" # set the list of possible answer choices, and set what this doc's gold answer is (set what ds column used, and what)
doc_to_text: "{{context}}\nQuestion: {{ex_question}}\nAnswer:" doc_to_text: "{{context}}\nQuestion: {{ex_question}}\nAnswer:"
doc_to_target: "{{answer_choices[gold]}}" doc_to_target: label
gold_alias: "{{gold}}" # this will be cast to an int. doc_to_choice: "{{[A, B, C, D]}}"
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "{{context}}\nQuestion: {{ex_question}}\nAnswer:" doc_to_decontamination_query: "{{context}}\nQuestion: {{ex_question}}\nAnswer:"
metric_list: metric_list:
......
...@@ -7,10 +7,9 @@ output_type: multiple_choice ...@@ -7,10 +7,9 @@ output_type: multiple_choice
training_split: null training_split: null
validation_split: null validation_split: null
test_split: train test_split: train
template_aliases: "{% set answer_choices = ['yes', 'no', 'maybe'] %}{% set gold = final_decision %}"
doc_to_text: !function preprocess_pubmedqa.doc_to_text doc_to_text: !function preprocess_pubmedqa.doc_to_text
doc_to_target: !function preprocess_pubmedqa.doc_to_target doc_to_target: final_decision
gold_alias: !function preprocess_pubmedqa.gold_alias doc_to_choice: ["yes", "no", "maybe"]
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment