Commit c0d5a660 authored by lintangsutawika's avatar lintangsutawika
Browse files

resolved conflict

parents f7b81bd4 0d1ef037
......@@ -197,6 +197,8 @@ It is on our roadmap to create task variants designed to enable models which do
A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py).
To create your own custom integration you can follow instructions from [this tutorial](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage).
### Additional Features
If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher).
......
......@@ -2,6 +2,7 @@ import logging
import math
import random
from collections.abc import Iterable
from collections import defaultdict
import evaluate
import numpy as np
......@@ -110,6 +111,39 @@ def ter(items):
return sacrebleu.corpus_ter(preds, refs).score
@register_aggregation("brier_score")
def brier_score(items): # This is a passthrough function
# Certain datasets like arc_easy can have a different number of choices.
golds, predictions = list(zip(*items))
pred_group = defaultdict(list)
gold_group = defaultdict(list)
for gold, pred in zip(golds, predictions):
pred_group[len(pred)].append(pred)
gold_group[len(pred)].append(gold)
total_size = 0
average = 0
for g, p in zip(gold_group.values(), pred_group.values()):
_p = np.array(p)
_g = np.array(g)
average += np.mean(np.sum((_p - _g) ** 2, axis=1)) * len(_g)
total_size += len(_g)
return average / total_size
@register_metric(
metric="brier_score",
higher_is_better=False,
output_type=["multiple_choice"],
aggregation="brier_score",
)
def brier_score_fn(items): # This is a passthrough function
return items
@register_metric(
metric="acc",
higher_is_better=True,
......
......@@ -1016,9 +1016,16 @@ class ConfigurableTask(Task):
if self.OUTPUT_TYPE == "loglikelihood":
results = results[0]
ll, is_greedy = results
prob_norm = np.exp(ll)
return {
**({"perplexity": ll} if "perplexity" in use_metric else {}),
**({"acc": int(is_greedy)} if "acc" in use_metric else {}),
**(
{"brier_score": (0, [prob_norm])} # Gold is Index 0
if "brier_score" in use_metric
else {}
),
}
elif self.OUTPUT_TYPE == "loglikelihood_rolling":
(loglikelihood,) = results
......@@ -1097,12 +1104,22 @@ class ConfigurableTask(Task):
# TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
exact_match = int(is_greedy[gold]) if gold != -100 else 0
prob_norm = utils.softmax(lls)
# TODO use keyword arguments to the metric?
# gold, pred, norm stuff, the original lls,
result_dict = {
**({"acc": acc} if "acc" in use_metric else {}),
**({"f1": (gold, pred)} if "f1" in use_metric else {}),
**({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
**({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
**({"exact_match": exact_match} if "exact_match" in use_metric else {}),
**(
# {"brier_score": (gold, prob_norm)}
{"brier_score": [np.eye(len(prob_norm))[gold], prob_norm]}
if "brier_score" in use_metric
else {}
),
}
if "acc_mutual_info" in use_metric:
......
......@@ -486,18 +486,24 @@ def evaluate(
# For unweighted averaging, use:
# current_size = 1
# TODO: Tasks like brier score for individual
# tasks have no stderr since the score is
# itself an aggregation. But it's possible to
# calculate the stderr over groups
all_stderr = []
for metric in [
key for key in metrics.keys() if "_stderr" not in key
]:
stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr]
if isinstance(stderr_score, str):
stderr_score = 0
var_score = stderr_score**2
metric_score = results[task][metric]
if stderr_score == "N/A":
var_score = "N/A"
else:
var_score = stderr_score**2
all_stderr.append(stderr)
all_stderr.append(stderr)
metric_score = results[task][metric]
if metric in results[group]:
results[group][metric] = (
......@@ -505,15 +511,20 @@ def evaluate(
+ metric_score * current_size
) / (total_size + current_size)
# $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
results[group][stderr] = (
(total_size - 1) * results[group][stderr]
+ (current_size - 1) * var_score
) / (
total_size + current_size - 1
) + total_size * current_size / (
(total_size + current_size)
* (total_size + current_size - 1)
) * (results[group][metric] - metric_score) ** 2
if var_score == "N/A":
results[group][stderr] = "N/A"
else:
results[group][stderr] = (
(total_size - 1) * results[group][stderr]
+ (current_size - 1) * var_score
) / (
total_size + current_size - 1
) + total_size * current_size / (
(total_size + current_size)
* (total_size + current_size - 1)
) * (
results[group][metric] - metric_score
) ** 2
else:
results[group][metric] = metric_score
results[group][stderr] = var_score
......
Investigate affect of letter options
- (A)
- A)
- A.
- A\t
- (a)
- a)
- a.
- a\t
Answer types:
- letters only
- original option
- just letter
- letters + continuation
- original option
- just letter
- continuation
group:
- ai2_arc
dataset_path: ai2_arc
dataset_name: ARC-Challenge
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: brier_score
aggregation: brier_score
higher_is_better: false
group: arc_challenge_alt_ov
task:
- arc_challenge_alt_ov_01
- arc_challenge_alt_ov_02
- arc_challenge_alt_ov_03
- arc_challenge_alt_ov_04
- arc_challenge_alt_ov_05
- arc_challenge_alt_ov_06
- arc_challenge_alt_ov_07
- arc_challenge_alt_ov_08
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_01
task: arc_challenge_alt_ov_01a
doc_to_text: !function ../styles.template_01
doc_to_choice: !function ../styles.choice_01a
doc_to_decontamination_query: !function ../styles.template_01
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_01
task: arc_challenge_alt_ov_01b
doc_to_text: !function ../styles.template_01
doc_to_choice: !function ../styles.choice_01b
doc_to_decontamination_query: !function ../styles.template_01
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_01
task: arc_challenge_alt_ov_01c
doc_to_text: !function ../styles.template_01
doc_to_choice: !function ../styles.choice_01c
doc_to_decontamination_query: !function ../styles.template_01
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_02
task: arc_challenge_alt_ov_02a
doc_to_text: !function ../styles.template_02
doc_to_choice: !function ../styles.choice_02a
doc_to_decontamination_query: !function ../styles.template_02
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_02
task: arc_challenge_alt_ov_02b
doc_to_text: !function ../styles.template_02
doc_to_choice: !function ../styles.choice_02b
doc_to_decontamination_query: !function ../styles.template_02
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_02
task: arc_challenge_alt_ov_02c
doc_to_text: !function ../styles.template_02
doc_to_choice: !function ../styles.choice_02c
doc_to_decontamination_query: !function ../styles.template_02
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_03
task: arc_challenge_alt_ov_03a
doc_to_text: !function ../styles.template_03
doc_to_choice: !function ../styles.choice_03a
doc_to_decontamination_query: !function ../styles.template_03
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_03
task: arc_challenge_alt_ov_03b
doc_to_text: !function ../styles.template_03
doc_to_choice: !function ../styles.choice_03b
doc_to_decontamination_query: !function ../styles.template_03
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_03
task: arc_challenge_alt_ov_03c
doc_to_text: !function ../styles.template_03
doc_to_choice: !function ../styles.choice_03c
doc_to_decontamination_query: !function ../styles.template_03
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_04
task: arc_challenge_alt_ov_04a
doc_to_text: !function ../styles.template_04
doc_to_choice: !function ../styles.choice_04a
doc_to_decontamination_query: !function ../styles.template_04
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_04
task: arc_challenge_alt_ov_04b
doc_to_text: !function ../styles.template_04
doc_to_choice: !function ../styles.choice_04b
doc_to_decontamination_query: !function ../styles.template_04
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_04
task: arc_challenge_alt_ov_04c
doc_to_text: !function ../styles.template_04
doc_to_choice: !function ../styles.choice_04c
doc_to_decontamination_query: !function ../styles.template_04
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_05
task: arc_challenge_alt_ov_05a
doc_to_text: !function ../styles.template_05
doc_to_choice: !function ../styles.choice_05a
doc_to_decontamination_query: !function ../styles.template_05
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment