"git@developer.sourcefind.cn:gaoqiong/migraphx.git" did not exist on "5bf067ed802c2b8735c474a43409e58cb68c79cd"
Commit c0d5a660 authored by lintangsutawika's avatar lintangsutawika
Browse files

resolved conflict

parents f7b81bd4 0d1ef037
...@@ -197,6 +197,8 @@ It is on our roadmap to create task variants designed to enable models which do ...@@ -197,6 +197,8 @@ It is on our roadmap to create task variants designed to enable models which do
A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py). A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py).
To create your own custom integration you can follow instructions from [this tutorial](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage).
### Additional Features ### Additional Features
If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher). If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher).
......
...@@ -2,6 +2,7 @@ import logging ...@@ -2,6 +2,7 @@ import logging
import math import math
import random import random
from collections.abc import Iterable from collections.abc import Iterable
from collections import defaultdict
import evaluate import evaluate
import numpy as np import numpy as np
...@@ -110,6 +111,39 @@ def ter(items): ...@@ -110,6 +111,39 @@ def ter(items):
return sacrebleu.corpus_ter(preds, refs).score return sacrebleu.corpus_ter(preds, refs).score
@register_aggregation("brier_score")
def brier_score(items): # This is a passthrough function
# Certain datasets like arc_easy can have a different number of choices.
golds, predictions = list(zip(*items))
pred_group = defaultdict(list)
gold_group = defaultdict(list)
for gold, pred in zip(golds, predictions):
pred_group[len(pred)].append(pred)
gold_group[len(pred)].append(gold)
total_size = 0
average = 0
for g, p in zip(gold_group.values(), pred_group.values()):
_p = np.array(p)
_g = np.array(g)
average += np.mean(np.sum((_p - _g) ** 2, axis=1)) * len(_g)
total_size += len(_g)
return average / total_size
@register_metric(
metric="brier_score",
higher_is_better=False,
output_type=["multiple_choice"],
aggregation="brier_score",
)
def brier_score_fn(items): # This is a passthrough function
return items
@register_metric( @register_metric(
metric="acc", metric="acc",
higher_is_better=True, higher_is_better=True,
......
...@@ -1016,9 +1016,16 @@ class ConfigurableTask(Task): ...@@ -1016,9 +1016,16 @@ class ConfigurableTask(Task):
if self.OUTPUT_TYPE == "loglikelihood": if self.OUTPUT_TYPE == "loglikelihood":
results = results[0] results = results[0]
ll, is_greedy = results ll, is_greedy = results
prob_norm = np.exp(ll)
return { return {
**({"perplexity": ll} if "perplexity" in use_metric else {}), **({"perplexity": ll} if "perplexity" in use_metric else {}),
**({"acc": int(is_greedy)} if "acc" in use_metric else {}), **({"acc": int(is_greedy)} if "acc" in use_metric else {}),
**(
{"brier_score": (0, [prob_norm])} # Gold is Index 0
if "brier_score" in use_metric
else {}
),
} }
elif self.OUTPUT_TYPE == "loglikelihood_rolling": elif self.OUTPUT_TYPE == "loglikelihood_rolling":
(loglikelihood,) = results (loglikelihood,) = results
...@@ -1097,12 +1104,22 @@ class ConfigurableTask(Task): ...@@ -1097,12 +1104,22 @@ class ConfigurableTask(Task):
# TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
exact_match = int(is_greedy[gold]) if gold != -100 else 0 exact_match = int(is_greedy[gold]) if gold != -100 else 0
prob_norm = utils.softmax(lls)
# TODO use keyword arguments to the metric?
# gold, pred, norm stuff, the original lls,
result_dict = { result_dict = {
**({"acc": acc} if "acc" in use_metric else {}), **({"acc": acc} if "acc" in use_metric else {}),
**({"f1": (gold, pred)} if "f1" in use_metric else {}), **({"f1": (gold, pred)} if "f1" in use_metric else {}),
**({"mcc": (gold, pred)} if "mcc" in use_metric else {}), **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
**({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}), **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
**({"exact_match": exact_match} if "exact_match" in use_metric else {}), **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
**(
# {"brier_score": (gold, prob_norm)}
{"brier_score": [np.eye(len(prob_norm))[gold], prob_norm]}
if "brier_score" in use_metric
else {}
),
} }
if "acc_mutual_info" in use_metric: if "acc_mutual_info" in use_metric:
......
...@@ -486,18 +486,24 @@ def evaluate( ...@@ -486,18 +486,24 @@ def evaluate(
# For unweighted averaging, use: # For unweighted averaging, use:
# current_size = 1 # current_size = 1
# TODO: Tasks like brier score for individual
# tasks have no stderr since the score is
# itself an aggregation. But it's possible to
# calculate the stderr over groups
all_stderr = [] all_stderr = []
for metric in [ for metric in [
key for key in metrics.keys() if "_stderr" not in key key for key in metrics.keys() if "_stderr" not in key
]: ]:
stderr = "_stderr,".join(metric.split(",")) stderr = "_stderr,".join(metric.split(","))
stderr_score = results[task][stderr] stderr_score = results[task][stderr]
if isinstance(stderr_score, str): if stderr_score == "N/A":
stderr_score = 0 var_score = "N/A"
var_score = stderr_score**2 else:
metric_score = results[task][metric] var_score = stderr_score**2
all_stderr.append(stderr)
all_stderr.append(stderr) metric_score = results[task][metric]
if metric in results[group]: if metric in results[group]:
results[group][metric] = ( results[group][metric] = (
...@@ -505,15 +511,20 @@ def evaluate( ...@@ -505,15 +511,20 @@ def evaluate(
+ metric_score * current_size + metric_score * current_size
) / (total_size + current_size) ) / (total_size + current_size)
# $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$ # $$s_z^2 = \frac{(n-1) s_x^2 + (m-1) s_y^2}{n+m-1} + \frac{nm(\bar x - \bar y)^2}{(n+m)(n+m-1)}.$$
results[group][stderr] = ( if var_score == "N/A":
(total_size - 1) * results[group][stderr] results[group][stderr] = "N/A"
+ (current_size - 1) * var_score else:
) / ( results[group][stderr] = (
total_size + current_size - 1 (total_size - 1) * results[group][stderr]
) + total_size * current_size / ( + (current_size - 1) * var_score
(total_size + current_size) ) / (
* (total_size + current_size - 1) total_size + current_size - 1
) * (results[group][metric] - metric_score) ** 2 ) + total_size * current_size / (
(total_size + current_size)
* (total_size + current_size - 1)
) * (
results[group][metric] - metric_score
) ** 2
else: else:
results[group][metric] = metric_score results[group][metric] = metric_score
results[group][stderr] = var_score results[group][stderr] = var_score
......
Investigate affect of letter options
- (A)
- A)
- A.
- A\t
- (a)
- a)
- a.
- a\t
Answer types:
- letters only
- original option
- just letter
- letters + continuation
- original option
- just letter
- continuation
group:
- ai2_arc
dataset_path: ai2_arc
dataset_name: ARC-Challenge
output_type: multiple_choice
training_split: train
validation_split: validation
test_split: test
doc_to_text: "Question: {{question}}\nAnswer:"
doc_to_target: "{{choices.label.index(answerKey)}}"
doc_to_choice: "{{choices.text}}"
should_decontaminate: true
doc_to_decontamination_query: "Question: {{question}}\nAnswer:"
metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: brier_score
aggregation: brier_score
higher_is_better: false
group: arc_challenge_alt_ov
task:
- arc_challenge_alt_ov_01
- arc_challenge_alt_ov_02
- arc_challenge_alt_ov_03
- arc_challenge_alt_ov_04
- arc_challenge_alt_ov_05
- arc_challenge_alt_ov_06
- arc_challenge_alt_ov_07
- arc_challenge_alt_ov_08
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_01
task: arc_challenge_alt_ov_01a
doc_to_text: !function ../styles.template_01
doc_to_choice: !function ../styles.choice_01a
doc_to_decontamination_query: !function ../styles.template_01
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_01
task: arc_challenge_alt_ov_01b
doc_to_text: !function ../styles.template_01
doc_to_choice: !function ../styles.choice_01b
doc_to_decontamination_query: !function ../styles.template_01
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_01
task: arc_challenge_alt_ov_01c
doc_to_text: !function ../styles.template_01
doc_to_choice: !function ../styles.choice_01c
doc_to_decontamination_query: !function ../styles.template_01
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_02
task: arc_challenge_alt_ov_02a
doc_to_text: !function ../styles.template_02
doc_to_choice: !function ../styles.choice_02a
doc_to_decontamination_query: !function ../styles.template_02
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_02
task: arc_challenge_alt_ov_02b
doc_to_text: !function ../styles.template_02
doc_to_choice: !function ../styles.choice_02b
doc_to_decontamination_query: !function ../styles.template_02
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_02
task: arc_challenge_alt_ov_02c
doc_to_text: !function ../styles.template_02
doc_to_choice: !function ../styles.choice_02c
doc_to_decontamination_query: !function ../styles.template_02
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_03
task: arc_challenge_alt_ov_03a
doc_to_text: !function ../styles.template_03
doc_to_choice: !function ../styles.choice_03a
doc_to_decontamination_query: !function ../styles.template_03
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_03
task: arc_challenge_alt_ov_03b
doc_to_text: !function ../styles.template_03
doc_to_choice: !function ../styles.choice_03b
doc_to_decontamination_query: !function ../styles.template_03
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_03
task: arc_challenge_alt_ov_03c
doc_to_text: !function ../styles.template_03
doc_to_choice: !function ../styles.choice_03c
doc_to_decontamination_query: !function ../styles.template_03
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_04
task: arc_challenge_alt_ov_04a
doc_to_text: !function ../styles.template_04
doc_to_choice: !function ../styles.choice_04a
doc_to_decontamination_query: !function ../styles.template_04
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_04
task: arc_challenge_alt_ov_04b
doc_to_text: !function ../styles.template_04
doc_to_choice: !function ../styles.choice_04b
doc_to_decontamination_query: !function ../styles.template_04
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_04
task: arc_challenge_alt_ov_04c
doc_to_text: !function ../styles.template_04
doc_to_choice: !function ../styles.choice_04c
doc_to_decontamination_query: !function ../styles.template_04
include: ../_arc_challenge_alt_yaml
group: arc_challenge_alt_ov_05
task: arc_challenge_alt_ov_05a
doc_to_text: !function ../styles.template_05
doc_to_choice: !function ../styles.choice_05a
doc_to_decontamination_query: !function ../styles.template_05
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment