Unverified Commit cda25fef authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into standardize_metrics

parents dfb41835 4d10ad56
...@@ -13,4 +13,4 @@ metric_list: ...@@ -13,4 +13,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -13,4 +13,4 @@ metric_list: ...@@ -13,4 +13,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 0.0 version: 0.0
import yaml import yaml
import inspect
import datasets import datasets
from tqdm import tqdm from tqdm import tqdm
def main() -> None: def main() -> None:
dataset_path = "EleutherAI/advanced_ai_risk" dataset_path = "EleutherAI/advanced_ai_risk"
for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()): for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
file_name = f"{task}.yaml" file_name = f"{task}.yaml"
......
...@@ -11,4 +11,4 @@ doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}" ...@@ -11,4 +11,4 @@ doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 0.0 version: 0.0
import yaml import yaml
import inspect
import datasets import datasets
from tqdm import tqdm from tqdm import tqdm
def main() -> None: def main() -> None:
dataset_path = "EleutherAI/persona" dataset_path = "EleutherAI/persona"
for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()): for task in tqdm(datasets.get_dataset_infos(dataset_path).keys()):
file_name = f"{task}.yaml" file_name = f"{task}.yaml"
......
...@@ -9,4 +9,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" ...@@ -9,4 +9,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" ...@@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" ...@@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" ...@@ -11,4 +11,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -9,4 +9,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" ...@@ -9,4 +9,4 @@ doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -22,4 +22,4 @@ metric_list: ...@@ -22,4 +22,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 2.0 version: 2.0
...@@ -29,4 +29,4 @@ metric_list: ...@@ -29,4 +29,4 @@ metric_list:
regexes_to_ignore: regexes_to_ignore:
- "\ban|a|the\b" - "\ban|a|the\b"
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -18,4 +18,4 @@ metric_list: ...@@ -18,4 +18,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 1.0 version: 1.0
import argparse import argparse
from typing import Dict, List
import yaml import yaml
......
...@@ -17,4 +17,4 @@ metric_list: ...@@ -17,4 +17,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -20,4 +20,4 @@ metric_list: ...@@ -20,4 +20,4 @@ metric_list:
aggregation: bits_per_byte aggregation: bits_per_byte
higher_is_better: false higher_is_better: false
metadata: metadata:
- version: 2.0 version: 2.0
...@@ -18,4 +18,4 @@ metric_list: ...@@ -18,4 +18,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -42,4 +42,4 @@ metric_list: ...@@ -42,4 +42,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -16,4 +16,4 @@ metric_list: ...@@ -16,4 +16,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 1.0 version: 1.0
def doc_to_text(doc) -> str: def doc_to_text(doc) -> str:
ctxs = "\n".join(doc["CONTEXTS"]) ctxs = "\n".join(doc["CONTEXTS"])
return "Abstract: {}\nQuestion: {}\nAnswer:".format( return "Abstract: {}\nQuestion: {}\nAnswer:".format(
ctxs, doc["QUESTION"], doc["final_decision"] ctxs,
doc["QUESTION"],
) )
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment