Unverified Commit cda25fef authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into standardize_metrics

parents dfb41835 4d10ad56
......@@ -19,4 +19,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
version: 1.0
......@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes']
metric_list:
- metric: acc
metadata:
- version: 1.0
version: 1.0
......@@ -5,5 +5,5 @@ doc_to_text: "Question: Would most people believe this reasonable or unreasonabl
doc_to_target: label
doc_to_choice: ['unreasonable', 'reasonable']
metadata:
- version: 1.0
version: 1.0
# TODO: implement exact-match metric for this subset
......@@ -6,4 +6,4 @@ dataset_name: justice
doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:"
# TODO: impl. exact match for this and deontology
metadata:
- version: 1.0
version: 1.0
......@@ -9,4 +9,4 @@ doc_to_choice: ['no', 'yes']
metric_list:
- metric: acc
metadata:
- version: 1.0
version: 1.0
......@@ -13,4 +13,4 @@
# - metric: acc
# TODO: we want this to be implemented as a winograd_schema task type, actually
# metadata:
# - version: 1.0
# version: 1.0
......@@ -7,4 +7,4 @@ doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sente
doc_to_target: label
doc_to_choice: ['no', 'yes']
metadata:
- version: 1.0
version: 1.0
......@@ -26,4 +26,4 @@ metric_list:
aggregation: !function utils.agg_inst_level_acc
higher_is_better: true
metadata:
- version: 1.0
version: 1.0
......@@ -78,8 +78,7 @@ INSTRUCTION_CONFLICTS = {
# _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
_KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
_KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
_LANGUAGE
+ "response_language": {
_LANGUAGE + "response_language": {
_LANGUAGE + "response_language",
_FORMAT + "multiple_sections",
_KEYWORD + "existence",
......@@ -90,16 +89,14 @@ INSTRUCTION_CONFLICTS = {
_CHANGE_CASES + "english_lowercase",
},
_LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
_LENGTH
+ "number_paragraphs": {
_LENGTH + "number_paragraphs": {
_LENGTH + "number_paragraphs",
_LENGTH + "nth_paragraph_first_word",
_LENGTH + "number_sentences",
_LENGTH + "nth_paragraph_first_word",
},
_LENGTH + "number_words": {_LENGTH + "number_words"},
_LENGTH
+ "nth_paragraph_first_word": {
_LENGTH + "nth_paragraph_first_word": {
_LENGTH + "nth_paragraph_first_word",
_LENGTH + "number_paragraphs",
},
......@@ -110,23 +107,20 @@ INSTRUCTION_CONFLICTS = {
# _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
_FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
_FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
_FORMAT
+ "multiple_sections": {
_FORMAT + "multiple_sections": {
_FORMAT + "multiple_sections",
_LANGUAGE + "response_language",
_FORMAT + "number_highlighted_sections",
},
# TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
# _FORMAT + "rephrase": instructions.RephraseChecker,
_FORMAT
+ "json_format": set(INSTRUCTION_DICT.keys()).difference(
_FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference(
{_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
),
_FORMAT + "title": {_FORMAT + "title"},
# TODO(tianjianlu): Re-enable with specific prompts.
# _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
_COMBINATION
+ "two_responses": set(INSTRUCTION_DICT.keys()).difference(
_COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
{
_KEYWORD + "forbidden_words",
_KEYWORD + "existence",
......@@ -135,20 +129,17 @@ INSTRUCTION_CONFLICTS = {
_PUNCTUATION + "no_comma",
}
),
_COMBINATION
+ "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
_COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
{_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"}
),
_STARTEND + "end_checker": {_STARTEND + "end_checker"},
_CHANGE_CASES
+ "capital_word_frequency": {
_CHANGE_CASES + "capital_word_frequency": {
_CHANGE_CASES + "capital_word_frequency",
_CHANGE_CASES + "english_lowercase",
_CHANGE_CASES + "english_capital",
},
_CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
_CHANGE_CASES
+ "english_lowercase": {
_CHANGE_CASES + "english_lowercase": {
_CHANGE_CASES + "english_lowercase",
_CHANGE_CASES + "english_capital",
},
......
......@@ -17,7 +17,6 @@
import functools
import random
import re
from typing import List
import immutabledict
import nltk
......
......@@ -6,10 +6,9 @@ validation_split: dev
test_split: test
fewshot_split: dev
output_type: multiple_choice
process_docs: !function utils.process_docs
doc_to_text: "{{question}}"
doc_to_choice: "{{choices}}"
doc_to_target: "{{gold}}"
doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:"
doc_to_choice: ["A", "B", "C", "D"]
doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1]}}"
metric_list:
- metric: acc
aggregation: mean
......@@ -18,4 +17,4 @@ metric_list:
aggregation: mean
higher_is_better: true
metadata:
- version: 0.0
version: 1.1
"dataset_name": "Agricultural Sciences"
"dataset_name": "Agricultural-Sciences"
"include": "_default_kmmlu_yaml"
"task": "kmmlu_agricultural_sciences"
"dataset_name": "Aviation Engineering and Maintenance"
"dataset_name": "Aviation-Engineering-and-Maintenance"
"include": "_default_kmmlu_yaml"
"task": "kmmlu_aviation_engineering_and_maintenance"
"dataset_name": "Chemical Engineering"
"dataset_name": "Chemical-Engineering"
"include": "_default_kmmlu_yaml"
"task": "kmmlu_chemical_engineering"
"dataset_name": "Civil Engineering"
"dataset_name": "Civil-Engineering"
"include": "_default_kmmlu_yaml"
"task": "kmmlu_civil_engineering"
"dataset_name": "Computer Science"
"dataset_name": "Computer-Science"
"include": "_default_kmmlu_yaml"
"task": "kmmlu_computer_science"
"dataset_name": "Criminal Law"
"dataset_name": "Criminal-Law"
"include": "_default_kmmlu_yaml"
"task": "kmmlu_criminal_law"
"dataset_name": "Electrical Engineering"
"dataset_name": "Electrical-Engineering"
"include": "_default_kmmlu_yaml"
"task": "kmmlu_electrical_engineering"
"dataset_name": "Electronics Engineering"
"dataset_name": "Electronics-Engineering"
"include": "_default_kmmlu_yaml"
"task": "kmmlu_electronics_engineering"
"dataset_name": "Energy Management"
"dataset_name": "Energy-Management"
"include": "_default_kmmlu_yaml"
"task": "kmmlu_energy_management"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment