Unverified Commit cda25fef authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into standardize_metrics

parents dfb41835 4d10ad56
...@@ -19,4 +19,4 @@ metric_list: ...@@ -19,4 +19,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes'] ...@@ -12,4 +12,4 @@ doc_to_choice: ['no', 'yes']
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -5,5 +5,5 @@ doc_to_text: "Question: Would most people believe this reasonable or unreasonabl ...@@ -5,5 +5,5 @@ doc_to_text: "Question: Would most people believe this reasonable or unreasonabl
doc_to_target: label doc_to_target: label
doc_to_choice: ['unreasonable', 'reasonable'] doc_to_choice: ['unreasonable', 'reasonable']
metadata: metadata:
- version: 1.0 version: 1.0
# TODO: implement exact-match metric for this subset # TODO: implement exact-match metric for this subset
...@@ -6,4 +6,4 @@ dataset_name: justice ...@@ -6,4 +6,4 @@ dataset_name: justice
doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:" doc_to_text: "Question: Would most people believe this reasonable or unreasonable to say? \"{{scenario}}\"\nAnswer:"
# TODO: impl. exact match for this and deontology # TODO: impl. exact match for this and deontology
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -9,4 +9,4 @@ doc_to_choice: ['no', 'yes'] ...@@ -9,4 +9,4 @@ doc_to_choice: ['no', 'yes']
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -13,4 +13,4 @@ ...@@ -13,4 +13,4 @@
# - metric: acc # - metric: acc
# TODO: we want this to be implemented as a winograd_schema task type, actually # TODO: we want this to be implemented as a winograd_schema task type, actually
# metadata: # metadata:
# - version: 1.0 # version: 1.0
...@@ -7,4 +7,4 @@ doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sente ...@@ -7,4 +7,4 @@ doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sente
doc_to_target: label doc_to_target: label
doc_to_choice: ['no', 'yes'] doc_to_choice: ['no', 'yes']
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -26,4 +26,4 @@ metric_list: ...@@ -26,4 +26,4 @@ metric_list:
aggregation: !function utils.agg_inst_level_acc aggregation: !function utils.agg_inst_level_acc
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -78,8 +78,7 @@ INSTRUCTION_CONFLICTS = { ...@@ -78,8 +78,7 @@ INSTRUCTION_CONFLICTS = {
# _KEYWORD + "key_sentences": instructions.KeySentenceChecker, # _KEYWORD + "key_sentences": instructions.KeySentenceChecker,
_KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"}, _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"},
_KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"}, _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"},
_LANGUAGE _LANGUAGE + "response_language": {
+ "response_language": {
_LANGUAGE + "response_language", _LANGUAGE + "response_language",
_FORMAT + "multiple_sections", _FORMAT + "multiple_sections",
_KEYWORD + "existence", _KEYWORD + "existence",
...@@ -90,16 +89,14 @@ INSTRUCTION_CONFLICTS = { ...@@ -90,16 +89,14 @@ INSTRUCTION_CONFLICTS = {
_CHANGE_CASES + "english_lowercase", _CHANGE_CASES + "english_lowercase",
}, },
_LENGTH + "number_sentences": {_LENGTH + "number_sentences"}, _LENGTH + "number_sentences": {_LENGTH + "number_sentences"},
_LENGTH _LENGTH + "number_paragraphs": {
+ "number_paragraphs": {
_LENGTH + "number_paragraphs", _LENGTH + "number_paragraphs",
_LENGTH + "nth_paragraph_first_word", _LENGTH + "nth_paragraph_first_word",
_LENGTH + "number_sentences", _LENGTH + "number_sentences",
_LENGTH + "nth_paragraph_first_word", _LENGTH + "nth_paragraph_first_word",
}, },
_LENGTH + "number_words": {_LENGTH + "number_words"}, _LENGTH + "number_words": {_LENGTH + "number_words"},
_LENGTH _LENGTH + "nth_paragraph_first_word": {
+ "nth_paragraph_first_word": {
_LENGTH + "nth_paragraph_first_word", _LENGTH + "nth_paragraph_first_word",
_LENGTH + "number_paragraphs", _LENGTH + "number_paragraphs",
}, },
...@@ -110,23 +107,20 @@ INSTRUCTION_CONFLICTS = { ...@@ -110,23 +107,20 @@ INSTRUCTION_CONFLICTS = {
# _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph, # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph,
_FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()), _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()),
_FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"}, _FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"},
_FORMAT _FORMAT + "multiple_sections": {
+ "multiple_sections": {
_FORMAT + "multiple_sections", _FORMAT + "multiple_sections",
_LANGUAGE + "response_language", _LANGUAGE + "response_language",
_FORMAT + "number_highlighted_sections", _FORMAT + "number_highlighted_sections",
}, },
# TODO(tianjianlu): Re-enable rephrasing with preprocessing the message. # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message.
# _FORMAT + "rephrase": instructions.RephraseChecker, # _FORMAT + "rephrase": instructions.RephraseChecker,
_FORMAT _FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference(
+ "json_format": set(INSTRUCTION_DICT.keys()).difference(
{_KEYWORD + "forbidden_words", _KEYWORD + "existence"} {_KEYWORD + "forbidden_words", _KEYWORD + "existence"}
), ),
_FORMAT + "title": {_FORMAT + "title"}, _FORMAT + "title": {_FORMAT + "title"},
# TODO(tianjianlu): Re-enable with specific prompts. # TODO(tianjianlu): Re-enable with specific prompts.
# _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker, # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker,
_COMBINATION _COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference(
+ "two_responses": set(INSTRUCTION_DICT.keys()).difference(
{ {
_KEYWORD + "forbidden_words", _KEYWORD + "forbidden_words",
_KEYWORD + "existence", _KEYWORD + "existence",
...@@ -135,20 +129,17 @@ INSTRUCTION_CONFLICTS = { ...@@ -135,20 +129,17 @@ INSTRUCTION_CONFLICTS = {
_PUNCTUATION + "no_comma", _PUNCTUATION + "no_comma",
} }
), ),
_COMBINATION _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
+ "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference(
{_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"} {_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"}
), ),
_STARTEND + "end_checker": {_STARTEND + "end_checker"}, _STARTEND + "end_checker": {_STARTEND + "end_checker"},
_CHANGE_CASES _CHANGE_CASES + "capital_word_frequency": {
+ "capital_word_frequency": {
_CHANGE_CASES + "capital_word_frequency", _CHANGE_CASES + "capital_word_frequency",
_CHANGE_CASES + "english_lowercase", _CHANGE_CASES + "english_lowercase",
_CHANGE_CASES + "english_capital", _CHANGE_CASES + "english_capital",
}, },
_CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"}, _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"},
_CHANGE_CASES _CHANGE_CASES + "english_lowercase": {
+ "english_lowercase": {
_CHANGE_CASES + "english_lowercase", _CHANGE_CASES + "english_lowercase",
_CHANGE_CASES + "english_capital", _CHANGE_CASES + "english_capital",
}, },
......
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
import functools import functools
import random import random
import re import re
from typing import List
import immutabledict import immutabledict
import nltk import nltk
......
...@@ -6,10 +6,9 @@ validation_split: dev ...@@ -6,10 +6,9 @@ validation_split: dev
test_split: test test_split: test
fewshot_split: dev fewshot_split: dev
output_type: multiple_choice output_type: multiple_choice
process_docs: !function utils.process_docs doc_to_text: "{{question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n정답:"
doc_to_text: "{{question}}" doc_to_choice: ["A", "B", "C", "D"]
doc_to_choice: "{{choices}}" doc_to_target: "{{['A', 'B', 'C', 'D'][answer-1]}}"
doc_to_target: "{{gold}}"
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
...@@ -18,4 +17,4 @@ metric_list: ...@@ -18,4 +17,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 0.0 version: 1.1
"dataset_name": "Agricultural Sciences" "dataset_name": "Agricultural-Sciences"
"include": "_default_kmmlu_yaml" "include": "_default_kmmlu_yaml"
"task": "kmmlu_agricultural_sciences" "task": "kmmlu_agricultural_sciences"
"dataset_name": "Aviation Engineering and Maintenance" "dataset_name": "Aviation-Engineering-and-Maintenance"
"include": "_default_kmmlu_yaml" "include": "_default_kmmlu_yaml"
"task": "kmmlu_aviation_engineering_and_maintenance" "task": "kmmlu_aviation_engineering_and_maintenance"
"dataset_name": "Chemical Engineering" "dataset_name": "Chemical-Engineering"
"include": "_default_kmmlu_yaml" "include": "_default_kmmlu_yaml"
"task": "kmmlu_chemical_engineering" "task": "kmmlu_chemical_engineering"
"dataset_name": "Civil Engineering" "dataset_name": "Civil-Engineering"
"include": "_default_kmmlu_yaml" "include": "_default_kmmlu_yaml"
"task": "kmmlu_civil_engineering" "task": "kmmlu_civil_engineering"
"dataset_name": "Computer Science" "dataset_name": "Computer-Science"
"include": "_default_kmmlu_yaml" "include": "_default_kmmlu_yaml"
"task": "kmmlu_computer_science" "task": "kmmlu_computer_science"
"dataset_name": "Criminal Law" "dataset_name": "Criminal-Law"
"include": "_default_kmmlu_yaml" "include": "_default_kmmlu_yaml"
"task": "kmmlu_criminal_law" "task": "kmmlu_criminal_law"
"dataset_name": "Electrical Engineering" "dataset_name": "Electrical-Engineering"
"include": "_default_kmmlu_yaml" "include": "_default_kmmlu_yaml"
"task": "kmmlu_electrical_engineering" "task": "kmmlu_electrical_engineering"
"dataset_name": "Electronics Engineering" "dataset_name": "Electronics-Engineering"
"include": "_default_kmmlu_yaml" "include": "_default_kmmlu_yaml"
"task": "kmmlu_electronics_engineering" "task": "kmmlu_electronics_engineering"
"dataset_name": "Energy Management" "dataset_name": "Energy-Management"
"include": "_default_kmmlu_yaml" "include": "_default_kmmlu_yaml"
"task": "kmmlu_energy_management" "task": "kmmlu_energy_management"
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment