Unverified Commit cda25fef authored by Lintang Sutawika's avatar Lintang Sutawika Committed by GitHub
Browse files

Merge branch 'main' into standardize_metrics

parents dfb41835 4d10ad56
...@@ -19,4 +19,4 @@ metric_list: ...@@ -19,4 +19,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 2.0 version: 2.0
...@@ -7,7 +7,7 @@ def doc_to_text(doc): ...@@ -7,7 +7,7 @@ def doc_to_text(doc):
# Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1} # Given a passage p, the conversation history {q1, a1, . . . qi−1, ai−1}
# and a question qi, the task is to predict the answer ai # and a question qi, the task is to predict the answer ai
doc_text = doc["story"] + "\n\n" doc_text = doc["story"] + "\n\n"
for (q, a) in zip_longest( for q, a in zip_longest(
doc["questions"]["input_text"], doc["answers"]["input_text"][:-1] doc["questions"]["input_text"], doc["answers"]["input_text"][:-1]
): # omit target answer ai ): # omit target answer ai
question = f"Q: {q}\n\n" question = f"Q: {q}\n\n"
...@@ -17,7 +17,6 @@ def doc_to_text(doc): ...@@ -17,7 +17,6 @@ def doc_to_text(doc):
def doc_to_target(doc): def doc_to_target(doc):
turn_id = len(doc["questions"]["input_text"]) turn_id = len(doc["questions"]["input_text"])
# Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers). # Returns unique answers and valid alternatives (Some questions in CoQA have multiple valid answers).
answers = [] answers = []
...@@ -71,7 +70,6 @@ def compute_scores(gold_list, pred): ...@@ -71,7 +70,6 @@ def compute_scores(gold_list, pred):
def process_results(doc, results): def process_results(doc, results):
gold_list = doc_to_target(doc) gold_list = doc_to_target(doc)
pred = results[0].strip().split("\n")[0] pred = results[0].strip().split("\n")[0]
......
...@@ -20,4 +20,4 @@ metric_list: ...@@ -20,4 +20,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: false higher_is_better: false
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -14,4 +14,4 @@ metric_list: ...@@ -14,4 +14,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -21,7 +21,6 @@ def parse_args(): ...@@ -21,7 +21,6 @@ def parse_args():
if __name__ == "__main__": if __name__ == "__main__":
args = parse_args() args = parse_args()
# get filename of base_yaml so we can `"include": ` it in our other YAMLs. # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
...@@ -30,7 +29,6 @@ if __name__ == "__main__": ...@@ -30,7 +29,6 @@ if __name__ == "__main__":
base_yaml = yaml.full_load(f) base_yaml = yaml.full_load(f)
for name in tqdm(SUBSETS): for name in tqdm(SUBSETS):
yaml_dict = { yaml_dict = {
"include": base_yaml_name, "include": base_yaml_name,
"task": f"csatqa_{args.task_prefix}_{name}" "task": f"csatqa_{args.task_prefix}_{name}"
......
...@@ -21,4 +21,4 @@ metric_list: ...@@ -21,4 +21,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 2.0 version: 2.0
...@@ -62,7 +62,6 @@ def parse_answer(answer): ...@@ -62,7 +62,6 @@ def parse_answer(answer):
def process_results(doc, results): def process_results(doc, results):
preds, golds = results, doc["answers"] preds, golds = results, doc["answers"]
max_em = 0 max_em = 0
max_f1 = 0 max_f1 = 0
......
...@@ -12,3 +12,10 @@ metric_list: ...@@ -12,3 +12,10 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
filter_list:
- name: remove_whitespace
filter:
- function: remove_whitespace
- function: take_first
metadata:
version: 1.0
...@@ -13,4 +13,4 @@ doc_to_decontamination_query: sentence ...@@ -13,4 +13,4 @@ doc_to_decontamination_query: sentence
metric_list: metric_list:
- metric: mcc - metric: mcc
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -11,4 +11,4 @@ doc_to_choice: ["True", "Neither", "False"] ...@@ -11,4 +11,4 @@ doc_to_choice: ["True", "Neither", "False"]
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -12,4 +12,4 @@ metric_list: ...@@ -12,4 +12,4 @@ metric_list:
- metric: acc - metric: acc
- metric: f1 - metric: f1
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -11,4 +11,4 @@ doc_to_choice: ["yes", "no"] ...@@ -11,4 +11,4 @@ doc_to_choice: ["yes", "no"]
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -12,4 +12,4 @@ metric_list: ...@@ -12,4 +12,4 @@ metric_list:
- metric: acc - metric: acc
- metric: f1 - metric: f1
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -11,4 +11,4 @@ doc_to_choice: ["True", "False"] ...@@ -11,4 +11,4 @@ doc_to_choice: ["True", "False"]
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -11,4 +11,4 @@ doc_to_choice: ["negative", "positive"] ...@@ -11,4 +11,4 @@ doc_to_choice: ["negative", "positive"]
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -11,4 +11,4 @@ doc_to_choice: ["False", "True"] ...@@ -11,4 +11,4 @@ doc_to_choice: ["False", "True"]
metric_list: metric_list:
- metric: acc - metric: acc
metadata: metadata:
- version: 2.0 version: 2.0
...@@ -31,4 +31,4 @@ filter_list: ...@@ -31,4 +31,4 @@ filter_list:
- function: "majority_vote" - function: "majority_vote"
- function: "take_first" - function: "take_first"
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -41,4 +41,4 @@ filter_list: ...@@ -41,4 +41,4 @@ filter_list:
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)." regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)."
- function: "take_first" - function: "take_first"
metadata: metadata:
- version: 0.0 version: 0.0
...@@ -34,4 +34,4 @@ filter_list: ...@@ -34,4 +34,4 @@ filter_list:
regex_pattern: "#### (\\-?[0-9\\.\\,]+)" regex_pattern: "#### (\\-?[0-9\\.\\,]+)"
- function: "take_first" - function: "take_first"
metadata: metadata:
- version: 1.0 version: 1.0
...@@ -20,4 +20,4 @@ metric_list: ...@@ -20,4 +20,4 @@ metric_list:
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata: metadata:
- version: 1.0 version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment