Commit 835cc40e authored by lintangsutawika's avatar lintangsutawika
Browse files

merged latest and added altworld files

parents 8da401e0 c9bbec6e
......@@ -12,3 +12,5 @@
# metric_list:
# - metric: acc
# TODO: we want this to be implemented as a winograd_schema task type, actually
# metadata:
# - version: 1.0
......@@ -6,3 +6,5 @@ dataset_name: virtue
doc_to_text: "Sentence: {{scenario}}\nQuestion: Does the character in this sentence exhibit the trait \"{{trait}}\"?\nAnswer:"
doc_to_target: label
doc_to_choice: ['no', 'yes']
metadata:
- version: 1.0
......@@ -16,3 +16,5 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
......@@ -17,3 +17,5 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
......@@ -16,3 +16,5 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
......@@ -17,3 +17,5 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
......@@ -16,3 +16,5 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
......@@ -17,3 +17,5 @@ metric_list:
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
......@@ -23,3 +23,5 @@ filter_list:
# https://github.com/openai/evals/blob/305b237cdb3884c7ddb6a5d12cb184a83551fcba/evals/api.py#L84
regex_pattern: "^\\s*([A-D])"
- function: "take_first"
metadata:
- version: 0.0
......@@ -17,3 +17,5 @@ metric_list:
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
- version: 0.0
......@@ -8,5 +8,8 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: acc_norm
aggregation: mean
higher_is_better: true
- metric: brier_score
higher_is_better: false
......@@ -18,3 +18,5 @@ metric_list:
- metric: acc_norm
aggregation: mean
higher_is_better: true
metadata:
- version: 1.0
......@@ -11,3 +11,5 @@ doc_to_decontamination_query: "{{question}} {{sentence}}"
metric_list:
- metric: acc
- metric: f1
metadata:
- version: 1.0
......@@ -25,3 +25,5 @@ metric_list:
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
- version: 0.0
......@@ -27,3 +27,5 @@ filter_list:
- function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first"
metadata:
- version: 0.0
......@@ -27,3 +27,5 @@ filter_list:
- function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first"
metadata:
- version: 1.0
......@@ -19,3 +19,5 @@ metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
metadata:
- version: 0.0
......@@ -11,6 +11,5 @@ metric_list:
- metric: acc
aggregation: mean
higher_is_better: true
- metric: brier_score
aggregation: mean
higher_is_better: false
metadata:
- version: 0.0
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -21,3 +21,5 @@ metric_list:
higher_is_better: true
ignore_case: true
ignore_punctuation: true
metadata:
- version: 0.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment