Commit a0787a9f authored by baberabb's avatar baberabb
Browse files

Merge remote-tracking branch 'origin/big-refactor' into big-refactor_dp

parents 6359f083 dc5b3d5d
...@@ -16,3 +16,5 @@ metric_list: ...@@ -16,3 +16,5 @@ metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 1.0
...@@ -17,3 +17,5 @@ metric_list: ...@@ -17,3 +17,5 @@ metric_list:
- metric: acc_norm - metric: acc_norm
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 1.0
...@@ -23,3 +23,5 @@ filter_list: ...@@ -23,3 +23,5 @@ filter_list:
# https://github.com/openai/evals/blob/305b237cdb3884c7ddb6a5d12cb184a83551fcba/evals/api.py#L84 # https://github.com/openai/evals/blob/305b237cdb3884c7ddb6a5d12cb184a83551fcba/evals/api.py#L84
regex_pattern: "^\\s*([A-D])" regex_pattern: "^\\s*([A-D])"
- function: "take_first" - function: "take_first"
metadata:
- version: 0.0
...@@ -17,3 +17,5 @@ metric_list: ...@@ -17,3 +17,5 @@ metric_list:
- metric: acc_norm - metric: acc_norm
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 0.0
...@@ -18,3 +18,5 @@ metric_list: ...@@ -18,3 +18,5 @@ metric_list:
- metric: acc_norm - metric: acc_norm
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 1.0
...@@ -11,3 +11,5 @@ doc_to_decontamination_query: "{{question}} {{sentence}}" ...@@ -11,3 +11,5 @@ doc_to_decontamination_query: "{{question}} {{sentence}}"
metric_list: metric_list:
- metric: acc - metric: acc
- metric: f1 - metric: f1
metadata:
- version: 1.0
...@@ -25,3 +25,5 @@ metric_list: ...@@ -25,3 +25,5 @@ metric_list:
higher_is_better: true higher_is_better: true
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
metadata:
- version: 0.0
...@@ -27,3 +27,5 @@ filter_list: ...@@ -27,3 +27,5 @@ filter_list:
- function: "regex" - function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)" regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first" - function: "take_first"
metadata:
- version: 0.0
...@@ -27,3 +27,5 @@ filter_list: ...@@ -27,3 +27,5 @@ filter_list:
- function: "regex" - function: "regex"
regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)" regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
- function: "take_first" - function: "take_first"
metadata:
- version: 1.0
...@@ -19,3 +19,5 @@ metric_list: ...@@ -19,3 +19,5 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 0.0
...@@ -11,3 +11,5 @@ metric_list: ...@@ -11,3 +11,5 @@ metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 0.0
...@@ -21,3 +21,5 @@ metric_list: ...@@ -21,3 +21,5 @@ metric_list:
higher_is_better: true higher_is_better: true
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
metadata:
- version: 0.0
...@@ -21,3 +21,5 @@ metric_list: ...@@ -21,3 +21,5 @@ metric_list:
higher_is_better: true higher_is_better: true
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
metadata:
- version: 0.0
...@@ -12,3 +12,5 @@ metric_list: ...@@ -12,3 +12,5 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 0.0
...@@ -12,3 +12,5 @@ metric_list: ...@@ -12,3 +12,5 @@ metric_list:
- metric: acc_norm - metric: acc_norm
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 0.0
...@@ -10,3 +10,5 @@ should_decontaminate: true ...@@ -10,3 +10,5 @@ should_decontaminate: true
doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}" doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
...@@ -8,3 +8,5 @@ doc_to_target: 0 ...@@ -8,3 +8,5 @@ doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
...@@ -10,3 +10,5 @@ doc_to_target: 0 ...@@ -10,3 +10,5 @@ doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
...@@ -10,3 +10,5 @@ doc_to_target: 0 ...@@ -10,3 +10,5 @@ doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
...@@ -10,3 +10,5 @@ doc_to_target: 0 ...@@ -10,3 +10,5 @@ doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment