Commit 176a4b1d authored by lintangsutawika's avatar lintangsutawika
Browse files

Merge branch 'big-refactor' of...

Merge branch 'big-refactor' of https://github.com/EleutherAI/lm-evaluation-harness into num_fewshot_fix
parents c3e3643d dc5b3d5d
...@@ -11,3 +11,5 @@ metric_list: ...@@ -11,3 +11,5 @@ metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 0.0
...@@ -21,3 +21,5 @@ metric_list: ...@@ -21,3 +21,5 @@ metric_list:
higher_is_better: true higher_is_better: true
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
metadata:
- version: 0.0
...@@ -21,3 +21,5 @@ metric_list: ...@@ -21,3 +21,5 @@ metric_list:
higher_is_better: true higher_is_better: true
ignore_case: true ignore_case: true
ignore_punctuation: true ignore_punctuation: true
metadata:
- version: 0.0
...@@ -12,3 +12,5 @@ metric_list: ...@@ -12,3 +12,5 @@ metric_list:
- metric: exact_match - metric: exact_match
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 0.0
...@@ -12,3 +12,5 @@ metric_list: ...@@ -12,3 +12,5 @@ metric_list:
- metric: acc_norm - metric: acc_norm
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 0.0
...@@ -10,3 +10,5 @@ should_decontaminate: true ...@@ -10,3 +10,5 @@ should_decontaminate: true
doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}" doc_to_decontamination_query: "{{sentence_good}} {{sentence_bad}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
...@@ -8,3 +8,5 @@ doc_to_target: 0 ...@@ -8,3 +8,5 @@ doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
...@@ -10,3 +10,5 @@ doc_to_target: 0 ...@@ -10,3 +10,5 @@ doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
...@@ -10,3 +10,5 @@ doc_to_target: 0 ...@@ -10,3 +10,5 @@ doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
...@@ -10,3 +10,5 @@ doc_to_target: 0 ...@@ -10,3 +10,5 @@ doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
...@@ -8,3 +8,5 @@ doc_to_target: 0 ...@@ -8,3 +8,5 @@ doc_to_target: 0
doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}" doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
metric_list: metric_list:
- metric: acc - metric: acc
metadata:
- version: 0.0
...@@ -21,3 +21,5 @@ metric_list: ...@@ -21,3 +21,5 @@ metric_list:
- metric: mrr - metric: mrr
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 2.0
...@@ -28,3 +28,5 @@ metric_list: ...@@ -28,3 +28,5 @@ metric_list:
ignore_punctuation: true ignore_punctuation: true
regexes_to_ignore: regexes_to_ignore:
- "\ban|a|the\b" - "\ban|a|the\b"
metadata:
- version: 0.0
...@@ -17,3 +17,5 @@ metric_list: ...@@ -17,3 +17,5 @@ metric_list:
- metric: acc_norm - metric: acc_norm
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 1.0
...@@ -16,3 +16,5 @@ metric_list: ...@@ -16,3 +16,5 @@ metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 0.0
...@@ -19,3 +19,5 @@ metric_list: ...@@ -19,3 +19,5 @@ metric_list:
- metric: bits_per_byte - metric: bits_per_byte
aggregation: bits_per_byte aggregation: bits_per_byte
higher_is_better: false higher_is_better: false
metadata:
- version: 2.0
...@@ -17,3 +17,5 @@ metric_list: ...@@ -17,3 +17,5 @@ metric_list:
- metric: acc_norm - metric: acc_norm
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 1.0
...@@ -41,3 +41,5 @@ metric_list: ...@@ -41,3 +41,5 @@ metric_list:
- metric: accuracy - metric: accuracy
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 0.0
...@@ -15,3 +15,5 @@ metric_list: ...@@ -15,3 +15,5 @@ metric_list:
- metric: acc_norm - metric: acc_norm
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 1.0
...@@ -12,3 +12,5 @@ metric_list: ...@@ -12,3 +12,5 @@ metric_list:
- metric: acc - metric: acc
aggregation: mean aggregation: mean
higher_is_better: true higher_is_better: true
metadata:
- version: 1.0
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment