Commit e6b798f9 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	.pre-commit-config.yaml
#	lm_eval/api/task.py
#	lm_eval/models/huggingface.py
#	lm_eval/models/vllm_causallms.py
#	pyproject.toml
parents 14a29ade 4f8195f1
......@@ -34,5 +34,3 @@ metric_list:
ignore_punctuation: true
metadata:
version: 3.0
dataset_kwargs:
trust_remote_code: true
......@@ -17,7 +17,7 @@ class MultiChoiceRegexFilter(RegexFilter):
ignore_punctuation=False,
regexes_to_ignore=None,
) -> None:
"""
r"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
......@@ -90,7 +90,7 @@ class MultiChoiceRegexFilter(RegexFilter):
fallback_regex = re.compile("|".join(fallback_regexes))
without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile(
f":[\s]*({without_paren_fallback_regex})"
rf":[\s]*({without_paren_fallback_regex})"
)
filtered = []
......
......@@ -30,5 +30,3 @@ metric_list:
higher_is_better: true
metadata:
version: 3.0
dataset_kwargs:
trust_remote_code: true
......@@ -17,7 +17,7 @@ class MultiChoiceRegexFilter(RegexFilter):
ignore_punctuation=False,
regexes_to_ignore=None,
) -> None:
"""
r"""
regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
- step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
- step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
......@@ -90,7 +90,7 @@ class MultiChoiceRegexFilter(RegexFilter):
fallback_regex = re.compile("|".join(fallback_regexes))
without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
without_paren_fallback_regex = re.compile(
f":[\s]*({without_paren_fallback_regex})"
rf":[\s]*({without_paren_fallback_regex})"
)
filtered = []
......
......@@ -13,5 +13,3 @@ metric_list:
higher_is_better: true
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -30,5 +30,3 @@ filter_list:
- function: take_first
metadata:
version: 3.0
dataset_kwargs:
trust_remote_code: true
......@@ -44,7 +44,7 @@ if __name__ == "__main__":
line = line.format(lang=lang_abbr)
if "{ans_regex}" in line:
ans_regex = lang_lib_list[-1].replace(
"({})", "\(?([ABCDEFGHIJ])\)?"
"({})", r"\(?([ABCDEFGHIJ])\)?"
)
if lang_abbr == "en":
ans_regex = ans_regex.lstrip("the").strip()
......
......@@ -12,5 +12,3 @@ metric_list:
- metric: acc
metadata:
version: 0.0
dataset_kwargs:
trust_remote_code: true
......@@ -12,5 +12,3 @@ metric_list:
- metric: acc
metadata:
version: 0.0
dataset_kwargs:
trust_remote_code: true
......@@ -12,5 +12,3 @@ metric_list:
- metric: acc
metadata:
version: 0.0
dataset_kwargs:
trust_remote_code: true
......@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -2,8 +2,6 @@ dataset_path: Helsinki-NLP/tatoeba_mt
training_split: validation
test_split: test
output_type: generate_until
dataset_kwargs:
trust_remote_code: true
metric_list:
- metric: bleu
higher_is_better: true
......
......@@ -19,5 +19,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -12,5 +12,3 @@ metric_list:
higher_is_better: true
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -28,9 +28,7 @@ generation_kwargs:
process_results: !function utils_agieval.non_greedy_robustness_process_results
metric_list:
- metric: non_greedy_accuracy
aggregation: !function utils_agieval.non_greedy_accuracy
aggregation: !function utils_agieval.non_greedy_accuracy
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -27,21 +27,19 @@ generation_kwargs:
process_results: !function utils_agieval.option_order_robustness_process_results
metric_list:
- metric: per_option_accuracy_A
aggregation: !function utils_agieval.per_option_accuracy_a
aggregation: !function utils_agieval.per_option_accuracy_a
higher_is_better: true
- metric: per_option_accuracy_B
aggregation: !function utils_agieval.per_option_accuracy_b
aggregation: !function utils_agieval.per_option_accuracy_b
higher_is_better: true
- metric: per_option_accuracy_C
aggregation: !function utils_agieval.per_option_accuracy_c
aggregation: !function utils_agieval.per_option_accuracy_c
higher_is_better: true
- metric: per_option_accuracy_D
aggregation: !function utils_agieval.per_option_accuracy_d
aggregation: !function utils_agieval.per_option_accuracy_d
higher_is_better: true
- metric: options_consistency_rate
aggregation: !function utils_agieval.options_consistency_rate
aggregation: !function utils_agieval.options_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -27,39 +27,37 @@ generation_kwargs:
process_results: !function utils_agieval.prompt_robustness_process_results
metric_list:
- metric: 0_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_0
aggregation: !function utils_agieval.per_prompt_accuracy_0
higher_is_better: true
- metric: 1_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_1
aggregation: !function utils_agieval.per_prompt_accuracy_1
higher_is_better: true
- metric: 2_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_2
aggregation: !function utils_agieval.per_prompt_accuracy_2
higher_is_better: true
- metric: 3_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_3
aggregation: !function utils_agieval.per_prompt_accuracy_3
higher_is_better: true
- metric: 4_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_4
aggregation: !function utils_agieval.per_prompt_accuracy_4
higher_is_better: true
- metric: 5_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_5
aggregation: !function utils_agieval.per_prompt_accuracy_5
higher_is_better: true
- metric: 6_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_6
aggregation: !function utils_agieval.per_prompt_accuracy_6
higher_is_better: true
- metric: 7_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_7
aggregation: !function utils_agieval.per_prompt_accuracy_7
higher_is_better: true
- metric: 8_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_8
aggregation: !function utils_agieval.per_prompt_accuracy_8
higher_is_better: true
- metric: 9_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_9
aggregation: !function utils_agieval.per_prompt_accuracy_9
higher_is_better: true
- metric: consistency_rate
aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
"\\\\textit",
]:
expr = expr.replace(surround_str, "")
pattern = f"^{surround_str}" + "\{(?P<text>.+?)\}$"
pattern = f"^{surround_str}" + r"\{(?P<text>.+?)\}$"
m = re.search(pattern, expr)
if m is not None:
expr = m.group("text")
expr = expr.replace("\!", "")
expr = expr.replace(r"\!", "")
expr = expr.replace("\\%", "%")
expr = expr.replace("\\$", "$")
expr = expr.replace("$", "")
......@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
"p.m.",
"PM",
]:
expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
expr = re.sub(rf"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
if "day" in expr:
days = [
......@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
if not weekday_expressed:
expr = re.sub("day(s)?", "", expr)
expr = re.sub("\^ *\\\\circ", "", expr)
expr = re.sub("\\^ *\\\\circ", "", expr)
if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
expr = expr[1:-1]
......
......@@ -18,7 +18,7 @@ dataset_name: algebra
output_type: generate_until
test_split: test
process_docs: !function utils_math.non_greedy_robustness_process_docs
doc_to_text: !function utils_math.math_robustness_doc_to_text
doc_to_text: !function utils_math.math_robustness_doc_to_text
doc_to_target: answer
generation_kwargs:
max_gen_toks: 1024
......@@ -28,9 +28,7 @@ generation_kwargs:
process_results: !function utils_math.non_greedy_robustness_process_results
metric_list:
- metric: non_greedy_accuracy
aggregation: !function utils_math.non_greedy_accuracy
aggregation: !function utils_math.non_greedy_accuracy
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment