Unverified Commit 314f7176 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

remove trust-remote-code in configs; fix escape sequences (#3180)

* remove trust-remote-code

* add W605 rule
parent 8c6fde08
......@@ -12,5 +12,3 @@ metric_list:
higher_is_better: true
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -28,9 +28,7 @@ generation_kwargs:
process_results: !function utils_agieval.non_greedy_robustness_process_results
metric_list:
- metric: non_greedy_accuracy
aggregation: !function utils_agieval.non_greedy_accuracy
aggregation: !function utils_agieval.non_greedy_accuracy
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -27,21 +27,19 @@ generation_kwargs:
process_results: !function utils_agieval.option_order_robustness_process_results
metric_list:
- metric: per_option_accuracy_A
aggregation: !function utils_agieval.per_option_accuracy_a
aggregation: !function utils_agieval.per_option_accuracy_a
higher_is_better: true
- metric: per_option_accuracy_B
aggregation: !function utils_agieval.per_option_accuracy_b
aggregation: !function utils_agieval.per_option_accuracy_b
higher_is_better: true
- metric: per_option_accuracy_C
aggregation: !function utils_agieval.per_option_accuracy_c
aggregation: !function utils_agieval.per_option_accuracy_c
higher_is_better: true
- metric: per_option_accuracy_D
aggregation: !function utils_agieval.per_option_accuracy_d
aggregation: !function utils_agieval.per_option_accuracy_d
higher_is_better: true
- metric: options_consistency_rate
aggregation: !function utils_agieval.options_consistency_rate
aggregation: !function utils_agieval.options_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -27,39 +27,37 @@ generation_kwargs:
process_results: !function utils_agieval.prompt_robustness_process_results
metric_list:
- metric: 0_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_0
aggregation: !function utils_agieval.per_prompt_accuracy_0
higher_is_better: true
- metric: 1_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_1
aggregation: !function utils_agieval.per_prompt_accuracy_1
higher_is_better: true
- metric: 2_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_2
aggregation: !function utils_agieval.per_prompt_accuracy_2
higher_is_better: true
- metric: 3_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_3
aggregation: !function utils_agieval.per_prompt_accuracy_3
higher_is_better: true
- metric: 4_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_4
aggregation: !function utils_agieval.per_prompt_accuracy_4
higher_is_better: true
- metric: 5_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_5
aggregation: !function utils_agieval.per_prompt_accuracy_5
higher_is_better: true
- metric: 6_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_6
aggregation: !function utils_agieval.per_prompt_accuracy_6
higher_is_better: true
- metric: 7_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_7
aggregation: !function utils_agieval.per_prompt_accuracy_7
higher_is_better: true
- metric: 8_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_8
aggregation: !function utils_agieval.per_prompt_accuracy_8
higher_is_better: true
- metric: 9_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_9
aggregation: !function utils_agieval.per_prompt_accuracy_9
higher_is_better: true
- metric: consistency_rate
aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
"\\\\textit",
]:
expr = expr.replace(surround_str, "")
pattern = f"^{surround_str}" + "\{(?P<text>.+?)\}$"
pattern = f"^{surround_str}" + r"\{(?P<text>.+?)\}$"
m = re.search(pattern, expr)
if m is not None:
expr = m.group("text")
expr = expr.replace("\!", "")
expr = expr.replace(r"\!", "")
expr = expr.replace("\\%", "%")
expr = expr.replace("\\$", "$")
expr = expr.replace("$", "")
......@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
"p.m.",
"PM",
]:
expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
expr = re.sub(rf"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
if "day" in expr:
days = [
......@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
if not weekday_expressed:
expr = re.sub("day(s)?", "", expr)
expr = re.sub("\^ *\\\\circ", "", expr)
expr = re.sub("\\^ *\\\\circ", "", expr)
if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
expr = expr[1:-1]
......
......@@ -18,7 +18,7 @@ dataset_name: algebra
output_type: generate_until
test_split: test
process_docs: !function utils_math.non_greedy_robustness_process_docs
doc_to_text: !function utils_math.math_robustness_doc_to_text
doc_to_text: !function utils_math.math_robustness_doc_to_text
doc_to_target: answer
generation_kwargs:
max_gen_toks: 1024
......@@ -28,9 +28,7 @@ generation_kwargs:
process_results: !function utils_math.non_greedy_robustness_process_results
metric_list:
- metric: non_greedy_accuracy
aggregation: !function utils_math.non_greedy_accuracy
aggregation: !function utils_math.non_greedy_accuracy
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs
dataset_name: algebra
output_type: generate_until
test_split: test
doc_to_text: !function utils_math.math_robustness_doc_to_text
doc_to_text: !function utils_math.math_robustness_doc_to_text
process_results: !function utils_math.process_results
doc_to_target: answer
generation_kwargs:
......@@ -28,39 +28,37 @@ generation_kwargs:
max_gen_toks: 1024
metric_list:
- metric: 0_accuracy
aggregation: !function utils_math.per_prompt_accuracy_0
aggregation: !function utils_math.per_prompt_accuracy_0
higher_is_better: true
- metric: 1_accuracy
aggregation: !function utils_math.per_prompt_accuracy_1
aggregation: !function utils_math.per_prompt_accuracy_1
higher_is_better: true
- metric: 2_accuracy
aggregation: !function utils_math.per_prompt_accuracy_2
aggregation: !function utils_math.per_prompt_accuracy_2
higher_is_better: true
- metric: 3_accuracy
aggregation: !function utils_math.per_prompt_accuracy_3
aggregation: !function utils_math.per_prompt_accuracy_3
higher_is_better: true
- metric: 4_accuracy
aggregation: !function utils_math.per_prompt_accuracy_4
aggregation: !function utils_math.per_prompt_accuracy_4
higher_is_better: true
- metric: 5_accuracy
aggregation: !function utils_math.per_prompt_accuracy_5
aggregation: !function utils_math.per_prompt_accuracy_5
higher_is_better: true
- metric: 6_accuracy
aggregation: !function utils_math.per_prompt_accuracy_6
aggregation: !function utils_math.per_prompt_accuracy_6
higher_is_better: true
- metric: 7_accuracy
aggregation: !function utils_math.per_prompt_accuracy_7
aggregation: !function utils_math.per_prompt_accuracy_7
higher_is_better: true
- metric: 8_accuracy
aggregation: !function utils_math.per_prompt_accuracy_8
aggregation: !function utils_math.per_prompt_accuracy_8
higher_is_better: true
- metric: 9_accuracy
aggregation: !function utils_math.per_prompt_accuracy_9
aggregation: !function utils_math.per_prompt_accuracy_9
higher_is_better: true
- metric: consistency_rate
aggregation: !function utils_math.math_prompt_consistency_rate
aggregation: !function utils_math.math_prompt_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -30,9 +30,7 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.non_greedy_robustness_process_results
metric_list:
- metric: non_greedy_macro_accuracy
aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy
aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -29,39 +29,37 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.option_order_robustness_process_results
metric_list:
- metric: per_option_macro_accuracy_A
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
higher_is_better: true
- metric: per_option_macro_accuracy_B
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
higher_is_better: true
- metric: per_option_macro_accuracy_C
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
higher_is_better: true
- metric: per_option_macro_accuracy_D
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
higher_is_better: true
- metric: per_option_macro_accuracy_E
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
higher_is_better: true
- metric: per_option_macro_accuracy_F
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
higher_is_better: true
- metric: per_option_macro_accuracy_G
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
higher_is_better: true
- metric: per_option_macro_accuracy_H
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
higher_is_better: true
- metric: per_option_macro_accuracy_I
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
higher_is_better: true
- metric: per_option_macro_accuracy_J
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
higher_is_better: true
- metric: options_consistency_rate
aggregation: !function utils_mmlu_pro.options_consistency_rate
aggregation: !function utils_mmlu_pro.options_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -29,39 +29,37 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.prompt_robustness_process_results
metric_list:
- metric: 0_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
higher_is_better: true
- metric: 1_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
higher_is_better: true
- metric: 2_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
higher_is_better: true
- metric: 3_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
higher_is_better: true
- metric: 4_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
higher_is_better: true
- metric: 5_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
higher_is_better: true
- metric: 6_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
higher_is_better: true
- metric: 7_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
higher_is_better: true
- metric: 8_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
higher_is_better: true
- metric: 9_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
higher_is_better: true
- metric: consistency_rate
aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -16,5 +16,3 @@ metric_list:
- metric: bits_per_byte
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -15,5 +15,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -106,7 +106,7 @@ plugins.md029.allow_extended_start_values = true # ol-prefix
plugins.md034.enabled = false # no-bare-urls
[tool.ruff.lint]
extend-select = ["I"]
extend-select = ["I", "W605"]
[tool.ruff.lint.isort]
lines-after-imports = 2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment