Unverified Commit 314f7176 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

remove trust-remote-code in configs; fix escape sequences (#3180)

* remove trust-remote-code

* add W605 rule
parent 8c6fde08
...@@ -12,5 +12,3 @@ metric_list: ...@@ -12,5 +12,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -28,9 +28,7 @@ generation_kwargs: ...@@ -28,9 +28,7 @@ generation_kwargs:
process_results: !function utils_agieval.non_greedy_robustness_process_results process_results: !function utils_agieval.non_greedy_robustness_process_results
metric_list: metric_list:
- metric: non_greedy_accuracy - metric: non_greedy_accuracy
aggregation: !function utils_agieval.non_greedy_accuracy aggregation: !function utils_agieval.non_greedy_accuracy
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -27,21 +27,19 @@ generation_kwargs: ...@@ -27,21 +27,19 @@ generation_kwargs:
process_results: !function utils_agieval.option_order_robustness_process_results process_results: !function utils_agieval.option_order_robustness_process_results
metric_list: metric_list:
- metric: per_option_accuracy_A - metric: per_option_accuracy_A
aggregation: !function utils_agieval.per_option_accuracy_a aggregation: !function utils_agieval.per_option_accuracy_a
higher_is_better: true higher_is_better: true
- metric: per_option_accuracy_B - metric: per_option_accuracy_B
aggregation: !function utils_agieval.per_option_accuracy_b aggregation: !function utils_agieval.per_option_accuracy_b
higher_is_better: true higher_is_better: true
- metric: per_option_accuracy_C - metric: per_option_accuracy_C
aggregation: !function utils_agieval.per_option_accuracy_c aggregation: !function utils_agieval.per_option_accuracy_c
higher_is_better: true higher_is_better: true
- metric: per_option_accuracy_D - metric: per_option_accuracy_D
aggregation: !function utils_agieval.per_option_accuracy_d aggregation: !function utils_agieval.per_option_accuracy_d
higher_is_better: true higher_is_better: true
- metric: options_consistency_rate - metric: options_consistency_rate
aggregation: !function utils_agieval.options_consistency_rate aggregation: !function utils_agieval.options_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -27,39 +27,37 @@ generation_kwargs: ...@@ -27,39 +27,37 @@ generation_kwargs:
process_results: !function utils_agieval.prompt_robustness_process_results process_results: !function utils_agieval.prompt_robustness_process_results
metric_list: metric_list:
- metric: 0_accuracy - metric: 0_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_0 aggregation: !function utils_agieval.per_prompt_accuracy_0
higher_is_better: true higher_is_better: true
- metric: 1_accuracy - metric: 1_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_1 aggregation: !function utils_agieval.per_prompt_accuracy_1
higher_is_better: true higher_is_better: true
- metric: 2_accuracy - metric: 2_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_2 aggregation: !function utils_agieval.per_prompt_accuracy_2
higher_is_better: true higher_is_better: true
- metric: 3_accuracy - metric: 3_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_3 aggregation: !function utils_agieval.per_prompt_accuracy_3
higher_is_better: true higher_is_better: true
- metric: 4_accuracy - metric: 4_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_4 aggregation: !function utils_agieval.per_prompt_accuracy_4
higher_is_better: true higher_is_better: true
- metric: 5_accuracy - metric: 5_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_5 aggregation: !function utils_agieval.per_prompt_accuracy_5
higher_is_better: true higher_is_better: true
- metric: 6_accuracy - metric: 6_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_6 aggregation: !function utils_agieval.per_prompt_accuracy_6
higher_is_better: true higher_is_better: true
- metric: 7_accuracy - metric: 7_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_7 aggregation: !function utils_agieval.per_prompt_accuracy_7
higher_is_better: true higher_is_better: true
- metric: 8_accuracy - metric: 8_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_8 aggregation: !function utils_agieval.per_prompt_accuracy_8
higher_is_better: true higher_is_better: true
- metric: 9_accuracy - metric: 9_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_9 aggregation: !function utils_agieval.per_prompt_accuracy_9
higher_is_better: true higher_is_better: true
- metric: consistency_rate - metric: consistency_rate
aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str: ...@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
"\\\\textit", "\\\\textit",
]: ]:
expr = expr.replace(surround_str, "") expr = expr.replace(surround_str, "")
pattern = f"^{surround_str}" + "\{(?P<text>.+?)\}$" pattern = f"^{surround_str}" + r"\{(?P<text>.+?)\}$"
m = re.search(pattern, expr) m = re.search(pattern, expr)
if m is not None: if m is not None:
expr = m.group("text") expr = m.group("text")
expr = expr.replace("\!", "") expr = expr.replace(r"\!", "")
expr = expr.replace("\\%", "%") expr = expr.replace("\\%", "%")
expr = expr.replace("\\$", "$") expr = expr.replace("\\$", "$")
expr = expr.replace("$", "") expr = expr.replace("$", "")
...@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str: ...@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
"p.m.", "p.m.",
"PM", "PM",
]: ]:
expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr) expr = re.sub(rf"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
if "day" in expr: if "day" in expr:
days = [ days = [
...@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str: ...@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
if not weekday_expressed: if not weekday_expressed:
expr = re.sub("day(s)?", "", expr) expr = re.sub("day(s)?", "", expr)
expr = re.sub("\^ *\\\\circ", "", expr) expr = re.sub("\\^ *\\\\circ", "", expr)
if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}": if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
expr = expr[1:-1] expr = expr[1:-1]
......
...@@ -18,7 +18,7 @@ dataset_name: algebra ...@@ -18,7 +18,7 @@ dataset_name: algebra
output_type: generate_until output_type: generate_until
test_split: test test_split: test
process_docs: !function utils_math.non_greedy_robustness_process_docs process_docs: !function utils_math.non_greedy_robustness_process_docs
doc_to_text: !function utils_math.math_robustness_doc_to_text doc_to_text: !function utils_math.math_robustness_doc_to_text
doc_to_target: answer doc_to_target: answer
generation_kwargs: generation_kwargs:
max_gen_toks: 1024 max_gen_toks: 1024
...@@ -28,9 +28,7 @@ generation_kwargs: ...@@ -28,9 +28,7 @@ generation_kwargs:
process_results: !function utils_math.non_greedy_robustness_process_results process_results: !function utils_math.non_greedy_robustness_process_results
metric_list: metric_list:
- metric: non_greedy_accuracy - metric: non_greedy_accuracy
aggregation: !function utils_math.non_greedy_accuracy aggregation: !function utils_math.non_greedy_accuracy
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs ...@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs
dataset_name: algebra dataset_name: algebra
output_type: generate_until output_type: generate_until
test_split: test test_split: test
doc_to_text: !function utils_math.math_robustness_doc_to_text doc_to_text: !function utils_math.math_robustness_doc_to_text
process_results: !function utils_math.process_results process_results: !function utils_math.process_results
doc_to_target: answer doc_to_target: answer
generation_kwargs: generation_kwargs:
...@@ -28,39 +28,37 @@ generation_kwargs: ...@@ -28,39 +28,37 @@ generation_kwargs:
max_gen_toks: 1024 max_gen_toks: 1024
metric_list: metric_list:
- metric: 0_accuracy - metric: 0_accuracy
aggregation: !function utils_math.per_prompt_accuracy_0 aggregation: !function utils_math.per_prompt_accuracy_0
higher_is_better: true higher_is_better: true
- metric: 1_accuracy - metric: 1_accuracy
aggregation: !function utils_math.per_prompt_accuracy_1 aggregation: !function utils_math.per_prompt_accuracy_1
higher_is_better: true higher_is_better: true
- metric: 2_accuracy - metric: 2_accuracy
aggregation: !function utils_math.per_prompt_accuracy_2 aggregation: !function utils_math.per_prompt_accuracy_2
higher_is_better: true higher_is_better: true
- metric: 3_accuracy - metric: 3_accuracy
aggregation: !function utils_math.per_prompt_accuracy_3 aggregation: !function utils_math.per_prompt_accuracy_3
higher_is_better: true higher_is_better: true
- metric: 4_accuracy - metric: 4_accuracy
aggregation: !function utils_math.per_prompt_accuracy_4 aggregation: !function utils_math.per_prompt_accuracy_4
higher_is_better: true higher_is_better: true
- metric: 5_accuracy - metric: 5_accuracy
aggregation: !function utils_math.per_prompt_accuracy_5 aggregation: !function utils_math.per_prompt_accuracy_5
higher_is_better: true higher_is_better: true
- metric: 6_accuracy - metric: 6_accuracy
aggregation: !function utils_math.per_prompt_accuracy_6 aggregation: !function utils_math.per_prompt_accuracy_6
higher_is_better: true higher_is_better: true
- metric: 7_accuracy - metric: 7_accuracy
aggregation: !function utils_math.per_prompt_accuracy_7 aggregation: !function utils_math.per_prompt_accuracy_7
higher_is_better: true higher_is_better: true
- metric: 8_accuracy - metric: 8_accuracy
aggregation: !function utils_math.per_prompt_accuracy_8 aggregation: !function utils_math.per_prompt_accuracy_8
higher_is_better: true higher_is_better: true
- metric: 9_accuracy - metric: 9_accuracy
aggregation: !function utils_math.per_prompt_accuracy_9 aggregation: !function utils_math.per_prompt_accuracy_9
higher_is_better: true higher_is_better: true
- metric: consistency_rate - metric: consistency_rate
aggregation: !function utils_math.math_prompt_consistency_rate aggregation: !function utils_math.math_prompt_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -30,9 +30,7 @@ generation_kwargs: ...@@ -30,9 +30,7 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.non_greedy_robustness_process_results process_results: !function utils_mmlu_pro.non_greedy_robustness_process_results
metric_list: metric_list:
- metric: non_greedy_macro_accuracy - metric: non_greedy_macro_accuracy
aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -29,39 +29,37 @@ generation_kwargs: ...@@ -29,39 +29,37 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.option_order_robustness_process_results process_results: !function utils_mmlu_pro.option_order_robustness_process_results
metric_list: metric_list:
- metric: per_option_macro_accuracy_A - metric: per_option_macro_accuracy_A
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_B - metric: per_option_macro_accuracy_B
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_C - metric: per_option_macro_accuracy_C
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_D - metric: per_option_macro_accuracy_D
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_E - metric: per_option_macro_accuracy_E
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_F - metric: per_option_macro_accuracy_F
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_G - metric: per_option_macro_accuracy_G
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_H - metric: per_option_macro_accuracy_H
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_I - metric: per_option_macro_accuracy_I
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_J - metric: per_option_macro_accuracy_J
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
higher_is_better: true higher_is_better: true
- metric: options_consistency_rate - metric: options_consistency_rate
aggregation: !function utils_mmlu_pro.options_consistency_rate aggregation: !function utils_mmlu_pro.options_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -29,39 +29,37 @@ generation_kwargs: ...@@ -29,39 +29,37 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.prompt_robustness_process_results process_results: !function utils_mmlu_pro.prompt_robustness_process_results
metric_list: metric_list:
- metric: 0_macro_accuracy - metric: 0_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
higher_is_better: true higher_is_better: true
- metric: 1_macro_accuracy - metric: 1_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
higher_is_better: true higher_is_better: true
- metric: 2_macro_accuracy - metric: 2_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
higher_is_better: true higher_is_better: true
- metric: 3_macro_accuracy - metric: 3_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
higher_is_better: true higher_is_better: true
- metric: 4_macro_accuracy - metric: 4_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
higher_is_better: true higher_is_better: true
- metric: 5_macro_accuracy - metric: 5_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
higher_is_better: true higher_is_better: true
- metric: 6_macro_accuracy - metric: 6_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
higher_is_better: true higher_is_better: true
- metric: 7_macro_accuracy - metric: 7_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
higher_is_better: true higher_is_better: true
- metric: 8_macro_accuracy - metric: 8_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
higher_is_better: true higher_is_better: true
- metric: 9_macro_accuracy - metric: 9_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
higher_is_better: true higher_is_better: true
- metric: consistency_rate - metric: consistency_rate
aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -23,5 +23,3 @@ metric_list: ...@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -16,5 +16,3 @@ metric_list: ...@@ -16,5 +16,3 @@ metric_list:
- metric: bits_per_byte - metric: bits_per_byte
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -15,5 +15,3 @@ metric_list: ...@@ -15,5 +15,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -106,7 +106,7 @@ plugins.md029.allow_extended_start_values = true # ol-prefix ...@@ -106,7 +106,7 @@ plugins.md029.allow_extended_start_values = true # ol-prefix
plugins.md034.enabled = false # no-bare-urls plugins.md034.enabled = false # no-bare-urls
[tool.ruff.lint] [tool.ruff.lint]
extend-select = ["I"] extend-select = ["I", "W605"]
[tool.ruff.lint.isort] [tool.ruff.lint.isort]
lines-after-imports = 2 lines-after-imports = 2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment