Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
314f7176
Unverified
Commit
314f7176
authored
Jul 23, 2025
by
Baber Abbasi
Committed by
GitHub
Jul 23, 2025
Browse files
remove trust-remote-code in configs; fix escape sequences (#3180)
* remove trust-remote-code * add W605 rule
parent
8c6fde08
Changes
98
Show whitespace changes
Inline
Side-by-side
Showing
18 changed files
with
59 additions
and
91 deletions
+59
-91
lm_eval/tasks/race/race.yaml
lm_eval/tasks/race/race.yaml
+0
-2
lm_eval/tasks/score/agi_eval/non_greedy_robustness_agieval_aqua_rat.yaml
...core/agi_eval/non_greedy_robustness_agieval_aqua_rat.yaml
+1
-3
lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
...re/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
+5
-7
lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
...ks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
+11
-13
lm_eval/tasks/score/math/math_grader.py
lm_eval/tasks/score/math/math_grader.py
+4
-4
lm_eval/tasks/score/math/non_greedy_robustness_math_algebra.yaml
.../tasks/score/math/non_greedy_robustness_math_algebra.yaml
+2
-4
lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
+12
-14
lm_eval/tasks/score/mmlu_pro/score_non_greedy_robustness_mmlu_pro.yaml
.../score/mmlu_pro/score_non_greedy_robustness_mmlu_pro.yaml
+1
-3
lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
...core/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
+11
-13
lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
...asks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
+11
-13
lm_eval/tasks/spanish_bench/flores_es/_flores_common_yaml
lm_eval/tasks/spanish_bench/flores_es/_flores_common_yaml
+0
-2
lm_eval/tasks/unscramble/anagrams1.yaml
lm_eval/tasks/unscramble/anagrams1.yaml
+0
-2
lm_eval/tasks/unscramble/anagrams2.yaml
lm_eval/tasks/unscramble/anagrams2.yaml
+0
-2
lm_eval/tasks/unscramble/cycle_letters.yaml
lm_eval/tasks/unscramble/cycle_letters.yaml
+0
-2
lm_eval/tasks/unscramble/random_insertion.yaml
lm_eval/tasks/unscramble/random_insertion.yaml
+0
-2
lm_eval/tasks/wikitext/wikitext.yaml
lm_eval/tasks/wikitext/wikitext.yaml
+0
-2
lm_eval/tasks/winogrande/default.yaml
lm_eval/tasks/winogrande/default.yaml
+0
-2
pyproject.toml
pyproject.toml
+1
-1
No files found.
lm_eval/tasks/race/race.yaml
View file @
314f7176
...
@@ -12,5 +12,3 @@ metric_list:
...
@@ -12,5 +12,3 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
2.0
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/agi_eval/non_greedy_robustness_agieval_aqua_rat.yaml
View file @
314f7176
...
@@ -32,5 +32,3 @@ metric_list:
...
@@ -32,5 +32,3 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
1.0
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
View file @
314f7176
...
@@ -43,5 +43,3 @@ metric_list:
...
@@ -43,5 +43,3 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
1.0
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
View file @
314f7176
...
@@ -61,5 +61,3 @@ metric_list:
...
@@ -61,5 +61,3 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
1.0
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/math/math_grader.py
View file @
314f7176
...
@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
...
@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
"
\\\\
textit"
,
"
\\\\
textit"
,
]:
]:
expr
=
expr
.
replace
(
surround_str
,
""
)
expr
=
expr
.
replace
(
surround_str
,
""
)
pattern
=
f
"^
{
surround_str
}
"
+
"\{(?P<text>.+?)\}$"
pattern
=
f
"^
{
surround_str
}
"
+
r
"\{(?P<text>.+?)\}$"
m
=
re
.
search
(
pattern
,
expr
)
m
=
re
.
search
(
pattern
,
expr
)
if
m
is
not
None
:
if
m
is
not
None
:
expr
=
m
.
group
(
"text"
)
expr
=
m
.
group
(
"text"
)
expr
=
expr
.
replace
(
"\!"
,
""
)
expr
=
expr
.
replace
(
r
"\!"
,
""
)
expr
=
expr
.
replace
(
"
\\
%"
,
"%"
)
expr
=
expr
.
replace
(
"
\\
%"
,
"%"
)
expr
=
expr
.
replace
(
"
\\
$"
,
"$"
)
expr
=
expr
.
replace
(
"
\\
$"
,
"$"
)
expr
=
expr
.
replace
(
"$"
,
""
)
expr
=
expr
.
replace
(
"$"
,
""
)
...
@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
...
@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
"p.m."
,
"p.m."
,
"PM"
,
"PM"
,
]:
]:
expr
=
re
.
sub
(
f
"
{
unit
}
(es)?(s)? *(\^[0-9]+)?"
,
""
,
expr
)
expr
=
re
.
sub
(
r
f
"
{
unit
}
(es)?(s)? *(\^[0-9]+)?"
,
""
,
expr
)
if
"day"
in
expr
:
if
"day"
in
expr
:
days
=
[
days
=
[
...
@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
...
@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
if
not
weekday_expressed
:
if
not
weekday_expressed
:
expr
=
re
.
sub
(
"day(s)?"
,
""
,
expr
)
expr
=
re
.
sub
(
"day(s)?"
,
""
,
expr
)
expr
=
re
.
sub
(
"\^ *
\\\\
circ"
,
""
,
expr
)
expr
=
re
.
sub
(
"
\
\
^ *
\\\\
circ"
,
""
,
expr
)
if
len
(
expr
)
>
0
and
expr
[
0
]
==
"{"
and
expr
[
-
1
]
==
"}"
:
if
len
(
expr
)
>
0
and
expr
[
0
]
==
"{"
and
expr
[
-
1
]
==
"}"
:
expr
=
expr
[
1
:
-
1
]
expr
=
expr
[
1
:
-
1
]
...
...
lm_eval/tasks/score/math/non_greedy_robustness_math_algebra.yaml
View file @
314f7176
...
@@ -32,5 +32,3 @@ metric_list:
...
@@ -32,5 +32,3 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
1.0
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
View file @
314f7176
...
@@ -62,5 +62,3 @@ metric_list:
...
@@ -62,5 +62,3 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
1.0
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/mmlu_pro/score_non_greedy_robustness_mmlu_pro.yaml
View file @
314f7176
...
@@ -34,5 +34,3 @@ metric_list:
...
@@ -34,5 +34,3 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
1.0
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
View file @
314f7176
...
@@ -63,5 +63,3 @@ metric_list:
...
@@ -63,5 +63,3 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
1.0
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
View file @
314f7176
...
@@ -63,5 +63,3 @@ metric_list:
...
@@ -63,5 +63,3 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
1.0
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/spanish_bench/flores_es/_flores_common_yaml
View file @
314f7176
...
@@ -23,5 +23,3 @@ metric_list:
...
@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true
higher_is_better: true
metadata:
metadata:
version: 1.0
version: 1.0
dataset_kwargs:
trust_remote_code: true
lm_eval/tasks/unscramble/anagrams1.yaml
View file @
314f7176
...
@@ -18,5 +18,3 @@ metric_list:
...
@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation
:
false
ignore_punctuation
:
false
metadata
:
metadata
:
version
:
2.0
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/unscramble/anagrams2.yaml
View file @
314f7176
...
@@ -18,5 +18,3 @@ metric_list:
...
@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation
:
false
ignore_punctuation
:
false
metadata
:
metadata
:
version
:
2.0
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/unscramble/cycle_letters.yaml
View file @
314f7176
...
@@ -18,5 +18,3 @@ metric_list:
...
@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation
:
false
ignore_punctuation
:
false
metadata
:
metadata
:
version
:
2.0
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/unscramble/random_insertion.yaml
View file @
314f7176
...
@@ -18,5 +18,3 @@ metric_list:
...
@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation
:
false
ignore_punctuation
:
false
metadata
:
metadata
:
version
:
2.0
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/wikitext/wikitext.yaml
View file @
314f7176
...
@@ -16,5 +16,3 @@ metric_list:
...
@@ -16,5 +16,3 @@ metric_list:
-
metric
:
bits_per_byte
-
metric
:
bits_per_byte
metadata
:
metadata
:
version
:
2.0
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/winogrande/default.yaml
View file @
314f7176
...
@@ -15,5 +15,3 @@ metric_list:
...
@@ -15,5 +15,3 @@ metric_list:
higher_is_better
:
true
higher_is_better
:
true
metadata
:
metadata
:
version
:
1.0
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
pyproject.toml
View file @
314f7176
...
@@ -106,7 +106,7 @@ plugins.md029.allow_extended_start_values = true # ol-prefix
...
@@ -106,7 +106,7 @@ plugins.md029.allow_extended_start_values = true # ol-prefix
plugins.md034.enabled
=
false
# no-bare-urls
plugins.md034.enabled
=
false
# no-bare-urls
[tool.ruff.lint]
[tool.ruff.lint]
extend-select
=
["I"]
extend-select
=
[
"I"
,
"W605"
]
[tool.ruff.lint.isort]
[tool.ruff.lint.isort]
lines-after-imports
=
2
lines-after-imports
=
2
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment