Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b58e5556
Commit
b58e5556
authored
Jul 27, 2025
by
Baber
Browse files
Merge branch 'main' into tasklist
# Conflicts: # pyproject.toml
parents
6e1866f5
4f8195f1
Changes
340
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
89 additions
and
96 deletions
+89
-96
lm_eval/tasks/race/race.yaml
lm_eval/tasks/race/race.yaml
+0
-2
lm_eval/tasks/score/agi_eval/non_greedy_robustness_agieval_aqua_rat.yaml
...core/agi_eval/non_greedy_robustness_agieval_aqua_rat.yaml
+1
-3
lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
...re/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
+5
-7
lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
...ks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
+11
-13
lm_eval/tasks/score/math/math_grader.py
lm_eval/tasks/score/math/math_grader.py
+4
-4
lm_eval/tasks/score/math/non_greedy_robustness_math_algebra.yaml
.../tasks/score/math/non_greedy_robustness_math_algebra.yaml
+2
-4
lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
+12
-14
lm_eval/tasks/score/mmlu_pro/score_non_greedy_robustness_mmlu_pro.yaml
.../score/mmlu_pro/score_non_greedy_robustness_mmlu_pro.yaml
+1
-3
lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
...core/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
+11
-13
lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
...asks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
+11
-13
lm_eval/tasks/spanish_bench/flores_es/_flores_common_yaml
lm_eval/tasks/spanish_bench/flores_es/_flores_common_yaml
+0
-2
lm_eval/tasks/unscramble/anagrams1.yaml
lm_eval/tasks/unscramble/anagrams1.yaml
+0
-2
lm_eval/tasks/unscramble/anagrams2.yaml
lm_eval/tasks/unscramble/anagrams2.yaml
+0
-2
lm_eval/tasks/unscramble/cycle_letters.yaml
lm_eval/tasks/unscramble/cycle_letters.yaml
+0
-2
lm_eval/tasks/unscramble/random_insertion.yaml
lm_eval/tasks/unscramble/random_insertion.yaml
+0
-2
lm_eval/tasks/wikitext/wikitext.yaml
lm_eval/tasks/wikitext/wikitext.yaml
+0
-2
lm_eval/tasks/winogrande/default.yaml
lm_eval/tasks/winogrande/default.yaml
+0
-2
lm_eval/utils.py
lm_eval/utils.py
+20
-3
pyproject.toml
pyproject.toml
+6
-3
tests/test_unitxt_tasks.py
tests/test_unitxt_tasks.py
+5
-0
No files found.
lm_eval/tasks/race/race.yaml
View file @
b58e5556
...
...
@@ -12,5 +12,3 @@ metric_list:
higher_is_better
:
true
metadata
:
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/agi_eval/non_greedy_robustness_agieval_aqua_rat.yaml
View file @
b58e5556
...
...
@@ -28,9 +28,7 @@ generation_kwargs:
process_results
:
!function
utils_agieval.non_greedy_robustness_process_results
metric_list
:
-
metric
:
non_greedy_accuracy
aggregation
:
!function
utils_agieval.non_greedy_accuracy
aggregation
:
!function
utils_agieval.non_greedy_accuracy
higher_is_better
:
true
metadata
:
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/agi_eval/option_order_robustness_agieval_aqua_rat.yaml
View file @
b58e5556
...
...
@@ -27,21 +27,19 @@ generation_kwargs:
process_results
:
!function
utils_agieval.option_order_robustness_process_results
metric_list
:
-
metric
:
per_option_accuracy_A
aggregation
:
!function
utils_agieval.per_option_accuracy_a
aggregation
:
!function
utils_agieval.per_option_accuracy_a
higher_is_better
:
true
-
metric
:
per_option_accuracy_B
aggregation
:
!function
utils_agieval.per_option_accuracy_b
aggregation
:
!function
utils_agieval.per_option_accuracy_b
higher_is_better
:
true
-
metric
:
per_option_accuracy_C
aggregation
:
!function
utils_agieval.per_option_accuracy_c
aggregation
:
!function
utils_agieval.per_option_accuracy_c
higher_is_better
:
true
-
metric
:
per_option_accuracy_D
aggregation
:
!function
utils_agieval.per_option_accuracy_d
aggregation
:
!function
utils_agieval.per_option_accuracy_d
higher_is_better
:
true
-
metric
:
options_consistency_rate
aggregation
:
!function
utils_agieval.options_consistency_rate
aggregation
:
!function
utils_agieval.options_consistency_rate
higher_is_better
:
true
metadata
:
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/agi_eval/prompt_robustness_agieval_aqua_rat.yaml
View file @
b58e5556
...
...
@@ -27,39 +27,37 @@ generation_kwargs:
process_results
:
!function
utils_agieval.prompt_robustness_process_results
metric_list
:
-
metric
:
0_accuracy
aggregation
:
!function
utils_agieval.per_prompt_accuracy_0
aggregation
:
!function
utils_agieval.per_prompt_accuracy_0
higher_is_better
:
true
-
metric
:
1_accuracy
aggregation
:
!function
utils_agieval.per_prompt_accuracy_1
aggregation
:
!function
utils_agieval.per_prompt_accuracy_1
higher_is_better
:
true
-
metric
:
2_accuracy
aggregation
:
!function
utils_agieval.per_prompt_accuracy_2
aggregation
:
!function
utils_agieval.per_prompt_accuracy_2
higher_is_better
:
true
-
metric
:
3_accuracy
aggregation
:
!function
utils_agieval.per_prompt_accuracy_3
aggregation
:
!function
utils_agieval.per_prompt_accuracy_3
higher_is_better
:
true
-
metric
:
4_accuracy
aggregation
:
!function
utils_agieval.per_prompt_accuracy_4
aggregation
:
!function
utils_agieval.per_prompt_accuracy_4
higher_is_better
:
true
-
metric
:
5_accuracy
aggregation
:
!function
utils_agieval.per_prompt_accuracy_5
aggregation
:
!function
utils_agieval.per_prompt_accuracy_5
higher_is_better
:
true
-
metric
:
6_accuracy
aggregation
:
!function
utils_agieval.per_prompt_accuracy_6
aggregation
:
!function
utils_agieval.per_prompt_accuracy_6
higher_is_better
:
true
-
metric
:
7_accuracy
aggregation
:
!function
utils_agieval.per_prompt_accuracy_7
aggregation
:
!function
utils_agieval.per_prompt_accuracy_7
higher_is_better
:
true
-
metric
:
8_accuracy
aggregation
:
!function
utils_agieval.per_prompt_accuracy_8
aggregation
:
!function
utils_agieval.per_prompt_accuracy_8
higher_is_better
:
true
-
metric
:
9_accuracy
aggregation
:
!function
utils_agieval.per_prompt_accuracy_9
aggregation
:
!function
utils_agieval.per_prompt_accuracy_9
higher_is_better
:
true
-
metric
:
consistency_rate
aggregation
:
!function
utils_agieval.agi_eval_prompt_consistency_rate
aggregation
:
!function
utils_agieval.agi_eval_prompt_consistency_rate
higher_is_better
:
true
metadata
:
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/math/math_grader.py
View file @
b58e5556
...
...
@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
"
\\\\
textit"
,
]:
expr
=
expr
.
replace
(
surround_str
,
""
)
pattern
=
f
"^
{
surround_str
}
"
+
"\{(?P<text>.+?)\}$"
pattern
=
f
"^
{
surround_str
}
"
+
r
"\{(?P<text>.+?)\}$"
m
=
re
.
search
(
pattern
,
expr
)
if
m
is
not
None
:
expr
=
m
.
group
(
"text"
)
expr
=
expr
.
replace
(
"\!"
,
""
)
expr
=
expr
.
replace
(
r
"\!"
,
""
)
expr
=
expr
.
replace
(
"
\\
%"
,
"%"
)
expr
=
expr
.
replace
(
"
\\
$"
,
"$"
)
expr
=
expr
.
replace
(
"$"
,
""
)
...
...
@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
"p.m."
,
"PM"
,
]:
expr
=
re
.
sub
(
f
"
{
unit
}
(es)?(s)? *(\^[0-9]+)?"
,
""
,
expr
)
expr
=
re
.
sub
(
r
f
"
{
unit
}
(es)?(s)? *(\^[0-9]+)?"
,
""
,
expr
)
if
"day"
in
expr
:
days
=
[
...
...
@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
if
not
weekday_expressed
:
expr
=
re
.
sub
(
"day(s)?"
,
""
,
expr
)
expr
=
re
.
sub
(
"\^ *
\\\\
circ"
,
""
,
expr
)
expr
=
re
.
sub
(
"
\
\
^ *
\\\\
circ"
,
""
,
expr
)
if
len
(
expr
)
>
0
and
expr
[
0
]
==
"{"
and
expr
[
-
1
]
==
"}"
:
expr
=
expr
[
1
:
-
1
]
...
...
lm_eval/tasks/score/math/non_greedy_robustness_math_algebra.yaml
View file @
b58e5556
...
...
@@ -18,7 +18,7 @@ dataset_name: algebra
output_type
:
generate_until
test_split
:
test
process_docs
:
!function
utils_math.non_greedy_robustness_process_docs
doc_to_text
:
!function
utils_math.math_robustness_doc_to_text
doc_to_text
:
!function
utils_math.math_robustness_doc_to_text
doc_to_target
:
answer
generation_kwargs
:
max_gen_toks
:
1024
...
...
@@ -28,9 +28,7 @@ generation_kwargs:
process_results
:
!function
utils_math.non_greedy_robustness_process_results
metric_list
:
-
metric
:
non_greedy_accuracy
aggregation
:
!function
utils_math.non_greedy_accuracy
aggregation
:
!function
utils_math.non_greedy_accuracy
higher_is_better
:
true
metadata
:
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/math/prompt_robustness_math_algebra.yaml
View file @
b58e5556
...
...
@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs
dataset_name
:
algebra
output_type
:
generate_until
test_split
:
test
doc_to_text
:
!function
utils_math.math_robustness_doc_to_text
doc_to_text
:
!function
utils_math.math_robustness_doc_to_text
process_results
:
!function
utils_math.process_results
doc_to_target
:
answer
generation_kwargs
:
...
...
@@ -28,39 +28,37 @@ generation_kwargs:
max_gen_toks
:
1024
metric_list
:
-
metric
:
0_accuracy
aggregation
:
!function
utils_math.per_prompt_accuracy_0
aggregation
:
!function
utils_math.per_prompt_accuracy_0
higher_is_better
:
true
-
metric
:
1_accuracy
aggregation
:
!function
utils_math.per_prompt_accuracy_1
aggregation
:
!function
utils_math.per_prompt_accuracy_1
higher_is_better
:
true
-
metric
:
2_accuracy
aggregation
:
!function
utils_math.per_prompt_accuracy_2
aggregation
:
!function
utils_math.per_prompt_accuracy_2
higher_is_better
:
true
-
metric
:
3_accuracy
aggregation
:
!function
utils_math.per_prompt_accuracy_3
aggregation
:
!function
utils_math.per_prompt_accuracy_3
higher_is_better
:
true
-
metric
:
4_accuracy
aggregation
:
!function
utils_math.per_prompt_accuracy_4
aggregation
:
!function
utils_math.per_prompt_accuracy_4
higher_is_better
:
true
-
metric
:
5_accuracy
aggregation
:
!function
utils_math.per_prompt_accuracy_5
aggregation
:
!function
utils_math.per_prompt_accuracy_5
higher_is_better
:
true
-
metric
:
6_accuracy
aggregation
:
!function
utils_math.per_prompt_accuracy_6
aggregation
:
!function
utils_math.per_prompt_accuracy_6
higher_is_better
:
true
-
metric
:
7_accuracy
aggregation
:
!function
utils_math.per_prompt_accuracy_7
aggregation
:
!function
utils_math.per_prompt_accuracy_7
higher_is_better
:
true
-
metric
:
8_accuracy
aggregation
:
!function
utils_math.per_prompt_accuracy_8
aggregation
:
!function
utils_math.per_prompt_accuracy_8
higher_is_better
:
true
-
metric
:
9_accuracy
aggregation
:
!function
utils_math.per_prompt_accuracy_9
aggregation
:
!function
utils_math.per_prompt_accuracy_9
higher_is_better
:
true
-
metric
:
consistency_rate
aggregation
:
!function
utils_math.math_prompt_consistency_rate
aggregation
:
!function
utils_math.math_prompt_consistency_rate
higher_is_better
:
true
metadata
:
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/mmlu_pro/score_non_greedy_robustness_mmlu_pro.yaml
View file @
b58e5556
...
...
@@ -30,9 +30,7 @@ generation_kwargs:
process_results
:
!function
utils_mmlu_pro.non_greedy_robustness_process_results
metric_list
:
-
metric
:
non_greedy_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.non_greedy_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.non_greedy_macro_accuracy
higher_is_better
:
true
metadata
:
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/mmlu_pro/score_option_order_robustness_mmlu_pro.yaml
View file @
b58e5556
...
...
@@ -29,39 +29,37 @@ generation_kwargs:
process_results
:
!function
utils_mmlu_pro.option_order_robustness_process_results
metric_list
:
-
metric
:
per_option_macro_accuracy_A
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_a
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_a
higher_is_better
:
true
-
metric
:
per_option_macro_accuracy_B
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_b
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_b
higher_is_better
:
true
-
metric
:
per_option_macro_accuracy_C
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_c
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_c
higher_is_better
:
true
-
metric
:
per_option_macro_accuracy_D
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_d
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_d
higher_is_better
:
true
-
metric
:
per_option_macro_accuracy_E
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_e
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_e
higher_is_better
:
true
-
metric
:
per_option_macro_accuracy_F
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_f
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_f
higher_is_better
:
true
-
metric
:
per_option_macro_accuracy_G
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_g
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_g
higher_is_better
:
true
-
metric
:
per_option_macro_accuracy_H
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_h
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_h
higher_is_better
:
true
-
metric
:
per_option_macro_accuracy_I
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_i
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_i
higher_is_better
:
true
-
metric
:
per_option_macro_accuracy_J
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_j
aggregation
:
!function
utils_mmlu_pro.per_option_macro_accuracy_j
higher_is_better
:
true
-
metric
:
options_consistency_rate
aggregation
:
!function
utils_mmlu_pro.options_consistency_rate
aggregation
:
!function
utils_mmlu_pro.options_consistency_rate
higher_is_better
:
true
metadata
:
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/score/mmlu_pro/score_prompt_robustness_mmlu_pro.yaml
View file @
b58e5556
...
...
@@ -29,39 +29,37 @@ generation_kwargs:
process_results
:
!function
utils_mmlu_pro.prompt_robustness_process_results
metric_list
:
-
metric
:
0_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_0
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_0
higher_is_better
:
true
-
metric
:
1_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_1
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_1
higher_is_better
:
true
-
metric
:
2_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_2
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_2
higher_is_better
:
true
-
metric
:
3_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_3
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_3
higher_is_better
:
true
-
metric
:
4_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_4
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_4
higher_is_better
:
true
-
metric
:
5_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_5
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_5
higher_is_better
:
true
-
metric
:
6_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_6
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_6
higher_is_better
:
true
-
metric
:
7_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_7
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_7
higher_is_better
:
true
-
metric
:
8_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_8
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_8
higher_is_better
:
true
-
metric
:
9_macro_accuracy
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_9
aggregation
:
!function
utils_mmlu_pro.per_prompt_accuracy_9
higher_is_better
:
true
-
metric
:
consistency_rate
aggregation
:
!function
utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
aggregation
:
!function
utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
higher_is_better
:
true
metadata
:
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/spanish_bench/flores_es/_flores_common_yaml
View file @
b58e5556
...
...
@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
lm_eval/tasks/unscramble/anagrams1.yaml
View file @
b58e5556
...
...
@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation
:
false
metadata
:
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/unscramble/anagrams2.yaml
View file @
b58e5556
...
...
@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation
:
false
metadata
:
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/unscramble/cycle_letters.yaml
View file @
b58e5556
...
...
@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation
:
false
metadata
:
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/unscramble/random_insertion.yaml
View file @
b58e5556
...
...
@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation
:
false
metadata
:
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/wikitext/wikitext.yaml
View file @
b58e5556
...
...
@@ -16,5 +16,3 @@ metric_list:
-
metric
:
bits_per_byte
metadata
:
version
:
2.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/tasks/winogrande/default.yaml
View file @
b58e5556
...
...
@@ -15,5 +15,3 @@ metric_list:
higher_is_better
:
true
metadata
:
version
:
1.0
dataset_kwargs
:
trust_remote_code
:
true
lm_eval/utils.py
View file @
b58e5556
...
...
@@ -8,9 +8,10 @@ import json
import
logging
import
os
import
re
from
collections.abc
import
Generator
from
dataclasses
import
asdict
,
is_dataclass
from
itertools
import
islice
from
typing
import
Any
,
Callable
,
Generator
,
List
,
Optional
,
Tuple
from
typing
import
Any
,
Callable
,
List
,
Optional
,
Tuple
import
numpy
as
np
from
jinja2
import
BaseLoader
,
Environment
,
StrictUndefined
...
...
@@ -24,12 +25,28 @@ HIGHER_IS_BETTER_SYMBOLS = {
}
def
wrap_text
(
string
:
str
,
width
:
int
=
140
,
**
kwargs
)
->
Optional
[
str
]:
"""
Wraps the given string to the specified width.
"""
import
textwrap
return
textwrap
.
fill
(
inspect
.
cleandoc
(
string
),
width
=
width
,
initial_indent
=
""
,
subsequent_indent
=
" "
*
8
,
break_long_words
=
False
,
break_on_hyphens
=
False
,
**
kwargs
,
)
def
setup_logging
(
verbosity
=
logging
.
INFO
):
# Configure the root logger
class
CustomFormatter
(
logging
.
Formatter
):
def
format
(
self
,
record
):
if
record
.
name
.
startswith
(
"lm_eval."
):
record
.
name
=
record
.
name
[
len
(
"lm_eval."
)
:]
record
.
name
=
record
.
name
.
removeprefix
(
"lm_eval."
)
return
super
().
format
(
record
)
formatter
=
CustomFormatter
(
...
...
pyproject.toml
View file @
b58e5556
...
...
@@ -21,7 +21,7 @@ license = { "text" = "MIT" }
dependencies
=
[
"accelerate>=0.26.0"
,
"evaluate"
,
"datasets>=2.16.0"
,
"datasets>=2.16.0
,<4.0
"
,
"evaluate>=0.4.0"
,
"jsonlines"
,
"numexpr"
,
...
...
@@ -69,6 +69,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
ipex
=
["optimum"]
japanese_leaderboard
=
[
"emoji==2.14.0"
,
"neologdn==0.5.3"
,
"fugashi[unidic-lite]"
,
"rouge_score>=0.1.2"
]
longbench
=[
"jieba"
,
"fuzzywuzzy"
,
"rouge"
]
libra
=
["pymorphy2"]
mamba
=
[
"mamba_ssm"
,
"causal-conv1d==1.0.2"
,
"torch"
]
math
=
[
"sympy>=1.12"
,
"antlr4-python3-runtime==4.11"
,
"math_verify[antlr4_11_0]"
]
multilingual
=
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
,
"pycountry"
]
...
...
@@ -89,6 +90,8 @@ tasks = [
"lm_eval[ifeval]"
,
"lm_eval[japanese_leaderboard]"
,
"lm_eval[longbench]"
,
"lm_eval[libra]"
,
"lm_eval[mamba]"
,
"lm_eval[math]"
,
"lm_eval[multilingual]"
,
"lm_eval[ruler]"
,
...
...
@@ -103,8 +106,8 @@ plugins.md029.allow_extended_start_values = true # ol-prefix
plugins.md034.enabled
=
false
# no-bare-urls
[tool.ruff.lint]
select
=
[
"ASYNC"
,
"B"
,
"C4"
,
"E"
,
"F"
,
"I"
,
"LOG"
,
"PIE"
,
"PTH"
,
"SIM"
,
"UP"
,
"PERF"
,
"ISC001"
,
"ISC002"
,
"ICN001"
,
"C901"
,
"FURB"
,
"RUF"
]
ignore
=
[
"E501"
,
"E111"
,
"E114"
,
"E117"
,
"E501"
,
"PERF203"
,
"B011"
]
select
=
[
"ASYNC"
,
"B"
,
"C4"
,
"E"
,
"F"
,
"I"
,
"LOG"
,
"PIE"
,
"PTH"
,
"SIM"
,
"UP"
,
"PERF"
,
"ISC001"
,
"ISC002"
,
"ICN001"
,
"C901"
,
"FURB"
,
"RUF"
,
"W605"
]
ignore
=
[
"E501"
,
"E111"
,
"E114"
,
"E117"
,
"E501"
,
"PERF203"
,
"B011"
,
"RUF005"
]
[tool.ruff.lint.isort]
lines-after-imports
=
2
...
...
tests/test_unitxt_tasks.py
View file @
b58e5556
...
...
@@ -7,6 +7,11 @@ from lm_eval.api.task import ConfigurableTask
from
tests.test_tasks
import
BaseTasks
,
task_class
@
pytest
.
fixture
()
def
limit
()
->
int
:
return
10
@
pytest
.
mark
.
parametrize
(
"task_class"
,
task_class
(
...
...
Prev
1
…
13
14
15
16
17
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment