"docs/vscode:/vscode.git/clone" did not exist on "44f011d2241945b173bcfd13545b523e80b806bd"
Commit b58e5556 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into tasklist

# Conflicts:
#	pyproject.toml
parents 6e1866f5 4f8195f1
...@@ -12,5 +12,3 @@ metric_list: ...@@ -12,5 +12,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -28,9 +28,7 @@ generation_kwargs: ...@@ -28,9 +28,7 @@ generation_kwargs:
process_results: !function utils_agieval.non_greedy_robustness_process_results process_results: !function utils_agieval.non_greedy_robustness_process_results
metric_list: metric_list:
- metric: non_greedy_accuracy - metric: non_greedy_accuracy
aggregation: !function utils_agieval.non_greedy_accuracy aggregation: !function utils_agieval.non_greedy_accuracy
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -27,21 +27,19 @@ generation_kwargs: ...@@ -27,21 +27,19 @@ generation_kwargs:
process_results: !function utils_agieval.option_order_robustness_process_results process_results: !function utils_agieval.option_order_robustness_process_results
metric_list: metric_list:
- metric: per_option_accuracy_A - metric: per_option_accuracy_A
aggregation: !function utils_agieval.per_option_accuracy_a aggregation: !function utils_agieval.per_option_accuracy_a
higher_is_better: true higher_is_better: true
- metric: per_option_accuracy_B - metric: per_option_accuracy_B
aggregation: !function utils_agieval.per_option_accuracy_b aggregation: !function utils_agieval.per_option_accuracy_b
higher_is_better: true higher_is_better: true
- metric: per_option_accuracy_C - metric: per_option_accuracy_C
aggregation: !function utils_agieval.per_option_accuracy_c aggregation: !function utils_agieval.per_option_accuracy_c
higher_is_better: true higher_is_better: true
- metric: per_option_accuracy_D - metric: per_option_accuracy_D
aggregation: !function utils_agieval.per_option_accuracy_d aggregation: !function utils_agieval.per_option_accuracy_d
higher_is_better: true higher_is_better: true
- metric: options_consistency_rate - metric: options_consistency_rate
aggregation: !function utils_agieval.options_consistency_rate aggregation: !function utils_agieval.options_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -27,39 +27,37 @@ generation_kwargs: ...@@ -27,39 +27,37 @@ generation_kwargs:
process_results: !function utils_agieval.prompt_robustness_process_results process_results: !function utils_agieval.prompt_robustness_process_results
metric_list: metric_list:
- metric: 0_accuracy - metric: 0_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_0 aggregation: !function utils_agieval.per_prompt_accuracy_0
higher_is_better: true higher_is_better: true
- metric: 1_accuracy - metric: 1_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_1 aggregation: !function utils_agieval.per_prompt_accuracy_1
higher_is_better: true higher_is_better: true
- metric: 2_accuracy - metric: 2_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_2 aggregation: !function utils_agieval.per_prompt_accuracy_2
higher_is_better: true higher_is_better: true
- metric: 3_accuracy - metric: 3_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_3 aggregation: !function utils_agieval.per_prompt_accuracy_3
higher_is_better: true higher_is_better: true
- metric: 4_accuracy - metric: 4_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_4 aggregation: !function utils_agieval.per_prompt_accuracy_4
higher_is_better: true higher_is_better: true
- metric: 5_accuracy - metric: 5_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_5 aggregation: !function utils_agieval.per_prompt_accuracy_5
higher_is_better: true higher_is_better: true
- metric: 6_accuracy - metric: 6_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_6 aggregation: !function utils_agieval.per_prompt_accuracy_6
higher_is_better: true higher_is_better: true
- metric: 7_accuracy - metric: 7_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_7 aggregation: !function utils_agieval.per_prompt_accuracy_7
higher_is_better: true higher_is_better: true
- metric: 8_accuracy - metric: 8_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_8 aggregation: !function utils_agieval.per_prompt_accuracy_8
higher_is_better: true higher_is_better: true
- metric: 9_accuracy - metric: 9_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_9 aggregation: !function utils_agieval.per_prompt_accuracy_9
higher_is_better: true higher_is_better: true
- metric: consistency_rate - metric: consistency_rate
aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str: ...@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
"\\\\textit", "\\\\textit",
]: ]:
expr = expr.replace(surround_str, "") expr = expr.replace(surround_str, "")
pattern = f"^{surround_str}" + "\{(?P<text>.+?)\}$" pattern = f"^{surround_str}" + r"\{(?P<text>.+?)\}$"
m = re.search(pattern, expr) m = re.search(pattern, expr)
if m is not None: if m is not None:
expr = m.group("text") expr = m.group("text")
expr = expr.replace("\!", "") expr = expr.replace(r"\!", "")
expr = expr.replace("\\%", "%") expr = expr.replace("\\%", "%")
expr = expr.replace("\\$", "$") expr = expr.replace("\\$", "$")
expr = expr.replace("$", "") expr = expr.replace("$", "")
...@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str: ...@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
"p.m.", "p.m.",
"PM", "PM",
]: ]:
expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr) expr = re.sub(rf"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
if "day" in expr: if "day" in expr:
days = [ days = [
...@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str: ...@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
if not weekday_expressed: if not weekday_expressed:
expr = re.sub("day(s)?", "", expr) expr = re.sub("day(s)?", "", expr)
expr = re.sub("\^ *\\\\circ", "", expr) expr = re.sub("\\^ *\\\\circ", "", expr)
if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}": if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
expr = expr[1:-1] expr = expr[1:-1]
......
...@@ -18,7 +18,7 @@ dataset_name: algebra ...@@ -18,7 +18,7 @@ dataset_name: algebra
output_type: generate_until output_type: generate_until
test_split: test test_split: test
process_docs: !function utils_math.non_greedy_robustness_process_docs process_docs: !function utils_math.non_greedy_robustness_process_docs
doc_to_text: !function utils_math.math_robustness_doc_to_text doc_to_text: !function utils_math.math_robustness_doc_to_text
doc_to_target: answer doc_to_target: answer
generation_kwargs: generation_kwargs:
max_gen_toks: 1024 max_gen_toks: 1024
...@@ -28,9 +28,7 @@ generation_kwargs: ...@@ -28,9 +28,7 @@ generation_kwargs:
process_results: !function utils_math.non_greedy_robustness_process_results process_results: !function utils_math.non_greedy_robustness_process_results
metric_list: metric_list:
- metric: non_greedy_accuracy - metric: non_greedy_accuracy
aggregation: !function utils_math.non_greedy_accuracy aggregation: !function utils_math.non_greedy_accuracy
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs ...@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs
dataset_name: algebra dataset_name: algebra
output_type: generate_until output_type: generate_until
test_split: test test_split: test
doc_to_text: !function utils_math.math_robustness_doc_to_text doc_to_text: !function utils_math.math_robustness_doc_to_text
process_results: !function utils_math.process_results process_results: !function utils_math.process_results
doc_to_target: answer doc_to_target: answer
generation_kwargs: generation_kwargs:
...@@ -28,39 +28,37 @@ generation_kwargs: ...@@ -28,39 +28,37 @@ generation_kwargs:
max_gen_toks: 1024 max_gen_toks: 1024
metric_list: metric_list:
- metric: 0_accuracy - metric: 0_accuracy
aggregation: !function utils_math.per_prompt_accuracy_0 aggregation: !function utils_math.per_prompt_accuracy_0
higher_is_better: true higher_is_better: true
- metric: 1_accuracy - metric: 1_accuracy
aggregation: !function utils_math.per_prompt_accuracy_1 aggregation: !function utils_math.per_prompt_accuracy_1
higher_is_better: true higher_is_better: true
- metric: 2_accuracy - metric: 2_accuracy
aggregation: !function utils_math.per_prompt_accuracy_2 aggregation: !function utils_math.per_prompt_accuracy_2
higher_is_better: true higher_is_better: true
- metric: 3_accuracy - metric: 3_accuracy
aggregation: !function utils_math.per_prompt_accuracy_3 aggregation: !function utils_math.per_prompt_accuracy_3
higher_is_better: true higher_is_better: true
- metric: 4_accuracy - metric: 4_accuracy
aggregation: !function utils_math.per_prompt_accuracy_4 aggregation: !function utils_math.per_prompt_accuracy_4
higher_is_better: true higher_is_better: true
- metric: 5_accuracy - metric: 5_accuracy
aggregation: !function utils_math.per_prompt_accuracy_5 aggregation: !function utils_math.per_prompt_accuracy_5
higher_is_better: true higher_is_better: true
- metric: 6_accuracy - metric: 6_accuracy
aggregation: !function utils_math.per_prompt_accuracy_6 aggregation: !function utils_math.per_prompt_accuracy_6
higher_is_better: true higher_is_better: true
- metric: 7_accuracy - metric: 7_accuracy
aggregation: !function utils_math.per_prompt_accuracy_7 aggregation: !function utils_math.per_prompt_accuracy_7
higher_is_better: true higher_is_better: true
- metric: 8_accuracy - metric: 8_accuracy
aggregation: !function utils_math.per_prompt_accuracy_8 aggregation: !function utils_math.per_prompt_accuracy_8
higher_is_better: true higher_is_better: true
- metric: 9_accuracy - metric: 9_accuracy
aggregation: !function utils_math.per_prompt_accuracy_9 aggregation: !function utils_math.per_prompt_accuracy_9
higher_is_better: true higher_is_better: true
- metric: consistency_rate - metric: consistency_rate
aggregation: !function utils_math.math_prompt_consistency_rate aggregation: !function utils_math.math_prompt_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -30,9 +30,7 @@ generation_kwargs: ...@@ -30,9 +30,7 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.non_greedy_robustness_process_results process_results: !function utils_mmlu_pro.non_greedy_robustness_process_results
metric_list: metric_list:
- metric: non_greedy_macro_accuracy - metric: non_greedy_macro_accuracy
aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -29,39 +29,37 @@ generation_kwargs: ...@@ -29,39 +29,37 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.option_order_robustness_process_results process_results: !function utils_mmlu_pro.option_order_robustness_process_results
metric_list: metric_list:
- metric: per_option_macro_accuracy_A - metric: per_option_macro_accuracy_A
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_B - metric: per_option_macro_accuracy_B
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_C - metric: per_option_macro_accuracy_C
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_D - metric: per_option_macro_accuracy_D
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_E - metric: per_option_macro_accuracy_E
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_F - metric: per_option_macro_accuracy_F
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_G - metric: per_option_macro_accuracy_G
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_H - metric: per_option_macro_accuracy_H
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_I - metric: per_option_macro_accuracy_I
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_J - metric: per_option_macro_accuracy_J
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
higher_is_better: true higher_is_better: true
- metric: options_consistency_rate - metric: options_consistency_rate
aggregation: !function utils_mmlu_pro.options_consistency_rate aggregation: !function utils_mmlu_pro.options_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -29,39 +29,37 @@ generation_kwargs: ...@@ -29,39 +29,37 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.prompt_robustness_process_results process_results: !function utils_mmlu_pro.prompt_robustness_process_results
metric_list: metric_list:
- metric: 0_macro_accuracy - metric: 0_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
higher_is_better: true higher_is_better: true
- metric: 1_macro_accuracy - metric: 1_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
higher_is_better: true higher_is_better: true
- metric: 2_macro_accuracy - metric: 2_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
higher_is_better: true higher_is_better: true
- metric: 3_macro_accuracy - metric: 3_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
higher_is_better: true higher_is_better: true
- metric: 4_macro_accuracy - metric: 4_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
higher_is_better: true higher_is_better: true
- metric: 5_macro_accuracy - metric: 5_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
higher_is_better: true higher_is_better: true
- metric: 6_macro_accuracy - metric: 6_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
higher_is_better: true higher_is_better: true
- metric: 7_macro_accuracy - metric: 7_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
higher_is_better: true higher_is_better: true
- metric: 8_macro_accuracy - metric: 8_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
higher_is_better: true higher_is_better: true
- metric: 9_macro_accuracy - metric: 9_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
higher_is_better: true higher_is_better: true
- metric: consistency_rate - metric: consistency_rate
aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -23,5 +23,3 @@ metric_list: ...@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -16,5 +16,3 @@ metric_list: ...@@ -16,5 +16,3 @@ metric_list:
- metric: bits_per_byte - metric: bits_per_byte
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -15,5 +15,3 @@ metric_list: ...@@ -15,5 +15,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -8,9 +8,10 @@ import json ...@@ -8,9 +8,10 @@ import json
import logging import logging
import os import os
import re import re
from collections.abc import Generator
from dataclasses import asdict, is_dataclass from dataclasses import asdict, is_dataclass
from itertools import islice from itertools import islice
from typing import Any, Callable, Generator, List, Optional, Tuple from typing import Any, Callable, List, Optional, Tuple
import numpy as np import numpy as np
from jinja2 import BaseLoader, Environment, StrictUndefined from jinja2 import BaseLoader, Environment, StrictUndefined
...@@ -24,12 +25,28 @@ HIGHER_IS_BETTER_SYMBOLS = { ...@@ -24,12 +25,28 @@ HIGHER_IS_BETTER_SYMBOLS = {
} }
def wrap_text(string: str, width: int = 140, **kwargs) -> Optional[str]:
"""
Wraps the given string to the specified width.
"""
import textwrap
return textwrap.fill(
inspect.cleandoc(string),
width=width,
initial_indent="",
subsequent_indent=" " * 8,
break_long_words=False,
break_on_hyphens=False,
**kwargs,
)
def setup_logging(verbosity=logging.INFO): def setup_logging(verbosity=logging.INFO):
# Configure the root logger # Configure the root logger
class CustomFormatter(logging.Formatter): class CustomFormatter(logging.Formatter):
def format(self, record): def format(self, record):
if record.name.startswith("lm_eval."): record.name = record.name.removeprefix("lm_eval.")
record.name = record.name[len("lm_eval.") :]
return super().format(record) return super().format(record)
formatter = CustomFormatter( formatter = CustomFormatter(
......
...@@ -21,7 +21,7 @@ license = { "text" = "MIT" } ...@@ -21,7 +21,7 @@ license = { "text" = "MIT" }
dependencies = [ dependencies = [
"accelerate>=0.26.0", "accelerate>=0.26.0",
"evaluate", "evaluate",
"datasets>=2.16.0", "datasets>=2.16.0,<4.0",
"evaluate>=0.4.0", "evaluate>=0.4.0",
"jsonlines", "jsonlines",
"numexpr", "numexpr",
...@@ -69,6 +69,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"] ...@@ -69,6 +69,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
ipex = ["optimum"] ipex = ["optimum"]
japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"] japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
longbench=["jieba", "fuzzywuzzy", "rouge"] longbench=["jieba", "fuzzywuzzy", "rouge"]
libra=["pymorphy2"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2", "torch"] mamba = ["mamba_ssm", "causal-conv1d==1.0.2", "torch"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"] math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"] multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
...@@ -89,6 +90,8 @@ tasks = [ ...@@ -89,6 +90,8 @@ tasks = [
"lm_eval[ifeval]", "lm_eval[ifeval]",
"lm_eval[japanese_leaderboard]", "lm_eval[japanese_leaderboard]",
"lm_eval[longbench]", "lm_eval[longbench]",
"lm_eval[libra]",
"lm_eval[mamba]",
"lm_eval[math]", "lm_eval[math]",
"lm_eval[multilingual]", "lm_eval[multilingual]",
"lm_eval[ruler]", "lm_eval[ruler]",
...@@ -103,8 +106,8 @@ plugins.md029.allow_extended_start_values = true # ol-prefix ...@@ -103,8 +106,8 @@ plugins.md029.allow_extended_start_values = true # ol-prefix
plugins.md034.enabled = false # no-bare-urls plugins.md034.enabled = false # no-bare-urls
[tool.ruff.lint] [tool.ruff.lint]
select = ["ASYNC","B", "C4", "E", "F", "I", "LOG","PIE", "PTH","SIM", "UP", "PERF", "ISC001", "ISC002", "ICN001", "C901","FURB", "RUF"] select = ["ASYNC","B", "C4", "E", "F", "I", "LOG","PIE", "PTH","SIM", "UP", "PERF", "ISC001", "ISC002", "ICN001", "C901","FURB", "RUF", "W605"]
ignore = ["E501", "E111", "E114", "E117", "E501", "PERF203", "B011"] ignore = ["E501", "E111", "E114", "E117", "E501", "PERF203", "B011", "RUF005"]
[tool.ruff.lint.isort] [tool.ruff.lint.isort]
lines-after-imports = 2 lines-after-imports = 2
......
...@@ -7,6 +7,11 @@ from lm_eval.api.task import ConfigurableTask ...@@ -7,6 +7,11 @@ from lm_eval.api.task import ConfigurableTask
from tests.test_tasks import BaseTasks, task_class from tests.test_tasks import BaseTasks, task_class
@pytest.fixture()
def limit() -> int:
return 10
@pytest.mark.parametrize( @pytest.mark.parametrize(
"task_class", "task_class",
task_class( task_class(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment