Commit b58e5556 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into tasklist

# Conflicts:
#	pyproject.toml
parents 6e1866f5 4f8195f1
......@@ -12,5 +12,3 @@ metric_list:
higher_is_better: true
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -28,9 +28,7 @@ generation_kwargs:
process_results: !function utils_agieval.non_greedy_robustness_process_results
metric_list:
- metric: non_greedy_accuracy
aggregation: !function utils_agieval.non_greedy_accuracy
aggregation: !function utils_agieval.non_greedy_accuracy
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -27,21 +27,19 @@ generation_kwargs:
process_results: !function utils_agieval.option_order_robustness_process_results
metric_list:
- metric: per_option_accuracy_A
aggregation: !function utils_agieval.per_option_accuracy_a
aggregation: !function utils_agieval.per_option_accuracy_a
higher_is_better: true
- metric: per_option_accuracy_B
aggregation: !function utils_agieval.per_option_accuracy_b
aggregation: !function utils_agieval.per_option_accuracy_b
higher_is_better: true
- metric: per_option_accuracy_C
aggregation: !function utils_agieval.per_option_accuracy_c
aggregation: !function utils_agieval.per_option_accuracy_c
higher_is_better: true
- metric: per_option_accuracy_D
aggregation: !function utils_agieval.per_option_accuracy_d
aggregation: !function utils_agieval.per_option_accuracy_d
higher_is_better: true
- metric: options_consistency_rate
aggregation: !function utils_agieval.options_consistency_rate
aggregation: !function utils_agieval.options_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -27,39 +27,37 @@ generation_kwargs:
process_results: !function utils_agieval.prompt_robustness_process_results
metric_list:
- metric: 0_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_0
aggregation: !function utils_agieval.per_prompt_accuracy_0
higher_is_better: true
- metric: 1_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_1
aggregation: !function utils_agieval.per_prompt_accuracy_1
higher_is_better: true
- metric: 2_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_2
aggregation: !function utils_agieval.per_prompt_accuracy_2
higher_is_better: true
- metric: 3_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_3
aggregation: !function utils_agieval.per_prompt_accuracy_3
higher_is_better: true
- metric: 4_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_4
aggregation: !function utils_agieval.per_prompt_accuracy_4
higher_is_better: true
- metric: 5_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_5
aggregation: !function utils_agieval.per_prompt_accuracy_5
higher_is_better: true
- metric: 6_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_6
aggregation: !function utils_agieval.per_prompt_accuracy_6
higher_is_better: true
- metric: 7_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_7
aggregation: !function utils_agieval.per_prompt_accuracy_7
higher_is_better: true
- metric: 8_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_8
aggregation: !function utils_agieval.per_prompt_accuracy_8
higher_is_better: true
- metric: 9_accuracy
aggregation: !function utils_agieval.per_prompt_accuracy_9
aggregation: !function utils_agieval.per_prompt_accuracy_9
higher_is_better: true
- metric: consistency_rate
aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
aggregation: !function utils_agieval.agi_eval_prompt_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
"\\\\textit",
]:
expr = expr.replace(surround_str, "")
pattern = f"^{surround_str}" + "\{(?P<text>.+?)\}$"
pattern = f"^{surround_str}" + r"\{(?P<text>.+?)\}$"
m = re.search(pattern, expr)
if m is not None:
expr = m.group("text")
expr = expr.replace("\!", "")
expr = expr.replace(r"\!", "")
expr = expr.replace("\\%", "%")
expr = expr.replace("\\$", "$")
expr = expr.replace("$", "")
......@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
"p.m.",
"PM",
]:
expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
expr = re.sub(rf"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
if "day" in expr:
days = [
......@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
if not weekday_expressed:
expr = re.sub("day(s)?", "", expr)
expr = re.sub("\^ *\\\\circ", "", expr)
expr = re.sub("\\^ *\\\\circ", "", expr)
if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
expr = expr[1:-1]
......
......@@ -18,7 +18,7 @@ dataset_name: algebra
output_type: generate_until
test_split: test
process_docs: !function utils_math.non_greedy_robustness_process_docs
doc_to_text: !function utils_math.math_robustness_doc_to_text
doc_to_text: !function utils_math.math_robustness_doc_to_text
doc_to_target: answer
generation_kwargs:
max_gen_toks: 1024
......@@ -28,9 +28,7 @@ generation_kwargs:
process_results: !function utils_math.non_greedy_robustness_process_results
metric_list:
- metric: non_greedy_accuracy
aggregation: !function utils_math.non_greedy_accuracy
aggregation: !function utils_math.non_greedy_accuracy
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs
dataset_name: algebra
output_type: generate_until
test_split: test
doc_to_text: !function utils_math.math_robustness_doc_to_text
doc_to_text: !function utils_math.math_robustness_doc_to_text
process_results: !function utils_math.process_results
doc_to_target: answer
generation_kwargs:
......@@ -28,39 +28,37 @@ generation_kwargs:
max_gen_toks: 1024
metric_list:
- metric: 0_accuracy
aggregation: !function utils_math.per_prompt_accuracy_0
aggregation: !function utils_math.per_prompt_accuracy_0
higher_is_better: true
- metric: 1_accuracy
aggregation: !function utils_math.per_prompt_accuracy_1
aggregation: !function utils_math.per_prompt_accuracy_1
higher_is_better: true
- metric: 2_accuracy
aggregation: !function utils_math.per_prompt_accuracy_2
aggregation: !function utils_math.per_prompt_accuracy_2
higher_is_better: true
- metric: 3_accuracy
aggregation: !function utils_math.per_prompt_accuracy_3
aggregation: !function utils_math.per_prompt_accuracy_3
higher_is_better: true
- metric: 4_accuracy
aggregation: !function utils_math.per_prompt_accuracy_4
aggregation: !function utils_math.per_prompt_accuracy_4
higher_is_better: true
- metric: 5_accuracy
aggregation: !function utils_math.per_prompt_accuracy_5
aggregation: !function utils_math.per_prompt_accuracy_5
higher_is_better: true
- metric: 6_accuracy
aggregation: !function utils_math.per_prompt_accuracy_6
aggregation: !function utils_math.per_prompt_accuracy_6
higher_is_better: true
- metric: 7_accuracy
aggregation: !function utils_math.per_prompt_accuracy_7
aggregation: !function utils_math.per_prompt_accuracy_7
higher_is_better: true
- metric: 8_accuracy
aggregation: !function utils_math.per_prompt_accuracy_8
aggregation: !function utils_math.per_prompt_accuracy_8
higher_is_better: true
- metric: 9_accuracy
aggregation: !function utils_math.per_prompt_accuracy_9
aggregation: !function utils_math.per_prompt_accuracy_9
higher_is_better: true
- metric: consistency_rate
aggregation: !function utils_math.math_prompt_consistency_rate
aggregation: !function utils_math.math_prompt_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -30,9 +30,7 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.non_greedy_robustness_process_results
metric_list:
- metric: non_greedy_macro_accuracy
aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy
aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -29,39 +29,37 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.option_order_robustness_process_results
metric_list:
- metric: per_option_macro_accuracy_A
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
higher_is_better: true
- metric: per_option_macro_accuracy_B
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
higher_is_better: true
- metric: per_option_macro_accuracy_C
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
higher_is_better: true
- metric: per_option_macro_accuracy_D
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
higher_is_better: true
- metric: per_option_macro_accuracy_E
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
higher_is_better: true
- metric: per_option_macro_accuracy_F
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
higher_is_better: true
- metric: per_option_macro_accuracy_G
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
higher_is_better: true
- metric: per_option_macro_accuracy_H
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
higher_is_better: true
- metric: per_option_macro_accuracy_I
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
higher_is_better: true
- metric: per_option_macro_accuracy_J
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
higher_is_better: true
- metric: options_consistency_rate
aggregation: !function utils_mmlu_pro.options_consistency_rate
aggregation: !function utils_mmlu_pro.options_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -29,39 +29,37 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.prompt_robustness_process_results
metric_list:
- metric: 0_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
higher_is_better: true
- metric: 1_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
higher_is_better: true
- metric: 2_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
higher_is_better: true
- metric: 3_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
higher_is_better: true
- metric: 4_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
higher_is_better: true
- metric: 5_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
higher_is_better: true
- metric: 6_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
higher_is_better: true
- metric: 7_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
higher_is_better: true
- metric: 8_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
higher_is_better: true
- metric: 9_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
higher_is_better: true
- metric: consistency_rate
aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -16,5 +16,3 @@ metric_list:
- metric: bits_per_byte
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -15,5 +15,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -8,9 +8,10 @@ import json
import logging
import os
import re
from collections.abc import Generator
from dataclasses import asdict, is_dataclass
from itertools import islice
from typing import Any, Callable, Generator, List, Optional, Tuple
from typing import Any, Callable, List, Optional, Tuple
import numpy as np
from jinja2 import BaseLoader, Environment, StrictUndefined
......@@ -24,12 +25,28 @@ HIGHER_IS_BETTER_SYMBOLS = {
}
def wrap_text(string: str, width: int = 140, **kwargs) -> Optional[str]:
"""
Wraps the given string to the specified width.
"""
import textwrap
return textwrap.fill(
inspect.cleandoc(string),
width=width,
initial_indent="",
subsequent_indent=" " * 8,
break_long_words=False,
break_on_hyphens=False,
**kwargs,
)
def setup_logging(verbosity=logging.INFO):
# Configure the root logger
class CustomFormatter(logging.Formatter):
def format(self, record):
if record.name.startswith("lm_eval."):
record.name = record.name[len("lm_eval.") :]
record.name = record.name.removeprefix("lm_eval.")
return super().format(record)
formatter = CustomFormatter(
......
......@@ -21,7 +21,7 @@ license = { "text" = "MIT" }
dependencies = [
"accelerate>=0.26.0",
"evaluate",
"datasets>=2.16.0",
"datasets>=2.16.0,<4.0",
"evaluate>=0.4.0",
"jsonlines",
"numexpr",
......@@ -69,6 +69,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
ipex = ["optimum"]
japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
longbench=["jieba", "fuzzywuzzy", "rouge"]
libra=["pymorphy2"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2", "torch"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
......@@ -89,6 +90,8 @@ tasks = [
"lm_eval[ifeval]",
"lm_eval[japanese_leaderboard]",
"lm_eval[longbench]",
"lm_eval[libra]",
"lm_eval[mamba]",
"lm_eval[math]",
"lm_eval[multilingual]",
"lm_eval[ruler]",
......@@ -103,8 +106,8 @@ plugins.md029.allow_extended_start_values = true # ol-prefix
plugins.md034.enabled = false # no-bare-urls
[tool.ruff.lint]
select = ["ASYNC","B", "C4", "E", "F", "I", "LOG","PIE", "PTH","SIM", "UP", "PERF", "ISC001", "ISC002", "ICN001", "C901","FURB", "RUF"]
ignore = ["E501", "E111", "E114", "E117", "E501", "PERF203", "B011"]
select = ["ASYNC","B", "C4", "E", "F", "I", "LOG","PIE", "PTH","SIM", "UP", "PERF", "ISC001", "ISC002", "ICN001", "C901","FURB", "RUF", "W605"]
ignore = ["E501", "E111", "E114", "E117", "E501", "PERF203", "B011", "RUF005"]
[tool.ruff.lint.isort]
lines-after-imports = 2
......
......@@ -7,6 +7,11 @@ from lm_eval.api.task import ConfigurableTask
from tests.test_tasks import BaseTasks, task_class
@pytest.fixture()
def limit() -> int:
return 10
@pytest.mark.parametrize(
"task_class",
task_class(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment