Commit b58e5556 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into tasklist

# Conflicts:
#	pyproject.toml
parents 6e1866f5 4f8195f1
......@@ -12,5 +12,3 @@ metric_list:
higher_is_better: true
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -32,5 +32,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -43,5 +43,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -61,5 +61,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -267,12 +267,12 @@ def normalize_answer_string(expr: str) -> str:
"\\\\textit",
]:
expr = expr.replace(surround_str, "")
pattern = f"^{surround_str}" + "\{(?P<text>.+?)\}$"
pattern = f"^{surround_str}" + r"\{(?P<text>.+?)\}$"
m = re.search(pattern, expr)
if m is not None:
expr = m.group("text")
expr = expr.replace("\!", "")
expr = expr.replace(r"\!", "")
expr = expr.replace("\\%", "%")
expr = expr.replace("\\$", "$")
expr = expr.replace("$", "")
......@@ -305,7 +305,7 @@ def normalize_answer_string(expr: str) -> str:
"p.m.",
"PM",
]:
expr = re.sub(f"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
expr = re.sub(rf"{unit}(es)?(s)? *(\^[0-9]+)?", "", expr)
if "day" in expr:
days = [
......@@ -326,7 +326,7 @@ def normalize_answer_string(expr: str) -> str:
if not weekday_expressed:
expr = re.sub("day(s)?", "", expr)
expr = re.sub("\^ *\\\\circ", "", expr)
expr = re.sub("\\^ *\\\\circ", "", expr)
if len(expr) > 0 and expr[0] == "{" and expr[-1] == "}":
expr = expr[1:-1]
......
......@@ -32,5 +32,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -62,5 +62,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -34,5 +34,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -63,5 +63,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -63,5 +63,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -16,5 +16,3 @@ metric_list:
- metric: bits_per_byte
metadata:
version: 2.0
dataset_kwargs:
trust_remote_code: true
......@@ -15,5 +15,3 @@ metric_list:
higher_is_better: true
metadata:
version: 1.0
dataset_kwargs:
trust_remote_code: true
......@@ -8,9 +8,10 @@ import json
import logging
import os
import re
from collections.abc import Generator
from dataclasses import asdict, is_dataclass
from itertools import islice
from typing import Any, Callable, Generator, List, Optional, Tuple
from typing import Any, Callable, List, Optional, Tuple
import numpy as np
from jinja2 import BaseLoader, Environment, StrictUndefined
......@@ -24,12 +25,28 @@ HIGHER_IS_BETTER_SYMBOLS = {
}
def wrap_text(string: str, width: int = 140, **kwargs) -> Optional[str]:
"""
Wraps the given string to the specified width.
"""
import textwrap
return textwrap.fill(
inspect.cleandoc(string),
width=width,
initial_indent="",
subsequent_indent=" " * 8,
break_long_words=False,
break_on_hyphens=False,
**kwargs,
)
def setup_logging(verbosity=logging.INFO):
# Configure the root logger
class CustomFormatter(logging.Formatter):
def format(self, record):
if record.name.startswith("lm_eval."):
record.name = record.name[len("lm_eval.") :]
record.name = record.name.removeprefix("lm_eval.")
return super().format(record)
formatter = CustomFormatter(
......
......@@ -21,7 +21,7 @@ license = { "text" = "MIT" }
dependencies = [
"accelerate>=0.26.0",
"evaluate",
"datasets>=2.16.0",
"datasets>=2.16.0,<4.0",
"evaluate>=0.4.0",
"jsonlines",
"numexpr",
......@@ -69,6 +69,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
ipex = ["optimum"]
japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
longbench=["jieba", "fuzzywuzzy", "rouge"]
libra=["pymorphy2"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2", "torch"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
......@@ -89,6 +90,8 @@ tasks = [
"lm_eval[ifeval]",
"lm_eval[japanese_leaderboard]",
"lm_eval[longbench]",
"lm_eval[libra]",
"lm_eval[mamba]",
"lm_eval[math]",
"lm_eval[multilingual]",
"lm_eval[ruler]",
......@@ -103,8 +106,8 @@ plugins.md029.allow_extended_start_values = true # ol-prefix
plugins.md034.enabled = false # no-bare-urls
[tool.ruff.lint]
select = ["ASYNC","B", "C4", "E", "F", "I", "LOG","PIE", "PTH","SIM", "UP", "PERF", "ISC001", "ISC002", "ICN001", "C901","FURB", "RUF"]
ignore = ["E501", "E111", "E114", "E117", "E501", "PERF203", "B011"]
select = ["ASYNC","B", "C4", "E", "F", "I", "LOG","PIE", "PTH","SIM", "UP", "PERF", "ISC001", "ISC002", "ICN001", "C901","FURB", "RUF", "W605"]
ignore = ["E501", "E111", "E114", "E117", "E501", "PERF203", "B011", "RUF005"]
[tool.ruff.lint.isort]
lines-after-imports = 2
......
......@@ -7,6 +7,11 @@ from lm_eval.api.task import ConfigurableTask
from tests.test_tasks import BaseTasks, task_class
@pytest.fixture()
def limit() -> int:
return 10
@pytest.mark.parametrize(
"task_class",
task_class(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment