Commit e6b798f9 authored by Baber's avatar Baber
Browse files

Merge branch 'main' into metrics

# Conflicts:
#	.pre-commit-config.yaml
#	lm_eval/api/task.py
#	lm_eval/models/huggingface.py
#	lm_eval/models/vllm_causallms.py
#	pyproject.toml
parents 14a29ade 4f8195f1
...@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs ...@@ -18,7 +18,7 @@ process_docs: !function utils_math.prompt_robustness_process_docs
dataset_name: algebra dataset_name: algebra
output_type: generate_until output_type: generate_until
test_split: test test_split: test
doc_to_text: !function utils_math.math_robustness_doc_to_text doc_to_text: !function utils_math.math_robustness_doc_to_text
process_results: !function utils_math.process_results process_results: !function utils_math.process_results
doc_to_target: answer doc_to_target: answer
generation_kwargs: generation_kwargs:
...@@ -28,39 +28,37 @@ generation_kwargs: ...@@ -28,39 +28,37 @@ generation_kwargs:
max_gen_toks: 1024 max_gen_toks: 1024
metric_list: metric_list:
- metric: 0_accuracy - metric: 0_accuracy
aggregation: !function utils_math.per_prompt_accuracy_0 aggregation: !function utils_math.per_prompt_accuracy_0
higher_is_better: true higher_is_better: true
- metric: 1_accuracy - metric: 1_accuracy
aggregation: !function utils_math.per_prompt_accuracy_1 aggregation: !function utils_math.per_prompt_accuracy_1
higher_is_better: true higher_is_better: true
- metric: 2_accuracy - metric: 2_accuracy
aggregation: !function utils_math.per_prompt_accuracy_2 aggregation: !function utils_math.per_prompt_accuracy_2
higher_is_better: true higher_is_better: true
- metric: 3_accuracy - metric: 3_accuracy
aggregation: !function utils_math.per_prompt_accuracy_3 aggregation: !function utils_math.per_prompt_accuracy_3
higher_is_better: true higher_is_better: true
- metric: 4_accuracy - metric: 4_accuracy
aggregation: !function utils_math.per_prompt_accuracy_4 aggregation: !function utils_math.per_prompt_accuracy_4
higher_is_better: true higher_is_better: true
- metric: 5_accuracy - metric: 5_accuracy
aggregation: !function utils_math.per_prompt_accuracy_5 aggregation: !function utils_math.per_prompt_accuracy_5
higher_is_better: true higher_is_better: true
- metric: 6_accuracy - metric: 6_accuracy
aggregation: !function utils_math.per_prompt_accuracy_6 aggregation: !function utils_math.per_prompt_accuracy_6
higher_is_better: true higher_is_better: true
- metric: 7_accuracy - metric: 7_accuracy
aggregation: !function utils_math.per_prompt_accuracy_7 aggregation: !function utils_math.per_prompt_accuracy_7
higher_is_better: true higher_is_better: true
- metric: 8_accuracy - metric: 8_accuracy
aggregation: !function utils_math.per_prompt_accuracy_8 aggregation: !function utils_math.per_prompt_accuracy_8
higher_is_better: true higher_is_better: true
- metric: 9_accuracy - metric: 9_accuracy
aggregation: !function utils_math.per_prompt_accuracy_9 aggregation: !function utils_math.per_prompt_accuracy_9
higher_is_better: true higher_is_better: true
- metric: consistency_rate - metric: consistency_rate
aggregation: !function utils_math.math_prompt_consistency_rate aggregation: !function utils_math.math_prompt_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -30,9 +30,7 @@ generation_kwargs: ...@@ -30,9 +30,7 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.non_greedy_robustness_process_results process_results: !function utils_mmlu_pro.non_greedy_robustness_process_results
metric_list: metric_list:
- metric: non_greedy_macro_accuracy - metric: non_greedy_macro_accuracy
aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy aggregation: !function utils_mmlu_pro.non_greedy_macro_accuracy
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -29,39 +29,37 @@ generation_kwargs: ...@@ -29,39 +29,37 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.option_order_robustness_process_results process_results: !function utils_mmlu_pro.option_order_robustness_process_results
metric_list: metric_list:
- metric: per_option_macro_accuracy_A - metric: per_option_macro_accuracy_A
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_a
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_B - metric: per_option_macro_accuracy_B
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_b
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_C - metric: per_option_macro_accuracy_C
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_c
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_D - metric: per_option_macro_accuracy_D
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_d
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_E - metric: per_option_macro_accuracy_E
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_e
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_F - metric: per_option_macro_accuracy_F
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_f
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_G - metric: per_option_macro_accuracy_G
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_g
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_H - metric: per_option_macro_accuracy_H
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_h
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_I - metric: per_option_macro_accuracy_I
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_i
higher_is_better: true higher_is_better: true
- metric: per_option_macro_accuracy_J - metric: per_option_macro_accuracy_J
aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j aggregation: !function utils_mmlu_pro.per_option_macro_accuracy_j
higher_is_better: true higher_is_better: true
- metric: options_consistency_rate - metric: options_consistency_rate
aggregation: !function utils_mmlu_pro.options_consistency_rate aggregation: !function utils_mmlu_pro.options_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -29,39 +29,37 @@ generation_kwargs: ...@@ -29,39 +29,37 @@ generation_kwargs:
process_results: !function utils_mmlu_pro.prompt_robustness_process_results process_results: !function utils_mmlu_pro.prompt_robustness_process_results
metric_list: metric_list:
- metric: 0_macro_accuracy - metric: 0_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_0
higher_is_better: true higher_is_better: true
- metric: 1_macro_accuracy - metric: 1_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_1
higher_is_better: true higher_is_better: true
- metric: 2_macro_accuracy - metric: 2_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_2
higher_is_better: true higher_is_better: true
- metric: 3_macro_accuracy - metric: 3_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_3
higher_is_better: true higher_is_better: true
- metric: 4_macro_accuracy - metric: 4_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_4
higher_is_better: true higher_is_better: true
- metric: 5_macro_accuracy - metric: 5_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_5
higher_is_better: true higher_is_better: true
- metric: 6_macro_accuracy - metric: 6_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_6
higher_is_better: true higher_is_better: true
- metric: 7_macro_accuracy - metric: 7_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_7
higher_is_better: true higher_is_better: true
- metric: 8_macro_accuracy - metric: 8_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_8
higher_is_better: true higher_is_better: true
- metric: 9_macro_accuracy - metric: 9_macro_accuracy
aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9 aggregation: !function utils_mmlu_pro.per_prompt_accuracy_9
higher_is_better: true higher_is_better: true
- metric: consistency_rate - metric: consistency_rate
aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate aggregation: !function utils_mmlu_pro.mmlu_pro_prompt_consistency_rate
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -23,5 +23,3 @@ metric_list: ...@@ -23,5 +23,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -18,5 +18,3 @@ metric_list: ...@@ -18,5 +18,3 @@ metric_list:
ignore_punctuation: false ignore_punctuation: false
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -16,5 +16,3 @@ metric_list: ...@@ -16,5 +16,3 @@ metric_list:
- metric: bits_per_byte - metric: bits_per_byte
metadata: metadata:
version: 2.0 version: 2.0
dataset_kwargs:
trust_remote_code: true
...@@ -15,5 +15,3 @@ metric_list: ...@@ -15,5 +15,3 @@ metric_list:
higher_is_better: true higher_is_better: true
metadata: metadata:
version: 1.0 version: 1.0
dataset_kwargs:
trust_remote_code: true
...@@ -29,12 +29,28 @@ HIGHER_IS_BETTER_SYMBOLS = { ...@@ -29,12 +29,28 @@ HIGHER_IS_BETTER_SYMBOLS = {
} }
def wrap_text(string: str, width: int = 140, **kwargs) -> str | None:
"""
Wraps the given string to the specified width.
"""
import textwrap
return textwrap.fill(
inspect.cleandoc(string),
width=width,
initial_indent="",
subsequent_indent=" " * 8,
break_long_words=False,
break_on_hyphens=False,
**kwargs,
)
def setup_logging(verbosity=logging.INFO): def setup_logging(verbosity=logging.INFO):
# Configure the root logger # Configure the root logger
class CustomFormatter(logging.Formatter): class CustomFormatter(logging.Formatter):
def format(self, record): def format(self, record):
if record.name.startswith("lm_eval."): record.name = record.name.removeprefix("im_eval.")
record.name = record.name[len("lm_eval.") :]
return super().format(record) return super().format(record)
formatter = CustomFormatter( formatter = CustomFormatter(
......
...@@ -14,7 +14,7 @@ classifiers = [ ...@@ -14,7 +14,7 @@ classifiers = [
] ]
dependencies = [ dependencies = [
"accelerate>=0.26.0", "accelerate>=0.26.0",
"datasets>=2.16.0", "datasets>=2.16.0,<4.0",
"evaluate>=0.4.0", "evaluate>=0.4.0",
"peft>=0.2.0", "peft>=0.2.0",
"pytablewriter", "pytablewriter",
...@@ -55,6 +55,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"] ...@@ -55,6 +55,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
ipex = ["optimum"] ipex = ["optimum"]
japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"] japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
longbench = ["jieba", "fuzzywuzzy", "rouge"] longbench = ["jieba", "fuzzywuzzy", "rouge"]
libra=["pymorphy2"]
mamba = ["mamba_ssm", "causal-conv1d==1.0.2", "torch"] mamba = ["mamba_ssm", "causal-conv1d==1.0.2", "torch"]
math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"] math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"] multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
...@@ -70,7 +71,9 @@ tasks = [ ...@@ -70,7 +71,9 @@ tasks = [
"lm_eval[ifeval]", "lm_eval[ifeval]",
"lm_eval[japanese_leaderboard]", "lm_eval[japanese_leaderboard]",
"lm_eval[longbench]", "lm_eval[longbench]",
"lm_eval[math]", "lm_eval[libra]",
"lm_eval[mamba]",
"lm_eval[math]",
"lm_eval[multilingual]", "lm_eval[multilingual]",
"lm_eval[ruler]" "lm_eval[ruler]"
] ]
...@@ -98,7 +101,7 @@ plugins.md034.enabled = false # no-bare-urls ...@@ -98,7 +101,7 @@ plugins.md034.enabled = false # no-bare-urls
[tool.ruff] [tool.ruff]
target-version = "py39" target-version = "py39"
lint.extend-select = ["I", "UP", "E", "C419", "F", "B", "SIM", "RUF034", "W605", "FURB"] lint.extend-select = ["I", "UP", "E", "C419", "F", "B", "SIM", "RUF034", "W605", "FURB", "W605"]
lint.fixable = ["I001", "F401", "UP"] lint.fixable = ["I001", "F401", "UP"]
lint.ignore = ["E402", "E731", "E501", "E111", "E114", "E117", "E741"] lint.ignore = ["E402", "E731", "E501", "E111", "E114", "E117", "E741"]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment