Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
gaoqiong
lm-evaluation-harness
Commits
b2e1bfc6
Commit
b2e1bfc6
authored
Apr 22, 2025
by
artemorloff
Browse files
Merge remote-tracking branch 'origin' into feature/eval_from_config
parents
b5d16d61
e4a7b69f
Changes
48
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
39 additions
and
11 deletions
+39
-11
lm_eval/tasks/longbench/trec_e.yaml
lm_eval/tasks/longbench/trec_e.yaml
+3
-2
lm_eval/tasks/longbench/triviaqa.yaml
lm_eval/tasks/longbench/triviaqa.yaml
+3
-2
lm_eval/tasks/longbench/triviaqa_e.yaml
lm_eval/tasks/longbench/triviaqa_e.yaml
+3
-2
lm_eval/tasks/longbench/vcsum.yaml
lm_eval/tasks/longbench/vcsum.yaml
+20
-0
lm_eval/tasks/mmlu/default/_default_template_yaml
lm_eval/tasks/mmlu/default/_default_template_yaml
+1
-1
pyproject.toml
pyproject.toml
+7
-2
tests/test_task_manager.py
tests/test_task_manager.py
+1
-1
tests/test_tasks.py
tests/test_tasks.py
+1
-1
No files found.
lm_eval/tasks/longbench/trec_e.yaml
View file @
b2e1bfc6
...
...
@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split
:
test
dataset_name
:
trec_e
doc_to_text
:
'
Please
determine
the
type
of
the
question
below.
Here
are
some
examples
of
questions.\n\n{{context}}\n{{input}}'
doc_to_target
:
'
{{answers}}'
doc_to_target
:
'
{{answers
[0]
}}'
generation_kwargs
:
max_gen_toks
:
64
temperature
:
1
do_sample
:
True
until
:
[
'
\n'
]
metric_list
:
-
metric
:
!function
metrics.classification_score
aggregation
:
mean
higher_is_better
:
True
metadata
:
version
:
1
.0
version
:
2
.0
lm_eval/tasks/longbench/triviaqa.yaml
View file @
b2e1bfc6
...
...
@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split
:
test
dataset_name
:
triviaqa
doc_to_text
:
'
Answer
the
question
based
on
the
given
passage.
Only
give
me
the
answer
and
do
not
output
any
other
words.
The
following
are
some
examples.\n\n{{context}}\n\n{{input}}'
doc_to_target
:
'
{{answers}}'
doc_to_target
:
'
{{answers
[0]
}}'
generation_kwargs
:
max_gen_toks
:
32
temperature
:
1
do_sample
:
True
until
:
[
'
\n'
]
metric_list
:
-
metric
:
!function
metrics.qa_f1_score
aggregation
:
mean
higher_is_better
:
True
metadata
:
version
:
1
.0
version
:
2
.0
lm_eval/tasks/longbench/triviaqa_e.yaml
View file @
b2e1bfc6
...
...
@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
test_split
:
test
dataset_name
:
triviaqa_e
doc_to_text
:
'
Answer
the
question
based
on
the
given
passage.
Only
give
me
the
answer
and
do
not
output
any
other
words.
The
following
are
some
examples.\n\n{{context}}\n\n{{input}}'
doc_to_target
:
'
{{answers}}'
doc_to_target
:
'
{{answers
[0]
}}'
generation_kwargs
:
max_gen_toks
:
32
temperature
:
1
do_sample
:
True
until
:
[
'
\n'
]
metric_list
:
-
metric
:
!function
metrics.qa_f1_score
aggregation
:
mean
higher_is_better
:
True
metadata
:
version
:
1
.0
version
:
2
.0
lm_eval/tasks/longbench/vcsum.yaml
0 → 100644
View file @
b2e1bfc6
tag
:
-
longbench
task
:
longbench_vcsum
dataset_path
:
THUDM/LongBench
test_split
:
test
dataset_name
:
vcsum
doc_to_text
:
'
下面有一段会议记录,请你阅读后,写一段总结,总结会议的内容。\n会议记录:\n{{context}}\n\n会议总结:'
doc_to_target
:
'
{{answers[0]}}'
generation_kwargs
:
max_gen_toks
:
512
temperature
:
1
do_sample
:
True
until
:
[]
metric_list
:
-
metric
:
!function
metrics.rouge_zh_score
aggregation
:
mean
higher_is_better
:
True
metadata
:
version
:
2.0
lm_eval/tasks/mmlu/default/_default_template_yaml
View file @
b2e1bfc6
dataset_path:
hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
dataset_path:
cais/mmlu
test_split: test
fewshot_split: dev
fewshot_config:
...
...
pyproject.toml
View file @
b2e1bfc6
...
...
@@ -60,7 +60,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
api
=
[
"requests"
,
"aiohttp"
,
"tenacity"
,
"tqdm"
,
"tiktoken"
]
audiolm_qwen
=
[
"librosa"
,
"soundfile"
]
deepsparse
=
["deepsparse-nightly[llm]>=1.8.0.20240404"]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"mypy"
,
"unitxt"
]
dev
=
[
"pytest"
,
"pytest-cov"
,
"pytest-xdist"
,
"pre-commit"
,
"mypy"
,
"unitxt"
,
"requests"
,
"aiohttp"
,
"tenacity"
,
"tqdm"
,
"tiktoken"
,
"sentencepiece"
]
gptq
=
["auto-gptq[triton]>=0.6.0"]
gptqmodel
=
["gptqmodel>=1.0.9"]
hf_transfer
=
["hf_transfer"]
...
...
@@ -69,7 +69,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
ipex
=
["optimum"]
japanese_leaderboard
=
[
"emoji==2.14.0"
,
"neologdn==0.5.3"
,
"fugashi[unidic-lite]"
,
"rouge_score>=0.1.2"
]
longbench
=[
"jieba"
,
"fuzzywuzzy"
,
"rouge"
]
mamba
=
[
"mamba_ssm"
,
"causal-conv1d==1.0.2"
]
mamba
=
[
"mamba_ssm"
,
"causal-conv1d==1.0.2"
,
"torch"
]
math
=
[
"sympy>=1.12"
,
"antlr4-python3-runtime==4.11"
,
"math_verify[antlr4_11_0]"
]
multilingual
=
[
"nagisa>=0.2.7"
,
"jieba>=0.42.1"
,
"pycountry"
]
neuronx
=
["optimum[neuronx]"]
...
...
@@ -132,3 +132,8 @@ known-first-party = ["lm_eval"]
[tool.ruff.lint.extend-per-file-ignores]
"__init__.py"
=
["F401","F402","F403"]
"utils.py"
=
["F401"]
[dependency-groups]
dev
=
[
"api"
,
"dev"
,
"sentencepiece"
]
tests/test_task_manager.py
View file @
b2e1bfc6
...
...
@@ -18,7 +18,7 @@ def custom_task_tag():
@
pytest
.
fixture
(
scope
=
"module"
)
def
task_yaml
(
pytestconfig
,
custom_task_name
,
custom_task_tag
):
yield
f
"""include:
{
pytestconfig
.
rootpath
}
/lm_eval/tasks/
hellaswag/hellaswag
.yaml
yield
f
"""include:
{
pytestconfig
.
rootpath
}
/lm_eval/tasks/
arc/arc_easy
.yaml
task:
{
custom_task_name
}
class: !function
{
custom_task_name
}
.MockPythonTask
tag:
...
...
tests/test_tasks.py
View file @
b2e1bfc6
...
...
@@ -14,7 +14,7 @@ from .utils import new_tasks
datasets
.
config
.
HF_DATASETS_TRUST_REMOTE_CODE
=
True
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
# Default Task
TASKS
=
[
"
include_base_44_dutch_few_shot_en_applied_science
"
]
TASKS
=
[
"
arc_easy
"
]
def
get_new_tasks_else_default
():
...
...
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment