Merge remote-tracking branch 'origin' into feature/eval_from_config

b2e1bfc6 · artemorloff · b5d16d61 · e4a7b69f · b2e1bfc6 · b2e1bfc6
Commit b2e1bfc6 authored Apr 22, 2025 by artemorloff
8 changed files
--- a/lm_eval/tasks/longbench/trec_e.yaml
+++ b/lm_eval/tasks/longbench/trec_e.yaml
@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: trec_e
 doc_to_text: 'Please determine the type of the question below. Here are some examples of questions.\n\n{{context}}\n{{input}}'
-doc_to_target: '{{answers}}'
+doc_to_target: '{{answers[0]}}'
 generation_kwargs:
  max_gen_toks: 64
  temperature: 1
  do_sample: True
+  until: ['\n']
 metric_list:
  - metric: !function metrics.classification_score
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/longbench/triviaqa.yaml
+++ b/lm_eval/tasks/longbench/triviaqa.yaml
@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: triviaqa
 doc_to_text: 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{{context}}\n\n{{input}}'
-doc_to_target: '{{answers}}'
+doc_to_target: '{{answers[0]}}'
 generation_kwargs:
  max_gen_toks: 32
  temperature: 1
  do_sample: True
+  until: ['\n']
 metric_list:
  - metric: !function metrics.qa_f1_score
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/longbench/triviaqa_e.yaml
+++ b/lm_eval/tasks/longbench/triviaqa_e.yaml
@@ -6,14 +6,15 @@ dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: triviaqa_e
 doc_to_text: 'Answer the question based on the given passage. Only give me the answer and do not output any other words. The following are some examples.\n\n{{context}}\n\n{{input}}'
-doc_to_target: '{{answers}}'
+doc_to_target: '{{answers[0]}}'
 generation_kwargs:
  max_gen_toks: 32
  temperature: 1
  do_sample: True
+  until: ['\n']
 metric_list:
  - metric: !function metrics.qa_f1_score
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 1.0
+  version: 2.0
--- a/lm_eval/tasks/longbench/vcsum.yaml
+++ b/lm_eval/tasks/longbench/vcsum.yaml
+
+tag:
+  - longbench
+task: longbench_vcsum
+dataset_path: THUDM/LongBench
+test_split: test
+dataset_name: vcsum
+doc_to_text: '下面有一段会议记录，请你阅读后，写一段总结，总结会议的内容。\n会议记录：\n{{context}}\n\n会议总结：'
+doc_to_target: '{{answers[0]}}'
+generation_kwargs:
+  max_gen_toks: 512
+  temperature: 1
+  do_sample: True
+  until: []
+metric_list:
+  - metric: !function metrics.rouge_zh_score
+    aggregation: mean
+    higher_is_better: True
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/mmlu/default/_default_template_yaml
+++ b/lm_eval/tasks/mmlu/default/_default_template_yaml
-dataset_path: hails/mmlu_no_train # a copy of `cais/mmlu` with no auxiliary_train split
+dataset_path: cais/mmlu
 test_split: test
 fewshot_split: dev
 fewshot_config:

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,7 +60,7 @@ Repository = "https://github.com/EleutherAI/lm-evaluation-harness"
 api = ["requests", "aiohttp", "tenacity", "tqdm", "tiktoken"]
 audiolm_qwen = ["librosa", "soundfile"]
 deepsparse = ["deepsparse-nightly[llm]>=1.8.0.20240404"]
-dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt"]
+dev = ["pytest", "pytest-cov", "pytest-xdist", "pre-commit", "mypy", "unitxt", "requests", "aiohttp", "tenacity", "tqdm", "tiktoken", "sentencepiece"]
 gptq = ["auto-gptq[triton]>=0.6.0"]
 gptqmodel = ["gptqmodel>=1.0.9"]
 hf_transfer = ["hf_transfer"]
@@ -69,7 +69,7 @@ ifeval = ["langdetect", "immutabledict", "nltk>=3.9.1"]
 ipex = ["optimum"]
 japanese_leaderboard = ["emoji==2.14.0", "neologdn==0.5.3", "fugashi[unidic-lite]", "rouge_score>=0.1.2"]
 longbench=["jieba", "fuzzywuzzy", "rouge"]
-mamba = ["mamba_ssm", "causal-conv1d==1.0.2"]
+mamba = ["mamba_ssm", "causal-conv1d==1.0.2", "torch"]
 math = ["sympy>=1.12", "antlr4-python3-runtime==4.11", "math_verify[antlr4_11_0]"]
 multilingual = ["nagisa>=0.2.7", "jieba>=0.42.1", "pycountry"]
 neuronx = ["optimum[neuronx]"]
@@ -132,3 +132,8 @@ known-first-party = ["lm_eval"]
 [tool.ruff.lint.extend-per-file-ignores]
 "__init__.py" = ["F401","F402","F403"]
 "utils.py" = ["F401"]
+
+[dependency-groups]
+dev = [
+  "api","dev","sentencepiece"
+]
--- a/tests/test_task_manager.py
+++ b/tests/test_task_manager.py
@@ -18,7 +18,7 @@ def custom_task_tag():

 @pytest.fixture(scope="module")
 def task_yaml(pytestconfig, custom_task_name, custom_task_tag):
-    yield f"""include: {pytestconfig.rootpath}/lm_eval/tasks/hellaswag/hellaswag.yaml
+    yield f"""include: {pytestconfig.rootpath}/lm_eval/tasks/arc/arc_easy.yaml
 task: {custom_task_name}
 class: !function {custom_task_name}.MockPythonTask
 tag:

--- a/tests/test_tasks.py
+++ b/tests/test_tasks.py
@@ -14,7 +14,7 @@ from .utils import new_tasks
 datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # Default Task
-TASKS = ["include_base_44_dutch_few_shot_en_applied_science"]
+TASKS = ["arc_easy"]


 def get_new_tasks_else_default():