Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

08218829 · lintangsutawika · 51afaca2 · a97fde23 · 08218829 · 08218829
Commit 08218829 authored Mar 25, 2024 by lintangsutawika
17 changed files
--- a/lm_eval/tasks/aclue/aclue_basic_ancient_chinese.yaml
+++ b/lm_eval/tasks/aclue/aclue_basic_ancient_chinese.yaml
+"dataset_name": "basic_ancient_chinese"
+"description": "以下是关于古汉语知识的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_basic_ancient_chinese"
--- a/lm_eval/tasks/aclue/aclue_couplet_prediction.yaml
+++ b/lm_eval/tasks/aclue/aclue_couplet_prediction.yaml
+"dataset_name": "couplet_prediction"
+"description": "以下是关于对联的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_couplet_prediction"
--- a/lm_eval/tasks/aclue/aclue_homographic_character_resolution.yaml
+++ b/lm_eval/tasks/aclue/aclue_homographic_character_resolution.yaml
+"dataset_name": "homographic_character_resolution"
+"description": "以下是关于通假字的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_homographic_character_resolution"
--- a/lm_eval/tasks/aclue/aclue_named_entity_recognition.yaml
+++ b/lm_eval/tasks/aclue/aclue_named_entity_recognition.yaml
+"dataset_name": "named_entity_recognition"
+"description": "以下是关于古汉语命名体识别的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_named_entity_recognition"
--- a/lm_eval/tasks/aclue/aclue_poetry_appreciate.yaml
+++ b/lm_eval/tasks/aclue/aclue_poetry_appreciate.yaml
+"dataset_name": "poetry_appreciate"
+"description": "以下是关于古诗词曲鉴赏的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_poetry_appreciate"
--- a/lm_eval/tasks/aclue/aclue_poetry_context_prediction.yaml
+++ b/lm_eval/tasks/aclue/aclue_poetry_context_prediction.yaml
+"dataset_name": "poetry_context_prediction"
+"description": "以下是关于古诗词上下句预测的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_poetry_context_prediction"
--- a/lm_eval/tasks/aclue/aclue_poetry_quality_assessment.yaml
+++ b/lm_eval/tasks/aclue/aclue_poetry_quality_assessment.yaml
+"dataset_name": "poetry_quality_assessment"
+"description": "以下是关于古诗词质量评估的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_poetry_quality_assessment"
--- a/lm_eval/tasks/aclue/aclue_poetry_sentiment_analysis.yaml
+++ b/lm_eval/tasks/aclue/aclue_poetry_sentiment_analysis.yaml
+"dataset_name": "poetry_sentiment_analysis"
+"description": "以下是关于诗词情感分类的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_poetry_sentiment_analysis"
--- a/lm_eval/tasks/aclue/aclue_polysemy_resolution.yaml
+++ b/lm_eval/tasks/aclue/aclue_polysemy_resolution.yaml
+"dataset_name": "polysemy_resolution"
+"description": "以下是关于古文单字多义的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_polysemy_resolution"
--- a/lm_eval/tasks/aclue/aclue_reading_comprehension.yaml
+++ b/lm_eval/tasks/aclue/aclue_reading_comprehension.yaml
+"dataset_name": "reading_comprehension"
+"description": "以下是关于古文阅读理解的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_reading_comprehension"
--- a/lm_eval/tasks/aclue/aclue_sentence_segmentation.yaml
+++ b/lm_eval/tasks/aclue/aclue_sentence_segmentation.yaml
+"dataset_name": "sentence_segmentation"
+"description": "以下是关于古文断句的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_sentence_segmentation"
--- a/lm_eval/tasks/cmmlu/_generate_configs.py
+++ b/lm_eval/tasks/cmmlu/_generate_configs.py
@@ -7,7 +7,7 @@ import os
 import yaml
 from tqdm import tqdm

-from lm_eval.logger import eval_logger
+from lm_eval.utils import eval_logger


 SUBJECTS = {

--- a/lm_eval/tasks/mmlu/_generate_configs.py
+++ b/lm_eval/tasks/mmlu/_generate_configs.py
@@ -2,12 +2,14 @@
 Take in a YAML, and output all "other" splits with this YAML
 """
 import argparse
+import logging
 import os

 import yaml
 from tqdm import tqdm

-from lm_eval.logger import eval_logger
+
+eval_logger = logging.getLogger("lm-eval")


 SUBJECTS = {

--- a/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
+++ b/lm_eval/tasks/model_written_evals/advanced_ai_risk/_template_yaml
@@ -3,7 +3,7 @@ dataset_path: EleutherAI/advanced_ai_risk
 output_type: multiple_choice
 validation_split: validation
 target_delimiter: ""
-doc_to_text: "Human: {question}\n\nAssistant:"
+doc_to_text: "Human: {{question}}\n\nAssistant:"
 doc_to_target: 0
 doc_to_choice: "{{[answer_matching_behavior, answer_not_matching_behavior]}}"
 should_decontaminate: true

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "lm_eval"
-version = "0.4.1"
+version = "0.4.2"
 authors = [
    {name="EleutherAI", email="contact@eleuther.ai"}
 ]

--- a/tests/models/test_vllm.py
+++ b/tests/models/test_vllm.py
@@ -25,8 +25,8 @@ class TEST_VLLM:
    multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1)
    MULTIPLE_CH: List[Instance] = multiple_choice_task.instances
    generate_until_task = task_list["gsm8k"]  # type: ignore
-    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
    generate_until_task._config.generation_kwargs["max_gen_toks"] = 10
+    generate_until_task.build_all_requests(limit=10, rank=0, world_size=1)
    generate_until: List[Instance] = generate_until_task.instances
    rolling_task = task_list["wikitext"]  # type: ignore
    rolling_task.build_all_requests(limit=10, rank=0, world_size=1)

--- a/tests/test_cli.py
+++ b/tests/test_cli.py
+import argparse
+
+import pytest
+
+import lm_eval.__main__
+
+
+def test_cli_parse_error():
+    """
+    Assert error raised if cli args argument doesn't have type
+    """
+    with pytest.raises(ValueError):
+        parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+        parser.add_argument(
+            "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
+        )
+        parser.add_argument(
+            "--tasks",
+            "-t",
+            default=None,
+            metavar="task1,task2",
+            help="To get full list of tasks, use the command lm-eval --tasks list",
+        )
+        lm_eval.__main__.check_argument_types(parser)
+
+
+def test_cli_parse_no_error():
+    """
+    Assert typed arguments are parsed correctly
+    """
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
+    )
+    parser.add_argument(
+        "--tasks",
+        "-t",
+        type=str,
+        default=None,
+        metavar="task1,task2",
+        help="To get full list of tasks, use the command lm-eval --tasks list",
+    )
+    lm_eval.__main__.check_argument_types(parser)