Merge branch 'main' of https://github.com/EleutherAI/lm-evaluation-harness into t5v2-alt-plus

08218829 · lintangsutawika · 51afaca2 · a97fde23 · 08218829 · 08218829
Commit 08218829 authored Mar 25, 2024 by lintangsutawika
20 changed files
--- a/README.md
+++ b/README.md
@@ -140,10 +140,16 @@ lm_eval --model vllm \
    --tasks lambada_openai \
    --batch_size auto
 ```
-For a full list of supported vLLM configurations, please reference our vLLM integration and the vLLM documentation.
+To use vllm, do `pip install lm_eval[vllm]`. For a full list of supported vLLM configurations, please reference our [vLLM integration](https://github.com/EleutherAI/lm-evaluation-harness/blob/e74ec966556253fbe3d8ecba9de675c77c075bce/lm_eval/models/vllm_causallms.py) and the vLLM documentation.

 vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.

+> [!Tip]
+> For fastest performance, we recommend using `--batch_size auto` for vLLM whenever possible, to leverage its continuous batching functionality!
+
+> [!Tip]
+> Passing `max_model_len=4096` or some other reasonable default to vLLM through model args may cause speedups or prevent out-of-memory errors when trying to use auto batch size, such as for Mistral-7B-v0.1 which defaults to a maximum length of 32k.
+
 ### Model APIs and Inference Servers

 Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers.
@@ -240,9 +246,6 @@ Additionally, one can provide a directory with `--use_cache` to cache the result

 For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!

-> [!Tip]
-> Running lm-evaluation-harness as an external library and can't find (almost) any tasks available? Run `lm_eval.tasks.initialize_tasks()` to load the library's stock tasks before calling `lm_eval.evaluate()` or `lm_eval.simple_evaluate()` !
-
 ## Visualizing Results

 You can seamlessly visualize and analyze the results of your evaluation harness runs using both Weights & Biases (W&B) and Zeno.

--- a/docs/interface.md
+++ b/docs/interface.md
@@ -112,8 +112,8 @@ my_model = initialize_my_model()
 # - `Your_LM.generate_until()`
 lm_obj = Your_LM(model=my_model, batch_size=16)

-# The task_manager indexes tasks including ones
-# specified by the user through `include_path`
+# optional: the task_manager indexes tasks including ones
+# specified by the user through `include_path`.
 task_manager = lm_eval.tasks.TaskManager(
    include_path="/path/to/custom/yaml"
    )
@@ -138,9 +138,9 @@ task_dict = lm_eval.tasks.get_task_dict(
                 # custom paths is required.
    )

-def evaluate(
+results = evaluate(
    lm=lm_obj,
    task_dict=task_dict,
    ...
-):
+)
 ```
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -13,7 +13,7 @@ import numpy as np
 from lm_eval import evaluator, utils
 from lm_eval.evaluator import request_caching_arg_to_dict
 from lm_eval.logging_utils import WandbLogger
-from lm_eval.tasks import TaskManager, include_path, initialize_tasks
+from lm_eval.tasks import TaskManager
 from lm_eval.utils import make_table, simple_parse_args_string


@@ -53,13 +53,30 @@ def _int_or_none_list_arg_type(max_len: int, value: str, split_char: str = ","):
    return items


-def parse_eval_args() -> argparse.Namespace:
+def check_argument_types(parser: argparse.ArgumentParser):
+    """
+    Check to make sure all CLI args are typed, raises error if not
+    """
+    for action in parser._actions:
+        if action.dest != "help" and not action.const:
+            if action.type is None:
+                raise ValueError(
+                    f"Argument '{action.dest}' doesn't have a type specified."
+                )
+            else:
+                continue
+
+
+def setup_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument("--model", "-m", default="hf", help="Name of model e.g. `hf`")
+    parser.add_argument(
+        "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`"
+    )
    parser.add_argument(
        "--tasks",
        "-t",
        default=None,
+        type=str,
        metavar="task1,task2",
        help="To get full list of tasks, use the command lm-eval --tasks list",
    )
@@ -67,6 +84,7 @@ def parse_eval_args() -> argparse.Namespace:
        "--model_args",
        "-a",
        default="",
+        type=str,
        help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`",
    )
    parser.add_argument(
@@ -164,6 +182,7 @@ def parse_eval_args() -> argparse.Namespace:
    )
    parser.add_argument(
        "--gen_kwargs",
+        type=str,
        default=None,
        help=(
            "String arguments for model generation on greedy_until tasks,"
@@ -180,6 +199,7 @@ def parse_eval_args() -> argparse.Namespace:
    )
    parser.add_argument(
        "--wandb_args",
+        type=str,
        default="",
        help="Comma separated string arguments passed to wandb.init, e.g. `project=lm-eval,job_type=eval",
    )
@@ -209,13 +229,19 @@ def parse_eval_args() -> argparse.Namespace:
        help="Sets trust_remote_code to True to execute code to create HF Datasets from the Hub",
    )

+    return parser
+
+
+def parse_eval_args(parser: argparse.ArgumentParser) -> argparse.Namespace:
+    check_argument_types(parser)
    return parser.parse_args()


 def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
    if not args:
        # we allow for args to be passed externally, else we parse them ourselves
-        args = parse_eval_args()
+        parser = setup_parser()
+        args = parse_eval_args(parser)

    if args.wandb_args:
        wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args))
@@ -232,7 +258,8 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
            "Specify --output_path if providing --log_samples or --predict_only"
        )

-    initialize_tasks(args.verbosity)
+    if args.include_path is not None:
+        eval_logger.info(f"Including path: {args.include_path}")
    task_manager = TaskManager(args.verbosity, include_path=args.include_path)

    if args.limit:
@@ -305,7 +332,6 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        )

    eval_logger.info(f"Selected Tasks: {task_names}")
-    eval_logger.info("Loading selected tasks...")

    request_caching_args = request_caching_arg_to_dict(
        cache_requests=args.cache_requests

--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -66,11 +66,11 @@ class LM(abc.ABC):
          multiple chunks, the last input will still a full-sized context.
          Example:
            Input tokens: [ 0 1 2 3 4 5 6 7 8 9 ]
-            Prefix: EOT
+            Prefix: BOS/EOS
            Max context length: 4
            Resulting input/prediction pairs:

-                INPUT:  EOT   0   1   2
+                INPUT:  BOS   0   1   2
                PRED:     0   1   2   3

                INPUT:    3   4   5   6
@@ -90,7 +90,8 @@ class LM(abc.ABC):
        :return: list[tuple[float]]
            A list of tuples (logprob,)
            logprob: float
-                The log probability of `context` conditioned on the EOT token.
+                The log probability of `context` conditioned on the BOS/EOS token.
+                Can also be overridden for custom cases by `prefix_token_id`.
        """
        pass

@@ -283,6 +284,11 @@ class TemplateLM(LM):
    def eot_token_id(self):
        pass

+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        return self.eot_token_id
+
    @abc.abstractmethod
    def tok_encode(self, string: str, **kwargs):
        pass
@@ -317,9 +323,9 @@ class TemplateLM(LM):
        new_reqs = []
        for context, continuation in [req.args for req in requests]:
            if context == "":
-                # end of text as context
+                # BOS or EOS as context
                context_enc, continuation_enc = (
-                    [self.eot_token_id],
+                    [self.prefix_token_id],
                    self.tok_encode(continuation),
                )
            else:

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -392,7 +392,7 @@ class Task(abc.ABC):
        # used with caching
        og_limit = limit

-        cache_key = f"requests-{self._config.task}"
+        cache_key = f"requests-{self._config.task}-{self.config.num_fewshot}shot-rank{rank}-world_size{world_size}"

        cached_instances = load_from_cache(file_name=cache_key)


--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
 import itertools
 import logging
 import random
+import time
 from collections import defaultdict
 from typing import TYPE_CHECKING, List, Optional, Union

@@ -106,6 +107,7 @@ def simple_evaluate(
        Dictionary of results
    """
    eval_logger.setLevel(getattr(logging, f"{verbosity}"))
+    start_date = time.time()

    if delete_requests_cache:
        eval_logger.info("Deleting requests cache...")
@@ -146,9 +148,22 @@ def simple_evaluate(

    if isinstance(model, str):
        if model_args is None:
+            eval_logger.warning("model_args not specified. Using defaults.")
            model_args = ""
+        if "pretrained" not in model_args and model in [
+            "hf-auto",
+            "hf",
+            "huggingface",
+            "vllm",
+        ]:
+            eval_logger.warning(
+                "pretrained not specified. Using default pretrained=gpt2."
+            )

        if isinstance(model_args, dict):
+            eval_logger.info(
+                f"Initializing {model} model, with arguments: {model_args}"
+            )
            lm = lm_eval.api.registry.get_model(model).create_from_arg_obj(
                model_args,
                {
@@ -159,6 +174,9 @@ def simple_evaluate(
            )

        else:
+            eval_logger.info(
+                f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}"
+            )
            lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
                model_args,
                {
@@ -170,6 +188,7 @@ def simple_evaluate(
    else:
        if not isinstance(model, lm_eval.api.model.LM):
            raise TypeError
+        eval_logger.info("Using pre-initialized model")
        lm = model

    if use_cache is not None:
@@ -187,10 +206,6 @@ def simple_evaluate(
    if task_manager is None:
        task_manager = TaskManager(verbosity)

-    eval_logger.info(
-        "get_task_dict has been updated to accept an optional argument, `task_manager`"
-        "Read more here:https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
-    )
    task_dict = get_task_dict(tasks, task_manager)
    for task_name in task_dict.keys():
        task_obj = task_dict[task_name]
@@ -213,6 +228,8 @@ def simple_evaluate(
            # we have to change the class properties post-hoc. This is pretty hacky.
            task_obj.override_metric(metric_name="bypass")

+        # override tasks' fewshot values to the provided num_fewshot arg value
+        # except if tasks have it set to 0 manually in their configs--then we should never overwrite that
        if num_fewshot is not None:
            if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0:
                eval_logger.info(
@@ -223,6 +240,10 @@ def simple_evaluate(
                    f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}"
                )
                task_obj.set_config(key="num_fewshot", value=num_fewshot)
+        else:
+            # if num_fewshot not provided, and the task does not define a default one, default to 0
+            if (default_num_fewshot := task_obj.get_config("num_fewshot")) is None:
+                task_obj.set_config(key="num_fewshot", value=0)

    if check_integrity:
        run_task_tests(task_list=tasks)
@@ -262,6 +283,7 @@ def simple_evaluate(
            "gen_kwargs": gen_kwargs,
        }
        results["git_hash"] = get_git_commit_hash()
+        results["date"] = start_date
        add_env_info(results)  # additional environment info to results
        return results
    else:

--- a/lm_eval/logging_utils.py
+++ b/lm_eval/logging_utils.py
@@ -397,24 +397,30 @@ class WandbLogger:
            self.run.log({f"{group}_eval_results": grouped_df})


-def get_commit_from_path(repo_path: Path) -> Optional[str]:
-    git_folder = Path(repo_path, ".git")
-    if git_folder.is_file():
-        git_folder = Path(
-            git_folder.parent,
-            git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
-        )
-    if Path(git_folder, "HEAD").exists():
-        head_name = (
-            Path(git_folder, "HEAD")
-            .read_text(encoding="utf-8")
-            .split("\n")[0]
-            .split(" ")[-1]
+def get_commit_from_path(repo_path: Union[Path, str]) -> Optional[str]:
+    try:
+        git_folder = Path(repo_path, ".git")
+        if git_folder.is_file():
+            git_folder = Path(
+                git_folder.parent,
+                git_folder.read_text(encoding="utf-8").split("\n")[0].split(" ")[-1],
+            )
+        if Path(git_folder, "HEAD").exists():
+            head_name = (
+                Path(git_folder, "HEAD")
+                .read_text(encoding="utf-8")
+                .split("\n")[0]
+                .split(" ")[-1]
+            )
+            head_ref = Path(git_folder, head_name)
+            git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
+        else:
+            git_hash = None
+    except Exception as err:
+        logger.debug(
+            f"Failed to retrieve a Git commit hash from path: {str(repo_path)}. Error: {err}"
        )
-        head_ref = Path(git_folder, head_name)
-        git_hash = head_ref.read_text(encoding="utf-8").replace("\n", "")
-    else:
-        git_hash = None
+        return None
    return git_hash



--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -99,6 +99,7 @@ class HFLM(TemplateLM):
        trust_remote_code: Optional[bool] = False,
        use_fast_tokenizer: Optional[bool] = True,
        add_bos_token: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
        # arguments used for splitting a model across GPUs naively.
        # only used if `parallelize=True`.
        parallelize: Optional[bool] = False,
@@ -340,6 +341,12 @@ class HFLM(TemplateLM):
            self._rank = 0
            self._world_size = 1

+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(
+                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
+            )
+
    @property
    def config(self):
        # return the associated transformers.AutoConfig for the given pretrained model.
@@ -358,6 +365,15 @@ class HFLM(TemplateLM):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id

+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
+
    @property
    def max_length(self):
        if self._max_length:  # if max length manually set, return it
@@ -812,7 +828,7 @@ class HFLM(TemplateLM):
                    utils.make_disjoint_window,
                    utils.get_rolling_token_windows(
                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
+                        prefix_token=self.prefix_token_id,
                        max_seq_len=self.max_length,
                        context_len=1,
                    ),
@@ -1149,7 +1165,7 @@ class HFLM(TemplateLM):
                if "until" in kwargs.keys():
                    until = kwargs.pop("until")
                    if isinstance(until, str):
-                        until = [kwargs]
+                        until = [until]
                    elif not isinstance(until, list):
                        raise ValueError(
                            f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"

--- a/lm_eval/models/neuron_optimum.py
+++ b/lm_eval/models/neuron_optimum.py
@@ -305,6 +305,11 @@ class NEURON_HF(TemplateLM):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id

+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        return self.tokenizer.bos_token_id or self.tokenizer.eos_token_id
+
    @property
    def max_length(self):
        if self._max_length:  # if max length manually set, return it
@@ -460,7 +465,7 @@ class NEURON_HF(TemplateLM):
                    utils.make_disjoint_window,
                    utils.get_rolling_token_windows(
                        token_list=self.tok_encode(string),
-                        prefix_token=self.eot_token_id,
+                        prefix_token=self.prefix_token_id,
                        max_seq_len=self.max_length,
                        context_len=1,
                    ),
@@ -659,7 +664,7 @@ class NEURON_HF(TemplateLM):
                    if "until" in kwargs.keys():
                        until = kwargs.pop("until")
                        if isinstance(until, str):
-                            until = [kwargs]
+                            until = [until]
                        elif not isinstance(until, list):
                            raise ValueError(
                                f"Expected `kwargs['until']` to be of type Union[str,list] but got {until}"

--- a/lm_eval/models/openai_completions.py
+++ b/lm_eval/models/openai_completions.py
@@ -281,7 +281,7 @@ class OpenaiCompletionsLM(TemplateLM):
                **{
                    k: v
                    for k, v in request_args.items()
-                    if k not in ["do_sample", "max_gen_toks"]
+                    if k not in {"do_sample", "max_gen_toks", "until"}
                },
            )
            for resp, (context, args_) in zip(response.choices, chunk):

--- a/lm_eval/models/vllm_causallms.py
+++ b/lm_eval/models/vllm_causallms.py
@@ -42,6 +42,7 @@ class VLLM(TemplateLM):
        tokenizer_mode: Literal["auto", "slow"] = "auto",
        tokenizer_revision: Optional[str] = None,
        add_bos_token: Optional[bool] = False,
+        prefix_token_id: Optional[int] = None,
        tensor_parallel_size: int = 1,
        quantization: Optional[str] = None,
        max_gen_toks: int = 256,
@@ -118,6 +119,11 @@ class VLLM(TemplateLM):
            tokenizer_revision=tokenizer_revision,
        )
        self.add_bos_token = add_bos_token
+        self.custom_prefix_token_id = prefix_token_id
+        if prefix_token_id is not None:
+            eval_logger.info(
+                f"Loglikelihood prefix token id used in evaluation: {self.prefix_token_id}"
+            )

        self._max_gen_toks = max_gen_toks

@@ -126,6 +132,15 @@ class VLLM(TemplateLM):
        # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
        return self.tokenizer.eos_token_id

+    @property
+    def prefix_token_id(self):
+        # it is used as prefix for loglikelihood
+        if self.custom_prefix_token_id is not None:
+            return self.custom_prefix_token_id
+        if self.tokenizer.bos_token_id is not None:
+            return self.tokenizer.bos_token_id
+        return self.tokenizer.eos_token_id
+
    @property
    def max_length(self):
        if self._max_length:  # if max length manually set, return it

--- a/lm_eval/tasks/README.md
+++ b/lm_eval/tasks/README.md
-# v1.0 Tasks
-This list keeps track of which tasks' implementations have been ported to YAML / v2.0 of the Eval Harness.
-
-Boxes should be checked iff tasks are implemented in the refactor and tested for regression. Tasks should be struck through if checked *against original introducing paper* implementation or popularizing implementation. (WIP) Denotes that there exists a PR or person working on this task already.
-
- [x] Glue
- [x] SuperGlue
- [x] CoQA
- [x] DROP
- [x] ~~Lambada~~
- [x] Lambada (Cloze variants)
- [x] ~~Lambada (Multilingual)~~
- [x] Wikitext
- [x] PiQA
- [x] PROST
- [x] MCTACO
- [x] Pubmed QA
- [x] SciQ
- [x] QASPER
- [x] QA4MRE
- [x] TriviaQA
- [x] AI2 ARC
- [x] LogiQA
- [x] HellaSwag
- [x] SWAG
- [x] OpenBookQA
- [ ] SQuADv2 (Lintang)
- [x] RACE
- [x] HeadQA
- [x] MathQA
- [x] WebQs
- [x] WSC273
- [x] Winogrande
- [x] ANLI
- [x] Hendrycks Ethics (missing some tasks/metrics, see PR 660: <https://github.com/EleutherAI/lm-evaluation-harness/pull/660> for more info)
- [x] TruthfulQA (mc1)
- [x] TruthfulQA (mc2)
- [x] TruthfulQA (gen)
- [x] MuTual
- [ ] Hendrycks Math (Hailey)
- [x] Asdiv
- [ ] GSM8k
- [x] Arithmetic
- [ ] MMMLU (Hailey)
- [x] Translation (WMT) suite
- [x] Unscramble
- [x] ~~Pile (perplexity)~~
- [x] BLiMP
- [x] ToxiGen
- [x] StoryCloze
- [ ] NaturalQs (Hailey)
- [x] CrowS-Pairs
- [x] XCopa
- [ ] BIG-Bench (Hailey)
- [x] XStoryCloze
- [x] XWinograd
- [x] PAWS-X
- [x] XNLI
- [x] MGSM
- [ ] SCROLLS
- [x] Babi
- [x] Belebele
-
-# Novel Tasks
-Tasks added in the revamped harness that were not previously available. Again, a strikethrough denotes checking performed *against the original task's implementation or published results introducing the task*.
-
-# Task Wishlist
-
- [ ] TheoremQA
- [ ] Theorem Proving evaluations
- [ ] Chain of Thought
- [ ] Self-consistency ; Least-to-Most prompting, etc.
- [ ] Summarization Tasks
- [ ] Anthropic Model-Written Evals
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -356,28 +356,6 @@ class TaskManager:
        return tasks_and_groups


-def include_path(task_dir):
-    logger = utils.eval_logger
-    logger.setLevel(getattr(logging, "INFO"))
-    logger.info(
-        "To still use tasks loaded from args.include_path,"
-        "see an example of the new TaskManager API in "
-        "https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage"
-    )
-    return 0
-
-
-def initialize_tasks(verbosity="INFO"):
-    logger = utils.eval_logger
-    logger.setLevel(getattr(logging, f"{verbosity}"))
-    logger.info(
-        "lm_eval.tasks.initialize_tasks() is deprecated and no longer necessary. "
-        "It will be removed in v0.4.2 release. "
-        "TaskManager will instead be used."
-    )
-    return 0
-
-
 def get_task_name_from_config(task_config: Dict[str, str]) -> str:
    if "task" in task_config:
        return task_config["task"]
@@ -401,7 +379,7 @@ def get_task_name_from_object(task_object):


 def get_task_dict(
-    task_name_list: List[Union[str, Dict, Task]],
+    task_name_list: Union[str, List[Union[str, Dict, Task]]],
    task_manager: Optional[TaskManager] = None,
 ):
    """Creates a dictionary of task objects from either a name of task, config, or prepared Task object.
@@ -423,6 +401,15 @@ def get_task_dict(

    if isinstance(task_name_list, str):
        task_name_list = [task_name_list]
+    elif isinstance(task_name_list, list):
+        if not all([isinstance(task, (str, dict, Task)) for task in task_name_list]):
+            raise TypeError(
+                "Expected all list items to be of types 'str', 'dict', or 'Task', but at least one entry did not match."
+            )
+    else:
+        raise TypeError(
+            f"Expected a 'str' or 'list' but received {type(task_name_list)}."
+        )

    string_task_name_list = [task for task in task_name_list if isinstance(task, str)]
    others_task_name_list = [task for task in task_name_list if ~isinstance(task, str)]

--- a/lm_eval/tasks/aclue/README.md
+++ b/lm_eval/tasks/aclue/README.md
+# ACLUE
+
+### Paper
+
+Can Large Language Model Comprehend Ancient Chinese? A Preliminary Test on ACLUE
+https://arxiv.org/abs/2310.09550
+
+The Ancient Chinese Language Understanding Evaluation (ACLUE) is an evaluation benchmark focused on ancient Chinese language comprehension. It aims to assess the performance of large-scale language models on understanding ancient Chinese. The benchmark comprises 15 tasks spanning various domains, including lexical, syntactic, semantic, inference, and knowledge. ACLUE's tasks are derived from a combination of manually curated questions from publicly available resources, and automatically
+generated questions from classical Chinese language corpora. The range of questions span from the Xia dynasty (2070 BCE) to the Ming dynasty (1368 CE). ACLUE adopts a multiple-choice question format for all tasks.
+
+Homepage: https://github.com/isen-zhang/ACLUE
+
+### Citation
+
+```bibtex
+@inproceedings{zhang-li-2023-large,
+    title = "Can Large Langauge Model Comprehend {A}ncient {C}hinese? A Preliminary Test on {ACLUE}",
+    author = "Zhang, Yixuan  and Li, Haonan",
+    booktitle = "Proceedings of the Ancient Language Processing Workshop",
+    month = sep,
+    year = "2023",
+    address = "Varna, Bulgaria",
+    publisher = "INCOMA Ltd., Shoumen, Bulgaria",
+    url = "https://aclanthology.org/2023.alp-1.9",
+    pages = "80--87"
+}
+```
+
+### Groups and Tasks
+
+#### Groups
+
+- `aclue`: All 15 subjects of the ACLUE dataset, evaluated following the methodology in CMMLU's original implementation.
+
+#### Tasks
+
+The following tasks evaluate subjects in the ACLUE dataset using loglikelihood-based multiple-choice scoring:
+- `aclue_{subject_english}`
+
+### Checklist
+
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation?
+    * [x] Yes, original implementation contributed by author of the benchmark
+
+If other tasks on this dataset are already supported:
+* [x] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [x] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/aclue/_default_template_yaml
+++ b/lm_eval/tasks/aclue/_default_template_yaml
+group: aclue
+dataset_path: tyouisen/aclue
+test_split: test
+fewshot_split: dev
+fewshot_config:
+  sampler: first_n
+output_type: multiple_choice
+doc_to_text: "{{Question.strip()}}\nA. {{A}}\nB. {{B}}\nC. {{C}}\nD. {{D}}\n答案："
+doc_to_choice: ["A", "B", "C", "D"]
+doc_to_target: "{{['A', 'B', 'C', 'D'].index(Answer)}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/aclue/_generate_configs.py
+++ b/lm_eval/tasks/aclue/_generate_configs.py
+"""
+Take in a YAML, and output all other splits with this YAML
+"""
+import argparse
+import os
+
+import yaml
+from tqdm import tqdm
+
+from lm_eval.utils import eval_logger
+
+
+SUBJECTS = {
+    "古文单字多义": "polysemy_resolution",
+    "诗词情感分类": "poetry_sentiment_analysis",
+    "古汉语命名体识别": "named_entity_recognition",
+    "古汉语知识": "basic_ancient_chinese",
+    "古诗词上下句预测": "poetry_context_prediction",
+    "古文断句": "sentence_segmentation",
+    "对联": "couplet_prediction",
+    "古诗词曲鉴赏": "poetry_appreciate",
+    "国学常识": "ancient_chinese_culture",
+    "古音学": "ancient_phonetics",
+    "通假字": "homographic_character_resolution",
+    "古代文学知识": "ancient_literature",
+    "医古文": "ancient_medical",
+    "古诗词质量评估": "poetry_quality_assessment",
+    "古文阅读理解": "reading_comprehension",
+}
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base_yaml_path", required=True)
+    parser.add_argument("--save_prefix_path", default="aclue")
+    parser.add_argument("--cot_prompt_path", default=None)
+    parser.add_argument("--task_prefix", default="")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_args()
+
+    # get filename of base_yaml so we can `"include": ` it in our other YAMLs.
+    base_yaml_name = os.path.split(args.base_yaml_path)[-1]
+    with open(args.base_yaml_path, encoding="utf-8") as f:
+        base_yaml = yaml.full_load(f)
+
+    if args.cot_prompt_path is not None:
+        import json
+
+        with open(args.cot_prompt_path, encoding="utf-8") as f:
+            cot_file = json.load(f)
+
+    for subject_zh, subject_eng in tqdm(SUBJECTS.items()):
+        if args.cot_prompt_path is not None:
+            description = cot_file[subject_eng]
+        else:
+            description = (
+                f"以下是关于{subject_zh}的单项选择题，请直接给出正确答案的选项。\n\n"
+            )
+
+        yaml_dict = {
+            "include": base_yaml_name,
+            "task": f"aclue_{args.task_prefix}_{subject_eng}"
+            if args.task_prefix != ""
+            else f"aclue_{subject_eng}",
+            "dataset_name": subject_eng,
+            "description": description,
+        }
+
+        file_save_path = args.save_prefix_path + f"_{subject_eng}.yaml"
+        eval_logger.info(f"Saving yaml for subset {subject_eng} to {file_save_path}")
+        with open(file_save_path, "w", encoding="utf-8") as yaml_file:
+            yaml.dump(
+                yaml_dict,
+                yaml_file,
+                width=float("inf"),
+                allow_unicode=True,
+                default_style='"',
+            )
--- a/lm_eval/tasks/aclue/aclue_ancient_chinese_culture.yaml
+++ b/lm_eval/tasks/aclue/aclue_ancient_chinese_culture.yaml
+"dataset_name": "ancient_chinese_culture"
+"description": "以下是关于国学常识的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_ancient_chinese_culture"
--- a/lm_eval/tasks/aclue/aclue_ancient_literature.yaml
+++ b/lm_eval/tasks/aclue/aclue_ancient_literature.yaml
+"dataset_name": "ancient_literature"
+"description": "以下是关于古代文学知识的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_ancient_literature"
--- a/lm_eval/tasks/aclue/aclue_ancient_medical.yaml
+++ b/lm_eval/tasks/aclue/aclue_ancient_medical.yaml
+"dataset_name": "ancient_medical"
+"description": "以下是关于医古文的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_ancient_medical"
--- a/lm_eval/tasks/aclue/aclue_ancient_phonetics.yaml
+++ b/lm_eval/tasks/aclue/aclue_ancient_phonetics.yaml
+"dataset_name": "ancient_phonetics"
+"description": "以下是关于古音学的单项选择题，请直接给出正确答案的选项。\n\n"
+"include": "_default_template_yaml"
+"task": "aclue_ancient_phonetics"