initial chat template

62df55d1 · Konrad · 885f48d6 · 62df55d1 · 62df55d1 · 62df55d1
Commit 62df55d1 authored May 08, 2024 by Konrad
Showing with 143 additions and 16 deletions

lm_eval/__main__.py lm_eval/__main__.py +7 -0

lm_eval/api/samplers.py lm_eval/api/samplers.py +46 -1

lm_eval/api/task.py lm_eval/api/task.py +81 -15

lm_eval/evaluator.py lm_eval/evaluator.py +9 -0

No files found.
--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -162,6 +162,12 @@ def setup_parser() -> argparse.ArgumentParser:
        default=False,
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
    )
+    parser.add_argument(
+        "--apply_chat_template",
+        action="store_true",
+        default=False,
+        help="If True, applies the chat template to the prompt",
+    )
    parser.add_argument(
        "--show_config",
        action="store_true",
@@ -357,6 +363,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        check_integrity=args.check_integrity,
        write_out=args.write_out,
        log_samples=args.log_samples,
+        apply_chat_template=args.apply_chat_template,
        gen_kwargs=args.gen_kwargs,
        task_manager=task_manager,
        verbosity=args.verbosity,

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
@@ -63,9 +63,54 @@ class ContextSampler:
            )
            + self.fewshot_delimiter
        )
-
        return labeled_examples

+    def get_chat_context(
+        self,
+        doc,
+        num_fewshot,
+        chat_history: list = [],
+    ):
+        # draw an extra fewshot sample if using same split as evaluating on
+        n_samples = (
+            num_fewshot + 1
+            if self.config.fewshot_split == self.config.test_split
+            else num_fewshot
+        )
+        # draw `n_samples` docs from fewshot_docs
+        fewshotex = self.sample(n_samples)
+
+        # get rid of the doc that's the one we're evaluating, if it's in the fewshot
+        # TODO: should we just stop people from using fewshot from same split as evaluating?
+        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+
+        for doc in selected_docs:
+            chat_history.append(
+                {
+                    "role": "user",
+                    "content": self.doc_to_text(doc)
+                    if (
+                        self.config.doc_to_choice is None
+                        or isinstance(self.doc_to_text(doc), str)
+                    )
+                    else self.doc_to_choice(doc)[self.doc_to_text(doc)],
+                }
+            )
+            chat_history.append(
+                {
+                    "role": "assistant",
+                    "content": str(self.doc_to_target(doc)[0])
+                    if isinstance(self.doc_to_target(doc), list)
+                    else self.doc_to_target(doc)
+                    if (
+                        self.config.doc_to_choice is None
+                        or isinstance(self.doc_to_target(doc), str)
+                    )
+                    else str(self.doc_to_choice(doc)[self.doc_to_target(doc)]),
+                }
+            )
+        return chat_history
+
    def sample(self, n):
        """
        Draw `n` samples from our fewshot docs. This method should be overridden by subclasses.

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -373,6 +373,8 @@ class Task(abc.ABC):
        world_size=None,
        cache_requests=False,
        rewrite_requests_cache=False,
+        apply_chat_template=False,
+        tokenizer=None,
    ) -> None:
        """Build a set of Instances for a task, and store them in task.instances"""

@@ -421,6 +423,8 @@ class Task(abc.ABC):
            fewshot_ctx = self.fewshot_context(
                doc,
                0 if self.config.num_fewshot is None else self.config.num_fewshot,
+                apply_chat_template,
+                tokenizer,
            )

            # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
@@ -957,8 +961,32 @@ class ConfigurableTask(Task):
                )
            return super().fewshot_docs()

+    def convert_chat_history_to_string(self, chat_history: list, tokenizer=None) -> str:
+        """Returns chat history tokenized or concatenated as a string.
+
+        :param chat_history: list
+            The chat history to convert to a string.
+        :param tokenizer:
+            Optional tokenizer to use for applying the chat template, if None, the sampler's fewshot_delimiter is used.
+        """
+        if tokenizer:
+            return tokenizer.apply_chat_template(
+                chat_history, tokenize=False, add_generation_prompt=True
+            )
+        else:
+            return self.sampler.fewshot_delimiter + "".join(
+                f"{s['role']}: {s['content']}" + self.sampler.fewshot_delimiter
+                for s in chat_history
+            )
+
    @utils.positional_deprecated
-    def fewshot_context(self, doc: str, num_fewshot: int) -> str:
+    def fewshot_context(
+        self,
+        doc: str,
+        num_fewshot: int,
+        apply_chat_template: bool = False,
+        tokenizer=None,
+    ) -> str:
        """Returns a fewshot context string that is made up of a prepended description
        (if provided), the `num_fewshot` number of examples, and an appended prompt example.

@@ -966,19 +994,57 @@ class ConfigurableTask(Task):
            The document as returned from training_docs, validation_docs, or test_docs.
        :param num_fewshot: int
            The number of fewshot examples to provide in the returned context string.
+        :param apply_chat_template: bool
+            Whether to apply the chat template to the fewshot context.
+        :param tokenizer:
+            The tokenizer to use for applying the chat template.
        :returns: str
            The fewshot context.
        """
        if description := self.config.description:
            description = utils.apply_template(self.config.description, doc)

+        chat_history = []
        if num_fewshot == 0:
            # always prepend the (possibly empty) task description
+            if apply_chat_template:
+                chat_history.append({"role": "system", "content": description})
+            else:
                labeled_examples = description
        else:
-            labeled_examples = description + self.sampler.get_context(doc, num_fewshot)
+            if apply_chat_template:
+                chat_history = self.sampler.get_chat_context(
+                    doc, num_fewshot, chat_history
+                )
+            else:
+                labeled_examples = description + self.sampler.get_context(
+                    doc, num_fewshot
+                )

        example = self.doc_to_text(doc)
+        if apply_chat_template:
+            if not self.multiple_input:
+                if isinstance(example, str):
+                    chat_history.append({"role": "user", "content": example})
+                elif isinstance(example, list):
+                    chat_histories_list = []
+                    for ex in example:
+                        chat = deepcopy(chat_history)
+                        chat.append({"role": "user", "content": ex})
+                        chat_histories_list.append(
+                            self.convert_chat_history_to_string(chat, tokenizer)
+                        )
+                    return chat_histories_list
+                elif isinstance(example, int):
+                    if self.config.doc_to_choice is not None:
+                        choices = self.doc_to_choice(doc)
+                        chat_history.append(
+                            {"role": "user", "content": choices[example]}
+                        )
+                    else:
+                        chat_history.append({"role": "user", "content": str(example)})
+            return self.convert_chat_history_to_string(chat_history, tokenizer)
+        else:
            if self.multiple_input:
                return labeled_examples
            else:

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -55,6 +55,7 @@ def simple_evaluate(
    check_integrity: bool = False,
    write_out: bool = False,
    log_samples: bool = True,
+    apply_chat_template: bool = False,
    gen_kwargs: Optional[str] = None,
    task_manager: Optional[TaskManager] = None,
    verbosity: str = "INFO",
@@ -99,6 +100,8 @@ def simple_evaluate(
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param apply_chat_template: bool
+        If True, apply chat template to the prompt
    :param gen_kwargs: str
        String arguments for model generation
        Ignored for all tasks with loglikelihood output_type
@@ -262,6 +265,7 @@ def simple_evaluate(
        bootstrap_iters=bootstrap_iters,
        write_out=write_out,
        log_samples=log_samples,
+        apply_chat_template=apply_chat_template,
        verbosity=verbosity,
    )

@@ -317,6 +321,7 @@ def evaluate(
    bootstrap_iters: Optional[int] = 100000,
    write_out: bool = False,
    log_samples: bool = True,
+    apply_chat_template: bool = False,
    verbosity: str = "INFO",
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -333,6 +338,8 @@ def evaluate(
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param apply_chat_template: bool
+        If True, apply chat template to the prompt
    :return
        Dictionary of results
    """
@@ -362,6 +369,8 @@ def evaluate(
            world_size=lm.world_size,
            cache_requests=cache_requests,
            rewrite_requests_cache=rewrite_requests_cache,
+            apply_chat_template=apply_chat_template,
+            tokenizer=lm.tokenizer,
        )
        eval_logger.debug(
            f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"