fewshot as multiturn

921c4d62 · Konrad · 3369f887 · 921c4d62 · 921c4d62 · 921c4d62
Commit 921c4d62 authored May 14, 2024 by Konrad
5 changed files
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -48,6 +48,8 @@ This mode supports a number of command-line arguments, the details of which can
 - `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, a generic chat template is used.
+- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.
 - `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
 * `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -174,6 +174,12 @@ def setup_parser() -> argparse.ArgumentParser:
        default=False,
        help="If True, applies the chat template to the prompt",
    )
+    parser.add_argument(
+        "--fewshot_as_multiturn",
+        action="store_true",
+        default=False,
+        help="If True, uses the fewshot as a multi-turn conversation",
+    )
    parser.add_argument(
        "--show_config",
        action="store_true",
@@ -282,6 +288,11 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
            "Specify --output_path if providing --log_samples or --predict_only"
        )
+    if args.fewshot_as_multiturn and args.apply_chat_template is False:
+        raise ValueError(
+            "If fewshot_as_multiturn is set, apply_chat_template must be set to True."
+        )
    if args.include_path is not None:
        eval_logger.info(f"Including path: {args.include_path}")
    task_manager = TaskManager(args.verbosity, include_path=args.include_path)
@@ -371,6 +382,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        log_samples=args.log_samples,
        system_instruction=args.system_instruction,
        apply_chat_template=args.apply_chat_template,
+        fewshot_as_multiturn=args.fewshot_as_multiturn,
        gen_kwargs=args.gen_kwargs,
        task_manager=task_manager,
        verbosity=args.verbosity,

--- a/lm_eval/api/samplers.py
+++ b/lm_eval/api/samplers.py
@@ -35,40 +35,32 @@ class ContextSampler:
        # TODO: should we just stop people from using fewshot from same split as evaluating?
        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
-        labeled_examples = (
+        labeled_examples = ""
-            self.fewshot_delimiter.join(
+        for doc in selected_docs:
-                [
+            doc_content = self.doc_to_text(doc)
-                    # TODO: is separating doc_to_text and doc_to_target by one space always desired?
+            doc_target = self.doc_to_target(doc)
-                    (
+            labeled_examples += (
-                        self.doc_to_text(doc)
+                doc_content
-                        if (
+                if self.config.doc_to_choice is None or isinstance(doc_content, str)
-                            self.config.doc_to_choice is None
+                else self.doc_to_choice(doc)[doc_content]
-                            or isinstance(self.doc_to_text(doc), str)
-                        )
-                        else self.doc_to_choice(doc)[self.doc_to_text(doc)]
-                    )
-                    + self.target_delimiter
-                    + (
-                        str(self.doc_to_target(doc)[0])
-                        if isinstance(self.doc_to_target(doc), list)
-                        else self.doc_to_target(doc)
-                        if (
-                            self.config.doc_to_choice is None
-                            or isinstance(self.doc_to_target(doc), str)
-                        )
-                        else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
-                    )
-                    for doc in selected_docs
-                ]
            )
-            + self.fewshot_delimiter
+            labeled_examples += self.target_delimiter
+            labeled_examples += (
+                str(doc_target[0])
+                if isinstance(doc_target, list)
+                else doc_target
+                if self.config.doc_to_choice is None or isinstance(doc_target, str)
+                else str(self.doc_to_choice(doc)[doc_target])
            )
+            labeled_examples += self.fewshot_delimiter
        return labeled_examples
    def get_chat_context(
        self,
        doc,
        num_fewshot,
+        fewshot_as_multiturn: bool = False,
        chat_history: list = [],
    ):
        # draw an extra fewshot sample if using same split as evaluating on
@@ -84,31 +76,36 @@ class ContextSampler:
        # TODO: should we just stop people from using fewshot from same split as evaluating?
        selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
+        if fewshot_as_multiturn:
            for doc in selected_docs:
+                doc_content = self.doc_to_text(doc)
+                doc_target = self.doc_to_target(doc)
                chat_history.append(
                    {
                        "role": "user",
-                    "content": self.doc_to_text(doc)
+                        "content": doc_content
-                    if (
+                        if self.config.doc_to_choice is None
-                        self.config.doc_to_choice is None
+                        or isinstance(doc_content, str)
-                        or isinstance(self.doc_to_text(doc), str)
+                        else self.doc_to_choice(doc)[doc_content],
-                    )
-                    else self.doc_to_choice(doc)[self.doc_to_text(doc)],
                    }
                )
                chat_history.append(
                    {
                        "role": "assistant",
-                    "content": str(self.doc_to_target(doc)[0])
+                        "content": str(doc_target[0])
-                    if isinstance(self.doc_to_target(doc), list)
+                        if isinstance(doc_target, list)
-                    else self.doc_to_target(doc)
+                        else doc_target
-                    if (
+                        if self.config.doc_to_choice is None
-                        self.config.doc_to_choice is None
+                        or isinstance(doc_target, str)
-                        or isinstance(self.doc_to_target(doc), str)
+                        else str(self.doc_to_choice(doc)[doc_target]),
-                    )
-                    else str(self.doc_to_choice(doc)[self.doc_to_target(doc)]),
                    }
                )
+        else:
+            # get fewshot context as one user turn
+            chat_history.append(
+                {"role": "user", "content": self.get_context(doc, num_fewshot)}
+            )
        return chat_history
    def sample(self, n):

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -375,6 +375,7 @@ class Task(abc.ABC):
        rewrite_requests_cache=False,
        system_instruction=None,
        apply_chat_template=False,
+        fewshot_as_multiturn=False,
        tokenizer=None,
    ) -> None:
        """Build a set of Instances for a task, and store them in task.instances"""
@@ -426,6 +427,7 @@ class Task(abc.ABC):
                0 if self.config.num_fewshot is None else self.config.num_fewshot,
                system_instruction,
                apply_chat_template,
+                fewshot_as_multiturn,
                tokenizer,
            )
@@ -988,6 +990,7 @@ class ConfigurableTask(Task):
        num_fewshot: int,
        system_instruction: str = None,
        apply_chat_template: bool = False,
+        fewshot_as_multiturn: bool = False,
        tokenizer=None,
    ) -> str:
        """Returns a fewshot context string that is made up of a prepended description
@@ -1001,6 +1004,8 @@ class ConfigurableTask(Task):
            System instruction to be applied to the prompt.
        :param apply_chat_template: bool
            Whether to apply the chat template to the fewshot context.
+        :param fewshot_as_multiturn: bool
+            Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
        :param tokenizer:
            The tokenizer to use for applying the chat template.
        :returns: str
@@ -1039,7 +1044,7 @@ class ConfigurableTask(Task):
        if num_fewshot > 0:
            if apply_chat_template:
                labeled_examples = self.sampler.get_chat_context(
-                    doc, num_fewshot, labeled_examples
+                    doc, num_fewshot, fewshot_as_multiturn, labeled_examples
                )
            else:
                labeled_examples += self.sampler.get_context(doc, num_fewshot)

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -57,6 +57,7 @@ def simple_evaluate(
    log_samples: bool = True,
    system_instruction: str = None,
    apply_chat_template: bool = False,
+    fewshot_as_multiturn: bool = False,
    gen_kwargs: Optional[str] = None,
    task_manager: Optional[TaskManager] = None,
    verbosity: str = "INFO",
@@ -105,6 +106,8 @@ def simple_evaluate(
        System instruction to be applied to the prompt
    :param apply_chat_template: bool
        If True, apply chat template to the prompt
+    :param fewshot_as_multiturn: bool
+        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
    :param gen_kwargs: str
        String arguments for model generation
        Ignored for all tasks with loglikelihood output_type
@@ -270,6 +273,7 @@ def simple_evaluate(
        log_samples=log_samples,
        system_instruction=system_instruction,
        apply_chat_template=apply_chat_template,
+        fewshot_as_multiturn=fewshot_as_multiturn,
        verbosity=verbosity,
    )
@@ -327,6 +331,7 @@ def evaluate(
    log_samples: bool = True,
    system_instruction: str = None,
    apply_chat_template: bool = False,
+    fewshot_as_multiturn: bool = False,
    verbosity: str = "INFO",
 ):
    """Instantiate and evaluate a model on a list of tasks.
@@ -347,6 +352,8 @@ def evaluate(
        System instruction to be applied to the prompt
    :param apply_chat_template: bool
        If True, apply chat template to the prompt
+    :param fewshot_as_multiturn: bool
+        Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
    :return
        Dictionary of results
    """
@@ -378,6 +385,7 @@ def evaluate(
            rewrite_requests_cache=rewrite_requests_cache,
            system_instruction=system_instruction,
            apply_chat_template=apply_chat_template,
+            fewshot_as_multiturn=fewshot_as_multiturn,
            tokenizer=lm.tokenizer if hasattr(lm, "tokenizer") else None,
        )
        eval_logger.debug(