system instruction

9dfb58a3 · Konrad · cd9e4540 · 9dfb58a3 · 9dfb58a3 · 9dfb58a3
Commit 9dfb58a3 authored May 12, 2024 by Konrad
Showing with 49 additions and 9 deletions

docs/interface.md docs/interface.md +2 -0

lm_eval/__main__.py lm_eval/__main__.py +7 -0

lm_eval/api/task.py lm_eval/api/task.py +32 -9

lm_eval/evaluator.py lm_eval/evaluator.py +8 -0

No files found.
--- a/docs/interface.md
+++ b/docs/interface.md
@@ -44,6 +44,8 @@ This mode supports a number of command-line arguments, the details of which can
 - `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing ` lm-eval`` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than  `lm_eval/tasks/`
+- `--system_instruction`: Specifies a system instruction string to prepend to the prompt.
 - `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, a generic chat template is used.
 - `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.

--- a/lm_eval/__main__.py
+++ b/lm_eval/__main__.py
@@ -162,6 +162,12 @@ def setup_parser() -> argparse.ArgumentParser:
        default=False,
        help="If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis. Use with --output_path.",
    )
+    parser.add_argument(
+        "--system_instruction",
+        type=str,
+        default=None,
+        help="System instruction to be used in the prompt",
+    )
    parser.add_argument(
        "--apply_chat_template",
        action="store_true",
@@ -363,6 +369,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
        check_integrity=args.check_integrity,
        write_out=args.write_out,
        log_samples=args.log_samples,
+        system_instruction=args.system_instruction,
        apply_chat_template=args.apply_chat_template,
        gen_kwargs=args.gen_kwargs,
        task_manager=task_manager,

--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -373,6 +373,7 @@ class Task(abc.ABC):
        world_size=None,
        cache_requests=False,
        rewrite_requests_cache=False,
+        system_instruction=None,
        apply_chat_template=False,
        tokenizer=None,
    ) -> None:
@@ -423,6 +424,7 @@ class Task(abc.ABC):
            fewshot_ctx = self.fewshot_context(
                doc,
                0 if self.config.num_fewshot is None else self.config.num_fewshot,
+                system_instruction,
                apply_chat_template,
                tokenizer,
            )
@@ -984,6 +986,7 @@ class ConfigurableTask(Task):
        self,
        doc: str,
        num_fewshot: int,
+        system_instruction: str = "",
        apply_chat_template: bool = False,
        tokenizer=None,
    ) -> str:
@@ -994,6 +997,8 @@ class ConfigurableTask(Task):
            The document as returned from training_docs, validation_docs, or test_docs.
        :param num_fewshot: int
            The number of fewshot examples to provide in the returned context string.
+        :param  system_instruction: str
+            System instruction to be applied to the prompt.
        :param apply_chat_template: bool
            Whether to apply the chat template to the fewshot context.
        :param tokenizer:
@@ -1001,25 +1006,43 @@ class ConfigurableTask(Task):
        :returns: str
            The fewshot context.
        """
+        if apply_chat_template:
+            labeled_examples = []
+        else:
+            labeled_examples = ""
+        # get task description
        if description := self.config.description:
            description = utils.apply_template(self.config.description, doc)
-        labeled_examples = []
+        # create system prompt based on the provided system instruction and description
-        if num_fewshot == 0:
+        if system_instruction and description:
-            # always prepend the (possibly empty) task description
+            system_prompt = (
-            if apply_chat_template:
+                f"{system_instruction}{self.sampler.fewshot_delimiter}{description}"
-                labeled_examples.append({"role": "system", "content": description})
+            )
+        elif system_instruction:
+            system_prompt = system_instruction
+        elif description:
+            system_prompt = description
        else:
-                labeled_examples = description
+            system_prompt = ""
+        # add system prompt if specified
+        if system_prompt:
+            if apply_chat_template:
+                labeled_examples.append({"role": "system", "content": system_prompt})
            else:
+                labeled_examples = system_prompt
+        # if few-shot - append examples after the system prompt
+        if num_fewshot > 0:
            if apply_chat_template:
                labeled_examples = self.sampler.get_chat_context(
                    doc, num_fewshot, labeled_examples
                )
            else:
-                labeled_examples = description + self.sampler.get_context(
+                labeled_examples += self.sampler.get_context(doc, num_fewshot)
-                    doc, num_fewshot
-                )
        example = self.doc_to_text(doc)
        if apply_chat_template:

--- a/lm_eval/evaluator.py
+++ b/lm_eval/evaluator.py
@@ -55,6 +55,7 @@ def simple_evaluate(
    check_integrity: bool = False,
    write_out: bool = False,
    log_samples: bool = True,
+    system_instruction: str = "",
    apply_chat_template: bool = False,
    gen_kwargs: Optional[str] = None,
    task_manager: Optional[TaskManager] = None,
@@ -100,6 +101,8 @@ def simple_evaluate(
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param system_instruction: str
+        System instruction to be applied to the prompt
    :param apply_chat_template: bool
        If True, apply chat template to the prompt
    :param gen_kwargs: str
@@ -265,6 +268,7 @@ def simple_evaluate(
        bootstrap_iters=bootstrap_iters,
        write_out=write_out,
        log_samples=log_samples,
+        system_instruction=system_instruction,
        apply_chat_template=apply_chat_template,
        verbosity=verbosity,
    )
@@ -321,6 +325,7 @@ def evaluate(
    bootstrap_iters: Optional[int] = 100000,
    write_out: bool = False,
    log_samples: bool = True,
+    system_instruction: str = "",
    apply_chat_template: bool = False,
    verbosity: str = "INFO",
 ):
@@ -338,6 +343,8 @@ def evaluate(
        If True, write out an example document and model input for checking task integrity
    :param log_samples: bool
        If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis
+    :param system_instruction: str
+        System instruction to be applied to the prompt
    :param apply_chat_template: bool
        If True, apply chat template to the prompt
    :return
@@ -369,6 +376,7 @@ def evaluate(
            world_size=lm.world_size,
            cache_requests=cache_requests,
            rewrite_requests_cache=rewrite_requests_cache,
+            system_instruction=system_instruction,
            apply_chat_template=apply_chat_template,
            tokenizer=lm.tokenizer if hasattr(lm, "tokenizer") else None,
        )