Commit 921c4d62 authored by Konrad's avatar Konrad
Browse files

fewshot as multiturn

parent 3369f887
...@@ -48,6 +48,8 @@ This mode supports a number of command-line arguments, the details of which can ...@@ -48,6 +48,8 @@ This mode supports a number of command-line arguments, the details of which can
- `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, a generic chat template is used. - `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, a generic chat template is used.
- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.
- `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results. - `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
* `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42. * `--seed`: Set seed for python's random, numpy and torch. Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three. The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility). E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`. E.g, `--seed 42` sets all three seeds to 42.
......
...@@ -174,6 +174,12 @@ def setup_parser() -> argparse.ArgumentParser: ...@@ -174,6 +174,12 @@ def setup_parser() -> argparse.ArgumentParser:
default=False, default=False,
help="If True, applies the chat template to the prompt", help="If True, applies the chat template to the prompt",
) )
parser.add_argument(
"--fewshot_as_multiturn",
action="store_true",
default=False,
help="If True, uses the fewshot as a multi-turn conversation",
)
parser.add_argument( parser.add_argument(
"--show_config", "--show_config",
action="store_true", action="store_true",
...@@ -282,6 +288,11 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -282,6 +288,11 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
"Specify --output_path if providing --log_samples or --predict_only" "Specify --output_path if providing --log_samples or --predict_only"
) )
if args.fewshot_as_multiturn and args.apply_chat_template is False:
raise ValueError(
"If fewshot_as_multiturn is set, apply_chat_template must be set to True."
)
if args.include_path is not None: if args.include_path is not None:
eval_logger.info(f"Including path: {args.include_path}") eval_logger.info(f"Including path: {args.include_path}")
task_manager = TaskManager(args.verbosity, include_path=args.include_path) task_manager = TaskManager(args.verbosity, include_path=args.include_path)
...@@ -371,6 +382,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: ...@@ -371,6 +382,7 @@ def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None:
log_samples=args.log_samples, log_samples=args.log_samples,
system_instruction=args.system_instruction, system_instruction=args.system_instruction,
apply_chat_template=args.apply_chat_template, apply_chat_template=args.apply_chat_template,
fewshot_as_multiturn=args.fewshot_as_multiturn,
gen_kwargs=args.gen_kwargs, gen_kwargs=args.gen_kwargs,
task_manager=task_manager, task_manager=task_manager,
verbosity=args.verbosity, verbosity=args.verbosity,
......
...@@ -35,40 +35,32 @@ class ContextSampler: ...@@ -35,40 +35,32 @@ class ContextSampler:
# TODO: should we just stop people from using fewshot from same split as evaluating? # TODO: should we just stop people from using fewshot from same split as evaluating?
selected_docs = [x for x in fewshotex if x != doc][:num_fewshot] selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
labeled_examples = ( labeled_examples = ""
self.fewshot_delimiter.join( for doc in selected_docs:
[ doc_content = self.doc_to_text(doc)
# TODO: is separating doc_to_text and doc_to_target by one space always desired? doc_target = self.doc_to_target(doc)
( labeled_examples += (
self.doc_to_text(doc) doc_content
if ( if self.config.doc_to_choice is None or isinstance(doc_content, str)
self.config.doc_to_choice is None else self.doc_to_choice(doc)[doc_content]
or isinstance(self.doc_to_text(doc), str)
)
else self.doc_to_choice(doc)[self.doc_to_text(doc)]
)
+ self.target_delimiter
+ (
str(self.doc_to_target(doc)[0])
if isinstance(self.doc_to_target(doc), list)
else self.doc_to_target(doc)
if (
self.config.doc_to_choice is None
or isinstance(self.doc_to_target(doc), str)
)
else str(self.doc_to_choice(doc)[self.doc_to_target(doc)])
)
for doc in selected_docs
]
) )
+ self.fewshot_delimiter labeled_examples += self.target_delimiter
labeled_examples += (
str(doc_target[0])
if isinstance(doc_target, list)
else doc_target
if self.config.doc_to_choice is None or isinstance(doc_target, str)
else str(self.doc_to_choice(doc)[doc_target])
) )
labeled_examples += self.fewshot_delimiter
return labeled_examples return labeled_examples
def get_chat_context( def get_chat_context(
self, self,
doc, doc,
num_fewshot, num_fewshot,
fewshot_as_multiturn: bool = False,
chat_history: list = [], chat_history: list = [],
): ):
# draw an extra fewshot sample if using same split as evaluating on # draw an extra fewshot sample if using same split as evaluating on
...@@ -84,31 +76,36 @@ class ContextSampler: ...@@ -84,31 +76,36 @@ class ContextSampler:
# TODO: should we just stop people from using fewshot from same split as evaluating? # TODO: should we just stop people from using fewshot from same split as evaluating?
selected_docs = [x for x in fewshotex if x != doc][:num_fewshot] selected_docs = [x for x in fewshotex if x != doc][:num_fewshot]
if fewshot_as_multiturn:
for doc in selected_docs: for doc in selected_docs:
doc_content = self.doc_to_text(doc)
doc_target = self.doc_to_target(doc)
chat_history.append( chat_history.append(
{ {
"role": "user", "role": "user",
"content": self.doc_to_text(doc) "content": doc_content
if ( if self.config.doc_to_choice is None
self.config.doc_to_choice is None or isinstance(doc_content, str)
or isinstance(self.doc_to_text(doc), str) else self.doc_to_choice(doc)[doc_content],
)
else self.doc_to_choice(doc)[self.doc_to_text(doc)],
} }
) )
chat_history.append( chat_history.append(
{ {
"role": "assistant", "role": "assistant",
"content": str(self.doc_to_target(doc)[0]) "content": str(doc_target[0])
if isinstance(self.doc_to_target(doc), list) if isinstance(doc_target, list)
else self.doc_to_target(doc) else doc_target
if ( if self.config.doc_to_choice is None
self.config.doc_to_choice is None or isinstance(doc_target, str)
or isinstance(self.doc_to_target(doc), str) else str(self.doc_to_choice(doc)[doc_target]),
)
else str(self.doc_to_choice(doc)[self.doc_to_target(doc)]),
} }
) )
else:
# get fewshot context as one user turn
chat_history.append(
{"role": "user", "content": self.get_context(doc, num_fewshot)}
)
return chat_history return chat_history
def sample(self, n): def sample(self, n):
......
...@@ -375,6 +375,7 @@ class Task(abc.ABC): ...@@ -375,6 +375,7 @@ class Task(abc.ABC):
rewrite_requests_cache=False, rewrite_requests_cache=False,
system_instruction=None, system_instruction=None,
apply_chat_template=False, apply_chat_template=False,
fewshot_as_multiturn=False,
tokenizer=None, tokenizer=None,
) -> None: ) -> None:
"""Build a set of Instances for a task, and store them in task.instances""" """Build a set of Instances for a task, and store them in task.instances"""
...@@ -426,6 +427,7 @@ class Task(abc.ABC): ...@@ -426,6 +427,7 @@ class Task(abc.ABC):
0 if self.config.num_fewshot is None else self.config.num_fewshot, 0 if self.config.num_fewshot is None else self.config.num_fewshot,
system_instruction, system_instruction,
apply_chat_template, apply_chat_template,
fewshot_as_multiturn,
tokenizer, tokenizer,
) )
...@@ -988,6 +990,7 @@ class ConfigurableTask(Task): ...@@ -988,6 +990,7 @@ class ConfigurableTask(Task):
num_fewshot: int, num_fewshot: int,
system_instruction: str = None, system_instruction: str = None,
apply_chat_template: bool = False, apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
tokenizer=None, tokenizer=None,
) -> str: ) -> str:
"""Returns a fewshot context string that is made up of a prepended description """Returns a fewshot context string that is made up of a prepended description
...@@ -1001,6 +1004,8 @@ class ConfigurableTask(Task): ...@@ -1001,6 +1004,8 @@ class ConfigurableTask(Task):
System instruction to be applied to the prompt. System instruction to be applied to the prompt.
:param apply_chat_template: bool :param apply_chat_template: bool
Whether to apply the chat template to the fewshot context. Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param tokenizer: :param tokenizer:
The tokenizer to use for applying the chat template. The tokenizer to use for applying the chat template.
:returns: str :returns: str
...@@ -1039,7 +1044,7 @@ class ConfigurableTask(Task): ...@@ -1039,7 +1044,7 @@ class ConfigurableTask(Task):
if num_fewshot > 0: if num_fewshot > 0:
if apply_chat_template: if apply_chat_template:
labeled_examples = self.sampler.get_chat_context( labeled_examples = self.sampler.get_chat_context(
doc, num_fewshot, labeled_examples doc, num_fewshot, fewshot_as_multiturn, labeled_examples
) )
else: else:
labeled_examples += self.sampler.get_context(doc, num_fewshot) labeled_examples += self.sampler.get_context(doc, num_fewshot)
......
...@@ -57,6 +57,7 @@ def simple_evaluate( ...@@ -57,6 +57,7 @@ def simple_evaluate(
log_samples: bool = True, log_samples: bool = True,
system_instruction: str = None, system_instruction: str = None,
apply_chat_template: bool = False, apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
gen_kwargs: Optional[str] = None, gen_kwargs: Optional[str] = None,
task_manager: Optional[TaskManager] = None, task_manager: Optional[TaskManager] = None,
verbosity: str = "INFO", verbosity: str = "INFO",
...@@ -105,6 +106,8 @@ def simple_evaluate( ...@@ -105,6 +106,8 @@ def simple_evaluate(
System instruction to be applied to the prompt System instruction to be applied to the prompt
:param apply_chat_template: bool :param apply_chat_template: bool
If True, apply chat template to the prompt If True, apply chat template to the prompt
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param gen_kwargs: str :param gen_kwargs: str
String arguments for model generation String arguments for model generation
Ignored for all tasks with loglikelihood output_type Ignored for all tasks with loglikelihood output_type
...@@ -270,6 +273,7 @@ def simple_evaluate( ...@@ -270,6 +273,7 @@ def simple_evaluate(
log_samples=log_samples, log_samples=log_samples,
system_instruction=system_instruction, system_instruction=system_instruction,
apply_chat_template=apply_chat_template, apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
verbosity=verbosity, verbosity=verbosity,
) )
...@@ -327,6 +331,7 @@ def evaluate( ...@@ -327,6 +331,7 @@ def evaluate(
log_samples: bool = True, log_samples: bool = True,
system_instruction: str = None, system_instruction: str = None,
apply_chat_template: bool = False, apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
verbosity: str = "INFO", verbosity: str = "INFO",
): ):
"""Instantiate and evaluate a model on a list of tasks. """Instantiate and evaluate a model on a list of tasks.
...@@ -347,6 +352,8 @@ def evaluate( ...@@ -347,6 +352,8 @@ def evaluate(
System instruction to be applied to the prompt System instruction to be applied to the prompt
:param apply_chat_template: bool :param apply_chat_template: bool
If True, apply chat template to the prompt If True, apply chat template to the prompt
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:return :return
Dictionary of results Dictionary of results
""" """
...@@ -378,6 +385,7 @@ def evaluate( ...@@ -378,6 +385,7 @@ def evaluate(
rewrite_requests_cache=rewrite_requests_cache, rewrite_requests_cache=rewrite_requests_cache,
system_instruction=system_instruction, system_instruction=system_instruction,
apply_chat_template=apply_chat_template, apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
tokenizer=lm.tokenizer if hasattr(lm, "tokenizer") else None, tokenizer=lm.tokenizer if hasattr(lm, "tokenizer") else None,
) )
eval_logger.debug( eval_logger.debug(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment