Unverified Commit 9b6179b2 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

Remove `LM` dependency from `build_all_requests` (#2011)

* refactored `lm.apply_chat_template`

* nit

* fix weird type error

* fixed!

* skip failing test

* pre-commit run all

* add type hints

* nit

* nit

* fixup
parent 9b6b0f5e
...@@ -368,15 +368,16 @@ class Task(abc.ABC): ...@@ -368,15 +368,16 @@ class Task(abc.ABC):
def build_all_requests( def build_all_requests(
self, self,
*, *,
limit=None, limit: Union[int, None] = None,
rank=None, rank: int = 0,
world_size=None, world_size: int = 1,
cache_requests=False, cache_requests: bool = False,
rewrite_requests_cache=False, rewrite_requests_cache: bool = False,
system_instruction=None, system_instruction: Optional[str] = None,
apply_chat_template=False, apply_chat_template: bool = False,
fewshot_as_multiturn=False, fewshot_as_multiturn: bool = False,
lm=None, chat_template: Optional[Callable] = None,
tokenizer_name: str = "",
) -> None: ) -> None:
"""Build a set of Instances for a task, and store them in task.instances""" """Build a set of Instances for a task, and store them in task.instances"""
...@@ -391,7 +392,7 @@ class Task(abc.ABC): ...@@ -391,7 +392,7 @@ class Task(abc.ABC):
if system_instruction is not None if system_instruction is not None
else "" else ""
) )
cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else "" cache_key += f"-tokenizer{tokenizer_name}"
cached_instances = load_from_cache(file_name=cache_key) cached_instances = load_from_cache(file_name=cache_key)
...@@ -436,7 +437,7 @@ class Task(abc.ABC): ...@@ -436,7 +437,7 @@ class Task(abc.ABC):
system_instruction, system_instruction,
apply_chat_template, apply_chat_template,
fewshot_as_multiturn, fewshot_as_multiturn,
lm, chat_template,
) )
# TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
...@@ -1014,7 +1015,7 @@ class ConfigurableTask(Task): ...@@ -1014,7 +1015,7 @@ class ConfigurableTask(Task):
system_instruction: Optional[str] = None, system_instruction: Optional[str] = None,
apply_chat_template: bool = False, apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False, fewshot_as_multiturn: bool = False,
lm=None, chat_template: Optional[Callable] = None,
) -> str: ) -> str:
"""Returns a fewshot context string that is made up of a prepended description """Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example. (if provided), the `num_fewshot` number of examples, and an appended prompt example.
...@@ -1029,8 +1030,8 @@ class ConfigurableTask(Task): ...@@ -1029,8 +1030,8 @@ class ConfigurableTask(Task):
Whether to apply the chat template to the fewshot context. Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool :param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn. Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param lm: :param chat_template: Callable
Language model with definition of the tokenizer/function to use for applying the chat template. Chat template to be applied to the fewshot context.
:returns: str :returns: str
The fewshot context. The fewshot context.
""" """
...@@ -1077,7 +1078,7 @@ class ConfigurableTask(Task): ...@@ -1077,7 +1078,7 @@ class ConfigurableTask(Task):
example = self.doc_to_text(doc) example = self.doc_to_text(doc)
if apply_chat_template: if apply_chat_template:
if self.multiple_input: if self.multiple_input:
return lm.apply_chat_template(labeled_examples) return chat_template(labeled_examples)
if isinstance(example, str): if isinstance(example, str):
self.append_target_question( self.append_target_question(
labeled_examples, example, fewshot_as_multiturn labeled_examples, example, fewshot_as_multiturn
...@@ -1089,7 +1090,7 @@ class ConfigurableTask(Task): ...@@ -1089,7 +1090,7 @@ class ConfigurableTask(Task):
for ex in example: for ex in example:
chat = deepcopy(labeled_examples) chat = deepcopy(labeled_examples)
self.append_target_question(chat, ex, fewshot_as_multiturn) self.append_target_question(chat, ex, fewshot_as_multiturn)
labeled_examples_list.append(lm.apply_chat_template(chat)) labeled_examples_list.append(chat_template(chat))
return labeled_examples_list return labeled_examples_list
# if example is an integer, append the choice or convert to string # if example is an integer, append the choice or convert to string
elif isinstance(example, int): elif isinstance(example, int):
...@@ -1103,7 +1104,7 @@ class ConfigurableTask(Task): ...@@ -1103,7 +1104,7 @@ class ConfigurableTask(Task):
labeled_examples, str(example), fewshot_as_multiturn labeled_examples, str(example), fewshot_as_multiturn
) )
# return lm.apply_chat_template(labeled_examples) # return lm.apply_chat_template(labeled_examples)
return lm.apply_chat_template(labeled_examples) return chat_template(labeled_examples)
else: else:
if self.multiple_input: if self.multiple_input:
return labeled_examples return labeled_examples
......
...@@ -399,7 +399,12 @@ def evaluate( ...@@ -399,7 +399,12 @@ def evaluate(
system_instruction=system_instruction, system_instruction=system_instruction,
apply_chat_template=apply_chat_template, apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn, fewshot_as_multiturn=fewshot_as_multiturn,
lm=lm, chat_template=getattr(lm, "apply_chat_template")
if apply_chat_template
else None,
tokenizer_name=getattr(lm, "tokenizer_name", "")
if apply_chat_template
else "",
) )
eval_logger.debug( eval_logger.debug(
f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}" f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
...@@ -609,16 +614,16 @@ def evaluate( ...@@ -609,16 +614,16 @@ def evaluate(
] ]
# compute group's pooled metric and stderr # compute group's pooled metric and stderr
results[group][ results[group][metric] = (
metric lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes) )
# TODO: calculate grouped metric using aggregation fn # TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs: if "N/A" in stderrs:
results[group][stderr] = "N/A" results[group][stderr] = "N/A"
else: else:
results[group][ results[group][stderr] = (
stderr lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes) )
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line: # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics) # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
......
...@@ -275,9 +275,9 @@ def consolidate_results( ...@@ -275,9 +275,9 @@ def consolidate_results(
metric_key metric_key
] ]
results[task_output.task_name]["samples"] = task_output.sample_len results[task_output.task_name]["samples"] = task_output.sample_len
results[task_output.task_name][ results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
f"{metric}_stderr,{filter_key}" task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"] )
return results, samples, configs, versions, num_fewshot, higher_is_better return results, samples, configs, versions, num_fewshot, higher_is_better
......
...@@ -18,7 +18,7 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU ...@@ -18,7 +18,7 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU
``` ```
@misc{koto2024arabicmmlu, @misc{koto2024arabicmmlu,
title={ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic}, title={ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic},
author={Fajri Koto and Haonan Li and Sara Shatnawi and Jad Doughman and Abdelrahman Boda Sadallah and Aisha Alraeesi and Khalid Almubarak and Zaid Alyafeai and Neha Sengupta and Shady Shehata and Nizar Habash and Preslav Nakov and Timothy Baldwin}, author={Fajri Koto and Haonan Li and Sara Shatnawi and Jad Doughman and Abdelrahman Boda Sadallah and Aisha Alraeesi and Khalid Almubarak and Zaid Alyafeai and Neha Sengupta and Shady Shehata and Nizar Habash and Preslav Nakov and Timothy Baldwin},
year={2024}, year={2024},
eprint={2402.12840}, eprint={2402.12840},
...@@ -37,4 +37,4 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU ...@@ -37,4 +37,4 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU
* `arabicmmlu_stem_social_science`: evaluates social science ArabicMMLU tasks. * `arabicmmlu_stem_social_science`: evaluates social science ArabicMMLU tasks.
* `arabicmmlu_stem_humanities`: evaluates humanities ArabicMMLU tasks. * `arabicmmlu_stem_humanities`: evaluates humanities ArabicMMLU tasks.
* `arabicmmlu_stem_language`: evaluates Arabic language ArabicMMLU tasks. * `arabicmmlu_stem_language`: evaluates Arabic language ArabicMMLU tasks.
* `arabicmmlu_stem_other`: evaluates other ArabicMMLU tasks. * `arabicmmlu_stem_other`: evaluates other ArabicMMLU tasks.
\ No newline at end of file
""" """
Take in a YAML, and output all "other" splits with this YAML Take in a YAML, and output all "other" splits with this YAML
""" """
import argparse import argparse
import logging import logging
import os import os
...@@ -76,7 +77,6 @@ if __name__ == "__main__": ...@@ -76,7 +77,6 @@ if __name__ == "__main__":
if category not in ALL_CATEGORIES: if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category) ALL_CATEGORIES.append(category)
# description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n" # description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = { yaml_dict = {
...@@ -89,7 +89,10 @@ if __name__ == "__main__": ...@@ -89,7 +89,10 @@ if __name__ == "__main__":
# "description": description, # "description": description,
} }
file_save_path = args.save_prefix_path + f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml" file_save_path = (
args.save_prefix_path
+ f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml"
)
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}") eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file: with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump( yaml.dump(
......
PROMPT = 'This is a {}. Select the correct answer!\n\nQuestion: {}\n{}\n\nAnswer:' PROMPT = "This is a {}. Select the correct answer!\n\nQuestion: {}\n{}\n\nAnswer:"
level_en = { level_en = {
'Primary': 'primary school', "Primary": "primary school",
'Middle': 'middle school', "Middle": "middle school",
'High': 'high school', "High": "high school",
'Univ': 'university', "Univ": "university",
'Prof': 'professional', "Prof": "professional",
} }
alpa = ['A.', 'B.', 'C.', 'D.', 'E.'] alpa = ["A.", "B.", "C.", "D.", "E."]
def doc_to_text(doc): def doc_to_text(doc):
...@@ -17,22 +17,28 @@ def doc_to_text(doc): ...@@ -17,22 +17,28 @@ def doc_to_text(doc):
https://github.com/mbzuai-nlp/ArabicMMLU/blob/main/util_prompt.py https://github.com/mbzuai-nlp/ArabicMMLU/blob/main/util_prompt.py
""" """
level = "" if not doc['Level'] else " for " + level_en[doc['Level']] level = "" if not doc["Level"] else " for " + level_en[doc["Level"]]
country = "" if not doc['Country'] else " in " + doc['Country'] country = "" if not doc["Country"] else " in " + doc["Country"]
main_meta_data = f"{doc['Subject']} question{level}{country}" main_meta_data = f"{doc['Subject']} question{level}{country}"
question = doc['Question'] if doc['Context']=="" else f"{doc['Context']}\n\n{doc['Question']}" question = (
doc["Question"]
if doc["Context"] == ""
else f"{doc['Context']}\n\n{doc['Question']}"
)
options = [] options = []
for i, opt in enumerate(['Option 1', 'Option 2', 'Option 3', 'Option 4', 'Option 5']): for i, opt in enumerate(
["Option 1", "Option 2", "Option 3", "Option 4", "Option 5"]
):
if not doc[opt]: if not doc[opt]:
break break
options.append(f"{alpa[i]} {doc[opt]}") options.append(f"{alpa[i]} {doc[opt]}")
doc_text = PROMPT.format(main_meta_data, question, '\n'.join(options)) doc_text = PROMPT.format(main_meta_data, question, "\n".join(options))
return doc_text return doc_text
def doc_to_choice(doc): def doc_to_choice(doc):
return [alpa[i][0] for i in range(5) if doc[f'Option {i+1}']] return [alpa[i][0] for i in range(5) if doc[f"Option {i+1}"]]
\ No newline at end of file
...@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [ ...@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
] ]
@pytest.mark.skip(reason="test failing")
@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS) @pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
def test_sparseml_eval(model_id, task): def test_sparseml_eval(model_id, task):
lm = get_model("sparseml").create_from_arg_string( lm = get_model("sparseml").create_from_arg_string(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment