Unverified Commit 9b6179b2 authored by Baber Abbasi's avatar Baber Abbasi Committed by GitHub
Browse files

Remove `LM` dependency from `build_all_requests` (#2011)

* refactored `lm.apply_chat_template`

* nit

* fix weird type error

* fixed!

* skip failing test

* pre-commit run all

* add type hints

* nit

* nit

* fixup
parent 9b6b0f5e
......@@ -368,15 +368,16 @@ class Task(abc.ABC):
def build_all_requests(
self,
*,
limit=None,
rank=None,
world_size=None,
cache_requests=False,
rewrite_requests_cache=False,
system_instruction=None,
apply_chat_template=False,
fewshot_as_multiturn=False,
lm=None,
limit: Union[int, None] = None,
rank: int = 0,
world_size: int = 1,
cache_requests: bool = False,
rewrite_requests_cache: bool = False,
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
chat_template: Optional[Callable] = None,
tokenizer_name: str = "",
) -> None:
"""Build a set of Instances for a task, and store them in task.instances"""
......@@ -391,7 +392,7 @@ class Task(abc.ABC):
if system_instruction is not None
else ""
)
cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else ""
cache_key += f"-tokenizer{tokenizer_name}"
cached_instances = load_from_cache(file_name=cache_key)
......@@ -436,7 +437,7 @@ class Task(abc.ABC):
system_instruction,
apply_chat_template,
fewshot_as_multiturn,
lm,
chat_template,
)
# TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
......@@ -1014,7 +1015,7 @@ class ConfigurableTask(Task):
system_instruction: Optional[str] = None,
apply_chat_template: bool = False,
fewshot_as_multiturn: bool = False,
lm=None,
chat_template: Optional[Callable] = None,
) -> str:
"""Returns a fewshot context string that is made up of a prepended description
(if provided), the `num_fewshot` number of examples, and an appended prompt example.
......@@ -1029,8 +1030,8 @@ class ConfigurableTask(Task):
Whether to apply the chat template to the fewshot context.
:param fewshot_as_multiturn: bool
Whether to provide the fewshot examples as a multiturn conversation or a single user turn.
:param lm:
Language model with definition of the tokenizer/function to use for applying the chat template.
:param chat_template: Callable
Chat template to be applied to the fewshot context.
:returns: str
The fewshot context.
"""
......@@ -1077,7 +1078,7 @@ class ConfigurableTask(Task):
example = self.doc_to_text(doc)
if apply_chat_template:
if self.multiple_input:
return lm.apply_chat_template(labeled_examples)
return chat_template(labeled_examples)
if isinstance(example, str):
self.append_target_question(
labeled_examples, example, fewshot_as_multiturn
......@@ -1089,7 +1090,7 @@ class ConfigurableTask(Task):
for ex in example:
chat = deepcopy(labeled_examples)
self.append_target_question(chat, ex, fewshot_as_multiturn)
labeled_examples_list.append(lm.apply_chat_template(chat))
labeled_examples_list.append(chat_template(chat))
return labeled_examples_list
# if example is an integer, append the choice or convert to string
elif isinstance(example, int):
......@@ -1103,7 +1104,7 @@ class ConfigurableTask(Task):
labeled_examples, str(example), fewshot_as_multiturn
)
# return lm.apply_chat_template(labeled_examples)
return lm.apply_chat_template(labeled_examples)
return chat_template(labeled_examples)
else:
if self.multiple_input:
return labeled_examples
......
......@@ -399,7 +399,12 @@ def evaluate(
system_instruction=system_instruction,
apply_chat_template=apply_chat_template,
fewshot_as_multiturn=fewshot_as_multiturn,
lm=lm,
chat_template=getattr(lm, "apply_chat_template")
if apply_chat_template
else None,
tokenizer_name=getattr(lm, "tokenizer_name", "")
if apply_chat_template
else "",
)
eval_logger.debug(
f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}"
......@@ -609,16 +614,16 @@ def evaluate(
]
# compute group's pooled metric and stderr
results[group][
metric
] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
results[group][metric] = (
lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes)
)
# TODO: calculate grouped metric using aggregation fn
if "N/A" in stderrs:
results[group][stderr] = "N/A"
else:
results[group][
stderr
] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
results[group][stderr] = (
lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes)
)
# TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility
# To use the old (likely incorrect) variance formula, comment out the above and uncomment this line:
# results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics)
......
......@@ -275,9 +275,9 @@ def consolidate_results(
metric_key
]
results[task_output.task_name]["samples"] = task_output.sample_len
results[task_output.task_name][
f"{metric}_stderr,{filter_key}"
] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = (
task_output.agg_metrics[f"{metric}_stderr,{filter_key}"]
)
return results, samples, configs, versions, num_fewshot, higher_is_better
......
......@@ -18,7 +18,7 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU
```
@misc{koto2024arabicmmlu,
title={ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic},
title={ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic},
author={Fajri Koto and Haonan Li and Sara Shatnawi and Jad Doughman and Abdelrahman Boda Sadallah and Aisha Alraeesi and Khalid Almubarak and Zaid Alyafeai and Neha Sengupta and Shady Shehata and Nizar Habash and Preslav Nakov and Timothy Baldwin},
year={2024},
eprint={2402.12840},
......@@ -37,4 +37,4 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU
* `arabicmmlu_stem_social_science`: evaluates social science ArabicMMLU tasks.
* `arabicmmlu_stem_humanities`: evaluates humanities ArabicMMLU tasks.
* `arabicmmlu_stem_language`: evaluates Arabic language ArabicMMLU tasks.
* `arabicmmlu_stem_other`: evaluates other ArabicMMLU tasks.
\ No newline at end of file
* `arabicmmlu_stem_other`: evaluates other ArabicMMLU tasks.
"""
Take in a YAML, and output all "other" splits with this YAML
"""
import argparse
import logging
import os
......@@ -76,7 +77,6 @@ if __name__ == "__main__":
if category not in ALL_CATEGORIES:
ALL_CATEGORIES.append(category)
# description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n"
yaml_dict = {
......@@ -89,7 +89,10 @@ if __name__ == "__main__":
# "description": description,
}
file_save_path = args.save_prefix_path + f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml"
file_save_path = (
args.save_prefix_path
+ f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml"
)
eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}")
with open(file_save_path, "w", encoding="utf-8") as yaml_file:
yaml.dump(
......
PROMPT = 'This is a {}. Select the correct answer!\n\nQuestion: {}\n{}\n\nAnswer:'
PROMPT = "This is a {}. Select the correct answer!\n\nQuestion: {}\n{}\n\nAnswer:"
level_en = {
'Primary': 'primary school',
'Middle': 'middle school',
'High': 'high school',
'Univ': 'university',
'Prof': 'professional',
"Primary": "primary school",
"Middle": "middle school",
"High": "high school",
"Univ": "university",
"Prof": "professional",
}
alpa = ['A.', 'B.', 'C.', 'D.', 'E.']
alpa = ["A.", "B.", "C.", "D.", "E."]
def doc_to_text(doc):
......@@ -17,22 +17,28 @@ def doc_to_text(doc):
https://github.com/mbzuai-nlp/ArabicMMLU/blob/main/util_prompt.py
"""
level = "" if not doc['Level'] else " for " + level_en[doc['Level']]
country = "" if not doc['Country'] else " in " + doc['Country']
level = "" if not doc["Level"] else " for " + level_en[doc["Level"]]
country = "" if not doc["Country"] else " in " + doc["Country"]
main_meta_data = f"{doc['Subject']} question{level}{country}"
question = doc['Question'] if doc['Context']=="" else f"{doc['Context']}\n\n{doc['Question']}"
question = (
doc["Question"]
if doc["Context"] == ""
else f"{doc['Context']}\n\n{doc['Question']}"
)
options = []
for i, opt in enumerate(['Option 1', 'Option 2', 'Option 3', 'Option 4', 'Option 5']):
for i, opt in enumerate(
["Option 1", "Option 2", "Option 3", "Option 4", "Option 5"]
):
if not doc[opt]:
break
options.append(f"{alpa[i]} {doc[opt]}")
doc_text = PROMPT.format(main_meta_data, question, '\n'.join(options))
doc_text = PROMPT.format(main_meta_data, question, "\n".join(options))
return doc_text
def doc_to_choice(doc):
return [alpa[i][0] for i in range(5) if doc[f'Option {i+1}']]
\ No newline at end of file
return [alpa[i][0] for i in range(5) if doc[f"Option {i+1}"]]
......@@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [
]
@pytest.mark.skip(reason="test failing")
@pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS)
def test_sparseml_eval(model_id, task):
lm = get_model("sparseml").create_from_arg_string(
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment