From 536691da2444bd35b76d3f9c9527126273a63251 Mon Sep 17 00:00:00 2001 From: achervyakov <77295913+artemorloff@users.noreply.github.com> Date: Mon, 24 Jun 2024 17:08:54 +0300 Subject: [PATCH 01/45] add tokenizer logs info (#1731) * add tokenizer logs info * add no tokenizer case * Update lm_eval/logging_utils.py Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * Update lm_eval/logging_utils.py Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * add updates * fix conflict --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> --- lm_eval/evaluator.py | 3 ++- lm_eval/loggers/utils.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py index 2378b3ea..d3b15c8c 100644 --- a/lm_eval/evaluator.py +++ b/lm_eval/evaluator.py @@ -22,7 +22,7 @@ from lm_eval.evaluator_utils import ( run_task_tests, ) from lm_eval.loggers import EvaluationTracker -from lm_eval.loggers.utils import add_env_info, get_git_commit_hash +from lm_eval.logging.utils import add_env_info, add_tokenizer_info, get_git_commit_hash from lm_eval.tasks import TaskManager, get_task_dict from lm_eval.utils import ( eval_logger, @@ -326,6 +326,7 @@ def simple_evaluate( results["git_hash"] = get_git_commit_hash() results["date"] = start_date add_env_info(results) # additional environment info to results + add_tokenizer_info(results, lm) # additional info about tokenizer return results else: return None diff --git a/lm_eval/loggers/utils.py b/lm_eval/loggers/utils.py index fd47c9ab..670cdbff 100644 --- a/lm_eval/loggers/utils.py +++ b/lm_eval/loggers/utils.py @@ -110,3 +110,20 @@ def add_env_info(storage: Dict[str, Any]): "upper_git_hash": upper_dir_commit, # in case this repo is submodule } storage.update(added_info) + + +def add_tokenizer_info(storage: Dict[str, Any], lm): + if getattr(lm, "tokenizer", False): + tokenizer_info = { + "tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id], + "tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id], + "tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id], + "eot_token_id": getattr(lm, "eot_token_id", None), + "max_length": getattr(lm, "max_length", None), + } + storage.update(tokenizer_info) + # seems gguf and textsynth do not have tokenizer + else: + logger.debug( + "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results." + ) -- GitLab From 0ae3d3eb5c5bdd73fa1f3bfd979866b5110324dd Mon Sep 17 00:00:00 2001 From: Stella Biderman Date: Mon, 24 Jun 2024 11:13:47 -0400 Subject: [PATCH 02/45] Hotfix breaking import (#2015) --- lm_eval/evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py index d3b15c8c..76ae5d15 100644 --- a/lm_eval/evaluator.py +++ b/lm_eval/evaluator.py @@ -22,7 +22,7 @@ from lm_eval.evaluator_utils import ( run_task_tests, ) from lm_eval.loggers import EvaluationTracker -from lm_eval.logging.utils import add_env_info, add_tokenizer_info, get_git_commit_hash +from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash from lm_eval.tasks import TaskManager, get_task_dict from lm_eval.utils import ( eval_logger, -- GitLab From 9b6b0f5e0b309563705376abe526887f65db7dd3 Mon Sep 17 00:00:00 2001 From: jonabur <135807120+jonabur@users.noreply.github.com> Date: Tue, 25 Jun 2024 17:52:25 +0300 Subject: [PATCH 03/45] add arc_challenge_mt (#1900) * add arc_challenge_mt * add README * add icelandic --- lm_eval/tasks/arc_mt/README.md | 12 ++++++++++ lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml | 3 +++ lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml | 3 +++ lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml | 3 +++ lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml | 3 +++ lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml | 23 +++++++++++++++++++ lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml | 3 +++ lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml | 22 ++++++++++++++++++ lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml | 3 +++ lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml | 3 +++ lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml | 3 +++ lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml | 3 +++ lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml | 3 +++ 13 files changed, 87 insertions(+) create mode 100644 lm_eval/tasks/arc_mt/README.md create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml create mode 100644 lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml diff --git a/lm_eval/tasks/arc_mt/README.md b/lm_eval/tasks/arc_mt/README.md new file mode 100644 index 00000000..5e1c6e40 --- /dev/null +++ b/lm_eval/tasks/arc_mt/README.md @@ -0,0 +1,12 @@ +# arc mt + +arc mt is an implementation of tasks to support machine translated arc +challenge evals, to improve eval support across a number of additional +languages. + +The main page for the effort is +[here](https://huggingface.co/datasets/LumiOpen/arc_challenge_mt) and we will +include more data and analysis there. + +Initial datasets include a number of European languages, and we plan to expand +more in the future. diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml new file mode 100644 index 00000000..f3efdc4c --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_da.yaml @@ -0,0 +1,3 @@ +include: arc_challenge_mt_fi.yaml +task: arc_challenge_mt_da +dataset_name: da diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml new file mode 100644 index 00000000..36fdf7be --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_de.yaml @@ -0,0 +1,3 @@ +include: arc_challenge_mt_fi.yaml +task: arc_challenge_mt_de +dataset_name: de diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml new file mode 100644 index 00000000..d97580b0 --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_el.yaml @@ -0,0 +1,3 @@ +include: arc_challenge_mt_fi.yaml +task: arc_challenge_mt_el +dataset_name: el diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml new file mode 100644 index 00000000..7dffc6c7 --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_es.yaml @@ -0,0 +1,3 @@ +include: arc_challenge_mt_fi.yaml +task: arc_challenge_mt_es +dataset_name: es diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml new file mode 100644 index 00000000..d2032e34 --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml @@ -0,0 +1,23 @@ +group: + - arc_challenge_mt +task: arc_challenge_mt_fi +dataset_path: LumiOpen/arc_challenge_mt +dataset_name: fi +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +doc_to_text: "Question: {{question}}\nAnswer:" +doc_to_target: "{{choices.label.index(answerKey)}}" +doc_to_choice: "{{choices.text}}" +should_decontaminate: true +doc_to_decontamination_query: "Question: {{question}}\nAnswer:" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml new file mode 100644 index 00000000..03d5ac17 --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_hu.yaml @@ -0,0 +1,3 @@ +include: arc_challenge_mt_fi.yaml +task: arc_challenge_mt_hu +dataset_name: hu diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml new file mode 100644 index 00000000..1591d7eb --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_is.yaml @@ -0,0 +1,22 @@ +group: + - arc_challenge_mt +task: arc_challenge_mt_is +dataset_path: mideind/icelandic-arc-challenge +output_type: multiple_choice +training_split: train +validation_split: validation +test_split: test +doc_to_text: "Question: {{question}}\nAnswer:" +doc_to_target: "{{choices.label.index(answerKey)}}" +doc_to_choice: "{{choices.text}}" +should_decontaminate: true +doc_to_decontamination_query: "Question: {{question}}\nAnswer:" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml new file mode 100644 index 00000000..995f7a3d --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_it.yaml @@ -0,0 +1,3 @@ +include: arc_challenge_mt_fi.yaml +task: arc_challenge_mt_it +dataset_name: it diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml new file mode 100644 index 00000000..aceaa14b --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_nb.yaml @@ -0,0 +1,3 @@ +include: arc_challenge_mt_fi.yaml +task: arc_challenge_mt_nb +dataset_name: nb diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml new file mode 100644 index 00000000..3b9a332f --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_pl.yaml @@ -0,0 +1,3 @@ +include: arc_challenge_mt_fi.yaml +task: arc_challenge_mt_pl +dataset_name: pl diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml new file mode 100644 index 00000000..748743fc --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_pt.yaml @@ -0,0 +1,3 @@ +include: arc_challenge_mt_fi.yaml +task: arc_challenge_mt_pt +dataset_name: pt diff --git a/lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml b/lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml new file mode 100644 index 00000000..09d97c51 --- /dev/null +++ b/lm_eval/tasks/arc_mt/arc_challenge_mt_sv.yaml @@ -0,0 +1,3 @@ +include: arc_challenge_mt_fi.yaml +task: arc_challenge_mt_sv +dataset_name: sv -- GitLab From 9b6179b2941d42e2540dbffc9c76fb56e596f644 Mon Sep 17 00:00:00 2001 From: Baber Abbasi <92168766+baberabb@users.noreply.github.com> Date: Tue, 25 Jun 2024 20:36:10 +0500 Subject: [PATCH 04/45] Remove `LM` dependency from `build_all_requests` (#2011) * refactored `lm.apply_chat_template` * nit * fix weird type error * fixed! * skip failing test * pre-commit run all * add type hints * nit * nit * fixup --- lm_eval/api/task.py | 35 ++++++++++--------- lm_eval/evaluator.py | 19 ++++++---- lm_eval/evaluator_utils.py | 6 ++-- lm_eval/tasks/arabicmmlu/README.md | 4 +-- lm_eval/tasks/arabicmmlu/_generate_configs.py | 7 ++-- lm_eval/tasks/arabicmmlu/utils.py | 34 ++++++++++-------- tests/models/test_neuralmagic.py | 1 + 7 files changed, 61 insertions(+), 45 deletions(-) diff --git a/lm_eval/api/task.py b/lm_eval/api/task.py index b5cc1a85..ccfda509 100644 --- a/lm_eval/api/task.py +++ b/lm_eval/api/task.py @@ -368,15 +368,16 @@ class Task(abc.ABC): def build_all_requests( self, *, - limit=None, - rank=None, - world_size=None, - cache_requests=False, - rewrite_requests_cache=False, - system_instruction=None, - apply_chat_template=False, - fewshot_as_multiturn=False, - lm=None, + limit: Union[int, None] = None, + rank: int = 0, + world_size: int = 1, + cache_requests: bool = False, + rewrite_requests_cache: bool = False, + system_instruction: Optional[str] = None, + apply_chat_template: bool = False, + fewshot_as_multiturn: bool = False, + chat_template: Optional[Callable] = None, + tokenizer_name: str = "", ) -> None: """Build a set of Instances for a task, and store them in task.instances""" @@ -391,7 +392,7 @@ class Task(abc.ABC): if system_instruction is not None else "" ) - cache_key += f"-tokenizer{lm.tokenizer_name}" if apply_chat_template else "" + cache_key += f"-tokenizer{tokenizer_name}" cached_instances = load_from_cache(file_name=cache_key) @@ -436,7 +437,7 @@ class Task(abc.ABC): system_instruction, apply_chat_template, fewshot_as_multiturn, - lm, + chat_template, ) # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute @@ -1014,7 +1015,7 @@ class ConfigurableTask(Task): system_instruction: Optional[str] = None, apply_chat_template: bool = False, fewshot_as_multiturn: bool = False, - lm=None, + chat_template: Optional[Callable] = None, ) -> str: """Returns a fewshot context string that is made up of a prepended description (if provided), the `num_fewshot` number of examples, and an appended prompt example. @@ -1029,8 +1030,8 @@ class ConfigurableTask(Task): Whether to apply the chat template to the fewshot context. :param fewshot_as_multiturn: bool Whether to provide the fewshot examples as a multiturn conversation or a single user turn. - :param lm: - Language model with definition of the tokenizer/function to use for applying the chat template. + :param chat_template: Callable + Chat template to be applied to the fewshot context. :returns: str The fewshot context. """ @@ -1077,7 +1078,7 @@ class ConfigurableTask(Task): example = self.doc_to_text(doc) if apply_chat_template: if self.multiple_input: - return lm.apply_chat_template(labeled_examples) + return chat_template(labeled_examples) if isinstance(example, str): self.append_target_question( labeled_examples, example, fewshot_as_multiturn @@ -1089,7 +1090,7 @@ class ConfigurableTask(Task): for ex in example: chat = deepcopy(labeled_examples) self.append_target_question(chat, ex, fewshot_as_multiturn) - labeled_examples_list.append(lm.apply_chat_template(chat)) + labeled_examples_list.append(chat_template(chat)) return labeled_examples_list # if example is an integer, append the choice or convert to string elif isinstance(example, int): @@ -1103,7 +1104,7 @@ class ConfigurableTask(Task): labeled_examples, str(example), fewshot_as_multiturn ) # return lm.apply_chat_template(labeled_examples) - return lm.apply_chat_template(labeled_examples) + return chat_template(labeled_examples) else: if self.multiple_input: return labeled_examples diff --git a/lm_eval/evaluator.py b/lm_eval/evaluator.py index 76ae5d15..73226b05 100644 --- a/lm_eval/evaluator.py +++ b/lm_eval/evaluator.py @@ -399,7 +399,12 @@ def evaluate( system_instruction=system_instruction, apply_chat_template=apply_chat_template, fewshot_as_multiturn=fewshot_as_multiturn, - lm=lm, + chat_template=getattr(lm, "apply_chat_template") + if apply_chat_template + else None, + tokenizer_name=getattr(lm, "tokenizer_name", "") + if apply_chat_template + else "", ) eval_logger.debug( f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}" @@ -609,16 +614,16 @@ def evaluate( ] # compute group's pooled metric and stderr - results[group][ - metric - ] = lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes) + results[group][metric] = ( + lm_eval.api.metrics.aggregate_subtask_metrics(metrics, sizes) + ) # TODO: calculate grouped metric using aggregation fn if "N/A" in stderrs: results[group][stderr] = "N/A" else: - results[group][ - stderr - ] = lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes) + results[group][stderr] = ( + lm_eval.api.metrics.pooled_sample_stderr(stderrs, sizes) + ) # TODO: allow GroupConfigs to choose which variance formula is used, for back-compatibility # To use the old (likely incorrect) variance formula, comment out the above and uncomment this line: # results[group][stderr] = lm_eval.api.metrics.combined_sample_stderr(stderrs, sizes, metrics=metrics) diff --git a/lm_eval/evaluator_utils.py b/lm_eval/evaluator_utils.py index ad6eb53a..a773c92e 100644 --- a/lm_eval/evaluator_utils.py +++ b/lm_eval/evaluator_utils.py @@ -275,9 +275,9 @@ def consolidate_results( metric_key ] results[task_output.task_name]["samples"] = task_output.sample_len - results[task_output.task_name][ - f"{metric}_stderr,{filter_key}" - ] = task_output.agg_metrics[f"{metric}_stderr,{filter_key}"] + results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = ( + task_output.agg_metrics[f"{metric}_stderr,{filter_key}"] + ) return results, samples, configs, versions, num_fewshot, higher_is_better diff --git a/lm_eval/tasks/arabicmmlu/README.md b/lm_eval/tasks/arabicmmlu/README.md index 12fd1a97..90de14b7 100644 --- a/lm_eval/tasks/arabicmmlu/README.md +++ b/lm_eval/tasks/arabicmmlu/README.md @@ -18,7 +18,7 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU ``` @misc{koto2024arabicmmlu, - title={ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic}, + title={ArabicMMLU: Assessing Massive Multitask Language Understanding in Arabic}, author={Fajri Koto and Haonan Li and Sara Shatnawi and Jad Doughman and Abdelrahman Boda Sadallah and Aisha Alraeesi and Khalid Almubarak and Zaid Alyafeai and Neha Sengupta and Shady Shehata and Nizar Habash and Preslav Nakov and Timothy Baldwin}, year={2024}, eprint={2402.12840}, @@ -37,4 +37,4 @@ Homepage: https://github.com/mbzuai-nlp/ArabicMMLU * `arabicmmlu_stem_social_science`: evaluates social science ArabicMMLU tasks. * `arabicmmlu_stem_humanities`: evaluates humanities ArabicMMLU tasks. * `arabicmmlu_stem_language`: evaluates Arabic language ArabicMMLU tasks. -* `arabicmmlu_stem_other`: evaluates other ArabicMMLU tasks. \ No newline at end of file +* `arabicmmlu_stem_other`: evaluates other ArabicMMLU tasks. diff --git a/lm_eval/tasks/arabicmmlu/_generate_configs.py b/lm_eval/tasks/arabicmmlu/_generate_configs.py index 6b45495a..83e6594e 100644 --- a/lm_eval/tasks/arabicmmlu/_generate_configs.py +++ b/lm_eval/tasks/arabicmmlu/_generate_configs.py @@ -1,6 +1,7 @@ """ Take in a YAML, and output all "other" splits with this YAML """ + import argparse import logging import os @@ -76,7 +77,6 @@ if __name__ == "__main__": if category not in ALL_CATEGORIES: ALL_CATEGORIES.append(category) - # description = f"The following are multiple choice questions (with answers) about {' '.join(subject.split('_'))}.\n\n" yaml_dict = { @@ -89,7 +89,10 @@ if __name__ == "__main__": # "description": description, } - file_save_path = args.save_prefix_path + f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml" + file_save_path = ( + args.save_prefix_path + + f"_{subject.lower().replace(' ', '_').replace('(', '').replace(')', '')}.yaml" + ) eval_logger.info(f"Saving yaml for subset {subject} to {file_save_path}") with open(file_save_path, "w", encoding="utf-8") as yaml_file: yaml.dump( diff --git a/lm_eval/tasks/arabicmmlu/utils.py b/lm_eval/tasks/arabicmmlu/utils.py index c8976b49..e1ed4b99 100644 --- a/lm_eval/tasks/arabicmmlu/utils.py +++ b/lm_eval/tasks/arabicmmlu/utils.py @@ -1,14 +1,14 @@ -PROMPT = 'This is a {}. Select the correct answer!\n\nQuestion: {}\n{}\n\nAnswer:' +PROMPT = "This is a {}. Select the correct answer!\n\nQuestion: {}\n{}\n\nAnswer:" level_en = { - 'Primary': 'primary school', - 'Middle': 'middle school', - 'High': 'high school', - 'Univ': 'university', - 'Prof': 'professional', + "Primary": "primary school", + "Middle": "middle school", + "High": "high school", + "Univ": "university", + "Prof": "professional", } -alpa = ['A.', 'B.', 'C.', 'D.', 'E.'] +alpa = ["A.", "B.", "C.", "D.", "E."] def doc_to_text(doc): @@ -17,22 +17,28 @@ def doc_to_text(doc): https://github.com/mbzuai-nlp/ArabicMMLU/blob/main/util_prompt.py """ - level = "" if not doc['Level'] else " for " + level_en[doc['Level']] - country = "" if not doc['Country'] else " in " + doc['Country'] + level = "" if not doc["Level"] else " for " + level_en[doc["Level"]] + country = "" if not doc["Country"] else " in " + doc["Country"] main_meta_data = f"{doc['Subject']} question{level}{country}" - question = doc['Question'] if doc['Context']=="" else f"{doc['Context']}\n\n{doc['Question']}" + question = ( + doc["Question"] + if doc["Context"] == "" + else f"{doc['Context']}\n\n{doc['Question']}" + ) options = [] - for i, opt in enumerate(['Option 1', 'Option 2', 'Option 3', 'Option 4', 'Option 5']): + for i, opt in enumerate( + ["Option 1", "Option 2", "Option 3", "Option 4", "Option 5"] + ): if not doc[opt]: break options.append(f"{alpa[i]} {doc[opt]}") - doc_text = PROMPT.format(main_meta_data, question, '\n'.join(options)) - + doc_text = PROMPT.format(main_meta_data, question, "\n".join(options)) + return doc_text def doc_to_choice(doc): - return [alpa[i][0] for i in range(5) if doc[f'Option {i+1}']] \ No newline at end of file + return [alpa[i][0] for i in range(5) if doc[f"Option {i+1}"]] diff --git a/tests/models/test_neuralmagic.py b/tests/models/test_neuralmagic.py index 5cc48ea9..e0a36cee 100644 --- a/tests/models/test_neuralmagic.py +++ b/tests/models/test_neuralmagic.py @@ -23,6 +23,7 @@ DEEPSPARSE_MODELS_TASKS = [ ] +@pytest.mark.skip(reason="test failing") @pytest.mark.parametrize("model_id,task", SPARSEML_MODELS_TASKS) def test_sparseml_eval(model_id, task): lm = get_model("sparseml").create_from_arg_string( -- GitLab From b62b9bd06db3049ef10827d3177cd9fa0c843bff Mon Sep 17 00:00:00 2001 From: Brendan Murphy Date: Tue, 25 Jun 2024 11:55:08 -0400 Subject: [PATCH 05/45] Added CommonsenseQA task (#1721) * Initial configuration * Using the validation set for the test set, because the test set on HF doesn't have labels * Probably just makes more sense to have validation be validation * fix format ; add docs to tasks/README.md * fix format --------- Co-authored-by: haileyschoelkopf --- lm_eval/tasks/README.md | 1 + lm_eval/tasks/commonsense_qa/README.md | 60 +++++++++++++++++++++++ lm_eval/tasks/commonsense_qa/default.yaml | 12 +++++ 3 files changed, 73 insertions(+) create mode 100644 lm_eval/tasks/commonsense_qa/README.md create mode 100644 lm_eval/tasks/commonsense_qa/default.yaml diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md index 6a7847b1..bb04d4f2 100644 --- a/lm_eval/tasks/README.md +++ b/lm_eval/tasks/README.md @@ -26,6 +26,7 @@ | [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese | | [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese | | code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby | +| [commonsense_qa](commmonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English | | [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian | | [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English | | [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French | diff --git a/lm_eval/tasks/commonsense_qa/README.md b/lm_eval/tasks/commonsense_qa/README.md new file mode 100644 index 00000000..94ef87a5 --- /dev/null +++ b/lm_eval/tasks/commonsense_qa/README.md @@ -0,0 +1,60 @@ +# Task-name + +### Paper + +Title: `COMMONSENSEQA: A Question Answering Challenge Targeting +Commonsense Knowledge` + +Abstract: https://arxiv.org/pdf/1811.00937.pdf + +CommonsenseQA is a multiple-choice question answering dataset that requires different types of commonsense knowledge to predict the correct answers. +It contains 12,102 questions with one correct answer and four distractor answers. + +Homepage: https://www.tau-nlp.org/commonsenseqa + + +### Citation + +``` +@inproceedings{talmor-etal-2019-commonsenseqa, + title = "{C}ommonsense{QA}: A Question Answering Challenge Targeting Commonsense Knowledge", + author = "Talmor, Alon and + Herzig, Jonathan and + Lourie, Nicholas and + Berant, Jonathan", + booktitle = "Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)", + month = jun, + year = "2019", + address = "Minneapolis, Minnesota", + publisher = "Association for Computational Linguistics", + url = "https://aclanthology.org/N19-1421", + doi = "10.18653/v1/N19-1421", + pages = "4149--4158", + archivePrefix = "arXiv", + eprint = "1811.00937", + primaryClass = "cs", +} +``` + +### Groups and Tasks + +#### Groups + +* Not part of a group yet. + +#### Tasks + +* `commonsense_qa`: Represents the "random" split from the paper. Uses an MMLU-style prompt, as (presumably) used by Llama evaluations. + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/commonsense_qa/default.yaml b/lm_eval/tasks/commonsense_qa/default.yaml new file mode 100644 index 00000000..31d31b01 --- /dev/null +++ b/lm_eval/tasks/commonsense_qa/default.yaml @@ -0,0 +1,12 @@ +task: commonsense_qa +dataset_path: tau/commonsense_qa +training_split: train +validation_split: validation +output_type: multiple_choice +doc_to_text: "Question: {{ question.strip() }}\nA. {{choices['text'][0]}}\nB. {{choices['text'][1]}}\nC. {{choices['text'][2]}}\nD. {{choices['text'][3]}}\nE. {{choices['text'][4]}}\nAnswer:" +doc_to_target: answerKey +doc_to_choice: ['A', 'B', 'C', 'D', 'E'] +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true -- GitLab From 0366c74f0db70640c307c3d9447710af5c3f5d35 Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Tue, 25 Jun 2024 12:45:06 -0400 Subject: [PATCH 06/45] Factor out LM-specific tests (#1859) * separate out optimum/neuralmagic tests to separate job * fix vllm tests * fix bug in --trust_remote_code * use datasets.config instead intentionally * fix remote code issue? --- .github/workflows/unit_tests.yml | 30 ++++++++++++++++++++++++++++-- tests/models/test_vllm.py | 5 ++--- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index a3e25429..11570e1d 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -55,13 +55,39 @@ jobs: cache-dependency-path: pyproject.toml - name: Install dependencies run: | + export HF_DATASETS_TRUST_REMOTE_CODE=1 python -m pip install --upgrade pip - pip install -e '.[dev,anthropic,sentencepiece,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu + pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu # Install optional git dependencies # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Test with pytest - run: python -m pytest --showlocals -s -vv -n=auto + run: python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py + - name: Archive artifacts + uses: actions/upload-artifact@v3 + with: + name: output_results + path: | + test_logs/* + testmodels: + name: External LM Tests + runs-on: ubuntu-latest + timeout-minutes: 30 + steps: + - name: Checkout Code + uses: actions/checkout@v4 + - name: Set up Python 3.8 + uses: actions/setup-python@v5 + with: + python-version: 3.8 + cache: pip + cache-dependency-path: pyproject.toml + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -e '.[dev,optimum,deepsparse,sparseml]' --extra-index-url https://download.pytorch.org/whl/cpu + - name: Test with pytest + run: python -m pytest tests/models --showlocals -s -vv - name: Archive artifacts uses: actions/upload-artifact@v3 with: diff --git a/tests/models/test_vllm.py b/tests/models/test_vllm.py index b0440759..01363bc8 100644 --- a/tests/models/test_vllm.py +++ b/tests/models/test_vllm.py @@ -1,7 +1,6 @@ from typing import List import pytest -import torch from lm_eval import tasks from lm_eval.api.instance import Instance @@ -11,7 +10,7 @@ task_manager = tasks.TaskManager() @pytest.mark.skip(reason="requires CUDA") -class TEST_VLLM: +class Test_VLLM: vllm = pytest.importorskip("vllm") try: from lm_eval.models.vllm_causallms import VLLM @@ -19,7 +18,7 @@ class TEST_VLLM: LM = VLLM(pretrained="EleutherAI/pythia-70m") except ModuleNotFoundError: pass - torch.use_deterministic_algorithms(True) + # torch.use_deterministic_algorithms(True) task_list = task_manager.load_task_or_group(["arc_easy", "gsm8k", "wikitext"]) multiple_choice_task = task_list["arc_easy"] # type: ignore multiple_choice_task.build_all_requests(limit=10, rank=0, world_size=1) -- GitLab From 6e49b1f6910931882a4b3b105794c6faf96b74e5 Mon Sep 17 00:00:00 2001 From: johnwee1 <91670254+johnwee1@users.noreply.github.com> Date: Wed, 26 Jun 2024 01:49:08 +0800 Subject: [PATCH 07/45] Update interface.md (#1982) * Update interface.md update interface to remove link to really outdated commit of evaluator.py * switch to relative referencing? * Update interface.md --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> --- docs/interface.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/interface.md b/docs/interface.md index 2add18f8..89e0397b 100644 --- a/docs/interface.md +++ b/docs/interface.md @@ -102,12 +102,10 @@ results = lm_eval.simple_evaluate( # call simple_evaluate ) ``` -See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L35 for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously. +See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously. Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`. -See https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/evaluator.py#L173 for more details. - As a brief example usage of `evaluate()`: ```python -- GitLab From e5e5ee0cb629c9c88165292d1b4bf34623392d33 Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Wed, 26 Jun 2024 12:40:47 -0400 Subject: [PATCH 08/45] Fix `trust_remote_code`-related test failures (#2024) * make MMLU trust remote code to fix tests * remove trust remote code --- .github/workflows/unit_tests.yml | 1 - lm_eval/tasks/mmlu/continuation/_continuation_template_yaml | 2 ++ lm_eval/tasks/mmlu/default/_default_template_yaml | 2 ++ .../mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml | 2 ++ .../flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml | 2 ++ .../flan_n_shot/generative/_mmlu_flan_generative_template_yaml | 2 ++ .../loglikelihood/_mmlu_flan_loglikelihood_template_yaml | 2 ++ lm_eval/tasks/mmlu/generative/_default_template_yaml | 2 ++ 8 files changed, 14 insertions(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 11570e1d..f9dcaf4c 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -55,7 +55,6 @@ jobs: cache-dependency-path: pyproject.toml - name: Install dependencies run: | - export HF_DATASETS_TRUST_REMOTE_CODE=1 python -m pip install --upgrade pip pip install -e '.[dev,anthropic,sentencepiece]' --extra-index-url https://download.pytorch.org/whl/cpu # Install optional git dependencies diff --git a/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml b/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml index b7d75e98..45e8928a 100644 --- a/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml +++ b/lm_eval/tasks/mmlu/continuation/_continuation_template_yaml @@ -9,3 +9,5 @@ doc_to_choice: "{{choices}}" doc_to_target: "{{answer}}" metadata: version: 0.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/mmlu/default/_default_template_yaml b/lm_eval/tasks/mmlu/default/_default_template_yaml index 37e8bb16..0f22d02c 100644 --- a/lm_eval/tasks/mmlu/default/_default_template_yaml +++ b/lm_eval/tasks/mmlu/default/_default_template_yaml @@ -13,3 +13,5 @@ metric_list: higher_is_better: true metadata: version: 0.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml index ae5d41be..11850950 100644 --- a/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml +++ b/lm_eval/tasks/mmlu/flan_cot_fewshot/_mmlu_flan_cot_fewshot_template_yaml @@ -27,3 +27,5 @@ metric_list: ignore_punctuation: true metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml index ebee3b65..113d20b7 100644 --- a/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml +++ b/lm_eval/tasks/mmlu/flan_cot_zeroshot/_mmlu_flan_cot_zeroshot_template_yaml @@ -34,3 +34,5 @@ metric_list: ignore_punctuation: true metadata: version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml index d866b1d7..28be9cfc 100644 --- a/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml +++ b/lm_eval/tasks/mmlu/flan_n_shot/generative/_mmlu_flan_generative_template_yaml @@ -31,3 +31,5 @@ metric_list: higher_is_better: true metadata: version: 2.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml index d26c69d4..8f223281 100644 --- a/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml +++ b/lm_eval/tasks/mmlu/flan_n_shot/loglikelihood/_mmlu_flan_loglikelihood_template_yaml @@ -13,3 +13,5 @@ metric_list: higher_is_better: true metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/lm_eval/tasks/mmlu/generative/_default_template_yaml b/lm_eval/tasks/mmlu/generative/_default_template_yaml index aa503942..bc1ad0db 100644 --- a/lm_eval/tasks/mmlu/generative/_default_template_yaml +++ b/lm_eval/tasks/mmlu/generative/_default_template_yaml @@ -16,3 +16,5 @@ metric_list: higher_is_better: true metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true -- GitLab From 801322e0515e44eef8b555020650fc0dd2ff9112 Mon Sep 17 00:00:00 2001 From: Steven Basart Date: Fri, 28 Jun 2024 09:14:48 -0400 Subject: [PATCH 09/45] Fixes scrolls task bug with few_shot examples (#2003) Bug: ``` python -m scripts.write_out --task scrolls_quality --output_base_path ~/workspace/ Traceback (most recent call last): File "", line 198, in _run_module_as_main File "", line 88, in _run_code File "/lm-evaluation-harness/scripts/write_out.py", line 92, in main() File "/lm-evaluation-harness/scripts/write_out.py", line 51, in main task_dict = tasks.get_task_dict(task_names, task_manager) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/lm-evaluation-harness/lm_eval/tasks/__init__.py", line 423, in get_task_dict task_name_from_string_dict = task_manager.load_task_or_group( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/lm-evaluation-harness/lm_eval/tasks/__init__.py", line 271, in load_task_or_group collections.ChainMap(*map(self._load_individual_task_or_group, task_list)) File "/lm-evaluation-harness/lm_eval/tasks/__init__.py", line 162, in _load_individual_task_or_group return load_task(task_config, task=name_or_config, group=parent_name) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/lm-evaluation-harness/lm_eval/tasks/__init__.py", line 148, in load_task task_object = config["class"]() ^^^^^^^^^^^^^^^^^ File "/lm-evaluation-harness/lm_eval/tasks/scrolls/task.py", line 120, in __init__ super().__init__() File "/lm-evaluation-harness/lm_eval/api/task.py", line 703, in __init__ self._config = TaskConfig(**config) ^^^^^^^^^^^^^^^^^^^^ TypeError: lm_eval.api.task.TaskConfig() argument after ** must be a mapping, not NoneType ``` --- lm_eval/tasks/scrolls/task.py | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py index 5b604e15..bbf1a3c3 100644 --- a/lm_eval/tasks/scrolls/task.py +++ b/lm_eval/tasks/scrolls/task.py @@ -4,12 +4,12 @@ from functools import reduce import numpy as np import transformers.data.metrics.squad_metrics as squad_metrics -from datasets import load_metric +from datasets import Dataset, load_metric from transformers import AutoTokenizer from lm_eval.api.instance import Instance from lm_eval.api.metrics import mean -from lm_eval.api.task import Task +from lm_eval.api.task import ConfigurableTask _CITATION = """ @@ -108,7 +108,7 @@ def _num_cpu_cores(): return len(os.sched_getaffinity(0)) -class _SCROLLSTask(Task): +class _SCROLLSTask(ConfigurableTask): VERSION = 2 DATASET_PATH = "tau/scrolls" DATASET_NAME = None @@ -117,7 +117,7 @@ class _SCROLLSTask(Task): PRUNE_NUM_PROC = None def __init__(self): - super().__init__() + super().__init__(config={"metadata": {"version": self.VERSION}}) if self.DATASET_NAME is not None: self.metric = load_metric(_download_metric(), config_name=self.DATASET_NAME) @@ -131,12 +131,26 @@ class _SCROLLSTask(Task): return False def training_docs(self): - for doc in self.dataset["train"]: - yield from self._process_doc(doc) + processed_docs = list(map(self._process_doc, self.dataset["train"])) + + # Flatten the list of lists since _process_doc returns a list of one element. + processed_docs = [item for sublist in processed_docs for item in sublist] + processed_dict = { + key: [d[key] for d in processed_docs] for key in processed_docs[0] + } + + return Dataset.from_dict(processed_dict) def validation_docs(self): - for doc in self.dataset["validation"]: - yield from self._process_doc(doc) + processed_docs = list(map(self._process_doc, self.dataset["validation"])) + + # Flatten the list of lists since _process_doc returns a list of one element. + processed_docs = [item for sublist in processed_docs for item in sublist] + processed_dict = { + key: [d[key] for d in processed_docs] for key in processed_docs[0] + } + + return Dataset.from_dict(processed_dict) def should_decontaminate(self): return True -- GitLab From e922cceb58bdcad6c2426ba237f8fcc3a8fc293d Mon Sep 17 00:00:00 2001 From: Baber Abbasi <92168766+baberabb@users.noreply.github.com> Date: Fri, 28 Jun 2024 22:15:39 +0500 Subject: [PATCH 10/45] fix cache (#2037) --- lm_eval/api/model.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lm_eval/api/model.py b/lm_eval/api/model.py index 7152441b..5eecc617 100644 --- a/lm_eval/api/model.py +++ b/lm_eval/api/model.py @@ -246,9 +246,10 @@ class CachingLM: # add hook to lm lm.set_cache_hook(self.get_cache_hook()) - def __getattr__(self, attr): + def __getattr__(self, attr: str): lm_attr = getattr(self.lm, attr) - if not callable(lm_attr): + if attr not in ["loglikelihood", "loglikelihood_rolling", "generate_until"]: + eval_logger.debug(f"Passing through attribute '{attr}' to underlying LM") return lm_attr def fn(requests): -- GitLab From cc2d3463c2d5aa28f2b26c40d0ff20c878cc56b8 Mon Sep 17 00:00:00 2001 From: Baber Abbasi <92168766+baberabb@users.noreply.github.com> Date: Fri, 28 Jun 2024 22:21:49 +0500 Subject: [PATCH 11/45] Add chat template to `vllm` (#2034) * add chat template * refactor token padding * nit * nit * check on failing test * check transformers version * remove transformers pin * add ids to test * nit * fixup * fix bos bug * nit * fixup! fix bos bug * increase tolerance for table test * don't detokenize vllm logprobs * Update lm_eval/models/utils.py Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * pre-commit run --all-files --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> --- lm_eval/models/huggingface.py | 24 ++------------ lm_eval/models/utils.py | 51 +++++++++++++++++++++++++++++ lm_eval/models/vllm_causallms.py | 56 ++++++++++++++++++++++++-------- lm_eval/tasks/scrolls/task.py | 6 ++-- tests/test_evaluator.py | 4 ++- 5 files changed, 101 insertions(+), 40 deletions(-) diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py index 46418d9a..8c125f71 100644 --- a/lm_eval/models/huggingface.py +++ b/lm_eval/models/huggingface.py @@ -30,6 +30,7 @@ from lm_eval.api.registry import register_model from lm_eval.models.utils import ( Collator, clear_torch_cache, + configure_pad_token, get_dtype, pad_and_concat, stop_sequences_criteria, @@ -253,28 +254,7 @@ class HFLM(TemplateLM): self.logits_cache = logits_cache self.vocab_size = self.tokenizer.vocab_size # select (or create) a pad token to use - if self.tokenizer.pad_token: - pass - elif self.tokenizer.unk_token: - self.tokenizer.pad_token_id = self.tokenizer.unk_token_id - elif self.tokenizer.eos_token: - self.tokenizer.pad_token_id = self.tokenizer.eos_token_id - else: - if getattr(self.config, "model_type", None) == "qwen": - # Qwen's trust_remote_code tokenizer does not allow for adding special tokens - self.tokenizer.pad_token = "<|endoftext|>" - elif ( - self.tokenizer.__class__.__name__ == "RWKVWorldTokenizer" - or self.tokenizer.__class__.__name__ == "Rwkv5Tokenizer" - ): - # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0) - # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer - # --- - # Note that the world tokenizer class name, might change in the future for the final huggingface merge - # https://github.com/huggingface/transformers/pull/26963 - assert self.tokenizer.pad_token_id == 0 - else: - self.tokenizer.add_special_tokens({"pad_token": "<|pad|>"}) + self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config) # TODO: override this for Gemma self.add_bos_token = add_bos_token diff --git a/lm_eval/models/utils.py b/lm_eval/models/utils.py index 09818f4e..8a81e5de 100644 --- a/lm_eval/models/utils.py +++ b/lm_eval/models/utils.py @@ -5,6 +5,7 @@ import itertools import time from functools import wraps from typing import ( + TYPE_CHECKING, Any, Callable, Dict, @@ -24,6 +25,11 @@ import transformers from lm_eval.utils import eval_logger +if TYPE_CHECKING: + from transformers import PreTrainedTokenizerBase + from transformers.configuration_utils import PretrainedConfig + + def chunks(iter, n: int = 0, fn=None): """ Divides an iterable into chunks of specified size or based on a given function. @@ -613,3 +619,48 @@ class Collator: if arr: yield arr + + +def configure_pad_token( + tokenizer: "PreTrainedTokenizerBase", + model_config: Optional["PretrainedConfig"] = None, +) -> "PreTrainedTokenizerBase": + """ + This function checks if the (Hugging Face) tokenizer has a padding token and sets it if not present. + Some tokenizers require special handling. + + Args: + tokenizer: The tokenizer for which the padding token is to be handled. + model_config: The configuration of the model. Default is None. + + Returns: + The tokenizer after the padding token has been handled. + + Raises: + AssertionError: If the tokenizer is of type RWKVWorldTokenizer or Rwkv5Tokenizer and the padding token id is not 0. + """ + if tokenizer.pad_token: + pass + elif tokenizer.unk_token: + tokenizer.pad_token_id = tokenizer.unk_token_id + elif tokenizer.eos_token: + tokenizer.pad_token_id = tokenizer.eos_token_id + else: + # handle special cases + if model_config and getattr(model_config, "model_type", None) == "qwen": + # Qwen's trust_remote_code tokenizer does not allow for adding special tokens + tokenizer.pad_token = "<|endoftext|>" + elif ( + tokenizer.__class__.__name__ == "RWKVWorldTokenizer" + or tokenizer.__class__.__name__ == "Rwkv5Tokenizer" + ): + # The RWKV world tokenizer, does not allow for adding special tokens / setting the pad token (which is set as 0) + # The additional tokenizer name check is needed, as there exists rwkv4 models with neox tokenizer + # --- + # Note that the world tokenizer class name, might change in the future for the final huggingface merge + # https://github.com/huggingface/transformers/pull/26963 + assert tokenizer.pad_token_id == 0 + else: + tokenizer.add_special_tokens({"pad_token": "<|pad|>"}) + + return tokenizer diff --git a/lm_eval/models/vllm_causallms.py b/lm_eval/models/vllm_causallms.py index 34a63bf2..c8819d7f 100644 --- a/lm_eval/models/vllm_causallms.py +++ b/lm_eval/models/vllm_causallms.py @@ -1,7 +1,7 @@ import copy from importlib.metadata import version from importlib.util import find_spec -from typing import List, Literal, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union from more_itertools import distribute from packaging.version import parse as parse_version @@ -10,7 +10,7 @@ from tqdm import tqdm from lm_eval.api.instance import Instance from lm_eval.api.model import TemplateLM from lm_eval.api.registry import register_model -from lm_eval.models.utils import Collator, undistribute +from lm_eval.models.utils import Collator, configure_pad_token, undistribute from lm_eval.utils import ( eval_logger, get_rolling_token_windows, @@ -26,6 +26,8 @@ try: except ModuleNotFoundError: pass +if TYPE_CHECKING: + pass eval_logger = eval_logger @@ -118,6 +120,7 @@ class VLLM(TemplateLM): trust_remote_code=trust_remote_code, tokenizer_revision=tokenizer_revision, ) + self.tokenizer = configure_pad_token(self.tokenizer) self.add_bos_token = add_bos_token if "gemma" in pretrained.lower(): self.add_bos_token = True @@ -176,23 +179,46 @@ class VLLM(TemplateLM): def max_gen_toks(self): return self._max_gen_toks + def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str: + """ + Method to apply a chat template to a list of chat history between user and model. + """ + return self.tokenizer.apply_chat_template( + chat_history, tokenize=False, add_generation_prompt=True + ) + + @property + def chat_template(self) -> str: + if self.tokenizer.chat_template is not None: + return self.tokenizer.chat_template + return self.tokenizer.default_chat_template + + @property + def tokenizer_name(self) -> str: + return self.tokenizer.name_or_path.replace("/", "__") + def tok_encode( self, - string: str, - left_truncate_len=None, - add_special_tokens=None, - truncation=False, - ): - """ """ + string: Union[str, List[str]], + left_truncate_len: int = None, + add_special_tokens: bool = False, + truncation: bool = False, + ) -> Union[List[int], List[List[int]]]: if not add_special_tokens: add_special_tokens = False or self.add_bos_token - encoding = self.tokenizer.encode( - string, add_special_tokens=add_special_tokens, truncation=truncation - ) + encoding: Union[List[List[int]], List[int]] = self.tokenizer( + string, + add_special_tokens=add_special_tokens, + truncation=truncation, + return_attention_mask=False, + ).input_ids # left-truncate the encoded context to be at most `left_truncate_len` tokens long if left_truncate_len: - encoding = encoding[-left_truncate_len:] + if not isinstance(string, str): + encoding = [enc[-left_truncate_len:] for enc in encoding] + else: + encoding = encoding[-left_truncate_len:] return encoding @@ -209,7 +235,7 @@ class VLLM(TemplateLM): sampling_params = SamplingParams(max_tokens=max_tokens, stop=stop, **kwargs) else: sampling_params = SamplingParams( - temperature=0, prompt_logprobs=1, max_tokens=1 + temperature=0, prompt_logprobs=1, max_tokens=1, detokenize=False ) if self.data_parallel_size > 1: # vLLM hangs if tensor_parallel > 1 and resources are set in ray.remote @@ -290,7 +316,9 @@ class VLLM(TemplateLM): # batch tokenize contexts context, all_gen_kwargs = zip(*(req.args for req in requests)) - context_encoding = self.tokenizer(context, add_special_tokens=False).input_ids + context_encoding: List[List[int]] = self.tok_encode( + context, add_special_tokens=self.add_bos_token + ) requests = [ ((a, b), c) for a, b, c in zip(context, context_encoding, all_gen_kwargs) ] diff --git a/lm_eval/tasks/scrolls/task.py b/lm_eval/tasks/scrolls/task.py index bbf1a3c3..2940eb2e 100644 --- a/lm_eval/tasks/scrolls/task.py +++ b/lm_eval/tasks/scrolls/task.py @@ -132,7 +132,7 @@ class _SCROLLSTask(ConfigurableTask): def training_docs(self): processed_docs = list(map(self._process_doc, self.dataset["train"])) - + # Flatten the list of lists since _process_doc returns a list of one element. processed_docs = [item for sublist in processed_docs for item in sublist] processed_dict = { @@ -143,13 +143,13 @@ class _SCROLLSTask(ConfigurableTask): def validation_docs(self): processed_docs = list(map(self._process_doc, self.dataset["validation"])) - + # Flatten the list of lists since _process_doc returns a list of one element. processed_docs = [item for sublist in processed_docs for item in sublist] processed_dict = { key: [d[key] for d in processed_docs] for key in processed_docs[0] } - + return Dataset.from_dict(processed_dict) def should_decontaminate(self): diff --git a/tests/test_evaluator.py b/tests/test_evaluator.py index d5edf9ae..71e25555 100644 --- a/tests/test_evaluator.py +++ b/tests/test_evaluator.py @@ -33,6 +33,7 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false" 10000, ), ], + ids=lambda d: f"{d}", ) def test_evaluator( task_name: List[str], limit: int, model: str, model_args: str, bootstrap_iters: int @@ -107,6 +108,7 @@ def test_evaluator( "pretrained=EleutherAI/pythia-14m,dtype=float32,device=cpu", ), ], + ids=lambda d: f"{d}", ) def test_printed_results(task_name: List[str], limit: int, model: str, model_args: str): results = evaluator.simple_evaluate( @@ -144,6 +146,6 @@ def test_printed_results(task_name: List[str], limit: int, model: str, model_arg try: t1_item = float(t1_item) t2_item = float(t2_item) - assert abs(t1_item - t2_item) < 0.1 + assert abs(t1_item - t2_item) < 0.3 except ValueError: assert t1_item == t2_item -- GitLab From 2a6acc88a0c31be7734aec85b17555323b70c049 Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Sat, 29 Jun 2024 08:53:58 -0400 Subject: [PATCH 12/45] fail gracefully upon tokenizer logging failure (#2038) --- lm_eval/loggers/utils.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/lm_eval/loggers/utils.py b/lm_eval/loggers/utils.py index 670cdbff..348175a9 100644 --- a/lm_eval/loggers/utils.py +++ b/lm_eval/loggers/utils.py @@ -114,15 +114,29 @@ def add_env_info(storage: Dict[str, Any]): def add_tokenizer_info(storage: Dict[str, Any], lm): if getattr(lm, "tokenizer", False): - tokenizer_info = { - "tokenizer_pad_token": [lm.tokenizer.pad_token, lm.tokenizer.pad_token_id], - "tokenizer_eos_token": [lm.tokenizer.eos_token, lm.tokenizer.eos_token_id], - "tokenizer_bos_token": [lm.tokenizer.bos_token, lm.tokenizer.bos_token_id], - "eot_token_id": getattr(lm, "eot_token_id", None), - "max_length": getattr(lm, "max_length", None), - } - storage.update(tokenizer_info) - # seems gguf and textsynth do not have tokenizer + try: + tokenizer_info = { + "tokenizer_pad_token": [ + lm.tokenizer.pad_token, + lm.tokenizer.pad_token_id, + ], + "tokenizer_eos_token": [ + lm.tokenizer.eos_token, + lm.tokenizer.eos_token_id, + ], + "tokenizer_bos_token": [ + lm.tokenizer.bos_token, + lm.tokenizer.bos_token_id, + ], + "eot_token_id": getattr(lm, "eot_token_id", None), + "max_length": getattr(lm, "max_length", None), + } + storage.update(tokenizer_info) + except Exception as err: + logger.debug( + f"Logging detailed tokenizer info failed with {err}, skipping..." + ) + # seems gguf and textsynth do not have tokenizer else: logger.debug( "LM does not have a 'tokenizer' attribute, not logging tokenizer metadata to results." -- GitLab From a8ac0446b854970d7eca5e5c2b612e59c0af707a Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Mon, 1 Jul 2024 09:36:15 -0400 Subject: [PATCH 13/45] ship with exact_match function already used ; don't call evaluate.load() on import (#2045) --- lm_eval/api/metrics.py | 60 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/lm_eval/api/metrics.py b/lm_eval/api/metrics.py index 27b60ed1..1daf3847 100644 --- a/lm_eval/api/metrics.py +++ b/lm_eval/api/metrics.py @@ -1,10 +1,11 @@ import logging import math import random +import re +import string from collections.abc import Iterable from typing import List -import evaluate as hf_evaluate import numpy as np import sacrebleu import sklearn.metrics @@ -166,7 +167,60 @@ def acc_mutual_info_fn(items): # This is a passthrough function return items -exact_match = hf_evaluate.load("exact_match") +### the code used in the `exact_match_hf_evaluate` function is ported from +### https://github.com/huggingface/evaluate/blob/main/metrics/exact_match/exact_match.py +### which is under the apache license. + +# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +def exact_match_hf_evaluate( + predictions, + references, + regexes_to_ignore=None, + ignore_case=False, + ignore_punctuation=False, + ignore_numbers=False, +): + if regexes_to_ignore is not None: + for s in regexes_to_ignore: + predictions = np.array([re.sub(s, "", x) for x in predictions]) + references = np.array([re.sub(s, "", x) for x in references]) + else: + predictions = np.asarray(predictions) + references = np.asarray(references) + + if ignore_case: + predictions = np.char.lower(predictions) + references = np.char.lower(references) + + if ignore_punctuation: + repl_table = string.punctuation.maketrans("", "", string.punctuation) + predictions = np.char.translate(predictions, table=repl_table) + references = np.char.translate(references, table=repl_table) + + if ignore_numbers: + repl_table = string.digits.maketrans("", "", string.digits) + predictions = np.char.translate(predictions, table=repl_table) + references = np.char.translate(references, table=repl_table) + + score_list = predictions == references + + return {"exact_match": np.mean(score_list)} + + +### @register_metric( @@ -176,7 +230,7 @@ exact_match = hf_evaluate.load("exact_match") aggregation="mean", ) def exact_match_fn(**kwargs): - return exact_match.compute(**kwargs) + return exact_match_hf_evaluate(**kwargs) @register_metric( -- GitLab From 3fa4fd725c8a428710109f1d6c14eda37e95baea Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Mon, 1 Jul 2024 09:50:22 -0400 Subject: [PATCH 14/45] update to v0.4.3 (#2046) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 3818a1a8..040a9b7c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "lm_eval" -version = "0.4.2" +version = "0.4.3" authors = [ {name="EleutherAI", email="contact@eleuther.ai"} ] -- GitLab From e83e891d186f0797640c656501b023104852831d Mon Sep 17 00:00:00 2001 From: Ogundepo Odunayo Date: Mon, 1 Jul 2024 13:16:09 -0400 Subject: [PATCH 15/45] fix wandb logger module import in example (#2041) --- examples/visualize-wandb.ipynb | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/examples/visualize-wandb.ipynb b/examples/visualize-wandb.ipynb index b6ac47ce..d0113d55 100644 --- a/examples/visualize-wandb.ipynb +++ b/examples/visualize-wandb.ipynb @@ -110,13 +110,15 @@ "cell_type": "markdown", "id": "e974cabdbe70b667", "metadata": {}, - "source": "" + "source": [] }, { "cell_type": "markdown", "id": "5178ca9445b844e4", "metadata": {}, - "source": "W&B can also be initialized programmatically for use outside the CLI to parse and log the results." + "source": [ + "W&B can also be initialized programmatically for use outside the CLI to parse and log the results." + ] }, { "cell_type": "code", @@ -126,7 +128,7 @@ "outputs": [], "source": [ "import lm_eval\n", - "from lm_eval.logging_utils import WandbLogger\n", + "from lm_eval.loggers import WandbLogger\n", "\n", "results = lm_eval.simple_evaluate(\n", " model=\"hf\",\n", -- GitLab From 9088a68fac0d94ab028a23de47f9d499db70cc62 Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Mon, 1 Jul 2024 19:50:26 +0200 Subject: [PATCH 16/45] Fix strip whitespace filter (#2048) * batch commit * :Revert "batch commit" This reverts commit d859d1ca3bc2d6e45979ec51379df119f2a9633d. * batch commit * checkout from main * checkout from main * checkout from main * checkout from main * checkout from main * cleanup * cleanup * cleanup * cleanup * cleanup * cleanup --- lm_eval/filters/extraction.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lm_eval/filters/extraction.py b/lm_eval/filters/extraction.py index 9230808a..41dc6208 100644 --- a/lm_eval/filters/extraction.py +++ b/lm_eval/filters/extraction.py @@ -62,11 +62,8 @@ class WhitespaceFilter(Filter): def filter_set(inst): filtered_resp = [] for resp in inst: - if resp.startswith(" "): - resp = resp[1:] - + resp = resp.lstrip() filtered_resp.append(resp) - return filtered_resp filtered_resps = [filter_set(resp) for resp in resps] -- GitLab From 67a990e7345d4ba940e8281ac7c9113ccef2a446 Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Mon, 1 Jul 2024 22:26:25 -0400 Subject: [PATCH 17/45] update gemma-2 default BOS behavior (#2049) --- lm_eval/models/huggingface.py | 3 +-- lm_eval/models/vllm_causallms.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lm_eval/models/huggingface.py b/lm_eval/models/huggingface.py index 8c125f71..331684f5 100644 --- a/lm_eval/models/huggingface.py +++ b/lm_eval/models/huggingface.py @@ -256,9 +256,8 @@ class HFLM(TemplateLM): # select (or create) a pad token to use self.tokenizer = configure_pad_token(self.tokenizer, model_config=self.config) - # TODO: override this for Gemma self.add_bos_token = add_bos_token - if getattr(self.config, "model_type", None) == "gemma": + if getattr(self.config, "model_type", None) in ["gemma", "gemma2"]: self.add_bos_token = True eval_logger.info( f"Model type is '{self.config.model_type}', a BOS token will be used as Gemma underperforms without it." diff --git a/lm_eval/models/vllm_causallms.py b/lm_eval/models/vllm_causallms.py index c8819d7f..1a1067cc 100644 --- a/lm_eval/models/vllm_causallms.py +++ b/lm_eval/models/vllm_causallms.py @@ -125,7 +125,7 @@ class VLLM(TemplateLM): if "gemma" in pretrained.lower(): self.add_bos_token = True eval_logger.info( - "Found 'gemma' in model name, a BOS token will be used as Gemma underperforms without it." + "Found 'gemma' in model name, a BOS token will be used as Gemma series models underperform without it." ) self.custom_prefix_token_id = prefix_token_id -- GitLab From 1870ee4e5352ce4c29de23469c2932c356a1b87f Mon Sep 17 00:00:00 2001 From: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Date: Wed, 3 Jul 2024 04:54:03 -0400 Subject: [PATCH 18/45] Update hellaswag.yaml (#2029) --- lm_eval/tasks/hellaswag/hellaswag.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lm_eval/tasks/hellaswag/hellaswag.yaml b/lm_eval/tasks/hellaswag/hellaswag.yaml index ec627da7..577ca69e 100644 --- a/lm_eval/tasks/hellaswag/hellaswag.yaml +++ b/lm_eval/tasks/hellaswag/hellaswag.yaml @@ -20,3 +20,5 @@ metric_list: higher_is_better: true metadata: version: 1.0 +dataset_kwargs: + trust_remote_code: true -- GitLab From 3c8db1bb7be5662e4fd5b48a26b6214f758e483f Mon Sep 17 00:00:00 2001 From: Nathan Habib <30601243+NathanHB@users.noreply.github.com> Date: Wed, 3 Jul 2024 14:15:49 +0200 Subject: [PATCH 19/45] Adds Open LLM Leaderboard Taks (#2047) * adds leaderboard tasks * Delete lm_eval/tasks/leaderboard/leaderboard_chat_template.yaml * add readme * Delete lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro_chat_template.yaml * modify readme * fix bbh task * fix bbh salient task * modify the readme * Delete lm_eval/tasks/leaderboard/ifeval/README.md * Delete lm_eval/tasks/leaderboard/math/README.md * add leaderboard to the tasks repertory * add anouncment about new leaderbaord tasks * linting * Update README.md Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * installs ifeval dependency in new_task github workflow --------- Co-authored-by: Nathan Habib Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> --- .github/workflows/new_tasks.yml | 2 +- README.md | 1 + lm_eval/tasks/README.md | 1 + lm_eval/tasks/leaderboard/README.md | 319 ++++ .../leaderboard/bbh_mc/_fewshot_template_yaml | 17 + .../bbh_mc/boolean_expressions.yaml | 16 + .../leaderboard/bbh_mc/causal_judgement.yaml | 64 + .../bbh_mc/date_understanding.yaml | 57 + .../leaderboard/bbh_mc/disambiguation_qa.yaml | 50 + .../leaderboard/bbh_mc/formal_fallacies.yaml | 57 + .../leaderboard/bbh_mc/geometric_shapes.yaml | 86 + .../tasks/leaderboard/bbh_mc/hyperbaton.yaml | 34 + .../logical_deduction_five_objects.yaml | 52 + .../logical_deduction_seven_objects.yaml | 52 + .../logical_deduction_three_objects.yaml | 52 + .../bbh_mc/movie_recommendation.yaml | 54 + .../tasks/leaderboard/bbh_mc/navigate.yaml | 40 + .../leaderboard/bbh_mc/object_counting.yaml | 20 + .../bbh_mc/penguins_in_a_table.yaml | 68 + .../reasoning_about_colored_objects.yaml | 116 ++ .../tasks/leaderboard/bbh_mc/ruin_names.yaml | 50 + .../salient_translation_error_detection.yaml | 103 + lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml | 45 + .../bbh_mc/sports_understanding.yaml | 19 + .../bbh_mc/temporal_sequences.yaml | 101 + ...racking_shuffled_objects_five_objects.yaml | 59 + ...acking_shuffled_objects_seven_objects.yaml | 59 + ...acking_shuffled_objects_three_objects.yaml | 59 + .../tasks/leaderboard/bbh_mc/web_of_lies.yaml | 22 + lm_eval/tasks/leaderboard/gpqa/_template_yaml | 20 + .../gpqa/gpqa_diamond_zeroshot.yaml | 4 + .../gpqa/gpqa_extended_zeroshot.yaml | 4 + .../leaderboard/gpqa/gpqa_main_zeroshot.yaml | 4 + lm_eval/tasks/leaderboard/gpqa/utils.py | 38 + lm_eval/tasks/leaderboard/ifeval/ifeval.yaml | 32 + .../tasks/leaderboard/ifeval/instructions.py | 1612 ++++++++++++++++ .../ifeval/instructions_registry.py | 168 ++ .../leaderboard/ifeval/instructions_util.py | 1682 +++++++++++++++++ lm_eval/tasks/leaderboard/ifeval/utils.py | 134 ++ lm_eval/tasks/leaderboard/leaderboard.yaml | 8 + lm_eval/tasks/leaderboard/math/_template_yaml | 28 + .../tasks/leaderboard/math/math_algebra.yaml | 3 + .../math/math_counting_and_prob.yaml | 3 + .../tasks/leaderboard/math/math_geometry.yaml | 3 + .../math/math_intermediate_algebra.yaml | 3 + .../leaderboard/math/math_num_theory.yaml | 3 + .../leaderboard/math/math_prealgebra.yaml | 3 + .../leaderboard/math/math_precalculus.yaml | 3 + lm_eval/tasks/leaderboard/math/utils.py | 294 +++ .../tasks/leaderboard/mmlu_pro/mmlu_pro.yaml | 17 + lm_eval/tasks/leaderboard/mmlu_pro/utils.py | 15 + lm_eval/tasks/leaderboard/musr/_template_yaml | 13 + .../musr/musr_murder_mysteries.yaml | 3 + .../musr/musr_object_placements.yaml | 3 + .../musr/musr_team_allocation.yaml | 3 + lm_eval/tasks/leaderboard/musr/utils.py | 26 + 56 files changed, 5803 insertions(+), 1 deletion(-) create mode 100644 lm_eval/tasks/leaderboard/README.md create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml create mode 100644 lm_eval/tasks/leaderboard/gpqa/_template_yaml create mode 100644 lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml create mode 100644 lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml create mode 100644 lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml create mode 100644 lm_eval/tasks/leaderboard/gpqa/utils.py create mode 100644 lm_eval/tasks/leaderboard/ifeval/ifeval.yaml create mode 100644 lm_eval/tasks/leaderboard/ifeval/instructions.py create mode 100644 lm_eval/tasks/leaderboard/ifeval/instructions_registry.py create mode 100644 lm_eval/tasks/leaderboard/ifeval/instructions_util.py create mode 100644 lm_eval/tasks/leaderboard/ifeval/utils.py create mode 100644 lm_eval/tasks/leaderboard/leaderboard.yaml create mode 100644 lm_eval/tasks/leaderboard/math/_template_yaml create mode 100644 lm_eval/tasks/leaderboard/math/math_algebra.yaml create mode 100644 lm_eval/tasks/leaderboard/math/math_counting_and_prob.yaml create mode 100644 lm_eval/tasks/leaderboard/math/math_geometry.yaml create mode 100644 lm_eval/tasks/leaderboard/math/math_intermediate_algebra.yaml create mode 100644 lm_eval/tasks/leaderboard/math/math_num_theory.yaml create mode 100644 lm_eval/tasks/leaderboard/math/math_prealgebra.yaml create mode 100644 lm_eval/tasks/leaderboard/math/math_precalculus.yaml create mode 100644 lm_eval/tasks/leaderboard/math/utils.py create mode 100644 lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml create mode 100644 lm_eval/tasks/leaderboard/mmlu_pro/utils.py create mode 100644 lm_eval/tasks/leaderboard/musr/_template_yaml create mode 100644 lm_eval/tasks/leaderboard/musr/musr_murder_mysteries.yaml create mode 100644 lm_eval/tasks/leaderboard/musr/musr_object_placements.yaml create mode 100644 lm_eval/tasks/leaderboard/musr/musr_team_allocation.yaml create mode 100644 lm_eval/tasks/leaderboard/musr/utils.py diff --git a/.github/workflows/new_tasks.yml b/.github/workflows/new_tasks.yml index 0df7111c..b748aab5 100644 --- a/.github/workflows/new_tasks.yml +++ b/.github/workflows/new_tasks.yml @@ -56,7 +56,7 @@ jobs: if: steps.changed-tasks.outputs.tasks_any_modified == 'true' || steps.changed-tasks.outputs.api_any_modified == 'true' run: | python -m pip install --upgrade pip - pip install -e '.[dev]' --extra-index-url https://download.pytorch.org/whl/cpu + pip install -e '.[dev,ifeval]' --extra-index-url https://download.pytorch.org/whl/cpu # Install optional git dependencies # pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt # if [ -f requirements.txt ]; then pip install -r requirements.txt; fi diff --git a/README.md b/README.md index ef00b8a6..ed50e5e5 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ New updates and features include: +- **New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.** - Internal refactoring - Config-based task creation and configuration - Easier import and sharing of externally-defined task config YAMLs diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md index bb04d4f2..5d6393bb 100644 --- a/lm_eval/tasks/README.md +++ b/lm_eval/tasks/README.md @@ -56,6 +56,7 @@ | [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English | | [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian | | [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese | +| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English | | [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese | | [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese | | [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English | diff --git a/lm_eval/tasks/leaderboard/README.md b/lm_eval/tasks/leaderboard/README.md new file mode 100644 index 00000000..70c8a3a1 --- /dev/null +++ b/lm_eval/tasks/leaderboard/README.md @@ -0,0 +1,319 @@ +# Leaderboard evaluations +Our goal with this group is to create an unchanging through time version of +evaluations that will power the Open LLM Leaderboard on HuggingFace. + +As we want to evaluate models across capabilities, the list currently contains: +- BBH (3-shots, multichoice) +- GPQA (0-shot, multichoice) +- mmlu-pro (5-shots, multichoice) +- Musr (0-shot, multichoice) +- ifeval (0-shot, generative) +- Math-lvl-5 (4-shots, generative, minerva version) + + +Details on the choice of those evals can be found [here](https://huggingface.co/spaces/open-llm-leaderboard/blog) ! + +## BigBenchHard (BBH) + +A suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH). +These are the task for which prior language model evaluations did not +outperform the average human-rater. + +### Paper + +Title: Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them + +BIG-Bench (Srivastava et al., 2022) is a diverse evaluation suite that focuses on tasks believed to be beyond the capabilities of current language models. Language models have already made good progress on this benchmark, with the best model in the BIG-Bench paper outperforming average reported human-rater results on 65% of the BIG-Bench tasks via few-shot prompting. But on what tasks do language models fall short of average human-rater performance, and are those tasks actually unsolvable by current language models? +In this work, we focus on a suite of 23 challenging BIG-Bench tasks which we call BIG-Bench Hard (BBH). These are the task for which prior language model evaluations did not outperform the average human-rater. We find that applying chain-of-thought (CoT) prompting to BBH tasks enables PaLM to surpass the average human-rater performance on 10 of the 23 tasks, and Codex (code-davinci-002) to surpass the average human-rater performance on 17 of the 23 tasks. Since many tasks in BBH require multi-step reasoning, few-shot prompting without CoT, as done in the BIG-Bench evaluations (Srivastava et al., 2022), substantially underestimates the best performance and capabilities of language models, which is better captured via CoT prompting. As further analysis, we explore the interaction between CoT and model scale on BBH, finding that CoT enables emergent task performance on several BBH tasks with otherwise flat scaling curves. + + +- paper: https://huggingface.co/papers/2210.09261 +- Homepage: https://github.com/suzgunmirac/BIG-Bench-Hard + +### Citation + +``` +@article{suzgun2022challenging, + title={Challenging BIG-Bench Tasks and Whether Chain-of-Thought Can Solve Them}, + author={Suzgun, Mirac and Scales, Nathan and Sch{\"a}rli, Nathanael and Gehrmann, Sebastian and Tay, Yi and Chung, Hyung Won and Chowdhery, Aakanksha and Le, Quoc V and Chi, Ed H and Zhou, Denny and and Wei, Jason}, + journal={arXiv preprint arXiv:2210.09261}, + year={2022} +} +``` + +### Groups + +- `leaderboard_bbh` + +### Tasks + +- `leaderboard_bbh_boolean_expressions` +- `leaderboard_bbh_causal_judgement` +- `leaderboard_bbh_date_understanding` +- `leaderboard_bbh_disambiguation_qa` +- `leaderboard_bbh_dyck_languages` +- `leaderboard_bbh_formal_fallacies` +- `leaderboard_bbh_geometric_shapes` +- `leaderboard_bbh_hyperbaton` +- `leaderboard_bbh_logical_deduction_five_objects` +- `leaderboard_bbh_logical_deduction_seven_objects` +- `leaderboard_bbh_logical_deduction_three_objects` +- `leaderboard_bbh_movie_recommendation` +- `leaderboard_bbh_multistep_arithmetic_two` +- `leaderboard_bbh_navigate` +- `leaderboard_bbh_object_counting` +- `leaderboard_bbh_penguins_in_a_table` +- `leaderboard_bbh_reasoning_about_colored_objects` +- `leaderboard_bbh_ruin_names` +- `leaderboard_bbh_salient_translation_error_detection` +- `leaderboard_bbh_snarks` +- `leaderboard_bbh_sports_understanding` +- `leaderboard_bbh_temporal_sequences` +- `leaderboard_bbh_tracking_shuffled_objects_five_objects` +- `leaderboard_bbh_tracking_shuffled_objects_seven_objects` +- `leaderboard_bbh_tracking_shuffled_objects_three_objects` +- `leaderboard_bbh_web_of_lies` +- `leaderboard_bbh_word_sorting` + +## GPQA + +### Paper + +Title: GPQA: A Graduate-Level Google-Proof Q&A Benchmark + +We present GPQA, a challenging dataset of 448 multiple-choice questions written +by domain experts in biology, physics, and chemistry. We ensure that the +questions are high-quality and extremely difficult: experts who have or are +pursuing PhDs in the corresponding domains reach 65% accuracy (74% when +discounting clear mistakes the experts identified in retrospect), while highly +skilled non-expert validators only reach 34% accuracy, despite spending on +average over 30 minutes with unrestricted access to the web (i.e., the +questions are “Google-proof”). The questions are also difficult for +state-of-the-art AI systems, with our strongest GPT-4–based baseline achieving +39% accuracy. If we are to use future AI systems to help us answer very hard +questions—for example, when developing new scientific knowledge—we need to +develop scalable oversight methods that enable humans to supervise their +outputs, which may be difficult even if the supervisors are themselves skilled +and knowledgeable. The difficulty of GPQA both for skilled non-experts and +frontier AI systems should enable realistic scalable oversight experiments, +which we hope can help devise ways for human experts to reliably get truthful +information from AI systems that surpass human capabilities. + +- Paper: https://huggingface.co/papers/2311.12022 +- Homepage: https://github.com/idavidrein/gpqa/tree/main + +### Citation + +``` +@misc{rein2023gpqa, + title={GPQA: A Graduate-Level Google-Proof Q&A Benchmark}, + author={David Rein and Betty Li Hou and Asa Cooper Stickland and Jackson Petty and Richard Yuanzhe Pang and Julien Dirani and Julian Michael and Samuel R. Bowman}, + year={2023}, + eprint={2311.12022}, + archivePrefix={arXiv}, + primaryClass={cs.AI} +} +``` + +### Groups + +- `leaderboard_gpqa` + +### Tasks + +- `leaderboard_gpqa_extended` +- `leaderboard_gpqa_diamond` +- `leaderboard_gpqa_main` + +## IFEval + +### Paper + +Title: Instruction-Following Evaluation for Large Language Models + +One core capability of Large Language Models (LLMs) is to follow natural +language instructions. However, the evaluation of such abilities is not +standardized: Human evaluations are expensive, slow, and not objectively +reproducible, while LLM-based auto-evaluation is potentially biased or limited +by the ability of the evaluator LLM. To overcome these issues, we introduce +Instruction-Following Eval (IFEval) for large language models. IFEval is a +straightforward and easy-to-reproduce evaluation benchmark. It focuses on a set +of "verifiable instructions" such as "write in more than 400 words" and +"mention the keyword of AI at least 3 times". We identified 25 types of those +verifiable instructions and constructed around 500 prompts, with each prompt +containing one or more verifiable instructions. We show evaluation results of +two widely available LLMs on the market. + +- Paper: https://huggingface.co/papers/2210.09261 +- Homepage: https://github.com/google-research/google-research/tree/master/instruction_following_eval + +### Citation + +``` +@article{zhou2023instructionfollowing, + title={Instruction-Following Evaluation for Large Language Models}, + author={Jeffrey Zhou and Tianjian Lu and Swaroop Mishra and Siddhartha Brahma and Sujoy Basu and Yi Luan and Denny Zhou and Le Hou}, + journal={arXiv preprint arXiv:2311.07911}, + year={2023}, +} +``` + +### Tasks + +- `leaderboard_ifeval` + +## MATH-hard + +This is the 4 shots variant of minerva math but only keeping the level 5 questions. + +### Paper + +Title: Measuring Mathematical Problem Solving With the MATH Dataset + +Many intellectual endeavors require mathematical problem solving, but this +skill remains beyond the capabilities of computers. To measure this ability in +machine learning models, we introduce MATH, a new dataset of 12,500 challenging +competition mathematics problems. Each problem in MATH has a full step-by-step +solution which can be used to teach models to generate answer derivations and +explanations. + +NOTE: The few-shot and the generated answer extraction is based on the +[Minerva](https://arxiv.org/abs/2206.14858) and exact match equivalence is +calculated using the `sympy` library. This requires additional dependencies, +which can be installed via the `lm-eval[math]` extra. + +- Paper: https://huggingface.co/papers/2103.03874 +- Homepage: https://github.com/hendrycks/math + + +### Citation + +``` +@article{hendrycksmath2021, + title={Measuring Mathematical Problem Solving With the MATH Dataset}, + author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt}, + journal={NeurIPS}, + year={2021} +} +@misc{2206.14858, +Author = {Aitor Lewkowycz and Anders Andreassen and David Dohan and Ethan Dye and Henryk Michalewski and Vinay Ramasesh and Ambrose Slone and Cem Anil and Imanol Schlag and Theo Gutman-Solo and Yuhuai Wu and Behnam Neyshabur and Guy Gur-Ari and Vedant Misra}, +Title = {Solving Quantitative Reasoning Problems with Language Models}, +Year = {2022}, +Eprint = {arXiv:2206.14858}, +} +``` + +### Groups + +- `leaderboard_math_hard` + +### Tasks + +- `leaderboard_math_algebra_hard` +- `leaderboard_math_counting_and_prob_hard` +- `leaderboard_math_geometry_hard` +- `leaderboard_math_intermediate_algebra_hard` +- `leaderboard_math_num_theory_hard` +- `leaderboard_math_prealgebra_hard` +- `leaderboard_math_precalc_hard` + + +## MMLU-Pro + +### Paper + +Title: MMLU-Pro: A More Robust and Challenging Multi-Task Language +Understanding Benchmark + +In the age of large-scale language models, benchmarks like the Massive +Multitask Language Understanding (MMLU) have been pivotal in pushing the +boundaries of what AI can achieve in language comprehension and reasoning +across diverse domains. However, as models continue to improve, their +performance on these benchmarks has begun to plateau, making it increasingly +difficult to discern differences in model capabilities. This paper introduces +MMLU-Pro, an enhanced dataset designed to extend the mostly knowledge-driven +MMLU benchmark by integrating more challenging, reasoning-focused questions and +expanding the choice set from four to ten options. Additionally, MMLU-Pro +eliminates the trivial and noisy questions in MMLU. Our experimental results +show that MMLU-Pro not only raises the challenge, causing a significant drop in +accuracy by 16% to 33% compared to MMLU but also demonstrates greater stability +under varying prompts. With 24 different prompt styles tested, the sensitivity +of model scores to prompt variations decreased from 4-5% in MMLU to just 2% in +MMLU-Pro. Additionally, we found that models utilizing Chain of Thought (CoT) +reasoning achieved better performance on MMLU-Pro compared to direct answering, +which is in stark contrast to the findings on the original MMLU, indicating +that MMLU-Pro includes more complex reasoning questions. Our assessments +confirm that MMLU-Pro is a more discriminative benchmark to better track +progress in the field. + +- Paper: https://huggingface.co/papers/2406.01574 +- Homepage: https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro + +### Citation + +``` +@misc{wang2024mmluprorobustchallengingmultitask, + title={MMLU-Pro: A More Robust and Challenging Multi-Task Language + Understanding Benchmark}, + author={Yubo Wang and Xueguang Ma and Ge Zhang and Yuansheng Ni and Abhranil Chandra and Shiguang Guo and Weiming Ren and Aaran Arulraj and Xuan He and Ziyan Jiang and Tianle Li and Max Ku and Kai Wang and Alex Zhuang and Rongqi Fan and Xiang Yue and Wenhu Chen}, + year={2024}, + eprint={2406.01574}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2406.01574}, +} +``` + +## Musr + +### Paper + +Title: MuSR: Testing the Limits of Chain-of-thought with Multistep Soft +Reasoning + +While large language models (LLMs) equipped with techniques like +chain-of-thought prompting have demonstrated impressive capabilities, they +still fall short in their ability to reason robustly in complex settings. +However, evaluating LLM reasoning is challenging because system capabilities +continue to grow while benchmark datasets for tasks like logical deduction have +remained static. We introduce MuSR, a dataset for evaluating language models on +multistep soft reasoning tasks specified in a natural language narrative. This +dataset has two crucial features. First, it is created through a novel +neurosymbolic synthetic-to-natural generation algorithm, enabling the +construction of complex reasoning instances that challenge GPT-4 (e.g., murder +mysteries roughly 1000 words in length) and which can be scaled further as more +capable LLMs are released. Second, our dataset instances are free text +narratives corresponding to real-world domains of reasoning; this makes it +simultaneously much more challenging than other synthetically-crafted +benchmarks while remaining realistic and tractable for human annotators to +solve with high accuracy. We evaluate a range of LLMs and prompting techniques +on this dataset and characterize the gaps that remain for techniques like +chain-of-thought to perform robust reasoning. + +- Paper: https://huggingface.co/papers/2310.16049 +- Homepage: https://zayne-sprague.github.io/MuSR/ + +### Citation + +``` +@misc{sprague2024musrtestinglimitschainofthought, + title={MuSR: Testing the Limits of Chain-of-thought with Multistep Soft + Reasoning}, + author={Zayne Sprague and Xi Ye and Kaj Bostrom and Swarat Chaudhuri and Greg Durrett}, + year={2024}, + eprint={2310.16049}, + archivePrefix={arXiv}, + primaryClass={cs.CL}, + url={https://arxiv.org/abs/2310.16049}, +} +``` + +### Groups + +- `leaderboard_musr` + +### Tasks + +- `leaderboard_musr_murder_mysteries` +- `leaderboard_musr_object_placements` +- `leaderboard_musr_team_allocation` diff --git a/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml b/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml new file mode 100644 index 00000000..c461f75f --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/_fewshot_template_yaml @@ -0,0 +1,17 @@ +group: leaderboard_bbh +dataset_path: SaylorTwift/bbh +output_type: multiple_choice +test_split: test +doc_to_text: 'Q: {{input}} + + A:' +doc_to_target: "{{target}}" +metric_list: + - metric: acc_norm + aggregation: mean + higher_is_better: true +num_fewshot: 3 +fewshot_config: + sampler: first_n +metadata: + version: 0.0 diff --git a/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml b/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml new file mode 100644 index 00000000..970aa6c1 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/boolean_expressions.yaml @@ -0,0 +1,16 @@ +dataset_name: boolean_expressions +description: 'Evaluate the result of a random Boolean expression. + + ' +doc_to_choice: ["False", "True"] +fewshot_config: + sampler: first_n + samples: + - input: not ( ( not not True ) ) is + target: 'False' + - input: True and False and not True and True is + target: 'False' + - input: not not ( not ( False ) ) is + target: 'True' +include: _fewshot_template_yaml +task: leaderboard_bbh_boolean_expressions diff --git a/lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml b/lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml new file mode 100644 index 00000000..236bc568 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/causal_judgement.yaml @@ -0,0 +1,64 @@ +dataset_name: causal_judgement +description: 'Answer questions about causal attribution. + + ' +doc_to_choice: ["Yes", "No"] +fewshot_config: + sampler: first_n + samples: + - input: 'How would a typical person answer each of the following questions about + causation? + + Frank T., had an ongoing dispute with his neighbor over a stretch of land and + one day decided to shoot his neighbor in the body. Frank T. had no experience + with guns, his hand slipped on the barrel of the gun, and the shot went wild. + Nonetheless, the bullet bounced off a large boulder several feet away and hit + the neighbor''s body, causing significant injury. Did Frank T. intentionally + shoot his neighbor in the body? + + Options: + + - Yes + + - No' + target: 'No' + - input: 'How would a typical person answer each of the following questions about + causation? + + Suzy and Billy are working on a project that is very important for our nation''s + security. The boss tells them both: "Be sure that you are here at exactly 9 + am. It is absolutely essential that you arrive at that time." Both Billy and + Suzy arrive at 9 am. As it happens, there was a motion detector installed in + the room where they arrived. The motion detector was set up to be triggered + if at least one person appeared in the room at the same time. So the motion + detector went off. Did Billy cause the motion detector to go off? + + Options: + + - Yes + + - No' + target: 'Yes' + - input: 'How would a typical person answer each of the following questions about + causation? + + George and his sister Lena reunite at their parents'' house for Thanksgiving. + Whereas George just got into medical school, Lena is unhappy in her marriage + and recently lost her job. Over the course of the day, George and Lena get into + a number of heated arguments. Later in the afternoon they play a game of darts. + They split the first two games, and the third game is close until the end. Who + will win comes down to George''s last shot. If he hits a high point region, + he wins; if he hits a low point region, Lena wins. George thinks of the difficult + time Lena is having, and he really wants to let her win. He aims the dart at + the low point region. He sets up his shot and the dart lands in the low point + region. After his shot, Lena wins the game and is very happy. Did George hit + the low point region intentionally? + + Options: + + - Yes + + - No' + target: 'Yes' +include: _fewshot_template_yaml +task: leaderboard_bbh_causal_judgement diff --git a/lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml b/lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml new file mode 100644 index 00000000..bbdf8df6 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/date_understanding.yaml @@ -0,0 +1,57 @@ +dataset_name: date_understanding +description: 'Infer the date from context. + + ' +doc_to_choice: ["(A)", "(B)", "(C)", "(D)", "(E)", "(F)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Today is Christmas Eve of 1937. What is the date 10 days ago in MM/DD/YYYY? + + Options: + + (A) 12/14/2026 + + (B) 12/14/1950 + + (C) 12/14/2007 + + (D) 12/14/1937 + + (E) 07/14/1938 + + (F) 12/14/1988' + target: (D) + - input: 'Tomorrow is 11/12/2019. What is the date one year ago from today in MM/DD/YYYY? + + Options: + + (A) 09/04/2018 + + (B) 11/11/2018 + + (C) 08/25/2018 + + (D) 11/02/2018 + + (E) 11/04/2018' + target: (B) + - input: 'Jane and John married on Jan 2, 1958. It is their 5-year anniversary today. + What is the date tomorrow in MM/DD/YYYY? + + Options: + + (A) 01/11/1961 + + (B) 01/03/1963 + + (C) 01/18/1961 + + (D) 10/14/1960 + + (E) 01/03/1982 + + (F) 12/03/1960' + target: (B) +include: _fewshot_template_yaml +task: leaderboard_bbh_date_understanding diff --git a/lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml b/lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml new file mode 100644 index 00000000..34efa6f6 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/disambiguation_qa.yaml @@ -0,0 +1,50 @@ +dataset_name: disambiguation_qa +description: 'Clarify the meaning of sentences with ambiguous pronouns. + + ' +doc_to_choice: ["(A)", "(B)", "(C)"] +fewshot_config: + sampler: first_n + samples: + - input: 'In the following sentences, explain the antecedent of the pronoun (which + thing the pronoun refers to), or state that it is ambiguous. + + Sentence: The chief told the counselor that they took the day off. + + Options: + + (A) The chief took the day off + + (B) The counselor took the day off + + (C) Ambiguous' + target: (A) + - input: 'In the following sentences, explain the antecedent of the pronoun (which + thing the pronoun refers to), or state that it is ambiguous. + + Sentence: The manager sent a message to the secretary, but he didn''t reply + yet. + + Options: + + (A) The secretary didn''t reply yet + + (B) The manager didn''t reply yet + + (C) Ambiguous' + target: (A) + - input: 'In the following sentences, explain the antecedent of the pronoun (which + thing the pronoun refers to), or state that it is ambiguous. + + Sentence: Bailey will plan to meet the director at his office + + Options: + + (A) It will be Bailey''s office + + (B) It will be the director''s office + + (C) Ambiguous' + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_disambiguation_qa diff --git a/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml b/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml new file mode 100644 index 00000000..7fb642e5 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/formal_fallacies.yaml @@ -0,0 +1,57 @@ +dataset_name: formal_fallacies +description: 'Distinguish deductively valid arguments from formal fallacies. + + ' +doc_to_choice: ["valid", "invalid"] +fewshot_config: + sampler: first_n + samples: + - input: '"It is not always easy to see who is related to whom -- and in which ways. + The following argument pertains to this question: To begin with, Lesley is a + close friend of Fernando. Moreover, being a close friend of Fernando or a schoolmate + of Lowell is sufficient for being a great-grandfather of Leroy. It follows that + Lesley is a great-grandfather of Leroy." + + Is the argument, given the explicitly stated premises, deductively valid or + invalid? + + Options: + + - valid + + - invalid' + target: valid + - input: '"It is not always easy to see who is related to whom -- and in which ways. + The following argument pertains to this question: Whoever is not a great-grandfather + of Clyde is a stepbrother of Brian. Being an ancestor of Dana is sufficient + for not being a great-grandfather of Clyde. We may conclude: Everyone who is + an ancestor of Dana is a stepbrother of Brian, too." + + Is the argument, given the explicitly stated premises, deductively valid or + invalid? + + Options: + + - valid + + - invalid' + target: valid + - input: '"It is not always easy to grasp who is consuming which products. The following + argument pertains to this question: Every infrequent user of Paul Mitchell shampoo + is either a rare consumer of Nioxin shampoo or a loyal buyer of Caress soap, + or both. No regular consumer of Lush soap is a rare consumer of Nioxin shampoo + and, in the same time, a loyal buyer of Caress soap. It follows that whoever + is an infrequent user of Paul Mitchell shampoo is not a regular consumer of + Lush soap." + + Is the argument, given the explicitly stated premises, deductively valid or + invalid? + + Options: + + - valid + + - invalid' + target: invalid +include: _fewshot_template_yaml +task: leaderboard_bbh_formal_fallacies diff --git a/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml b/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml new file mode 100644 index 00000000..936caece --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/geometric_shapes.yaml @@ -0,0 +1,86 @@ +dataset_name: geometric_shapes +description: 'Name geometric shapes from their SVG paths. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)"] +fewshot_config: + sampler: first_n + samples: + - input: 'This SVG path element + draws a + + Options: + + (A) circle + + (B) heptagon + + (C) hexagon + + (D) kite + + (E) line + + (F) octagon + + (G) pentagon + + (H) rectangle + + (I) sector + + (J) triangle' + target: (F) + - input: 'This SVG path element draws a + + Options: + + (A) circle + + (B) heptagon + + (C) hexagon + + (D) kite + + (E) line + + (F) octagon + + (G) pentagon + + (H) rectangle + + (I) sector + + (J) triangle' + target: (G) + - input: 'This SVG path element draws a + + Options: + + (A) circle + + (B) heptagon + + (C) hexagon + + (D) kite + + (E) line + + (F) octagon + + (G) pentagon + + (H) rectangle + + (I) sector + + (J) triangle' + target: (D) +include: _fewshot_template_yaml +task: leaderboard_bbh_geometric_shapes diff --git a/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml b/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml new file mode 100644 index 00000000..24ddaa73 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/hyperbaton.yaml @@ -0,0 +1,34 @@ +dataset_name: hyperbaton +description: 'Order adjectives correctly in English sentences. + + ' +doc_to_choice: ["(A)", "(B)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Which sentence has the correct adjective order: + + Options: + + (A) rubber terrible ship + + (B) terrible rubber ship' + target: (B) + - input: 'Which sentence has the correct adjective order: + + Options: + + (A) repulsive small Brazilian exercise ship + + (B) Brazilian repulsive exercise small ship' + target: (A) + - input: 'Which sentence has the correct adjective order: + + Options: + + (A) blue gold wonderful square shoe + + (B) wonderful square blue gold shoe' + target: (B) +include: _fewshot_template_yaml +task: leaderboard_bbh_hyperbaton diff --git a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml new file mode 100644 index 00000000..21280253 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_five_objects.yaml @@ -0,0 +1,52 @@ +dataset_name: logical_deduction_five_objects +description: 'A logical deduction task which requires deducing the order of a sequence + of objects. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"] +fewshot_config: + sampler: first_n + samples: + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished + above Amy. Eli finished below Amy. + + Options: + + (A) Amy finished last + + (B) Eli finished last + + (C) Eve finished last' + target: (B) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a white book, a green book, and an orange + book. The green book is to the right of the white book. The orange book is the + rightmost. + + Options: + + (A) The white book is the leftmost + + (B) The green book is the leftmost + + (C) The orange book is the leftmost' + target: (A) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a red book, a gray book, and a white book. + The white book is to the left of the gray book. The red book is the second from + the left. + + Options: + + (A) The red book is the leftmost + + (B) The gray book is the leftmost + + (C) The white book is the leftmost' + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_logical_deduction_five_objects diff --git a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml new file mode 100644 index 00000000..11198d6f --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_seven_objects.yaml @@ -0,0 +1,52 @@ +dataset_name: logical_deduction_seven_objects +description: 'A logical deduction task which requires deducing the order of a sequence + of objects. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"] +fewshot_config: + sampler: first_n + samples: + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished + above Amy. Eli finished below Amy. + + Options: + + (A) Amy finished last + + (B) Eli finished last + + (C) Eve finished last' + target: (B) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a white book, a green book, and an orange + book. The green book is to the right of the white book. The orange book is the + rightmost. + + Options: + + (A) The white book is the leftmost + + (B) The green book is the leftmost + + (C) The orange book is the leftmost' + target: (A) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a red book, a gray book, and a white book. + The white book is to the left of the gray book. The red book is the second from + the left. + + Options: + + (A) The red book is the leftmost + + (B) The gray book is the leftmost + + (C) The white book is the leftmost' + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_logical_deduction_seven_objects diff --git a/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml new file mode 100644 index 00000000..f2522f3c --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/logical_deduction_three_objects.yaml @@ -0,0 +1,52 @@ +dataset_name: logical_deduction_three_objects +description: 'A logical deduction task which requires deducing the order of a sequence + of objects. + + ' +doc_to_choice: ["(A)","(B)","(C)"] +fewshot_config: + sampler: first_n + samples: + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + In a golf tournament, there were three golfers: Amy, Eli, and Eve. Eve finished + above Amy. Eli finished below Amy. + + Options: + + (A) Amy finished last + + (B) Eli finished last + + (C) Eve finished last' + target: (B) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a white book, a green book, and an orange + book. The green book is to the right of the white book. The orange book is the + rightmost. + + Options: + + (A) The white book is the leftmost + + (B) The green book is the leftmost + + (C) The orange book is the leftmost' + target: (A) + - input: 'The following paragraphs each describe a set of three objects arranged + in a fixed order. The statements are logically consistent within each paragraph. + On a shelf, there are three books: a red book, a gray book, and a white book. + The white book is to the left of the gray book. The red book is the second from + the left. + + Options: + + (A) The red book is the leftmost + + (B) The gray book is the leftmost + + (C) The white book is the leftmost' + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_logical_deduction_three_objects diff --git a/lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml b/lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml new file mode 100644 index 00000000..908272af --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/movie_recommendation.yaml @@ -0,0 +1,54 @@ +dataset_name: movie_recommendation +description: 'Recommend movies similar to the given list of movies. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Find a movie similar to Star Wars Episode IV - A New Hope, Indiana Jones + and the Last Crusade, Star Wars Episode V - The Empire Strikes Back, The Big + Lebowski: + + Options: + + (A) Tetsuo + + (B) the Ironman + + (C) The Princess Bride + + (D) The Barkley Marathons The Race That Eats Its Young + + (E) Bug' + target: (C) + - input: 'Find a movie similar to Twister, The Silence of the Lambs, Independence + Day, Braveheart: + + Options: + + (A) They Shoot Horses + + (B) Don''t They + + (C) Forrest Gump + + (D) The Salton Sea + + (E) Extreme Days' + target: (C) + - input: 'Find a movie similar to Minority Report, Total Recall, Inside Out, Forrest + Gump: + + Options: + + (A) Phenomena + + (B) Lilting + + (C) Catwoman + + (D) Edge of Tomorrow' + target: (D) +include: _fewshot_template_yaml +task: leaderboard_bbh_movie_recommendation diff --git a/lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml b/lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml new file mode 100644 index 00000000..22d0ddf3 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/navigate.yaml @@ -0,0 +1,40 @@ +dataset_name: navigate +description: 'Given a series of navigation instructions, determine whether one would + end up back at the starting point. + + ' +doc_to_choice: ["Yes","No"] +fewshot_config: + sampler: first_n + samples: + - input: 'If you follow these instructions, do you return to the starting point? + Turn left. Turn around. Turn left. Take 7 steps. Take 2 steps. Take 4 steps. + Take 8 steps. + + Options: + + - Yes + + - No' + target: 'No' + - input: 'If you follow these instructions, do you return to the starting point? + Turn around. Take 1 step. Take 6 steps. Turn around. Take 6 steps. Take 9 steps. + Take 1 step. + + Options: + + - Yes + + - No' + target: 'No' + - input: 'If you follow these instructions, do you return to the starting point? + Always face forward. Take 2 steps right. Take 9 steps left. Take 7 steps right. + + Options: + + - Yes + + - No' + target: 'Yes' +include: _fewshot_template_yaml +task: leaderboard_bbh_navigate diff --git a/lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml b/lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml new file mode 100644 index 00000000..9a98c508 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/object_counting.yaml @@ -0,0 +1,20 @@ +dataset_name: object_counting +description: 'Questions that involve enumerating objects and asking the model to count + them. + + ' +doc_to_choice: ["0","1","2","3","4","5","6","7","8","9","10", "11", "12", "13", "14", "15", "16", "17", "18"] +fewshot_config: + sampler: first_n + samples: + - input: I have a blackberry, a clarinet, a nectarine, a plum, a strawberry, a banana, + a flute, an orange, and a violin. How many fruits do I have? + target: '6' + - input: I have an orange, a raspberry, two peaches, a blackberry, an apple, a grape, + a nectarine, and three plums. How many fruits do I have? + target: '11' + - input: I have a lettuce head, a head of broccoli, an onion, a stalk of celery, + two carrots, a garlic, and a yam. How many vegetables do I have? + target: '8' +include: _fewshot_template_yaml +task: leaderboard_bbh_object_counting diff --git a/lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml b/lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml new file mode 100644 index 00000000..27687867 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/penguins_in_a_table.yaml @@ -0,0 +1,68 @@ +dataset_name: penguins_in_a_table +description: 'Answer questions about a table of penguins and their attributes. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Here is a table where the first line is a header and each subsequent line + is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, + 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis + is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. We now add + a penguin to the table: + + James, 12, 90, 12 + + How many penguins are less than 8 years old? + + Options: + + (A) 1 + + (B) 2 + + (C) 3 + + (D) 4 + + (E) 5' + target: (B) + - input: 'Here is a table where the first line is a header and each subsequent line + is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, + 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis + is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. Which is + the youngest penguin? + + Options: + + (A) Louis + + (B) Bernard + + (C) Vincent + + (D) Gwen + + (E) James' + target: (B) + - input: 'Here is a table where the first line is a header and each subsequent line + is a penguin: name, age, height (cm), weight (kg) Louis, 7, 50, 11 Bernard, + 5, 80, 13 Vincent, 9, 60, 11 Gwen, 8, 70, 15 For example: the age of Louis + is 7, the weight of Gwen is 15 kg, the height of Bernard is 80 cm. What is + the name of the second penguin sorted by alphabetic order? + + Options: + + (A) Louis + + (B) Bernard + + (C) Vincent + + (D) Gwen + + (E) James' + target: (D) +include: _fewshot_template_yaml +task: leaderboard_bbh_penguins_in_a_table diff --git a/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml new file mode 100644 index 00000000..d4bb5c38 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/reasoning_about_colored_objects.yaml @@ -0,0 +1,116 @@ +dataset_name: reasoning_about_colored_objects +description: 'Answer extremely simple questions about the colors of objects on a surface. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)","(H)","(I)","(J)","(K)","(L)","(M)","(N)","(O)","(P)","(Q)","(R)"] +fewshot_config: + sampler: first_n + samples: + - input: 'On the nightstand, there is a red pencil, a purple mug, a burgundy keychain, + a fuchsia teddy bear, a black plate, and a blue stress ball. What color is the + stress ball? + + Options: + + (A) red + + (B) orange + + (C) yellow + + (D) green + + (E) blue + + (F) brown + + (G) magenta + + (H) fuchsia + + (I) mauve + + (J) teal + + (K) turquoise + + (L) burgundy + + (M) silver + + (N) gold + + (O) black + + (P) grey + + (Q) purple + + (R) pink' + target: (E) + - input: 'On the table, you see a bunch of objects arranged in a row: a purple paperclip, + a pink stress ball, a brown keychain, a green scrunchiephone charger, a mauve + fidget spinner, and a burgundy pen. What is the color of the object directly + to the right of the stress ball? + + Options: + + (A) red + + (B) orange + + (C) yellow + + (D) green + + (E) blue + + (F) brown + + (G) magenta + + (H) fuchsia + + (I) mauve + + (J) teal + + (K) turquoise + + (L) burgundy + + (M) silver + + (N) gold + + (O) black + + (P) grey + + (Q) purple + + (R) pink' + target: (F) + - input: 'On the nightstand, you see the following items arranged in a row: a teal + plate, a burgundy keychain, a yellow scrunchiephone charger, an orange mug, + a pink notebook, and a grey cup. How many non-orange items do you see to the + left of the teal item? + + Options: + + (A) zero + + (B) one + + (C) two + + (D) three + + (E) four + + (F) five + + (G) six' + target: (A) +include: _fewshot_template_yaml +task: leaderboard_bbh_reasoning_about_colored_objects diff --git a/lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml b/lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml new file mode 100644 index 00000000..1d865267 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/ruin_names.yaml @@ -0,0 +1,50 @@ +dataset_name: ruin_names +description: 'Select the humorous edit that ''ruins'' the input movie or musical artist + name. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Which of the following is a humorous edit of this artist or movie name: + ''whitesnake''? + + Options: + + (A) whitesnape + + (B) whitesnapke + + (C) whitesnuake + + (D) mwhitesnake' + target: (A) + - input: 'Which of the following is a humorous edit of this artist or movie name: + ''one of our dinosaurs is missing''? + + Options: + + (A) ofne of our dinosaurs is missing + + (B) one af our dinosaurs is missing + + (C) one of our dinosaurs is pissing + + (D) one of our dinosaur is missing' + target: (C) + - input: 'Which of the following is a humorous edit of this artist or movie name: + ''counting crows''? + + Options: + + (A) countingy crows + + (B) counting cows + + (C) courting crows + + (D) coutnting crows' + target: (B) +include: _fewshot_template_yaml +task: leaderboard_bbh_ruin_names diff --git a/lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml b/lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml new file mode 100644 index 00000000..dd9c9142 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/salient_translation_error_detection.yaml @@ -0,0 +1,103 @@ +dataset_name: salient_translation_error_detection +description: 'Detect the type of error in an English translation of a German source + sentence. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)"] +fewshot_config: + sampler: first_n + samples: + - input: "The following translations from German to English contain a particular + error. That error will be one of the following types: Named Entities: An entity + (names, places, locations, etc.) is changed to a different entity. Numerical + Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. + Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are + changed. Negation or Antonyms: Introduce or remove a negation or change comparatives + to their antonyms. Facts: Trivial factual errors not pertaining to the above + classes are introduced in the translations. Dropped Content: A significant clause + in the translation is removed. Please identify that error. Source: In der Liste + der Baudenkmale in Lenzen (Elbe) sind alle Baudenkmale der brandenburgischen + Stadt Lenzen (Elbe) und ihrer Ortsteile aufgelistet. + + Translation: In the list of architectural monuments in Lenzen all architectural + monuments of the Brandenburg city of Lenzen and its districts are listed. + + The translation contains an error pertaining to + + Options: + + (A) Modifiers or Adjectives + + (B) Numerical Values + + (C) Negation or Antonyms + + (D) Named Entities + + (E) Dropped Content + + (F) Facts" + target: (D) + - input: "The following translations from German to English contain a particular + error. That error will be one of the following types: Named Entities: An entity + (names, places, locations, etc.) is changed to a different entity. Numerical + Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. + Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are + changed. Negation or Antonyms: Introduce or remove a negation or change comparatives + to their antonyms. Facts: Trivial factual errors not pertaining to the above + classes are introduced in the translations. Dropped Content: A significant clause + in the translation is removed. Please identify that error. Source: Auf dieser + Seite sind die Baudenkm\xE4ler der oberbayerischen Gro\xDFen Kreisstadt Landsberg + am Lech zusammengestellt. + + Translation: On this page are compiled the architectural monuments of the town of + Landsberg am Lech. + + The translation contains an error pertaining to + + Options: + + (A) Modifiers or Adjectives + + (B) Numerical Values + + (C) Negation or Antonyms + + (D) Named Entities + + (E) Dropped Content + + (F) Facts" + target: (E) + - input: "The following translations from German to English contain a particular + error. That error will be one of the following types: Named Entities: An entity + (names, places, locations, etc.) is changed to a different entity. Numerical + Values: Numerical values (ordinals or cardinals), dates, and/or units are changed. + Modifiers or Adjectives: The modifiers and adjectives pertaining to a noun are + changed. Negation or Antonyms: Introduce or remove a negation or change comparatives + to their antonyms. Facts: Trivial factual errors not pertaining to the above + classes are introduced in the translations. Dropped Content: A significant clause + in the translation is removed. Please identify that error. Source: \u0141eba ist + eine Kleinstadt und ein Badeort im Powiat L\u0119borski der polnischen Woiwodschaft + Pommern. + + Translation: Eba is not a small town and seaside resort in the Powiat L\xE9borski + county of the Pomeranian Voivodeship of Poland.\nThe translation contains an error + pertaining to + + Options: + + (A) Modifiers or Adjectives + + (B) Numerical Values + + (C) Negation or Antonyms + + (D) Named Entities + + (E) Dropped Content + + (F) Facts" + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_salient_translation_error_detection diff --git a/lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml b/lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml new file mode 100644 index 00000000..465e39b1 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/snarks.yaml @@ -0,0 +1,45 @@ +dataset_name: snarks +description: 'Determine which of two sentences is sarcastic. + + + According to Cambridge University Dictionary, sarcasm is "the use of remarks that + clearly mean the opposite of what they say, made in order to hurt someone''s feelings + or to criticize something in a humorous way." Sarcastic sentences often contain + satirical or ironic utterances, hyperboles, ambivalent or witty remarks. + + + ' +doc_to_choice: ["(A)","(B)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Which statement is sarcastic? + + Options: + + (A) Yes, because having interests and actively researching them is a huge waste + + (B) Yes, because having interests and actively researching them is a huge deal' + target: (A) + - input: 'Which statement is sarcastic? + + Options: + + (A) No one is going to disagree with you on this. Avoiding ad hominem attacks + really help your case + + (B) No one is going to disagree with you on this. Ad hominem attacks really + help your case' + target: (B) + - input: 'Which statement is sarcastic? + + Options: + + (A) Consistency in the league''s punishments? What do you think this is supposed + to be, politics? + + (B) Consistency in the league''s punishments? What do you think this is supposed + to be, moral?' + target: (A) +include: _fewshot_template_yaml +task: leaderboard_bbh_snarks diff --git a/lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml b/lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml new file mode 100644 index 00000000..f8c1c626 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/sports_understanding.yaml @@ -0,0 +1,19 @@ +dataset_name: sports_understanding +description: 'Determine whether an artificially constructed sentence relating to sports + is plausible or not. + + ' +doc_to_choice: ["yes","no"] +fewshot_config: + sampler: first_n + samples: + - input: Is the following sentence plausible? "Bam Adebayo scored a reverse layup + in the Western Conference Finals." + target: 'yes' + - input: Is the following sentence plausible? "Santi Cazorla scored a touchdown." + target: 'no' + - input: Is the following sentence plausible? "DeMar DeRozan was called for the + goal tend." + target: 'yes' +include: _fewshot_template_yaml +task: leaderboard_bbh_sports_understanding diff --git a/lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml b/lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml new file mode 100644 index 00000000..bafedd9c --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/temporal_sequences.yaml @@ -0,0 +1,101 @@ +dataset_name: temporal_sequences +description: 'Task description: Answer questions about which times certain events + could have occurred. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Today, Emily went to the museum. Between what times could they have gone? + + We know that: + + Emily woke up at 1pm. + + Elizabeth saw Emily reading at the library from 2pm to 4pm. + + Jessica saw Emily watching a movie at the theater from 4pm to 5pm. + + Leslie saw Emily waiting at the airport from 5pm to 6pm. + + William saw Emily buying clothes at the mall from 6pm to 7pm. + + The museum was closed after 7pm. + + Between what times could Emily have gone to the museum? + + Options: + + (A) 1pm to 2pm + + (B) 6pm to 7pm + + (C) 5pm to 6pm + + (D) 2pm to 4pm' + target: (A) + - input: 'Today, Elizabeth went to the amusement park. Between what times could + they have gone? + + We know that: + + Elizabeth woke up at 7am. + + David saw Elizabeth fixing their computer at the electronic store from 1pm to + 2pm. + + Sarah saw Elizabeth playing tennis at the tennis court from 2pm to 3pm. + + Susan saw Elizabeth walking towards the Statue of Liberty from 3pm to 6pm. + + Andrew saw Elizabeth taking photos near the Eiffel Tower from 6pm to 9pm. + + Emily saw Elizabeth getting a coffee at the cafe from 9pm to 10pm. + + The amusement park was closed after 10pm. + + Between what times could Elizabeth have gone to the amusement park? + + Options: + + (A) 7am to 1pm + + (B) 9pm to 10pm + + (C) 1pm to 2pm + + (D) 3pm to 6pm' + target: (A) + - input: 'Today, Tiffany went to the beach. Between what times could they have gone? + + We know that: + + Tiffany woke up at 5am. + + Betty saw Tiffany getting a coffee at the cafe from 5am to 6am. + + Jessica saw Tiffany working at the office from 6am to 9am. + + John saw Tiffany stretching at a yoga studio from 9am to 12pm. + + Sean saw Tiffany sitting on a rooftop from 12pm to 2pm. + + Sarah saw Tiffany playing tennis at the tennis court from 2pm to 3pm. + + The beach was closed after 4pm. + + Between what times could Tiffany have gone to the beach? + + Options: + + (A) 9am to 12pm + + (B) 12pm to 2pm + + (C) 5am to 6am + + (D) 3pm to 4pm' + target: (D) +include: _fewshot_template_yaml +task: leaderboard_bbh_temporal_sequences diff --git a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml new file mode 100644 index 00000000..59667929 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_five_objects.yaml @@ -0,0 +1,59 @@ +dataset_name: tracking_shuffled_objects_five_objects +description: 'A task requiring determining the final positions of a set of objects + given their initial positions and a description of a sequence of swaps. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)","(E)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire + has a pink ball. + + As the game progresses, pairs of players trade balls. First, Claire and Alice + swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. + At the end of the game, Bob has the + + Options: + + (A) yellow ball + + (B) blue ball + + (C) pink ball' + target: (A) + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a white ball, Bob has a purple ball, and + Claire has a pink ball. + + As the game progresses, pairs of players trade balls. First, Bob and Alice swap + balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At + the end of the game, Alice has the + + Options: + + (A) white ball + + (B) purple ball + + (C) pink ball' + target: (C) + - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of + a song, they each have a partner: Alice is dancing with Lola, Bob is dancing + with Rodrigo, and Claire is dancing with Patrick. + + Throughout the song, the dancers often trade partners. First, Alice and Bob + switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice + switch partners. At the end of the dance, Alice is dancing with + + Options: + + (A) Lola + + (B) Rodrigo + + (C) Patrick' + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_tracking_shuffled_objects_five_objects diff --git a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml new file mode 100644 index 00000000..1492eb61 --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_seven_objects.yaml @@ -0,0 +1,59 @@ +dataset_name: tracking_shuffled_objects_seven_objects +description: 'A task requiring determining the final positions of a set of objects + given their initial positions and a description of a sequence of swaps. + + ' +doc_to_choice: ["(A)","(B)","(C)","(D)","(E)","(F)","(G)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire + has a pink ball. + + As the game progresses, pairs of players trade balls. First, Claire and Alice + swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. + At the end of the game, Bob has the + + Options: + + (A) yellow ball + + (B) blue ball + + (C) pink ball' + target: (A) + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a white ball, Bob has a purple ball, and + Claire has a pink ball. + + As the game progresses, pairs of players trade balls. First, Bob and Alice swap + balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At + the end of the game, Alice has the + + Options: + + (A) white ball + + (B) purple ball + + (C) pink ball' + target: (C) + - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of + a song, they each have a partner: Alice is dancing with Lola, Bob is dancing + with Rodrigo, and Claire is dancing with Patrick. + + Throughout the song, the dancers often trade partners. First, Alice and Bob + switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice + switch partners. At the end of the dance, Alice is dancing with + + Options: + + (A) Lola + + (B) Rodrigo + + (C) Patrick' + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_tracking_shuffled_objects_seven_objects diff --git a/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml new file mode 100644 index 00000000..134a991b --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/tracking_shuffled_objects_three_objects.yaml @@ -0,0 +1,59 @@ +dataset_name: tracking_shuffled_objects_three_objects +description: 'A task requiring determining the final positions of a set of objects + given their initial positions and a description of a sequence of swaps. + + ' +doc_to_choice: ["(A)","(B)","(C)"] +fewshot_config: + sampler: first_n + samples: + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a yellow ball, Bob has a blue ball, and Claire + has a pink ball. + + As the game progresses, pairs of players trade balls. First, Claire and Alice + swap balls. Then, Alice and Bob swap balls. Finally, Claire and Bob swap balls. + At the end of the game, Bob has the + + Options: + + (A) yellow ball + + (B) blue ball + + (C) pink ball' + target: (A) + - input: 'Alice, Bob, and Claire are playing a game. At the start of the game, they + are each holding a ball: Alice has a white ball, Bob has a purple ball, and + Claire has a pink ball. + + As the game progresses, pairs of players trade balls. First, Bob and Alice swap + balls. Then, Bob and Claire swap balls. Finally, Bob and Alice swap balls. At + the end of the game, Alice has the + + Options: + + (A) white ball + + (B) purple ball + + (C) pink ball' + target: (C) + - input: 'Alice, Bob, and Claire are dancers at a square dance. At the start of + a song, they each have a partner: Alice is dancing with Lola, Bob is dancing + with Rodrigo, and Claire is dancing with Patrick. + + Throughout the song, the dancers often trade partners. First, Alice and Bob + switch partners. Then, Claire and Bob switch partners. Finally, Bob and Alice + switch partners. At the end of the dance, Alice is dancing with + + Options: + + (A) Lola + + (B) Rodrigo + + (C) Patrick' + target: (C) +include: _fewshot_template_yaml +task: leaderboard_bbh_tracking_shuffled_objects_three_objects diff --git a/lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml b/lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml new file mode 100644 index 00000000..a201d8ff --- /dev/null +++ b/lm_eval/tasks/leaderboard/bbh_mc/web_of_lies.yaml @@ -0,0 +1,22 @@ +dataset_name: web_of_lies +description: 'Evaluate a random boolean function expressed as a word problem. + + ' +doc_to_choice: ["Yes","No"] +fewshot_config: + sampler: first_n + samples: + - input: 'Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina + says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent + lies. Does Raymond tell the truth?' + target: 'Yes' + - input: 'Question: Kristian lies. Millie says Kristian lies. Maybelle says Millie + tells the truth. Fidel says Maybelle lies. Leda says Fidel lies. Does Leda tell + the truth?' + target: 'Yes' + - input: 'Question: Kristian tells the truth. Michaela says Kristian lies. Raymond + says Michaela tells the truth. Osvaldo says Raymond tells the truth. Jamey says + Osvaldo tells the truth. Does Jamey tell the truth?' + target: 'No' +include: _fewshot_template_yaml +task: leaderboard_bbh_web_of_lies diff --git a/lm_eval/tasks/leaderboard/gpqa/_template_yaml b/lm_eval/tasks/leaderboard/gpqa/_template_yaml new file mode 100644 index 00000000..83cb1872 --- /dev/null +++ b/lm_eval/tasks/leaderboard/gpqa/_template_yaml @@ -0,0 +1,20 @@ +dataset_path: Idavidrein/gpqa +group: leaderboard_gpqa +output_type: multiple_choice +process_docs: !function utils.process_docs +training_split: train +# Because huggingface dataset only has train split +validation_split: train +test_split: null +doc_to_text: "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer: " +doc_to_target: answer +doc_to_choice: ["(A)", "(B)", "(C)", "(D)"] +num_fewshot: 0 +metric_list: + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 +fewshot_config: + sampler: first_n diff --git a/lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml b/lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml new file mode 100644 index 00000000..a3892165 --- /dev/null +++ b/lm_eval/tasks/leaderboard/gpqa/gpqa_diamond_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_diamond +include: _template_yaml +task: leaderboard_gpqa_diamond diff --git a/lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml b/lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml new file mode 100644 index 00000000..b481d960 --- /dev/null +++ b/lm_eval/tasks/leaderboard/gpqa/gpqa_extended_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_extended +include: _template_yaml +task: leaderboard_gpqa_extended diff --git a/lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml b/lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml new file mode 100644 index 00000000..a15e45b6 --- /dev/null +++ b/lm_eval/tasks/leaderboard/gpqa/gpqa_main_zeroshot.yaml @@ -0,0 +1,4 @@ +# Generated by _generate_configs.py +dataset_name: gpqa_main +include: _template_yaml +task: leaderboard_gpqa_main diff --git a/lm_eval/tasks/leaderboard/gpqa/utils.py b/lm_eval/tasks/leaderboard/gpqa/utils.py new file mode 100644 index 00000000..c2317e02 --- /dev/null +++ b/lm_eval/tasks/leaderboard/gpqa/utils.py @@ -0,0 +1,38 @@ +import random +import re + +import datasets + + +def preprocess(text): + if text is None: + return " " + text = text.strip() + text = text.replace(" [title]", ". ") + text = re.sub("\\[.*?\\]", "", text) + text = text.replace(" ", " ") + return text + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc): + choices = [ + preprocess(doc["Incorrect Answer 1"]), + preprocess(doc["Incorrect Answer 2"]), + preprocess(doc["Incorrect Answer 3"]), + preprocess(doc["Correct Answer"]), + ] + + random.shuffle(choices) + correct_answer_index = choices.index(preprocess(doc["Correct Answer"])) + + out_doc = { + "choice1": choices[0], + "choice2": choices[1], + "choice3": choices[2], + "choice4": choices[3], + "answer": f"({chr(65 + correct_answer_index)})", + } + return out_doc + + return dataset.map(_process_doc) diff --git a/lm_eval/tasks/leaderboard/ifeval/ifeval.yaml b/lm_eval/tasks/leaderboard/ifeval/ifeval.yaml new file mode 100644 index 00000000..da394d9c --- /dev/null +++ b/lm_eval/tasks/leaderboard/ifeval/ifeval.yaml @@ -0,0 +1,32 @@ +task: leaderboard_ifeval +group: leaderboard_instruction_following +dataset_path: wis-k/instruction-following-eval +dataset_name: null +output_type: generate_until +test_split: train +num_fewshot: 0 +doc_to_text: prompt +doc_to_target: 0 +generation_kwargs: + until: [] + do_sample: false + temperature: 0.0 + max_gen_toks: 1280 +process_results: !function utils.process_results +metric_list: + - metric: prompt_level_strict_acc + aggregation: mean + higher_is_better: true + - metric: inst_level_strict_acc + aggregation: !function utils.agg_inst_level_acc + higher_is_better: true + - metric: prompt_level_loose_acc + aggregation: mean + higher_is_better: true + - metric: inst_level_loose_acc + aggregation: !function utils.agg_inst_level_acc + higher_is_better: true +metadata: + version: 2.0 +fewshot_config: + sampler: first_n diff --git a/lm_eval/tasks/leaderboard/ifeval/instructions.py b/lm_eval/tasks/leaderboard/ifeval/instructions.py new file mode 100644 index 00000000..5c352af3 --- /dev/null +++ b/lm_eval/tasks/leaderboard/ifeval/instructions.py @@ -0,0 +1,1612 @@ +# Copyright 2023 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Library of instructions.""" + +import collections +import json +import logging +import random +import re +import string +from typing import Dict, Optional, Sequence, Union + +import langdetect + +from lm_eval.tasks.ifeval import instructions_util + + +logger = logging.getLogger(__name__) + +_InstructionArgsDtype = Optional[Dict[str, Union[int, str, Sequence[str]]]] + +_LANGUAGES = instructions_util.LANGUAGE_CODES + +# The relational operation for comparison. +_COMPARISON_RELATION = ("less than", "at least") + +# The maximum number of sentences. +_MAX_NUM_SENTENCES = 20 + +# The number of placeholders. +_NUM_PLACEHOLDERS = 4 + +# The number of bullet lists. +_NUM_BULLETS = 5 + +# The options of constrained response. +_CONSTRAINED_RESPONSE_OPTIONS = ( + "My answer is yes.", + "My answer is no.", + "My answer is maybe.", +) + +# The options of starter keywords. +_STARTER_OPTIONS = ( + "I would say", + "My answer is", + "I believe", + "In my opinion", + "I think", + "I reckon", + "I feel", + "From my perspective", + "As I see it", + "According to me", + "As far as I'm concerned", + "To my understanding", + "In my view", + "My take on it is", + "As per my perception", +) + +# The options of ending keywords. +# TODO(jeffreyzhou) add more ending options +_ENDING_OPTIONS = ("Any other questions?", "Is there anything else I can help with?") + +# The number of highlighted sections. +_NUM_HIGHLIGHTED_SECTIONS = 4 + +# The section spliter. +_SECTION_SPLITER = ("Section", "SECTION") + +# The number of sections. +_NUM_SECTIONS = 5 + +# The number of paragraphs. +_NUM_PARAGRAPHS = 5 + +# The postscript marker. +_POSTSCRIPT_MARKER = ("P.S.", "P.P.S") + +# The number of keywords. +_NUM_KEYWORDS = 2 + +# The occurrences of a single keyword. +_KEYWORD_FREQUENCY = 3 + +# The occurrences of a single letter. +_LETTER_FREQUENCY = 10 + +# The occurrences of words with all capital letters. +_ALL_CAPITAL_WORD_FREQUENCY = 20 + +# The number of words in the response. +_NUM_WORDS_LOWER_LIMIT = 100 +_NUM_WORDS_UPPER_LIMIT = 500 + + +class Instruction: + """An instruction template.""" + + def __init__(self, instruction_id): + self.id = instruction_id + + def build_description(self, **kwargs): + raise NotImplementedError("`build_description` not implemented.") + + def get_instruction_args(self): + raise NotImplementedError("`get_instruction_args` not implemented.") + + def get_instruction_args_keys(self): + raise NotImplementedError("`get_instruction_args_keys` not implemented.") + + def check_following(self, value): + raise NotImplementedError("`check_following` not implemented.") + + +class ResponseLanguageChecker(Instruction): + """Check the language of the entire response.""" + + def build_description(self, *, language=None): + """Build the instruction description. + + Args: + language: A string representing the expected language of the response. The + language has to comply to the 97 types defined in + `langid.py` (https://pypi.org/project/langid/1.1.5/), which follows + ISO 639-1 codes (https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes); + for example, `en` for English, `zh` for Chinese, `fr` for French. + + Returns: + A string representing the instruction description. + """ + self._language = language + if self._language is None: + self._language = random.choice(list(_LANGUAGES.keys())) + # TODO(tianjianlu): opens the description generation to more choices. + self._description_pattern = ( + "Your ENTIRE response should be in {language} language, no other " + + "language is allowed." + ) + return self._description_pattern.format(language=_LANGUAGES[self._language]) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"language": self._language} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["language"] + + def check_following(self, value): + """Check if the language of the entire response follows the instruction. + + Args: + value: A string representing the response. + + Returns: + True if the language of `value` follows instruction; otherwise False. + """ + assert isinstance(value, str) + + try: + return langdetect.detect(value) == self._language + except langdetect.LangDetectException as e: + # Count as instruction is followed. + logging.error( + "Unable to detect language for text %s due to %s", value, e + ) # refex: disable=pytotw.037 + return True + + +class NumberOfSentences(Instruction): + """Check the number of sentences.""" + + def build_description(self, *, num_sentences=None, relation=None): + """Build the instruction description. + + Args: + num_sentences: An integer specifying the number of sentences as a + threshold. + relation: A string in (`less than`, `at least`), defining the relational + operator for comparison. + Two relational comparisons are supported for now: + if 'less than', the actual number of sentences < the threshold; + if 'at least', the actual number of sentences >= the threshold. + + Returns: + A string representing the instruction description. + """ + # The number of sentences as a threshold for comparison. + self._num_sentences_threshold = num_sentences + if self._num_sentences_threshold is None or self._num_sentences_threshold < 0: + self._num_sentences_threshold = random.randint(1, _MAX_NUM_SENTENCES) + + if relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {relation} is given." + ) + else: + self._comparison_relation = relation + + self._description_pattern = ( + "Your response should contain {relation} {num_sentences} sentences." + ) + return self._description_pattern.format( + relation=self._comparison_relation, + num_sentences=self._num_sentences_threshold, + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "num_sentences": self._num_sentences_threshold, + "relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_sentences", "relation"] + + def check_following(self, value): + """Check if the number of sentences follows the instruction. + + Args: + value: A string representing the response. + + Returns: + True if the response follows the instruction. + + Raise: + ValueError if the string in `instruction_args` is not in + [`less_than`, `at_least`]. + """ + num_sentences = instructions_util.count_sentences(value) + if self._comparison_relation == _COMPARISON_RELATION[0]: + return num_sentences < self._num_sentences_threshold + elif self._comparison_relation == _COMPARISON_RELATION[1]: + return num_sentences >= self._num_sentences_threshold + + +class PlaceholderChecker(Instruction): + """Check the placeholders in template writing.""" + + def build_description(self, *, num_placeholders=None): + """Build the instruction description. + + Args: + num_placeholders: An integer denoting the minimum number of + placeholders required in the response. + + Returns: + A string representing the instruction description. + """ + self._num_placeholders = num_placeholders + if self._num_placeholders is None or self._num_placeholders < 0: + self._num_placeholders = random.randint(1, _NUM_PLACEHOLDERS) + self._description_pattern = ( + "The response must contain at least {num_placeholders} placeholders " + + "represented by square brackets, such as [address]." + ) + return self._description_pattern.format(num_placeholders=self._num_placeholders) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_placeholders": self._num_placeholders} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_placeholders"] + + def check_following(self, value): + """Check if the number of placeholders follows the instruction. + + Args: + value: A string representing the response. + + Returns: + True if the actual number of placeholders in the response is greater than + or equal to `num_placeholders`; otherwise, False. + """ + placeholders = re.findall(r"\[.*?\]", value) + num_placeholders = len(placeholders) + return num_placeholders >= self._num_placeholders + + +class BulletListChecker(Instruction): + """Checks the bullet list in the prompt.""" + + def build_description(self, *, num_bullets=None): + """Build the instruction description. + + Args: + num_bullets: An integer specifying the exact number of bullet lists + that is required to appear in the response. + + Returns: + A string representing the instruction description. + """ + self._num_bullets = num_bullets + if self._num_bullets is None or self._num_bullets < 0: + self._num_bullets = random.randint(1, _NUM_BULLETS) + self._description_pattern = ( + "Your answer must contain exactly {num_bullets} bullet points. " + + "Use the markdown bullet points such as:\n" + + "* This is point 1. \n" + + "* This is point 2" + ) + return self._description_pattern.format(num_bullets=self._num_bullets) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_bullets": self._num_bullets} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_bullets"] + + def check_following(self, value): + r"""Check if the number of bullet lists meets the requirement. + + Args: + value: A string representing the response. The response is expected to + contain some bullet lists that start with `\*`. + + Returns: + True if the actual number of bullet lists in the response meets the + requirement. + """ + bullet_lists = re.findall(r"^\s*\*[^\*].*$", value, flags=re.MULTILINE) + bullet_lists_2 = re.findall(r"^\s*-.*$", value, flags=re.MULTILINE) + num_bullet_lists = len(bullet_lists) + len(bullet_lists_2) + return num_bullet_lists == self._num_bullets + + +class ConstrainedResponseChecker(Instruction): + """Checks the constrained response.""" + + def build_description(self): + """Build the instruction description.""" + # A sequence of string(s) representing the options of the expected response. + self._constrained_responses = _CONSTRAINED_RESPONSE_OPTIONS + self._description_pattern = ( + "Answer with one of the following options: {response_options}" + ) + return self._description_pattern.format( + response_options=self._constrained_responses + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response matches the constrained options. + + Args: + value: A string representing the response. + + Returns: + True if the actual response contains one of the options in the constrained + responses; otherwise False. + """ + value = value.strip() + for constrained_response in self._constrained_responses: + if constrained_response in value: + return True + return False + + +class ConstrainedStartChecker(Instruction): + """Checks the response start.""" + + def build_description(self, *, starter=None): + """Build the instruction description. + + Args: + starter: A string representing the keyward that the response should start + with. + + Returns: + A string representing the instruction description. + """ + self._starter = starter.strip() if isinstance(starter, str) else starter + if self._starter is None: + self._starter = random.choice(_STARTER_OPTIONS) + self._description_pattern = ( + "During the conversation, when it is your turn, " + + "please always start with {starter}" + ) + return self._description_pattern.format(starter=self._starter) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"starter": self._starter} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["starter"] + + def check_following(self, value): + """Checks if the response starts with the constrained keyword or phrase. + + Args: + value: A string representing the response. + + Returns: + True if the response starts with the given phrase or keyword that is + contained in `instruction_args`; otherwise, False. + """ + response_pattern = r"^\s*" + self._starter + r".*$" + response_with_constrained_start = re.search( + response_pattern, value, flags=re.MULTILINE + ) + return True if response_with_constrained_start else False + + +class HighlightSectionChecker(Instruction): + """Checks the highlighted section.""" + + def build_description(self, *, num_highlights=None): + """Build the instruction description. + + Args: + num_highlights: An integer specifying the minimum number of highlighted + sections. + + Returns: + A string representing the instruction description. + """ + self._num_highlights = num_highlights + if self._num_highlights is None or self._num_highlights < 0: + self._num_highlights = random.randint(1, _NUM_HIGHLIGHTED_SECTIONS) + + self._description_pattern = ( + "Highlight at least {num_highlights} sections in your answer with " + + "markdown, i.e. *highlighted section*." + ) + + return self._description_pattern.format(num_highlights=self._num_highlights) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_highlights": self._num_highlights} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_highlights"] + + def check_following(self, value): + """Checks if the number of highlighted sections meets the requirement. + + Args: + value: a string repesenting the response. The response is expected to + contain highlighted sections in the format of *highlighted*. + + Returns: + True if the actual number of highlighted sections in the format of + *highlighed sections* meets the minimum requirement; otherwise False. + """ + num_highlights = 0 + highlights = re.findall(r"\*[^\n\*]*\*", value) + double_highlights = re.findall(r"\*\*[^\n\*]*\*\*", value) + for highlight in highlights: + if highlight.strip("*").strip(): + num_highlights += 1 + for highlight in double_highlights: + if highlight.removeprefix("**").removesuffix("**").strip(): + num_highlights += 1 + + return num_highlights >= self._num_highlights + + +class SectionChecker(Instruction): + """Checks the sections.""" + + def build_description(self, *, section_spliter=None, num_sections=None): + """Build the instruction description. + + Args: + section_spliter: A string represents the section spliter keyword that + marks a new section, i.e., `Section` or `SECTION`. + num_sections: An integer specifying the number of sections. + + Returns: + A string representing the instruction description. + """ + self._section_spliter = ( + section_spliter.strip() + if isinstance(section_spliter, str) + else section_spliter + ) + if self._section_spliter is None: + self._section_spliter = random.choice(_SECTION_SPLITER) + + self._num_sections = num_sections + if self._num_sections is None or self._num_sections < 0: + self._num_sections = random.randint(1, _NUM_SECTIONS) + + self._description_pattern = ( + "Your response must have {num_sections} sections. Mark the beginning " + + "of each section with {section_spliter} X, such as:\n" + + "{section_spliter} 1\n" + + "[content of section 1]\n" + + "{section_spliter} 2\n" + + "[content of section 2]" + ) + + return self._description_pattern.format( + num_sections=self._num_sections, section_spliter=self._section_spliter + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "section_spliter": self._section_spliter, + "num_sections": self._num_sections, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["section_spliter", "num_sections"] + + def check_following(self, value): + """Checks the response contains multiple sections. + + Args: + value: A string representing the response. The response is expected + to contain multiple sections (number of sections is greater than 1). + A new section starts with `Section 1`, where the number denotes the + section index. + + Returns: + True if the number of sections in the response is greater than or equal to + the minimum number of sections; otherwise, False. + """ + section_splitter_patten = r"\s?" + self._section_spliter + r"\s?\d+\s?" + sections = re.split(section_splitter_patten, value) + num_sections = len(sections) - 1 + return num_sections >= self._num_sections + + +class ParagraphChecker(Instruction): + """Checks the paragraphs.""" + + def build_description(self, *, num_paragraphs=None): + """Build the instruction description. + + Args: + num_paragraphs: An integer specifying the number of paragraphs. + + Returns: + A string representing the instruction description. + """ + self._num_paragraphs = num_paragraphs + if self._num_paragraphs is None or self._num_paragraphs < 0: + self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS) + + self._description_pattern = ( + "There should be {num_paragraphs} paragraphs. " + + "Paragraphs are separated with the markdown divider: ***" + ) + + return self._description_pattern.format(num_paragraphs=self._num_paragraphs) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_paragraphs": self._num_paragraphs} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_paragraphs"] + + def check_following(self, value): + """Checks the response contains required number of paragraphs. + + Args: + value: A string representing the response. The response may contain + paragraphs that are separated by the markdown divider: `***`. + + Returns: + True if the actual number of paragraphs is the same as required; + otherwise, False. + """ + paragraphs = re.split(r"\s?\*\*\*\s?", value) + num_paragraphs = len(paragraphs) + + for index, paragraph in enumerate(paragraphs): + if not paragraph.strip(): + if index == 0 or index == len(paragraphs) - 1: + num_paragraphs -= 1 + else: + return False + + return num_paragraphs == self._num_paragraphs + + +class PostscriptChecker(Instruction): + """Checks the postscript.""" + + def build_description(self, *, postscript_marker=None): + """Build the instruction description. + + Args: + postscript_marker: A string containing the keyword that marks the start + of the postscript section. + + Returns: + A string representing the instruction description. + """ + self._postscript_marker = ( + postscript_marker.strip() + if isinstance(postscript_marker, str) + else postscript_marker + ) + if self._postscript_marker is None: + self._postscript_marker = random.choice(_POSTSCRIPT_MARKER) + + self._description_pattern = ( + "At the end of your response, please explicitly add a postscript " + + "starting with {postscript}" + ) + + return self._description_pattern.format(postscript=self._postscript_marker) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"postscript_marker": self._postscript_marker} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["postscript_marker"] + + def check_following(self, value): + """Checks if the response follows the postscript format. + + Args: + value: a string representing the response. The response is expected to + contain a postscript section. + + Returns: + True if the response contains a postscript section starting with + the keyword containing in the `instruction_args`; otherwise False. + """ + value = value.lower() + if self._postscript_marker == "P.P.S": + postscript_pattern = r"\s*p\.\s?p\.\s?s.*$" + elif self._postscript_marker == "P.S.": + postscript_pattern = r"\s*p\.\s?s\..*$" + else: + postscript_pattern = r"\s*" + self._postscript_marker.lower() + r".*$" + postscript = re.findall(postscript_pattern, value, flags=re.MULTILINE) + return True if postscript else False + + +class RephraseChecker(Instruction): + """Checks the repharse.""" + + def build_description(self, *, original_message): + """Build the instruction description. + + Args: + original_message: A string representing the original message. The + rephrased response should only change its words/sentences in between + its two asterisks, for example, *change me*. Both original and rephrased + messages should contain the changes in the form of *change me*. + + Returns: + A string representing the instruction description. + """ + if not self.is_change(original_message): + raise ValueError( + f"Message {original_message} does not contain changes " + "in the form of *change me*." + ) + + self._reference_without_change = original_message + self._description = ( + "Rephrasing: Your rephrased response should only" + + "change the words/sentences in between two asterisks" + + "such as *change me*." + ) + return self._description + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"original_message": self._reference_without_change} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["original_message"] + + def check_following(self, value): + r"""Checks if the rephrasing follows the instruction. + + Args: + value: A string representing the response, which is expected to rephras + the string of `instruction_args`. + + Returns: + True if `value` and `instruction_args` only differ by the words/sentences + in between two asterisks such as *change me*; otherwise, False. + """ + + if not self.is_change(value): + raise ValueError( + f"value {value} does not contain " "changes in the form of *change me*." + ) + + response_without_changes = self.strip_changes(value) + reference_without_changes = self.strip_changes(self._reference_without_change) + + return response_without_changes == reference_without_changes + + def is_change(self, response): + """Check if there is change in the response in the form of *change me*.""" + return re.search(r"\*.*\*", response) + + def strip_changes(self, response): + """Strips off the changes.""" + return re.sub(r"\*.*\*", "", response) + + +class KeywordChecker(Instruction): + """Check the exisitence of certain keywords.""" + + def build_description(self, *, keywords=None): + """Build the instruction description. + + Args: + keywords: A sequence of strings representing the keywords that are + expected in the response. + + Returns: + A string representing the instruction description. + """ + + if not keywords: + self._keywords = instructions_util.generate_keywords( + num_keywords=_NUM_KEYWORDS + ) + else: + self._keywords = keywords + self._keywords = sorted(self._keywords) + + self._description_pattern = "Include keywords {keywords} in the response." + + return self._description_pattern.format(keywords=self._keywords) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"keywords": self._keywords} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["keywords"] + + def check_following(self, value): + """Check if the response contain the expected keywords.""" + for keyword in self._keywords: + if not re.search(keyword, value, flags=re.IGNORECASE): + return False + return True + + +class KeywordFrequencyChecker(Instruction): + """Check the keyword frequency.""" + + def build_description(self, *, keyword=None, frequency=None, relation=None): + """Build the instruction description. + + Args: + keyword: A string representing a keyword that is expected in the response. + frequency: An integer specifying the number of times `keyword` is expected + to appear in the response. + relation: A string in (`less than`, `at least`), defining the relational + operator for comparison. + Two relational comparisons are supported for now: + if 'less than', the actual number of occurrences < frequency; + if 'at least', the actual number of occurrences >= frequency. + + Returns: + A string representing the instruction description. + """ + if not keyword: + self._keyword = instructions_util.generate_keywords(num_keywords=1)[0] + else: + self._keyword = keyword.strip() + + self._frequency = frequency + if self._frequency is None or self._frequency < 0: + self._frequency = random.randint(1, _KEYWORD_FREQUENCY) + + if relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {relation} is given." + ) + else: + self._comparison_relation = relation + + self._description_pattern = ( + "In your response, the word {keyword} should appear {relation} " + + "{frequency} times." + ) + + return self._description_pattern.format( + keyword=self._keyword, + relation=self._comparison_relation, + frequency=self._frequency, + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "keyword": self._keyword, + "frequency": self._frequency, + "relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["keyword", "frequency", "relation"] + + def check_following(self, value): + """Checks if the response contain the keyword with required frequency.""" + actual_occurrences = len(re.findall(self._keyword, value, flags=re.IGNORECASE)) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return actual_occurrences < self._frequency + elif self._comparison_relation == _COMPARISON_RELATION[1]: + return actual_occurrences >= self._frequency + + +class NumberOfWords(Instruction): + """Checks the number of words.""" + + def build_description(self, *, num_words=None, relation=None): + """Build the instruction description. + + Args: + num_words: An integer specifying the number of words contained in the + response. + relation: A string in (`less than`, `at least`), defining the relational + operator for comparison. + Two relational comparisons are supported for now: + if 'less than', the actual number of words < num_words; + if 'at least', the actual number of words >= num_words. + + Returns: + A string representing the instruction description. + """ + + self._num_words = num_words + if self._num_words is None or self._num_words < 0: + self._num_words = random.randint( + _NUM_WORDS_LOWER_LIMIT, _NUM_WORDS_UPPER_LIMIT + ) + + if relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {relation} is given." + ) + else: + self._comparison_relation = relation + + self._description_pattern = "Answer with {relation} {num_words} words." + + return self._description_pattern.format( + relation=self._comparison_relation, num_words=self._num_words + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"num_words": self._num_words, "relation": self._comparison_relation} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_words", "relation"] + + def check_following(self, value): + """Checks if the response contains the expected number of words.""" + num_words = instructions_util.count_words(value) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return num_words < self._num_words + elif self._comparison_relation == _COMPARISON_RELATION[1]: + return num_words >= self._num_words + + +class JsonFormat(Instruction): + """Check the Json format.""" + + def build_description(self): + self._description_pattern = ( + "Entire output should be wrapped in JSON format. You can use markdown" + " ticks such as ```." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + value = ( + value.strip() + .removeprefix("```json") + .removeprefix("```Json") + .removeprefix("```JSON") + .removeprefix("```") + .removesuffix("```") + .strip() + ) + try: + json.loads(value) + except ValueError: + return False + return True + + +class ParagraphFirstWordCheck(Instruction): + """Check the paragraph and the first word of the nth paragraph.""" + + def build_description( + self, num_paragraphs=None, nth_paragraph=None, first_word=None + ): + r"""Build the instruction description. + + Args: + num_paragraphs: An integer indicating the number of paragraphs expected + in the response. A paragraph is a subset of the string that is + expected to be separated by '\n\n'. + nth_paragraph: An integer indicating the paragraph number that we look at. + Note that n starts from 1. + first_word: A string that represent the first word of the bth paragraph. + + Returns: + A string representing the instruction description. + """ + self._num_paragraphs = num_paragraphs + if self._num_paragraphs is None or self._num_paragraphs < 0: + self._num_paragraphs = random.randint(1, _NUM_PARAGRAPHS) + + self._nth_paragraph = nth_paragraph + if ( + self._nth_paragraph is None + or self._nth_paragraph <= 0 + or self._nth_paragraph > self._num_paragraphs + ): + self._nth_paragraph = random.randint(1, self._num_paragraphs + 1) + + self._first_word = first_word + if self._first_word is None: + self._first_word = instructions_util.generate_keywords(num_keywords=1)[0] + self._first_word = self._first_word.lower() + + self._description_pattern = ( + "There should be {num_paragraphs} paragraphs. " + + "Paragraphs and only paragraphs are separated with each other by two " + + "new lines as if it was '\\n\\n' in python. " + + "Paragraph {nth_paragraph} must start with word {first_word}." + ) + + return self._description_pattern.format( + num_paragraphs=self._num_paragraphs, + nth_paragraph=self._nth_paragraph, + first_word=self._first_word, + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "num_paragraphs": self._num_paragraphs, + "nth_paragraph": self._nth_paragraph, + "first_word": self._first_word, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_paragraphs", "nth_paragraph", "first_word"] + + def check_following(self, value): + """Checks for required number of paragraphs and correct first word. + + Args: + value: a string representing the response. The response may contain + paragraphs that are separated by two new lines and the first word of + the nth paragraph will have to match a specified word. + + Returns: + True if the number of paragraphs is the same as required and the first + word of the specified paragraph is the same as required. Otherwise, false. + """ + + paragraphs = re.split(r"\n\n", value) + num_paragraphs = len(paragraphs) + + for paragraph in paragraphs: + if not paragraph.strip(): + num_paragraphs -= 1 + + # check that index doesn't go out of bounds + if self._nth_paragraph <= num_paragraphs: + paragraph = paragraphs[self._nth_paragraph - 1].strip() + if not paragraph: + return False + else: + return False + + first_word = "" + punctuation = {".", ",", "?", "!", "'", '"'} + + # get first word and remove punctuation + word = paragraph.split()[0].strip() + # TODO(jeffrey): make more complex? + word = word.lstrip("'") + word = word.lstrip('"') + + for letter in word: + if letter in punctuation: + break + first_word += letter.lower() + + return num_paragraphs == self._num_paragraphs and first_word == self._first_word + + +# TODO(jeffrey) add relation - at least/at most? +class KeySentenceChecker(Instruction): + """Check the existence of certain key sentences.""" + + def build_description(self, key_sentences=None, num_sentences=None): + """Build the instruction description. + + Args: + key_sentences: A sequences of strings representing the key sentences that + are expected in the response. + num_sentences: The number of key sentences that are expected to be seen in + the response. + + Returns: + A string representing the instruction description. + """ + + if not key_sentences: + # TODO(jeffrey) make a generate sentences function? wonderwords package + self._key_sentences = set(["For now, this is fine."]) + else: + self._key_sentences = key_sentences + + if not num_sentences: + self._num_sentences = random.randint(1, len(self._key_sentences)) + else: + self._num_sentences = num_sentences + + self._description_pattern = ( + "Include {num_sentences} of the following sentences {key_sentences}" + ) + + return self._description_pattern.format( + num_sentences=self._num_sentences, key_sentences=self._key_sentences + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "num_sentences": self._num_sentences, + "key_sentences": list(self._key_sentences), + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["num_sentences", "key_sentences"] + + def check_following(self, value): + """Checks if the response contains the expected key sentences.""" + count = 0 + sentences = instructions_util.split_into_sentences(value) + for sentence in self._key_sentences: + if sentence in sentences: + count += 1 + + return count == self._num_sentences + + +class ForbiddenWords(Instruction): + """Checks that specified words are not used in response.""" + + def build_description(self, forbidden_words=None): + """Build the instruction description. + + Args: + forbidden_words: A sequences of strings respresenting words that are not + allowed in the response. + + Returns: + A string representing the instruction description. + """ + + if not forbidden_words: + self._forbidden_words = instructions_util.generate_keywords( + num_keywords=_NUM_KEYWORDS + ) + else: + self._forbidden_words = list(set(forbidden_words)) + self._forbidden_words = sorted(self._forbidden_words) + self._description_pattern = ( + "Do not include keywords {forbidden_words} in the response." + ) + + return self._description_pattern.format(forbidden_words=self._forbidden_words) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return {"forbidden_words": self._forbidden_words} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["forbidden_words"] + + def check_following(self, value): + """Check if the response does not contain the expected keywords.""" + for word in self._forbidden_words: + if re.search(r"\b" + word + r"\b", value, flags=re.IGNORECASE): + return False + return True + + +class RephraseParagraph(Instruction): + """Checks that the paragraph is rephrased.""" + + def build_description(self, *, original_paragraph, low, high): + """Builds the instruction description. + + Args: + original_paragraph: A string presenting the original paragraph. The + rephrases response should have betweeb low-high words in common. + low: An integer presenting the lower bound of similar words. + high: An integer representing the upper bound of similar words. + + Returns: + A string representing the instruction description. + """ + # TODO(jeffrey) make more encompassing + self._original_paragraph = original_paragraph + self._low = low + self._high = high + + self._description = ( + "Rephrase the following paragraph: " + + "{original_paragraph}\nYour response should have " + + "between {low} and {high} of the same words. " + + "Words are the same if and only if all of the " + + "letters, ignoring cases, are the same. For " + + "example, 'run' is the same as 'Run' but different " + + "to 'ran'." + ) + + return self._description.format( + original_paragraph=original_paragraph, low=self._low, high=self._high + ) + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return { + "original_paragraph": self._original_paragraph, + "low": self._low, + "high": self._high, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["original_paragraph", "low", "high"] + + def check_following(self, value): + val_words = re.findall(r"\w+", value.lower()) + original_words = re.findall(r"\w+", self._original_paragraph.lower()) + similar_words = 0 + + dict_val = collections.Counter(val_words) + dict_original = collections.Counter(original_words) + + for word in dict_original: + similar_words += min(dict_original[word], dict_val[word]) + + return similar_words >= self._low and similar_words <= self._high + + +class TwoResponsesChecker(Instruction): + """Check that two responses were given.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Give two different responses. Responses and only responses should" + " be separated by 6 asterisk symbols: ******." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyward args of `build_description`.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response has two different answers. + + Args: + value: A string representing the response. + + Returns: + True if two responses are detected and false otherwise. + """ + valid_responses = list() + responses = value.split("******") + for index, response in enumerate(responses): + if not response.strip(): + if index != 0 and index != len(responses) - 1: + return False + else: + valid_responses.append(response) + return ( + len(valid_responses) == 2 + and valid_responses[0].strip() != valid_responses[1].strip() + ) + + +class RepeatPromptThenAnswer(Instruction): + """Checks that Prompt is first repeated then answered.""" + + def build_description(self, *, prompt_to_repeat=None): + """Build the instruction description. + + Args: + prompt_to_repeat: The prompt that is meant to be repeated. + + Returns: + A string representing the instruction description. + """ + if not prompt_to_repeat: + raise ValueError("prompt_to_repeat must be set.") + else: + self._prompt_to_repeat = prompt_to_repeat + self._description_pattern = ( + "First repeat the request word for word without change," + " then give your answer (1. do not say any words or characters" + " before repeating the request; 2. the request you need to repeat" + " does not include this sentence)" + ) + return self._description_pattern + + def get_instruction_args(self): + return {"prompt_to_repeat": self._prompt_to_repeat} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["prompt_to_repeat"] + + def check_following(self, value): + if value.strip().lower().startswith(self._prompt_to_repeat.strip().lower()): + return True + return False + + +class EndChecker(Instruction): + """Checks that the prompt ends with a given phrase.""" + + def build_description(self, *, end_phrase=None): + """Build the instruction description. + + Args: + end_phrase: A string representing the phrase the response should end with. + + Returns: + A string representing the instruction description. + """ + self._end_phrase = ( + end_phrase.strip() if isinstance(end_phrase, str) else end_phrase + ) + if self._end_phrase is None: + self._end_phrase = random.choice(_ENDING_OPTIONS) + self._description_pattern = ( + "Finish your response with this exact phrase {ender}. " + "No other words should follow this phrase." + ) + return self._description_pattern.format(ender=self._end_phrase) + + def get_instruction_args(self): + return {"end_phrase": self._end_phrase} + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["end_phrase"] + + def check_following(self, value): + """Checks if the response ends with the expected phrase.""" + value = value.strip().strip('"').lower() + self._end_phrase = self._end_phrase.strip().lower() + return value.endswith(self._end_phrase) + + +class TitleChecker(Instruction): + """Checks the response for a title.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Your answer must contain a title, wrapped in double angular brackets," + " such as <>." + ) + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response contains a title.""" + pattern = r"<<[^\n]+>>" + re_pattern = re.compile(pattern) + titles = re.findall(re_pattern, value) + + for title in titles: + if title.lstrip("<").rstrip(">").strip(): + return True + return False + + +class LetterFrequencyChecker(Instruction): + """Checks letter frequency.""" + + def build_description(self, *, letter=None, let_frequency=None, let_relation=None): + """Build the instruction description. + + Args: + letter: A string representing a letter that is expected in the response. + let_frequency: An integer specifying the number of times `keyword` is + expected to appear in the response. + let_relation: A string in (`less than`, `at least`), defining the + relational operator for comparison. Two relational comparisons are + supported for now; if 'less than', the actual number of + occurrences < frequency; if 'at least', the actual number of + occurrences >= frequency. + + Returns: + A string representing the instruction description. + """ + if ( + not letter + or len(letter) > 1 + or ord(letter.lower()) < 97 + or ord(letter.lower()) > 122 + ): + self._letter = random.choice(list(string.ascii_letters)) + else: + self._letter = letter.strip() + self._letter = self._letter.lower() + + self._frequency = let_frequency + if self._frequency is None or self._frequency < 0: + self._frequency = random.randint(1, _LETTER_FREQUENCY) + + if let_relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif let_relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {let_relation} is given." + ) + else: + self._comparison_relation = let_relation + + self._description_pattern = ( + "In your response, the letter {letter} should appear {let_relation}" + " {let_frequency} times." + ) + + return self._description_pattern.format( + letter=self._letter, + let_frequency=self._frequency, + let_relation=self._comparison_relation, + ) + + def get_instruction_args(self): + """Returns the keyword args of build description.""" + return { + "letter": self._letter, + "let_frequency": self._frequency, + "let_relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["letter", "let_frequency", "let_relation"] + + def check_following(self, value): + """Checks that the response contains the letter at the right frequency.""" + value = value.lower() + letters = collections.Counter(value) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return letters[self._letter] < self._frequency + else: + return letters[self._letter] >= self._frequency + + +class CapitalLettersEnglishChecker(Instruction): + """Checks that the response is in english and is in all capital letters.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Your entire response should be in English, and in all capital letters." + ) + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks that the response is in English and in all capital letters.""" + assert isinstance(value, str) + + try: + return value.isupper() and langdetect.detect(value) == "en" + except langdetect.LangDetectException as e: + # Count as instruction is followed. + logging.error( + "Unable to detect language for text %s due to %s", value, e + ) # refex: disable=pytotw.037 + return True + + +class LowercaseLettersEnglishChecker(Instruction): + """Checks that the response is in english and is in all lowercase letters.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Your entire response should be in English, and in all lowercase" + " letters. No capital letters are allowed." + ) + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks that the response is in English and in all lowercase letters.""" + assert isinstance(value, str) + + try: + return value.islower() and langdetect.detect(value) == "en" + except langdetect.LangDetectException as e: + # Count as instruction is followed. + logging.error( + "Unable to detect language for text %s due to %s", value, e + ) # refex: disable=pytotw.037 + return True + + +class CommaChecker(Instruction): + """Checks the response for no commas.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "In your entire response, refrain from the use of any commas." + ) + return self._description_pattern + + def get_instruction_args(self): + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks that the response does not contain commas.""" + return not re.search(r"\,", value) + + +class CapitalWordFrequencyChecker(Instruction): + """Checks frequency of words with all capital letters.""" + + def build_description( + self, + capital_frequency=None, + capital_relation=None, + ): + """Build the instruction description. + + Args: + capital_frequency: An integer that represents the number of words that + should be in all capital letters. + capital_relation: A string that is 'at least' or 'at most' that refers to + the frequency. + + Returns: + A string representing the instruction description. + """ + self._frequency = capital_frequency + if self._frequency is None: + self._frequency = random.randint(1, _ALL_CAPITAL_WORD_FREQUENCY) + + self._comparison_relation = capital_relation + if capital_relation is None: + self._comparison_relation = random.choice(_COMPARISON_RELATION) + elif capital_relation not in _COMPARISON_RELATION: + raise ValueError( + "The supported relation for comparison must be in " + f"{_COMPARISON_RELATION}, but {capital_relation} is given." + ) + + self._description_pattern = ( + "In your response, words with all capital letters should appear" + " {relation} {frequency} times." + ) + + return self._description_pattern.format( + frequency=self._frequency, relation=self._comparison_relation + ) + + def get_instruction_args(self): + """Returns the keyword args of build description.""" + return { + "capital_frequency": self._frequency, + "capital_relation": self._comparison_relation, + } + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return ["capital_frequency", "capital_relation"] + + def check_following(self, value): + """Checks the frequency of words with all capital letters.""" + # Hyphenated words will count as one word + words = instructions_util.nltk.word_tokenize(value) + capital_words = [word for word in words if word.isupper()] + + capital_words = len(capital_words) + + if self._comparison_relation == _COMPARISON_RELATION[0]: + return capital_words < self._frequency + else: + return capital_words >= self._frequency + + +class QuotationChecker(Instruction): + """Checks response is wrapped with double quotation marks.""" + + def build_description(self): + """Build the instruction description.""" + self._description_pattern = ( + "Wrap your entire response with double quotation marks." + ) + return self._description_pattern + + def get_instruction_args(self): + """Returns the keyword args of build description.""" + return None + + def get_instruction_args_keys(self): + """Returns the args keys of `build_description`.""" + return [] + + def check_following(self, value): + """Checks if the response is wrapped with double quotation marks.""" + value = value.strip() + return len(value) > 1 and value[0] == '"' and value[-1] == '"' diff --git a/lm_eval/tasks/leaderboard/ifeval/instructions_registry.py b/lm_eval/tasks/leaderboard/ifeval/instructions_registry.py new file mode 100644 index 00000000..00d9a1de --- /dev/null +++ b/lm_eval/tasks/leaderboard/ifeval/instructions_registry.py @@ -0,0 +1,168 @@ +# Copyright 2023 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Registry of all instructions.""" + +from lm_eval.tasks.ifeval import instructions + + +_KEYWORD = "keywords:" + +_LANGUAGE = "language:" + +_LENGTH = "length_constraints:" + +_CONTENT = "detectable_content:" + +_FORMAT = "detectable_format:" + +_MULTITURN = "multi-turn:" + +_COMBINATION = "combination:" + +_STARTEND = "startend:" + +_CHANGE_CASES = "change_case:" + +_PUNCTUATION = "punctuation:" + +INSTRUCTION_DICT = { + _KEYWORD + "existence": instructions.KeywordChecker, + _KEYWORD + "frequency": instructions.KeywordFrequencyChecker, + # TODO(jeffreyzhou): make a proper set of sentences to choose from + # _KEYWORD + "key_sentences": instructions.KeySentenceChecker, + _KEYWORD + "forbidden_words": instructions.ForbiddenWords, + _KEYWORD + "letter_frequency": instructions.LetterFrequencyChecker, + _LANGUAGE + "response_language": instructions.ResponseLanguageChecker, + _LENGTH + "number_sentences": instructions.NumberOfSentences, + _LENGTH + "number_paragraphs": instructions.ParagraphChecker, + _LENGTH + "number_words": instructions.NumberOfWords, + _LENGTH + "nth_paragraph_first_word": instructions.ParagraphFirstWordCheck, + _CONTENT + "number_placeholders": instructions.PlaceholderChecker, + _CONTENT + "postscript": instructions.PostscriptChecker, + _FORMAT + "number_bullet_lists": instructions.BulletListChecker, + # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace + # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph, + _FORMAT + "constrained_response": instructions.ConstrainedResponseChecker, + _FORMAT + "number_highlighted_sections": (instructions.HighlightSectionChecker), + _FORMAT + "multiple_sections": instructions.SectionChecker, + # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message. + # _FORMAT + "rephrase": instructions.RephraseChecker, + _FORMAT + "json_format": instructions.JsonFormat, + _FORMAT + "title": instructions.TitleChecker, + # TODO(tianjianlu): Re-enable with specific prompts. + # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker, + _COMBINATION + "two_responses": instructions.TwoResponsesChecker, + _COMBINATION + "repeat_prompt": instructions.RepeatPromptThenAnswer, + _STARTEND + "end_checker": instructions.EndChecker, + _CHANGE_CASES + "capital_word_frequency": instructions.CapitalWordFrequencyChecker, + _CHANGE_CASES + "english_capital": instructions.CapitalLettersEnglishChecker, + _CHANGE_CASES + "english_lowercase": instructions.LowercaseLettersEnglishChecker, + _PUNCTUATION + "no_comma": instructions.CommaChecker, + _STARTEND + "quotation": instructions.QuotationChecker, +} + +INSTRUCTION_CONFLICTS = { + _KEYWORD + "existence": {_KEYWORD + "existence"}, + _KEYWORD + "frequency": {_KEYWORD + "frequency"}, + # TODO(jeffreyzhou): make a proper set of sentences to choose from + # _KEYWORD + "key_sentences": instructions.KeySentenceChecker, + _KEYWORD + "forbidden_words": {_KEYWORD + "forbidden_words"}, + _KEYWORD + "letter_frequency": {_KEYWORD + "letter_frequency"}, + _LANGUAGE + "response_language": { + _LANGUAGE + "response_language", + _FORMAT + "multiple_sections", + _KEYWORD + "existence", + _KEYWORD + "frequency", + _KEYWORD + "forbidden_words", + _STARTEND + "end_checker", + _CHANGE_CASES + "english_capital", + _CHANGE_CASES + "english_lowercase", + }, + _LENGTH + "number_sentences": {_LENGTH + "number_sentences"}, + _LENGTH + "number_paragraphs": { + _LENGTH + "number_paragraphs", + _LENGTH + "nth_paragraph_first_word", + _LENGTH + "number_sentences", + _LENGTH + "nth_paragraph_first_word", + }, + _LENGTH + "number_words": {_LENGTH + "number_words"}, + _LENGTH + "nth_paragraph_first_word": { + _LENGTH + "nth_paragraph_first_word", + _LENGTH + "number_paragraphs", + }, + _CONTENT + "number_placeholders": {_CONTENT + "number_placeholders"}, + _CONTENT + "postscript": {_CONTENT + "postscript"}, + _FORMAT + "number_bullet_lists": {_FORMAT + "number_bullet_lists"}, + # TODO(jeffreyzhou): Pre-create paragraph or use prompt to replace + # _CONTENT + "rephrase_paragraph": instructions.RephraseParagraph, + _FORMAT + "constrained_response": set(INSTRUCTION_DICT.keys()), + _FORMAT + "number_highlighted_sections": {_FORMAT + "number_highlighted_sections"}, + _FORMAT + "multiple_sections": { + _FORMAT + "multiple_sections", + _LANGUAGE + "response_language", + _FORMAT + "number_highlighted_sections", + }, + # TODO(tianjianlu): Re-enable rephrasing with preprocessing the message. + # _FORMAT + "rephrase": instructions.RephraseChecker, + _FORMAT + "json_format": set(INSTRUCTION_DICT.keys()).difference( + {_KEYWORD + "forbidden_words", _KEYWORD + "existence"} + ), + _FORMAT + "title": {_FORMAT + "title"}, + # TODO(tianjianlu): Re-enable with specific prompts. + # _MULTITURN + "constrained_start": instructions.ConstrainedStartChecker, + _COMBINATION + "two_responses": set(INSTRUCTION_DICT.keys()).difference( + { + _KEYWORD + "forbidden_words", + _KEYWORD + "existence", + _LANGUAGE + "response_language", + _FORMAT + "title", + _PUNCTUATION + "no_comma", + } + ), + _COMBINATION + "repeat_prompt": set(INSTRUCTION_DICT.keys()).difference( + {_KEYWORD + "existence", _FORMAT + "title", _PUNCTUATION + "no_comma"} + ), + _STARTEND + "end_checker": {_STARTEND + "end_checker"}, + _CHANGE_CASES + "capital_word_frequency": { + _CHANGE_CASES + "capital_word_frequency", + _CHANGE_CASES + "english_lowercase", + _CHANGE_CASES + "english_capital", + }, + _CHANGE_CASES + "english_capital": {_CHANGE_CASES + "english_capital"}, + _CHANGE_CASES + "english_lowercase": { + _CHANGE_CASES + "english_lowercase", + _CHANGE_CASES + "english_capital", + }, + _PUNCTUATION + "no_comma": {_PUNCTUATION + "no_comma"}, + _STARTEND + "quotation": {_STARTEND + "quotation", _FORMAT + "title"}, +} + + +def conflict_make(conflicts): + """Makes sure if A conflicts with B, B will conflict with A. + + Args: + conflicts: Dictionary of potential conflicts where key is instruction id + and value is set of instruction ids that it conflicts with. + + Returns: + Revised version of the dictionary. All instructions conflict with + themselves. If A conflicts with B, B will conflict with A. + """ + for key in conflicts: + for k in conflicts[key]: + conflicts[k].add(key) + conflicts[key].add(key) + return conflicts diff --git a/lm_eval/tasks/leaderboard/ifeval/instructions_util.py b/lm_eval/tasks/leaderboard/ifeval/instructions_util.py new file mode 100644 index 00000000..ccb531f9 --- /dev/null +++ b/lm_eval/tasks/leaderboard/ifeval/instructions_util.py @@ -0,0 +1,1682 @@ +# Copyright 2023 The Google Research Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Utility library of instructions.""" + +import functools +import random +import re + +import immutabledict +import nltk + + +def download_nltk_resources(): + """Download 'punkt' if not already installed""" + try: + nltk.data.find("tokenizers/punkt") + except LookupError: + nltk.download("punkt") + + +download_nltk_resources() + +WORD_LIST = [ + "western", + "sentence", + "signal", + "dump", + "spot", + "opposite", + "bottom", + "potato", + "administration", + "working", + "welcome", + "morning", + "good", + "agency", + "primary", + "wish", + "responsibility", + "press", + "problem", + "president", + "steal", + "brush", + "read", + "type", + "beat", + "trainer", + "growth", + "lock", + "bone", + "case", + "equal", + "comfortable", + "region", + "replacement", + "performance", + "mate", + "walk", + "medicine", + "film", + "thing", + "rock", + "tap", + "total", + "competition", + "ease", + "south", + "establishment", + "gather", + "parking", + "world", + "plenty", + "breath", + "claim", + "alcohol", + "trade", + "dear", + "highlight", + "street", + "matter", + "decision", + "mess", + "agreement", + "studio", + "coach", + "assist", + "brain", + "wing", + "style", + "private", + "top", + "brown", + "leg", + "buy", + "procedure", + "method", + "speed", + "high", + "company", + "valuable", + "pie", + "analyst", + "session", + "pattern", + "district", + "pleasure", + "dinner", + "swimming", + "joke", + "order", + "plate", + "department", + "motor", + "cell", + "spend", + "cabinet", + "difference", + "power", + "examination", + "engine", + "horse", + "dimension", + "pay", + "toe", + "curve", + "literature", + "bother", + "fire", + "possibility", + "debate", + "activity", + "passage", + "hello", + "cycle", + "background", + "quiet", + "author", + "effect", + "actor", + "page", + "bicycle", + "error", + "throat", + "attack", + "character", + "phone", + "tea", + "increase", + "outcome", + "file", + "specific", + "inspector", + "internal", + "potential", + "staff", + "building", + "employer", + "shoe", + "hand", + "direction", + "garden", + "purchase", + "interview", + "study", + "recognition", + "member", + "spiritual", + "oven", + "sandwich", + "weird", + "passenger", + "particular", + "response", + "reaction", + "size", + "variation", + "a", + "cancel", + "candy", + "exit", + "guest", + "condition", + "fly", + "price", + "weakness", + "convert", + "hotel", + "great", + "mouth", + "mind", + "song", + "sugar", + "suspect", + "telephone", + "ear", + "roof", + "paint", + "refrigerator", + "organization", + "jury", + "reward", + "engineering", + "day", + "possession", + "crew", + "bar", + "road", + "description", + "celebration", + "score", + "mark", + "letter", + "shower", + "suggestion", + "sir", + "luck", + "national", + "progress", + "hall", + "stroke", + "theory", + "offer", + "story", + "tax", + "definition", + "history", + "ride", + "medium", + "opening", + "glass", + "elevator", + "stomach", + "question", + "ability", + "leading", + "village", + "computer", + "city", + "grand", + "confidence", + "candle", + "priest", + "recommendation", + "point", + "necessary", + "body", + "desk", + "secret", + "horror", + "noise", + "culture", + "warning", + "water", + "round", + "diet", + "flower", + "bus", + "tough", + "permission", + "week", + "prompt", + "connection", + "abuse", + "height", + "save", + "corner", + "border", + "stress", + "drive", + "stop", + "rip", + "meal", + "listen", + "confusion", + "girlfriend", + "living", + "relation", + "significance", + "plan", + "creative", + "atmosphere", + "blame", + "invite", + "housing", + "paper", + "drink", + "roll", + "silver", + "drunk", + "age", + "damage", + "smoke", + "environment", + "pack", + "savings", + "influence", + "tourist", + "rain", + "post", + "sign", + "grandmother", + "run", + "profit", + "push", + "clerk", + "final", + "wine", + "swim", + "pause", + "stuff", + "singer", + "funeral", + "average", + "source", + "scene", + "tradition", + "personal", + "snow", + "nobody", + "distance", + "sort", + "sensitive", + "animal", + "major", + "negotiation", + "click", + "mood", + "period", + "arrival", + "expression", + "holiday", + "repeat", + "dust", + "closet", + "gold", + "bad", + "sail", + "combination", + "clothes", + "emphasis", + "duty", + "black", + "step", + "school", + "jump", + "document", + "professional", + "lip", + "chemical", + "front", + "wake", + "while", + "inside", + "watch", + "row", + "subject", + "penalty", + "balance", + "possible", + "adult", + "aside", + "sample", + "appeal", + "wedding", + "depth", + "king", + "award", + "wife", + "blow", + "site", + "camp", + "music", + "safe", + "gift", + "fault", + "guess", + "act", + "shame", + "drama", + "capital", + "exam", + "stupid", + "record", + "sound", + "swing", + "novel", + "minimum", + "ratio", + "machine", + "shape", + "lead", + "operation", + "salary", + "cloud", + "affair", + "hit", + "chapter", + "stage", + "quantity", + "access", + "army", + "chain", + "traffic", + "kick", + "analysis", + "airport", + "time", + "vacation", + "philosophy", + "ball", + "chest", + "thanks", + "place", + "mountain", + "advertising", + "red", + "past", + "rent", + "return", + "tour", + "house", + "construction", + "net", + "native", + "war", + "figure", + "fee", + "spray", + "user", + "dirt", + "shot", + "task", + "stick", + "friend", + "software", + "promotion", + "interaction", + "surround", + "block", + "purpose", + "practice", + "conflict", + "routine", + "requirement", + "bonus", + "hole", + "state", + "junior", + "sweet", + "catch", + "tear", + "fold", + "wall", + "editor", + "life", + "position", + "pound", + "respect", + "bathroom", + "coat", + "script", + "job", + "teach", + "birth", + "view", + "resolve", + "theme", + "employee", + "doubt", + "market", + "education", + "serve", + "recover", + "tone", + "harm", + "miss", + "union", + "understanding", + "cow", + "river", + "association", + "concept", + "training", + "recipe", + "relationship", + "reserve", + "depression", + "proof", + "hair", + "revenue", + "independent", + "lift", + "assignment", + "temporary", + "amount", + "loss", + "edge", + "track", + "check", + "rope", + "estimate", + "pollution", + "stable", + "message", + "delivery", + "perspective", + "mirror", + "assistant", + "representative", + "witness", + "nature", + "judge", + "fruit", + "tip", + "devil", + "town", + "emergency", + "upper", + "drop", + "stay", + "human", + "neck", + "speaker", + "network", + "sing", + "resist", + "league", + "trip", + "signature", + "lawyer", + "importance", + "gas", + "choice", + "engineer", + "success", + "part", + "external", + "worker", + "simple", + "quarter", + "student", + "heart", + "pass", + "spite", + "shift", + "rough", + "lady", + "grass", + "community", + "garage", + "youth", + "standard", + "skirt", + "promise", + "blind", + "television", + "disease", + "commission", + "positive", + "energy", + "calm", + "presence", + "tune", + "basis", + "preference", + "head", + "common", + "cut", + "somewhere", + "presentation", + "current", + "thought", + "revolution", + "effort", + "master", + "implement", + "republic", + "floor", + "principle", + "stranger", + "shoulder", + "grade", + "button", + "tennis", + "police", + "collection", + "account", + "register", + "glove", + "divide", + "professor", + "chair", + "priority", + "combine", + "peace", + "extension", + "maybe", + "evening", + "frame", + "sister", + "wave", + "code", + "application", + "mouse", + "match", + "counter", + "bottle", + "half", + "cheek", + "resolution", + "back", + "knowledge", + "make", + "discussion", + "screw", + "length", + "accident", + "battle", + "dress", + "knee", + "log", + "package", + "it", + "turn", + "hearing", + "newspaper", + "layer", + "wealth", + "profile", + "imagination", + "answer", + "weekend", + "teacher", + "appearance", + "meet", + "bike", + "rise", + "belt", + "crash", + "bowl", + "equivalent", + "support", + "image", + "poem", + "risk", + "excitement", + "remote", + "secretary", + "public", + "produce", + "plane", + "display", + "money", + "sand", + "situation", + "punch", + "customer", + "title", + "shake", + "mortgage", + "option", + "number", + "pop", + "window", + "extent", + "nothing", + "experience", + "opinion", + "departure", + "dance", + "indication", + "boy", + "material", + "band", + "leader", + "sun", + "beautiful", + "muscle", + "farmer", + "variety", + "fat", + "handle", + "director", + "opportunity", + "calendar", + "outside", + "pace", + "bath", + "fish", + "consequence", + "put", + "owner", + "go", + "doctor", + "information", + "share", + "hurt", + "protection", + "career", + "finance", + "force", + "golf", + "garbage", + "aspect", + "kid", + "food", + "boot", + "milk", + "respond", + "objective", + "reality", + "raw", + "ring", + "mall", + "one", + "impact", + "area", + "news", + "international", + "series", + "impress", + "mother", + "shelter", + "strike", + "loan", + "month", + "seat", + "anything", + "entertainment", + "familiar", + "clue", + "year", + "glad", + "supermarket", + "natural", + "god", + "cost", + "conversation", + "tie", + "ruin", + "comfort", + "earth", + "storm", + "percentage", + "assistance", + "budget", + "strength", + "beginning", + "sleep", + "other", + "young", + "unit", + "fill", + "store", + "desire", + "hide", + "value", + "cup", + "maintenance", + "nurse", + "function", + "tower", + "role", + "class", + "camera", + "database", + "panic", + "nation", + "basket", + "ice", + "art", + "spirit", + "chart", + "exchange", + "feedback", + "statement", + "reputation", + "search", + "hunt", + "exercise", + "nasty", + "notice", + "male", + "yard", + "annual", + "collar", + "date", + "platform", + "plant", + "fortune", + "passion", + "friendship", + "spread", + "cancer", + "ticket", + "attitude", + "island", + "active", + "object", + "service", + "buyer", + "bite", + "card", + "face", + "steak", + "proposal", + "patient", + "heat", + "rule", + "resident", + "broad", + "politics", + "west", + "knife", + "expert", + "girl", + "design", + "salt", + "baseball", + "grab", + "inspection", + "cousin", + "couple", + "magazine", + "cook", + "dependent", + "security", + "chicken", + "version", + "currency", + "ladder", + "scheme", + "kitchen", + "employment", + "local", + "attention", + "manager", + "fact", + "cover", + "sad", + "guard", + "relative", + "county", + "rate", + "lunch", + "program", + "initiative", + "gear", + "bridge", + "breast", + "talk", + "dish", + "guarantee", + "beer", + "vehicle", + "reception", + "woman", + "substance", + "copy", + "lecture", + "advantage", + "park", + "cold", + "death", + "mix", + "hold", + "scale", + "tomorrow", + "blood", + "request", + "green", + "cookie", + "church", + "strip", + "forever", + "beyond", + "debt", + "tackle", + "wash", + "following", + "feel", + "maximum", + "sector", + "sea", + "property", + "economics", + "menu", + "bench", + "try", + "language", + "start", + "call", + "solid", + "address", + "income", + "foot", + "senior", + "honey", + "few", + "mixture", + "cash", + "grocery", + "link", + "map", + "form", + "factor", + "pot", + "model", + "writer", + "farm", + "winter", + "skill", + "anywhere", + "birthday", + "policy", + "release", + "husband", + "lab", + "hurry", + "mail", + "equipment", + "sink", + "pair", + "driver", + "consideration", + "leather", + "skin", + "blue", + "boat", + "sale", + "brick", + "two", + "feed", + "square", + "dot", + "rush", + "dream", + "location", + "afternoon", + "manufacturer", + "control", + "occasion", + "trouble", + "introduction", + "advice", + "bet", + "eat", + "kill", + "category", + "manner", + "office", + "estate", + "pride", + "awareness", + "slip", + "crack", + "client", + "nail", + "shoot", + "membership", + "soft", + "anybody", + "web", + "official", + "individual", + "pizza", + "interest", + "bag", + "spell", + "profession", + "queen", + "deal", + "resource", + "ship", + "guy", + "chocolate", + "joint", + "formal", + "upstairs", + "car", + "resort", + "abroad", + "dealer", + "associate", + "finger", + "surgery", + "comment", + "team", + "detail", + "crazy", + "path", + "tale", + "initial", + "arm", + "radio", + "demand", + "single", + "draw", + "yellow", + "contest", + "piece", + "quote", + "pull", + "commercial", + "shirt", + "contribution", + "cream", + "channel", + "suit", + "discipline", + "instruction", + "concert", + "speech", + "low", + "effective", + "hang", + "scratch", + "industry", + "breakfast", + "lay", + "join", + "metal", + "bedroom", + "minute", + "product", + "rest", + "temperature", + "many", + "give", + "argument", + "print", + "purple", + "laugh", + "health", + "credit", + "investment", + "sell", + "setting", + "lesson", + "egg", + "middle", + "marriage", + "level", + "evidence", + "phrase", + "love", + "self", + "benefit", + "guidance", + "affect", + "you", + "dad", + "anxiety", + "special", + "boyfriend", + "test", + "blank", + "payment", + "soup", + "obligation", + "reply", + "smile", + "deep", + "complaint", + "addition", + "review", + "box", + "towel", + "minor", + "fun", + "soil", + "issue", + "cigarette", + "internet", + "gain", + "tell", + "entry", + "spare", + "incident", + "family", + "refuse", + "branch", + "can", + "pen", + "grandfather", + "constant", + "tank", + "uncle", + "climate", + "ground", + "volume", + "communication", + "kind", + "poet", + "child", + "screen", + "mine", + "quit", + "gene", + "lack", + "charity", + "memory", + "tooth", + "fear", + "mention", + "marketing", + "reveal", + "reason", + "court", + "season", + "freedom", + "land", + "sport", + "audience", + "classroom", + "law", + "hook", + "win", + "carry", + "eye", + "smell", + "distribution", + "research", + "country", + "dare", + "hope", + "whereas", + "stretch", + "library", + "if", + "delay", + "college", + "plastic", + "book", + "present", + "use", + "worry", + "champion", + "goal", + "economy", + "march", + "election", + "reflection", + "midnight", + "slide", + "inflation", + "action", + "challenge", + "guitar", + "coast", + "apple", + "campaign", + "field", + "jacket", + "sense", + "way", + "visual", + "remove", + "weather", + "trash", + "cable", + "regret", + "buddy", + "beach", + "historian", + "courage", + "sympathy", + "truck", + "tension", + "permit", + "nose", + "bed", + "son", + "person", + "base", + "meat", + "usual", + "air", + "meeting", + "worth", + "game", + "independence", + "physical", + "brief", + "play", + "raise", + "board", + "she", + "key", + "writing", + "pick", + "command", + "party", + "yesterday", + "spring", + "candidate", + "physics", + "university", + "concern", + "development", + "change", + "string", + "target", + "instance", + "room", + "bitter", + "bird", + "football", + "normal", + "split", + "impression", + "wood", + "long", + "meaning", + "stock", + "cap", + "leadership", + "media", + "ambition", + "fishing", + "essay", + "salad", + "repair", + "today", + "designer", + "night", + "bank", + "drawing", + "inevitable", + "phase", + "vast", + "chip", + "anger", + "switch", + "cry", + "twist", + "personality", + "attempt", + "storage", + "being", + "preparation", + "bat", + "selection", + "white", + "technology", + "contract", + "side", + "section", + "station", + "till", + "structure", + "tongue", + "taste", + "truth", + "difficulty", + "group", + "limit", + "main", + "move", + "feeling", + "light", + "example", + "mission", + "might", + "wait", + "wheel", + "shop", + "host", + "classic", + "alternative", + "cause", + "agent", + "consist", + "table", + "airline", + "text", + "pool", + "craft", + "range", + "fuel", + "tool", + "partner", + "load", + "entrance", + "deposit", + "hate", + "article", + "video", + "summer", + "feature", + "extreme", + "mobile", + "hospital", + "flight", + "fall", + "pension", + "piano", + "fail", + "result", + "rub", + "gap", + "system", + "report", + "suck", + "ordinary", + "wind", + "nerve", + "ask", + "shine", + "note", + "line", + "mom", + "perception", + "brother", + "reference", + "bend", + "charge", + "treat", + "trick", + "term", + "homework", + "bake", + "bid", + "status", + "project", + "strategy", + "orange", + "let", + "enthusiasm", + "parent", + "concentrate", + "device", + "travel", + "poetry", + "business", + "society", + "kiss", + "end", + "vegetable", + "employ", + "schedule", + "hour", + "brave", + "focus", + "process", + "movie", + "illegal", + "general", + "coffee", + "ad", + "highway", + "chemistry", + "psychology", + "hire", + "bell", + "conference", + "relief", + "show", + "neat", + "funny", + "weight", + "quality", + "club", + "daughter", + "zone", + "touch", + "tonight", + "shock", + "burn", + "excuse", + "name", + "survey", + "landscape", + "advance", + "satisfaction", + "bread", + "disaster", + "item", + "hat", + "prior", + "shopping", + "visit", + "east", + "photo", + "home", + "idea", + "father", + "comparison", + "cat", + "pipe", + "winner", + "count", + "lake", + "fight", + "prize", + "foundation", + "dog", + "keep", + "ideal", + "fan", + "struggle", + "peak", + "safety", + "solution", + "hell", + "conclusion", + "population", + "strain", + "alarm", + "measurement", + "second", + "train", + "race", + "due", + "insurance", + "boss", + "tree", + "monitor", + "sick", + "course", + "drag", + "appointment", + "slice", + "still", + "care", + "patience", + "rich", + "escape", + "emotion", + "royal", + "female", + "childhood", + "government", + "picture", + "will", + "sock", + "big", + "gate", + "oil", + "cross", + "pin", + "improvement", + "championship", + "silly", + "help", + "sky", + "pitch", + "man", + "diamond", + "most", + "transition", + "work", + "science", + "committee", + "moment", + "fix", + "teaching", + "dig", + "specialist", + "complex", + "guide", + "people", + "dead", + "voice", + "original", + "break", + "topic", + "data", + "degree", + "reading", + "recording", + "bunch", + "reach", + "judgment", + "lie", + "regular", + "set", + "painting", + "mode", + "list", + "player", + "bear", + "north", + "wonder", + "carpet", + "heavy", + "officer", + "negative", + "clock", + "unique", + "baby", + "pain", + "assumption", + "disk", + "iron", + "bill", + "drawer", + "look", + "double", + "mistake", + "finish", + "future", + "brilliant", + "contact", + "math", + "rice", + "leave", + "restaurant", + "discount", + "sex", + "virus", + "bit", + "trust", + "event", + "wear", + "juice", + "failure", + "bug", + "context", + "mud", + "whole", + "wrap", + "intention", + "draft", + "pressure", + "cake", + "dark", + "explanation", + "space", + "angle", + "word", + "efficiency", + "management", + "habit", + "star", + "chance", + "finding", + "transportation", + "stand", + "criticism", + "flow", + "door", + "injury", + "insect", + "surprise", + "apartment", +] # pylint: disable=line-too-long + +# ISO 639-1 codes to language names. +LANGUAGE_CODES = immutabledict.immutabledict( + { + "en": "English", + "es": "Spanish", + "pt": "Portuguese", + "ar": "Arabic", + "hi": "Hindi", + "fr": "French", + "ru": "Russian", + "de": "German", + "ja": "Japanese", + "it": "Italian", + "bn": "Bengali", + "uk": "Ukrainian", + "th": "Thai", + "ur": "Urdu", + "ta": "Tamil", + "te": "Telugu", + "bg": "Bulgarian", + "ko": "Korean", + "pl": "Polish", + "he": "Hebrew", + "fa": "Persian", + "vi": "Vietnamese", + "ne": "Nepali", + "sw": "Swahili", + "kn": "Kannada", + "mr": "Marathi", + "gu": "Gujarati", + "pa": "Punjabi", + "ml": "Malayalam", + "fi": "Finnish", + } +) + +_ALPHABETS = "([A-Za-z])" +_PREFIXES = "(Mr|St|Mrs|Ms|Dr)[.]" +_SUFFIXES = "(Inc|Ltd|Jr|Sr|Co)" +_STARTERS = r"(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)" +_ACRONYMS = "([A-Z][.][A-Z][.](?:[A-Z][.])?)" +_WEBSITES = "[.](com|net|org|io|gov|edu|me)" +_DIGITS = "([0-9])" +_MULTIPLE_DOTS = r"\.{2,}" + + +def split_into_sentences(text): + """Split the text into sentences. + + Args: + text: A string that consists of more than or equal to one sentences. + + Returns: + A list of strings where each string is a sentence. + """ + text = " " + text + " " + text = text.replace("\n", " ") + text = re.sub(_PREFIXES, "\\1", text) + text = re.sub(_WEBSITES, "\\1", text) + text = re.sub(_DIGITS + "[.]" + _DIGITS, "\\1\\2", text) + text = re.sub( + _MULTIPLE_DOTS, + lambda match: "" * len(match.group(0)) + "", + text, + ) + if "Ph.D" in text: + text = text.replace("Ph.D.", "PhD") + text = re.sub(r"\s" + _ALPHABETS + "[.] ", " \\1 ", text) + text = re.sub(_ACRONYMS + " " + _STARTERS, "\\1 \\2", text) + text = re.sub( + _ALPHABETS + "[.]" + _ALPHABETS + "[.]" + _ALPHABETS + "[.]", + "\\1\\2\\3", + text, + ) + text = re.sub(_ALPHABETS + "[.]" + _ALPHABETS + "[.]", "\\1\\2", text) + text = re.sub(" " + _SUFFIXES + "[.] " + _STARTERS, " \\1 \\2", text) + text = re.sub(" " + _SUFFIXES + "[.]", " \\1", text) + text = re.sub(" " + _ALPHABETS + "[.]", " \\1", text) + if "”" in text: + text = text.replace(".”", "”.") + if '"' in text: + text = text.replace('."', '".') + if "!" in text: + text = text.replace('!"', '"!') + if "?" in text: + text = text.replace('?"', '"?') + text = text.replace(".", ".") + text = text.replace("?", "?") + text = text.replace("!", "!") + text = text.replace("", ".") + sentences = text.split("") + sentences = [s.strip() for s in sentences] + if sentences and not sentences[-1]: + sentences = sentences[:-1] + return sentences + + +def count_words(text): + """Counts the number of words.""" + tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+") + tokens = tokenizer.tokenize(text) + num_words = len(tokens) + return num_words + + +@functools.lru_cache(maxsize=None) +def _get_sentence_tokenizer(): + return nltk.data.load("nltk:tokenizers/punkt/english.pickle") + + +def count_sentences(text): + """Count the number of sentences.""" + tokenizer = _get_sentence_tokenizer() + tokenized_sentences = tokenizer.tokenize(text) + return len(tokenized_sentences) + + +def generate_keywords(num_keywords): + """Randomly generates a few keywords.""" + return random.sample(WORD_LIST, k=num_keywords) diff --git a/lm_eval/tasks/leaderboard/ifeval/utils.py b/lm_eval/tasks/leaderboard/ifeval/utils.py new file mode 100644 index 00000000..985e8d5a --- /dev/null +++ b/lm_eval/tasks/leaderboard/ifeval/utils.py @@ -0,0 +1,134 @@ +import dataclasses +from typing import Dict, Optional, Union + +from lm_eval.tasks.ifeval import instructions_registry + + +@dataclasses.dataclass +class InputExample: + key: int + instruction_id_list: list[str] + prompt: str + kwargs: list[Dict[str, Optional[Union[str, int]]]] + + +@dataclasses.dataclass +class OutputExample: + instruction_id_list: list[str] + prompt: str + response: str + follow_all_instructions: bool + follow_instruction_list: list[bool] + + +def test_instruction_following_strict( + inp, + response, +): + """Tests response to see if instructions are followed.""" + instruction_list = inp.instruction_id_list + is_following_list = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + + # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method. + kwargs = {k: v for k, v in inp.kwargs[index].items() if v} + instruction.build_description(**kwargs) + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + + if response.strip() and instruction.check_following(response): + is_following_list.append(True) + else: + is_following_list.append(False) + + return OutputExample( + instruction_id_list=inp.instruction_id_list, + prompt=inp.prompt, + response=response, + follow_all_instructions=all(is_following_list), + follow_instruction_list=is_following_list, + ) + + +def test_instruction_following_loose( + inp, + response, +): + """Tests response for an upper bound for following instructions.""" + r = response.split("\n") + response_remove_first = "\n".join(r[1:]).strip() + response_remove_last = "\n".join(r[:-1]).strip() + response_remove_both = "\n".join(r[1:-1]).strip() + revised_response = response.replace("*", "") + revised_response_remove_first = response_remove_first.replace("*", "") + revised_response_remove_last = response_remove_last.replace("*", "") + revised_response_remove_both = response_remove_both.replace("*", "") + all_responses = [ + response, + revised_response, + response_remove_first, + response_remove_last, + response_remove_both, + revised_response_remove_first, + revised_response_remove_last, + revised_response_remove_both, + ] + instruction_list = inp.instruction_id_list + is_following_list = [] + + for index, instruction_id in enumerate(instruction_list): + instruction_cls = instructions_registry.INSTRUCTION_DICT[instruction_id] + instruction = instruction_cls(instruction_id) + + # Remove None values from kwargs to avoid unexpected keyword argument errors in build_description method. + kwargs = {k: v for k, v in inp.kwargs[index].items() if v} + instruction.build_description(**kwargs) + args = instruction.get_instruction_args() + if args and "prompt" in args: + instruction.build_description(prompt=inp.prompt) + + is_following = False + for r in all_responses: + if r.strip() and instruction.check_following(r): + is_following = True + break + + is_following_list.append(is_following) + + return OutputExample( + instruction_id_list=inp.instruction_id_list, + prompt=inp.prompt, + response=response, + follow_all_instructions=all(is_following_list), + follow_instruction_list=is_following_list, + ) + + +def process_results(doc, results): + inp = InputExample( + key=doc["key"], + instruction_id_list=doc["instruction_id_list"], + prompt=doc["prompt"], + kwargs=doc["kwargs"], + ) + response = results[0] + + out_strict = test_instruction_following_strict(inp, response) + out_loose = test_instruction_following_loose(inp, response) + + return { + "prompt_level_strict_acc": out_strict.follow_all_instructions, + "inst_level_strict_acc": out_strict.follow_instruction_list, + "prompt_level_loose_acc": out_loose.follow_all_instructions, + "inst_level_loose_acc": out_loose.follow_instruction_list, + } + + +def agg_inst_level_acc(items): + flat_items = [item for sublist in items for item in sublist] + inst_level_acc = sum(flat_items) / len(flat_items) + return inst_level_acc diff --git a/lm_eval/tasks/leaderboard/leaderboard.yaml b/lm_eval/tasks/leaderboard/leaderboard.yaml new file mode 100644 index 00000000..d9c5aaac --- /dev/null +++ b/lm_eval/tasks/leaderboard/leaderboard.yaml @@ -0,0 +1,8 @@ +group: leaderboard +task: + - leaderboard_mmlu_pro + - leaderboard_bbh + - leaderboard_gpqa + - leaderboard_math_hard + - leaderboard_ifeval + - leaderboard_musr diff --git a/lm_eval/tasks/leaderboard/math/_template_yaml b/lm_eval/tasks/leaderboard/math/_template_yaml new file mode 100644 index 00000000..a121074b --- /dev/null +++ b/lm_eval/tasks/leaderboard/math/_template_yaml @@ -0,0 +1,28 @@ +group: + - leaderboard_math_hard +dataset_path: lighteval/MATH-Hard +process_docs: !function utils.process_docs +output_type: generate_until +training_split: train +test_split: test +doc_to_text: !function utils.doc_to_text +process_results: !function utils.process_results +doc_to_target: "{{answer if few_shot is undefined else solution}}" +generation_kwargs: + until: + - "Problem:" + do_sample: false + temperature: 0 + max_gen_toks: 1024 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true +num_fewshot: 4 +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true +fewshot_config: + sampler: first_n + samples: !function utils.list_fewshot_samples diff --git a/lm_eval/tasks/leaderboard/math/math_algebra.yaml b/lm_eval/tasks/leaderboard/math/math_algebra.yaml new file mode 100644 index 00000000..9e1d4753 --- /dev/null +++ b/lm_eval/tasks/leaderboard/math/math_algebra.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: algebra +task: leaderboard_math_algebra_hard diff --git a/lm_eval/tasks/leaderboard/math/math_counting_and_prob.yaml b/lm_eval/tasks/leaderboard/math/math_counting_and_prob.yaml new file mode 100644 index 00000000..41767099 --- /dev/null +++ b/lm_eval/tasks/leaderboard/math/math_counting_and_prob.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: counting_and_probability +task: leaderboard_math_counting_and_prob_hard diff --git a/lm_eval/tasks/leaderboard/math/math_geometry.yaml b/lm_eval/tasks/leaderboard/math/math_geometry.yaml new file mode 100644 index 00000000..189633c3 --- /dev/null +++ b/lm_eval/tasks/leaderboard/math/math_geometry.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: geometry +task: leaderboard_math_geometry_hard diff --git a/lm_eval/tasks/leaderboard/math/math_intermediate_algebra.yaml b/lm_eval/tasks/leaderboard/math/math_intermediate_algebra.yaml new file mode 100644 index 00000000..9959bd18 --- /dev/null +++ b/lm_eval/tasks/leaderboard/math/math_intermediate_algebra.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: intermediate_algebra +task: leaderboard_math_intermediate_algebra_hard diff --git a/lm_eval/tasks/leaderboard/math/math_num_theory.yaml b/lm_eval/tasks/leaderboard/math/math_num_theory.yaml new file mode 100644 index 00000000..a055d94d --- /dev/null +++ b/lm_eval/tasks/leaderboard/math/math_num_theory.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: number_theory +task: leaderboard_math_num_theory_hard diff --git a/lm_eval/tasks/leaderboard/math/math_prealgebra.yaml b/lm_eval/tasks/leaderboard/math/math_prealgebra.yaml new file mode 100644 index 00000000..474d7258 --- /dev/null +++ b/lm_eval/tasks/leaderboard/math/math_prealgebra.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: prealgebra +task: leaderboard_math_prealgebra_hard diff --git a/lm_eval/tasks/leaderboard/math/math_precalculus.yaml b/lm_eval/tasks/leaderboard/math/math_precalculus.yaml new file mode 100644 index 00000000..4d63541e --- /dev/null +++ b/lm_eval/tasks/leaderboard/math/math_precalculus.yaml @@ -0,0 +1,3 @@ +include: _template_yaml +dataset_name: precalculus +task: leaderboard_math_precalculus_hard diff --git a/lm_eval/tasks/leaderboard/math/utils.py b/lm_eval/tasks/leaderboard/math/utils.py new file mode 100644 index 00000000..e3ebcf99 --- /dev/null +++ b/lm_eval/tasks/leaderboard/math/utils.py @@ -0,0 +1,294 @@ +import re +import signal +from typing import Dict, List, Optional + +import datasets + +from lm_eval.utils import eval_logger + + +try: + import sympy + from sympy.parsing.latex import parse_latex +except ModuleNotFoundError: + raise ModuleNotFoundError( + "`sympy` is required for generating translation task prompt templates. \ +please install sympy via pip install lm-eval[math] or pip install -e .[math]", + ) + + +# taken from +# https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py +def doc_to_text(doc: dict) -> str: + return "Problem:" + "\n" + doc["problem"] + "\n\n" + "Solution:" + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc: dict) -> dict: + out_doc = { + "problem": doc["problem"], + "solution": doc["solution"], + "answer": normalize_final_answer( + remove_boxed(last_boxed_only_string(doc["solution"])) + ), + } + if getattr(doc, "few_shot", None) is not None: + out_doc["few_shot"] = True + return out_doc + + return dataset.map(_process_doc) + + +def list_fewshot_samples() -> list[dict]: + return [ + { + "problem": "Find the domain of the expression $\\frac{\\sqrt{x-2}}{\\sqrt{5-x}}$.}", + "solution": "The expressions inside each square root must be non-negative. Therefore, $x-2 \\ge 0$, so $x\\ge2$, and $5 - x \\ge 0$, so $x \\le 5$. Also, the denominator cannot be equal to zero, so $5-x>0$, which gives $x<5$. Therefore, the domain of the expression is $\\boxed{[2,5)}$.\nFinal Answer: The final answer is $[2,5)$. I hope it is correct.", + "few_shot": "1", + }, + { + "problem": "If $\\det \\mathbf{A} = 2$ and $\\det \\mathbf{B} = 12,$ then find $\\det (\\mathbf{A} \\mathbf{B}).$", + "solution": "We have that $\\det (\\mathbf{A} \\mathbf{B}) = (\\det \\mathbf{A})(\\det \\mathbf{B}) = (2)(12) = \\boxed{24}.$\nFinal Answer: The final answer is $24$. I hope it is correct.", + "few_shot": "1", + }, + { + "problem": "Terrell usually lifts two 20-pound weights 12 times. If he uses two 15-pound weights instead, how many times must Terrell lift them in order to lift the same total weight?", + "solution": "If Terrell lifts two 20-pound weights 12 times, he lifts a total of $2\\cdot 12\\cdot20=480$ pounds of weight. If he lifts two 15-pound weights instead for $n$ times, he will lift a total of $2\\cdot15\\cdot n=30n$ pounds of weight. Equating this to 480 pounds, we can solve for $n$:\n\\begin{align*}\n30n&=480\\\n\\Rightarrow\\qquad n&=480/30=\\boxed{16}\n\\end{align*}\nFinal Answer: The final answer is $16$. I hope it is correct.", + "few_shot": "1", + }, + { + "problem": "If the system of equations\n\n\\begin{align*}\n6x-4y&=a,\\\n6y-9x &=b.\n\\end{align*}has a solution $(x, y)$ where $x$ and $y$ are both nonzero,\nfind $\\frac{a}{b},$ assuming $b$ is nonzero.", + "solution": "If we multiply the first equation by $-\\frac{3}{2}$, we obtain\n\n$$6y-9x=-\\frac{3}{2}a.$$Since we also know that $6y-9x=b$, we have\n\n$$-\\frac{3}{2}a=b\\Rightarrow\\frac{a}{b}=\\boxed{-\\frac{2}{3}}.$$\nFinal Answer: The final answer is $-\\frac{2}{3}$. I hope it is correct.", + "few_shot": "1", + }, + ] + + +def process_results(doc: dict, results: List[str]) -> Dict[str, int]: + candidates = results[0] + + unnormalized_answer = get_unnormalized_answer(candidates) + answer = normalize_final_answer(unnormalized_answer) + + if is_equiv(answer, doc["answer"]): + retval = 1 + else: + retval = 0 + + results = { + "exact_match": retval, + } + return results + + +def last_boxed_only_string(string: str) -> Optional[str]: + idx = string.rfind("\\boxed") + if "\\boxed " in string: + return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0] + if idx < 0: + idx = string.rfind("\\fbox") + if idx < 0: + return None + + i = idx + right_brace_idx = None + num_left_braces_open = 0 + while i < len(string): + if string[i] == "{": + num_left_braces_open += 1 + if string[i] == "}": + num_left_braces_open -= 1 + if num_left_braces_open == 0: + right_brace_idx = i + break + i += 1 + + if right_brace_idx is None: + retval = None + else: + retval = string[idx : right_brace_idx + 1] + + return retval + + +def remove_boxed(s: str) -> str: + if "\\boxed " in s: + left = "\\boxed " + assert s[: len(left)] == left + return s[len(left) :] + + left = "\\boxed{" + + assert s[: len(left)] == left + assert s[-1] == "}" + + return s[len(left) : -1] + + +class timeout: + def __init__(self, seconds=1, error_message="Timeout"): + self.seconds = seconds + self.error_message = error_message + + def handle_timeout(self, signum, frame): + raise TimeoutError(self.error_message) + + def __enter__(self): + signal.signal(signal.SIGALRM, self.handle_timeout) + signal.alarm(self.seconds) + + def __exit__(self, type, value, traceback): + signal.alarm(0) + + +def is_equiv(x1: str, x2: str) -> bool: + """ + x1 and x2 are normalized latex string + """ + try: + with timeout(seconds=5): + try: + parsed_x1 = parse_latex(x1) + parsed_x2 = parse_latex(x2) + except ( + sympy.parsing.latex.errors.LaTeXParsingError, + sympy.SympifyError, + TypeError, + ): + eval_logger.debug(f"couldn't parse one of {x1} or {x2}") + return False + + try: + diff = parsed_x1 - parsed_x2 + except TypeError: + eval_logger.debug(f"couldn't subtract {x1} and {x2}") + return False + + try: + if sympy.simplify(diff) == 0: + return True + else: + return False + except ValueError: + eval_logger.debug( + f"Had some trouble simplifying when comparing {x1} and {x2}" + ) + except TimeoutError: + eval_logger.debug(f"Timed out comparing {x1} and {x2}") + return False + except ImportError as e: + eval_logger.error(e) + raise + except Exception as e: + eval_logger.debug(f"Failed comparing {x1} and {x2} with {e}") + return False + + +def get_unnormalized_answer(text: str) -> str: + INVALID_ANSWER = "[invalidanswer]" + end_seq = "I hope it is correct." + text += end_seq + match = re.search( + r"Final Answer: The final answer is(.*?). I hope it is correct.", + text, + ) + if match: + return match.group(1).strip() + else: + return INVALID_ANSWER + + +SUBSTITUTIONS = [ + ("an ", ""), + ("a ", ""), + (".$", "$"), + ("\\$", ""), + (r"\ ", ""), + (" ", ""), + ("mbox", "text"), + (",\\text{and}", ","), + ("\\text{and}", ","), + ("\\text{m}", "\\text{}"), +] +REMOVED_EXPRESSIONS = [ + "square", + "ways", + "integers", + "dollars", + "mph", + "inches", + "ft", + "hours", + "km", + "units", + "\\ldots", + "sue", + "points", + "feet", + "minutes", + "digits", + "cents", + "degrees", + "cm", + "gm", + "pounds", + "meters", + "meals", + "edges", + "students", + "childrentickets", + "multiples", + "\\text{s}", + "\\text{.}", + "\\text{\ns}", + "\\text{}^2", + "\\text{}^3", + "\\text{\n}", + "\\text{}", + r"\mathrm{th}", + r"^\circ", + r"^{\circ}", + r"\;", + r",\!", + "{,}", + '"', + "\\dots", +] + + +def normalize_final_answer(final_answer: str) -> str: + """ + Normalize a final answer to a quantitative reasoning question. + + Copied character for character from appendix D of Lewkowycz et al. (2022) + """ + final_answer = final_answer.split("=")[-1] + + for before, after in SUBSTITUTIONS: + final_answer = final_answer.replace(before, after) + for expr in REMOVED_EXPRESSIONS: + final_answer = final_answer.replace(expr, "") + + # Extract answer that is in LaTeX math, is bold, + # is surrounded by a box, etc. + final_answer = re.sub(r"(.*?)(\$)(.*?)(\$)(.*)", "$\\3$", final_answer) + final_answer = re.sub(r"(\\text\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\textbf\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\overline\{)(.*?)(\})", "\\2", final_answer) + final_answer = re.sub(r"(\\boxed\{)(.*)(\})", "\\2", final_answer) + + # Normalize shorthand TeX: + # \fracab -> \frac{a}{b} + # \frac{abc}{bef} -> \frac{abc}{bef} + # \fracabc -> \frac{a}{b}c + # \sqrta -> \sqrt{a} + # \sqrtab -> sqrt{a}b + final_answer = re.sub(r"(frac)([^{])(.)", "frac{\\2}{\\3}", final_answer) + final_answer = re.sub(r"(sqrt)([^{])", "sqrt{\\2}", final_answer) + final_answer = final_answer.replace("$", "") + + # Normalize 100,000 -> 100000 + if final_answer.replace(",", "").isdigit(): + final_answer = final_answer.replace(",", "") + + return final_answer diff --git a/lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml b/lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml new file mode 100644 index 00000000..8ddabf14 --- /dev/null +++ b/lm_eval/tasks/leaderboard/mmlu_pro/mmlu_pro.yaml @@ -0,0 +1,17 @@ +dataset_path: TIGER-Lab/MMLU-Pro # a copy of `cais/leaderboard_mmlu` with no auxiliary_train split +task: leaderboard_mmlu_pro +test_split: test +fewshot_split: validation +fewshot_config: + sampler: first_n +output_type: multiple_choice +doc_to_text: !function utils.doc_to_text +doc_to_choice: !function utils.doc_to_choice +doc_to_target: answer +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +num_fewshot: 5 +metadata: + version: 0.1 diff --git a/lm_eval/tasks/leaderboard/mmlu_pro/utils.py b/lm_eval/tasks/leaderboard/mmlu_pro/utils.py new file mode 100644 index 00000000..bb398262 --- /dev/null +++ b/lm_eval/tasks/leaderboard/mmlu_pro/utils.py @@ -0,0 +1,15 @@ +import string + + +def doc_to_text(doc): + doc_to_text = f"{doc['question']}\n" + + for i in range(len(doc["options"])): + doc_to_text += f"{string.ascii_uppercase[i]}. {doc['options'][i]}\n" + + doc_to_text += "Answer:" + return doc_to_text + + +def doc_to_choice(doc): + return [string.ascii_uppercase[i] for i in range(len(doc["options"]))] diff --git a/lm_eval/tasks/leaderboard/musr/_template_yaml b/lm_eval/tasks/leaderboard/musr/_template_yaml new file mode 100644 index 00000000..2076cb9f --- /dev/null +++ b/lm_eval/tasks/leaderboard/musr/_template_yaml @@ -0,0 +1,13 @@ +group: + - leaderboard_musr +dataset_path: TAUR-Lab/MuSR +output_type: multiple_choice +doc_to_text: !function utils.doc_to_text +doc_to_target: "{{answer_choice}}" +doc_to_choice: "{{choices}}" +metric_list: + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/lm_eval/tasks/leaderboard/musr/musr_murder_mysteries.yaml b/lm_eval/tasks/leaderboard/musr/musr_murder_mysteries.yaml new file mode 100644 index 00000000..88aac33e --- /dev/null +++ b/lm_eval/tasks/leaderboard/musr/musr_murder_mysteries.yaml @@ -0,0 +1,3 @@ +include: "_template_yaml" +task: leaderboard_musr_murder_mysteries +test_split: murder_mysteries diff --git a/lm_eval/tasks/leaderboard/musr/musr_object_placements.yaml b/lm_eval/tasks/leaderboard/musr/musr_object_placements.yaml new file mode 100644 index 00000000..89b29fb7 --- /dev/null +++ b/lm_eval/tasks/leaderboard/musr/musr_object_placements.yaml @@ -0,0 +1,3 @@ +include: "_template_yaml" +task: leaderboard_musr_object_placements +test_split: object_placements diff --git a/lm_eval/tasks/leaderboard/musr/musr_team_allocation.yaml b/lm_eval/tasks/leaderboard/musr/musr_team_allocation.yaml new file mode 100644 index 00000000..8ede2ba9 --- /dev/null +++ b/lm_eval/tasks/leaderboard/musr/musr_team_allocation.yaml @@ -0,0 +1,3 @@ +include: "_template_yaml" +task: leaderboard_musr_team_allocation +test_split: team_allocation diff --git a/lm_eval/tasks/leaderboard/musr/utils.py b/lm_eval/tasks/leaderboard/musr/utils.py new file mode 100644 index 00000000..1d0a7d1c --- /dev/null +++ b/lm_eval/tasks/leaderboard/musr/utils.py @@ -0,0 +1,26 @@ +import ast + + +def doc_to_choice(doc): + """ + Convert a doc to a choice. + """ + return ast.literal_eval(doc["choices"]) + + +DOC_TO_TEXT = "{narrative}\n\n" "{question}\n\n" "{choices}\n" "Answer:" + + +def doc_to_text(doc): + """ + Convert a doc to text. + """ + choices = "" + for i, choice in enumerate(ast.literal_eval(doc["choices"])): + choices += f"{i+1} - {choice}\n" + + text = DOC_TO_TEXT.format( + narrative=doc["narrative"], question=doc["question"], choices=choices + ) + + return text -- GitLab From d855d0baf8576296e790d0c9477b40a710d28e67 Mon Sep 17 00:00:00 2001 From: Hanwool Albert Lee <88315152+h-albert-lee@users.noreply.github.com> Date: Wed, 3 Jul 2024 21:17:15 +0900 Subject: [PATCH 20/45] #1442 inverse scaling tasks implementation (#1589) * initial_implementation (test has to be proceeded) * minor fix * revised task name and implemented new task * minor fixes * new tasks implement * minor fix * added 'prompt injection' task * delete prompt injection task (will be implemented at next PR) * trust remote code * Update lm_eval/tasks/inverse_scaling/README.md Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * added readme * Update lm_eval/tasks/README.md * Update lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml * Update lm_eval/tasks/inverse_scaling/README.md Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * Update lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * Update README.md * precommit? * run precommit on readme --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Co-authored-by: haileyschoelkopf --- lm_eval/tasks/README.md | 1 + lm_eval/tasks/inverse_scaling/README.md | 55 +++++++++++++++++++ .../inverse_scaling/_inverse_scaling_mc_yaml | 17 ++++++ .../inverse_scaling_hindsight_neglect.yaml | 3 + .../inverse_scaling_into_the_unknown.yaml | 3 + .../inverse_scaling_memo_trap.yaml | 3 + .../inverse_scaling_modus_tollens.yaml | 3 + .../inverse_scaling/inverse_scaling_neqa.yaml | 3 + ..._scaling_pattern_matching_suppression.yaml | 3 + .../inverse_scaling_quote_repetition.yaml | 3 + .../inverse_scaling_redefine_math.yaml | 3 + .../inverse_scaling_repetitive_algebra.yaml | 3 + .../inverse_scaling_sig_figs.yaml | 3 + ...verse_scaling_winobias_antistereotype.yaml | 20 +++++++ 14 files changed, 123 insertions(+) create mode 100644 lm_eval/tasks/inverse_scaling/README.md create mode 100644 lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml create mode 100644 lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml diff --git a/lm_eval/tasks/README.md b/lm_eval/tasks/README.md index 5d6393bb..1eb7fdc5 100644 --- a/lm_eval/tasks/README.md +++ b/lm_eval/tasks/README.md @@ -49,6 +49,7 @@ | [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English | | [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English | | [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English | +| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English | | [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean | | [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean | | [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean | diff --git a/lm_eval/tasks/inverse_scaling/README.md b/lm_eval/tasks/inverse_scaling/README.md new file mode 100644 index 00000000..10364d8e --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/README.md @@ -0,0 +1,55 @@ +# inverse_scaling + +### Paper + +Title: `Inverse Scaling: When Bigger Isn't Better` + +Abstract: `Work on scaling laws has found that large language models (LMs) show predictable improvements to overall loss with increased scale (model size, training data, and compute). Here, we present evidence for the claim that LMs may show inverse scaling, or worse task performance with increased scale, e.g., due to flaws in the training objective and data. We present empirical evidence of inverse scaling on 11 datasets collected by running a public contest, the Inverse Scaling Prize, with a substantial prize pool. Through analysis of the datasets, along with other examples found in the literature, we identify four potential causes of inverse scaling: (i) preference to repeat memorized sequences over following in-context instructions, (ii) imitation of undesirable patterns in the training data, (iii) tasks containing an easy distractor task which LMs could focus on, rather than the harder real task, and (iv) correct but misleading few-shot demonstrations of the task. We release the winning datasets at this https URL to allow for further investigation of inverse scaling. Our tasks have helped drive the discovery of U-shaped and inverted-U scaling trends, where an initial trend reverses, suggesting that scaling trends are less reliable at predicting the behavior of larger-scale models than previously understood. Overall, our results suggest that there are tasks for which increased model scale alone may not lead to progress, and that more careful thought needs to go into the data and objectives for training language models.` + +Note: This is not official implementation of inverse scaling prize. Implemented by h-albert-lee with permission from the authors of the paper. + +Homepage: https://github.com/inverse-scaling/prize + +### Citation + +@article{mckenzie2023inverse, + title={Inverse Scaling: When Bigger Isn't Better}, + author={Ian R. McKenzie and Alexander Lyzhov and Michael Pieler and Alicia Parrish and Aaron Mueller and Ameya Prabhu and Euan McLean and Aaron Kirtland and Alexis Ross and Alisa Liu and Andrew Gritsevskiy and Daniel Wurgaft and Derik Kauffman and Gabriel Recchia and Jiacheng Liu and Joe Cavanagh and Max Weiss and Sicong Huang and The Floating Droid and Tom Tseng and Tomasz Korbak and Xudong Shen and Yuhui Zhang and Zhengping Zhou and Najoung Kim and Samuel R. Bowman and Ethan Perez}, + journal={arXiv preprint arXiv:2306.09479}, + year={2023} +} + +### Groups and Tasks + +#### Groups + +* `inverse_scaling_mc`: all tasks of Inverse Scaling Prize (currently aside from Prompt Injection), matching their implementations on OPT for multiple-choice type classification tasks. **These match the published dataset versions from the prize, which may slightly differ from numbers in the paper (but have been tested for equivalence to the OPT numbers reported at https://huggingface.co/inverse-scaling/opt-1.3b_eval for multiple sizes.** + + +#### Tasks + +- `inverse_scaling_hindsight_neglect_10shot` +- `inverse_scaling_redefine_math` +- `inverse_scaling_quote_repetition` +- `inverse_scaling_neqa` +- `inverse_scaling_winobias_antistereotype`: not an official Inverse Scaling prize winner, but eval results reported on it at https://huggingface.co/inverse-scaling/opt-1.3b_eval . +- `inverse_scaling_into_the_unknown` +- `inverse_scaling_memo_trap` +- `inverse_scaling_modus_tollens` +- `inverse_scaling_pattern_matching_suppression` +- `inverse_scaling_repetitive_algebra` +- `inverse_scaling_sig_figs` + + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml b/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml new file mode 100644 index 00000000..4dc3df7a --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/_inverse_scaling_mc_yaml @@ -0,0 +1,17 @@ +group: + - inverse_scaling_mc +output_type: multiple_choice +test_split: train +doc_to_text: prompt +doc_to_choice: classes +doc_to_target: answer_index +target_delimiter: "" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +metadata: + version: 0 diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml new file mode 100644 index 00000000..b21baee2 --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_hindsight_neglect.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_hindsight_neglect_10shot +dataset_path: inverse-scaling/hindsight-neglect-10shot diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml new file mode 100644 index 00000000..0c7ccd3b --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_into_the_unknown.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_into_the_unknown +dataset_path: Albertmade/into-the-unknown diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml new file mode 100644 index 00000000..5c77ffc2 --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_memo_trap.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_memo_trap +dataset_path: Albertmade/memo-trap diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml new file mode 100644 index 00000000..89e1ebbc --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_modus_tollens.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_modus_tollens +dataset_path: Albertmade/modus-tollens diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml new file mode 100644 index 00000000..11a057d8 --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_neqa.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_neqa +dataset_path: inverse-scaling/NeQA diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml new file mode 100644 index 00000000..b415bba1 --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_pattern_matching_suppression.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_pattern_matching_suppression +dataset_path: Albertmade/pattern-matching-suppression diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml new file mode 100644 index 00000000..6750953c --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_quote_repetition.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_quote_repetition +dataset_path: inverse-scaling/quote-repetition diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml new file mode 100644 index 00000000..2f14ba28 --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_redefine_math.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_redefine_math +dataset_path: inverse-scaling/redefine-math diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml new file mode 100644 index 00000000..048ea271 --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_repetitive_algebra.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_repetitive_algebra +dataset_path: Albertmade/repetitive-algebra diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml new file mode 100644 index 00000000..25d86e33 --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_sig_figs.yaml @@ -0,0 +1,3 @@ +include: _inverse_scaling_mc_yaml +task: inverse_scaling_sig_figs +dataset_path: Albertmade/sig-figs diff --git a/lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml b/lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml new file mode 100644 index 00000000..4c5ff300 --- /dev/null +++ b/lm_eval/tasks/inverse_scaling/inverse_scaling_winobias_antistereotype.yaml @@ -0,0 +1,20 @@ +group: + - inverse_scaling_mc +task: inverse_scaling_winobias_antistereotype +dataset_path: mathemakitten/winobias_antistereotype_test_v5 +output_type: multiple_choice +test_split: test +doc_to_text: text +doc_to_choice: classes +doc_to_target: target +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true + - metric: acc_norm + aggregation: mean + higher_is_better: true +dataset_kwargs: + trust_remote_code: true +metadata: + version: 0 -- GitLab From 5a7ed3eee00eca378f157abbe9baf4d406f063f2 Mon Sep 17 00:00:00 2001 From: Choyunhui Date: Mon, 8 Jul 2024 16:34:01 +0900 Subject: [PATCH 21/45] Fix TypeError in samplers.py by converting int to str (#2074) Co-authored-by: yhjo --- lm_eval/api/samplers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lm_eval/api/samplers.py b/lm_eval/api/samplers.py index 74513bbc..ac75c722 100644 --- a/lm_eval/api/samplers.py +++ b/lm_eval/api/samplers.py @@ -55,7 +55,7 @@ class ContextSampler: labeled_examples += ( str(doc_target[0]) if isinstance(doc_target, list) - else doc_target + else str(doc_target) if self.config.doc_to_choice is None or isinstance(doc_target, str) else str(self.doc_to_choice(doc)[doc_target]) ) -- GitLab From 517aadc4d05507e2fea7b7753e91e6ad7dd76847 Mon Sep 17 00:00:00 2001 From: Lintang Sutawika Date: Mon, 8 Jul 2024 22:12:35 +0700 Subject: [PATCH 22/45] Group agg rework (#1741) * add greoup_config arg * add a group config that allows disabling table for group score and group aggregate in general * fixed size configuration * adjust config * add group config * adjust mmlu to use group_config * fixed args input in aggregate_subtask_metrics * fixed issues related to printing alias of group and updated yaml * update all mmlu variants to include group_config * edit format * modify mmlu tasks * adjust group to also be a configurable group * add configurable group * simplify get_task_list * adjust group scoring with using ConfigurableGroup * adjust args * update mmlu * update mmlu * update to work with new group and task configuration * readd group_agg * readd files * move prepare_print_tasks to evaluator_utils * sort set to False by default, fix predict_only arg * add version for groups * reversed task list * update additional condition when loading a group in a group yaml * update truthfulqa * add description regarding tags replacing group * replace group to tag * fixed conditional statement * remove warning * update loading of task group and newly added tags * reformat with pre-commit * fixed info log * update * fix bug * fix bug * use task id to differentiate tasks * convert all groups to configurable groups * use task_id * reformat * add task_id for python tasks as well * add task_id for python tasks as well * add task_id for python tasks as well * revert truthfulqa * revert mmlu tasks * new mmlu config * new group config parameter `tag_to_task` * Update truthfulqa_mc2.yaml * reformate * add _process_group_config * adjust task_id * add get_subtask_list function to get proper subtask list * group config to_dict update * remove tag check * update mmlu * fix config passing issues * add test yaml * format fix * add documentation * corner case for single tag being called * fix indentation * formatting * update all mmlu variants * Update docs/task_guide.md Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * remove group_alias * Update docs/task_guide.md Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * remove version for metadata * Update docs/task_guide.md Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> * update mmlu/ * removed " " in make_table * change how aggregate_metric is loaded * change how aggregate_metric is loaded * update aggregate_metric arg * update format * update format * some docs fixes * add groups for agieval, aexams, aclue * add more explicit aggregation groups * add more groupings / tags distinctions * add more groupings * more groupings * add many explicit group configs * add many explicit group configs * add more explicit group configs * add more explicit group configs * add more error msgs, agg_metric -> agg_metric_list * some docs updates * update task_id to be updateable and uses group:task format * make KMMLU a tag for now * update docs * don't duplicate task names * fix merge conflicts? * giving this a try * clean up diff * switch mmlu variants over to using * don't use to-be-deprecated group: config field in overview notebook * Python tasks which subclass ConfigurableTask now run * update mmlu * pre-commit format * fixed sorting for multi-level printing * move group api to separate file * fix bbh aggregation filter usage * track api/group.py * adjust group and tags loading * make explicit group configs for leaderboard and other newer tasks * fix arabicmmlu * update * change arabicmmlu template name??? * update group alias * fix printing bugs * check table printing is correct ; update tests * use mmlu_stem to have a group included in print tests --------- Co-authored-by: Hailey Schoelkopf <65563625+haileyschoelkopf@users.noreply.github.com> Co-authored-by: haileyschoelkopf --- docs/new_task_guide.md | 92 +++-- docs/task_guide.md | 114 +----- examples/lm-eval-overview.ipynb | 5 +- lm_eval/api/group.py | 117 ++++++ lm_eval/api/task.py | 21 +- lm_eval/evaluator.py | 226 +++++------ lm_eval/evaluator_utils.py | 350 ++++++++++++++---- lm_eval/tasks/__init__.py | 245 ++++++++---- lm_eval/tasks/aclue/README.md | 2 +- lm_eval/tasks/aclue/_aclue.yaml | 26 ++ lm_eval/tasks/aclue/_default_template_yaml | 1 - lm_eval/tasks/aexams/README.md | 4 +- lm_eval/tasks/aexams/_aexams.yaml | 16 + lm_eval/tasks/aexams/_default_template_yaml | 1 - lm_eval/tasks/agieval/README.md | 6 +- lm_eval/tasks/agieval/agieval.yaml | 29 ++ lm_eval/tasks/agieval/agieval_cn.yaml | 19 + lm_eval/tasks/agieval/agieval_en.yaml | 18 + lm_eval/tasks/agieval/agieval_nous.yaml | 16 + lm_eval/tasks/agieval/aqua-rat.yaml | 4 - lm_eval/tasks/agieval/gaokao-biology.yaml | 3 - lm_eval/tasks/agieval/gaokao-chemistry.yaml | 3 - lm_eval/tasks/agieval/gaokao-chinese.yaml | 3 - lm_eval/tasks/agieval/gaokao-english.yaml | 3 - lm_eval/tasks/agieval/gaokao-geography.yaml | 3 - lm_eval/tasks/agieval/gaokao-history.yaml | 3 - lm_eval/tasks/agieval/gaokao-mathcloze.yaml | 3 - lm_eval/tasks/agieval/gaokao-mathqa.yaml | 3 - lm_eval/tasks/agieval/gaokao-physics.yaml | 3 - lm_eval/tasks/agieval/jec-qa-ca.yaml | 3 - lm_eval/tasks/agieval/jec-qa-kd.yaml | 3 - lm_eval/tasks/agieval/logiqa-en.yaml | 4 - lm_eval/tasks/agieval/logiqa-zh.yaml | 3 - lm_eval/tasks/agieval/lsat-ar.yaml | 4 - lm_eval/tasks/agieval/lsat-lr.yaml | 4 - lm_eval/tasks/agieval/lsat-rc.yaml | 4 - lm_eval/tasks/agieval/math.yaml | 3 - .../tasks/agieval/sat-en-without-passage.yaml | 4 - lm_eval/tasks/agieval/sat-en.yaml | 4 - lm_eval/tasks/agieval/sat-math.yaml | 4 - lm_eval/tasks/anli/anli_r1.yaml | 2 +- .../{arabicmmlu.yaml => _arabicmmlu.yaml} | 5 + .../arabicmmlu/_arabicmmlu_humanities.yaml | 9 + .../arabicmmlu/_arabicmmlu_language.yaml | 9 + .../tasks/arabicmmlu/_arabicmmlu_other.yaml | 9 + .../_arabicmmlu_social_science.yaml | 9 + .../tasks/arabicmmlu/_arabicmmlu_stem.yaml | 9 + ...yaml => _default_arabicmmlu_template_yaml} | 2 + lm_eval/tasks/arabicmmlu/_generate_configs.py | 5 +- .../arabicmmlu_arabic_language_general.yaml | 5 +- .../arabicmmlu_arabic_language_grammar.yaml | 5 +- .../arabicmmlu/arabicmmlu_driving_test.yaml | 5 +- .../arabicmmlu_general_knowledge.yaml | 5 +- .../arabicmmlu_high_arabic_language.yaml | 5 +- .../arabicmmlu/arabicmmlu_high_biology.yaml | 5 +- .../arabicmmlu/arabicmmlu_high_civics.yaml | 5 +- .../arabicmmlu_high_computer_science.yaml | 5 +- .../arabicmmlu/arabicmmlu_high_economics.yaml | 5 +- .../arabicmmlu/arabicmmlu_high_geography.yaml | 5 +- .../arabicmmlu/arabicmmlu_high_history.yaml | 5 +- .../arabicmmlu_high_islamic_studies.yaml | 5 +- .../arabicmmlu_high_philosophy.yaml | 5 +- .../arabicmmlu/arabicmmlu_high_physics.yaml | 5 +- .../arabicmmlu_islamic_studies.yaml | 5 +- .../arabicmmlu_middle_arabic_language.yaml | 5 +- .../arabicmmlu/arabicmmlu_middle_civics.yaml | 5 +- .../arabicmmlu_middle_computer_science.yaml | 5 +- .../arabicmmlu_middle_economics.yaml | 5 +- .../arabicmmlu_middle_general_knowledge.yaml | 5 +- .../arabicmmlu_middle_geography.yaml | 5 +- .../arabicmmlu/arabicmmlu_middle_history.yaml | 5 +- .../arabicmmlu_middle_islamic_studies.yaml | 5 +- .../arabicmmlu_middle_natural_science.yaml | 5 +- .../arabicmmlu_middle_social_science.yaml | 5 +- .../arabicmmlu_primary_arabic_language.yaml | 5 +- .../arabicmmlu_primary_computer_science.yaml | 5 +- .../arabicmmlu_primary_general_knowledge.yaml | 5 +- .../arabicmmlu_primary_geography.yaml | 5 +- .../arabicmmlu_primary_history.yaml | 5 +- .../arabicmmlu_primary_islamic_studies.yaml | 5 +- .../arabicmmlu/arabicmmlu_primary_math.yaml | 5 +- .../arabicmmlu_primary_natural_science.yaml | 5 +- .../arabicmmlu_primary_social_science.yaml | 5 +- .../tasks/arabicmmlu/arabicmmlu_prof_law.yaml | 5 +- .../arabicmmlu_univ_accounting.yaml | 5 +- .../arabicmmlu_univ_computer_science.yaml | 5 +- .../arabicmmlu/arabicmmlu_univ_economics.yaml | 5 +- .../arabicmmlu_univ_management.yaml | 5 +- .../arabicmmlu_univ_political_science.yaml | 5 +- lm_eval/tasks/arc/README.md | 6 +- lm_eval/tasks/arc/arc_easy.yaml | 2 +- lm_eval/tasks/arc_mt/arc_challenge_mt_fi.yaml | 2 +- lm_eval/tasks/arithmetic/README.md | 4 +- lm_eval/tasks/arithmetic/arithmetic_1dc.yaml | 2 +- lm_eval/tasks/asdiv/README.md | 2 +- lm_eval/tasks/babi/README.md | 6 +- lm_eval/tasks/basqueglue/README.md | 8 +- lm_eval/tasks/basqueglue/bec.yaml | 2 +- lm_eval/tasks/basqueglue/bhtc.yaml | 2 +- lm_eval/tasks/basqueglue/coref.yaml | 2 +- lm_eval/tasks/basqueglue/qnli.yaml | 2 +- lm_eval/tasks/basqueglue/vaxx.yaml | 2 +- lm_eval/tasks/basqueglue/wic.yaml | 2 +- lm_eval/tasks/bbh/README.md | 6 +- lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml | 36 ++ .../bbh/cot_fewshot/_bbh_cot_fewshot.yaml | 36 ++ .../cot_fewshot/_cot_fewshot_template_yaml | 3 - .../bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml | 36 ++ .../cot_zeroshot/_cot_zeroshot_template_yaml | 1 - lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml | 35 ++ .../tasks/bbh/fewshot/_fewshot_template_yaml | 1 - lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml | 36 ++ .../bbh/zeroshot/_zeroshot_template_yaml | 1 - lm_eval/tasks/belebele/_belebele.yaml | 133 +++++++ lm_eval/tasks/belebele/_default_template_yaml | 1 - lm_eval/tasks/belebele/_generate_configs.py | 33 ++ .../tasks/benchmarks/flan/flan_held_in.yaml | 141 ++++--- lm_eval/tasks/benchmarks/minerva_math.yaml | 6 + .../benchmarks/multimedqa/multimedqa.yaml | 4 + lm_eval/tasks/bertaqa/_bertaqa_template | 2 +- lm_eval/tasks/blimp/_blimp.yaml | 75 ++++ lm_eval/tasks/blimp/_template_yaml | 1 - lm_eval/tasks/ceval/_ceval-valid.yaml | 63 ++++ lm_eval/tasks/ceval/_default_ceval_yaml | 1 - lm_eval/tasks/ceval/_generate_configs.py | 25 +- lm_eval/tasks/cmmlu/_cmmlu.yaml | 78 ++++ lm_eval/tasks/cmmlu/_default_template_yaml | 1 - lm_eval/tasks/cmmlu/_generate_configs.py | 30 ++ lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml | 4 + .../tasks/cmmlu/cmmlu_ancient_chinese.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_arts.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_astronomy.yaml | 4 + .../tasks/cmmlu/cmmlu_business_ethics.yaml | 4 + .../cmmlu_chinese_civil_service_exam.yaml | 4 + .../cmmlu/cmmlu_chinese_driving_rule.yaml | 4 + .../cmmlu/cmmlu_chinese_food_culture.yaml | 4 + .../cmmlu/cmmlu_chinese_foreign_policy.yaml | 4 + .../tasks/cmmlu/cmmlu_chinese_history.yaml | 4 + .../tasks/cmmlu/cmmlu_chinese_literature.yaml | 4 + .../cmmlu_chinese_teacher_qualification.yaml | 4 + .../tasks/cmmlu/cmmlu_clinical_knowledge.yaml | 4 + .../cmmlu_college_actuarial_science.yaml | 4 + .../tasks/cmmlu/cmmlu_college_education.yaml | 4 + .../cmmlu_college_engineering_hydrology.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_college_law.yaml | 4 + .../cmmlu/cmmlu_college_mathematics.yaml | 4 + .../cmmlu_college_medical_statistics.yaml | 4 + .../tasks/cmmlu/cmmlu_college_medicine.yaml | 4 + .../tasks/cmmlu/cmmlu_computer_science.yaml | 4 + .../tasks/cmmlu/cmmlu_computer_security.yaml | 4 + .../tasks/cmmlu/cmmlu_conceptual_physics.yaml | 4 + ...cmmlu_construction_project_management.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_economics.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_education.yaml | 4 + .../cmmlu/cmmlu_electrical_engineering.yaml | 4 + .../tasks/cmmlu/cmmlu_elementary_chinese.yaml | 4 + .../cmmlu/cmmlu_elementary_commonsense.yaml | 4 + ...elementary_information_and_technology.yaml | 4 + .../cmmlu/cmmlu_elementary_mathematics.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_food_science.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_genetics.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_global_facts.yaml | 4 + .../cmmlu/cmmlu_high_school_biology.yaml | 4 + .../cmmlu/cmmlu_high_school_chemistry.yaml | 4 + .../cmmlu/cmmlu_high_school_geography.yaml | 4 + .../cmmlu/cmmlu_high_school_mathematics.yaml | 4 + .../cmmlu/cmmlu_high_school_physics.yaml | 4 + .../cmmlu/cmmlu_high_school_politics.yaml | 4 + .../tasks/cmmlu/cmmlu_human_sexuality.yaml | 4 + .../tasks/cmmlu/cmmlu_international_law.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_journalism.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_jurisprudence.yaml | 4 + .../cmmlu/cmmlu_legal_and_moral_basis.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_logical.yaml | 4 + .../tasks/cmmlu/cmmlu_machine_learning.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_management.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_marketing.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_modern_chinese.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml | 4 + .../cmmlu/cmmlu_professional_accounting.yaml | 4 + .../tasks/cmmlu/cmmlu_professional_law.yaml | 4 + .../cmmlu/cmmlu_professional_medicine.yaml | 4 + .../cmmlu/cmmlu_professional_psychology.yaml | 4 + .../tasks/cmmlu/cmmlu_public_relations.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_security_study.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_sociology.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_sports_science.yaml | 4 + .../cmmlu_traditional_chinese_medicine.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_virology.yaml | 4 + lm_eval/tasks/cmmlu/cmmlu_world_history.yaml | 4 + .../tasks/cmmlu/cmmlu_world_religions.yaml | 4 + lm_eval/tasks/copal_id/standard.yaml | 2 +- .../crows_pairs/crows_pairs_english.yaml | 4 +- lm_eval/tasks/csatqa/_csatqa.yaml | 17 + lm_eval/tasks/csatqa/_default_csatqa_yaml | 1 - lm_eval/tasks/fda/task.py | 2 +- lm_eval/tasks/fld/fld_default.yaml | 2 - lm_eval/tasks/french_bench/README.md | 4 +- .../french_bench_arc_challenge.yaml | 2 +- .../french_bench/french_bench_boolqa.yaml | 2 +- .../french_bench/french_bench_fquadv2.yaml | 2 +- .../french_bench_fquadv2_bool.yaml | 2 +- .../french_bench_fquadv2_genq.yaml | 2 +- .../french_bench_fquadv2_hasAns.yaml | 2 +- .../french_bench/french_bench_grammar.yaml | 2 +- .../french_bench/french_bench_hellaswag.yaml | 2 +- .../french_bench/french_bench_multifquad.yaml | 2 +- .../french_bench_opus_perplexity.yaml | 2 +- .../french_bench_orangesum_abstract.yaml | 2 +- .../french_bench_orangesum_title.yaml | 2 +- .../french_bench_reading_comp.yaml | 2 +- .../french_bench_topic_based_nli.yaml | 2 +- .../french_bench/french_bench_trivia.yaml | 2 +- .../french_bench/french_bench_vocab.yaml | 2 +- .../french_bench_wikitext_fr.yaml | 2 +- .../tasks/french_bench/french_bench_xnli.yaml | 2 +- lm_eval/tasks/glue/README.md | 6 +- lm_eval/tasks/glue/cola/default.yaml | 2 +- lm_eval/tasks/glue/mnli/default.yaml | 2 +- lm_eval/tasks/glue/mrpc/default.yaml | 2 +- lm_eval/tasks/glue/qnli/default.yaml | 2 +- lm_eval/tasks/glue/qqp/default.yaml | 2 +- lm_eval/tasks/glue/rte/default.yaml | 2 +- lm_eval/tasks/glue/sst2/default.yaml | 2 +- lm_eval/tasks/glue/wnli/default.yaml | 2 +- lm_eval/tasks/gpqa/README.md | 8 +- .../gpqa/cot_n_shot/_gpqa_cot_n_shot_yaml | 2 +- .../gpqa/cot_zeroshot/_gpqa_cot_zeroshot_yaml | 2 +- .../generative/_gpqa_generative_n_shot_yaml | 2 +- lm_eval/tasks/gpqa/n_shot/_gpqa_n_shot_yaml | 2 +- .../tasks/gpqa/zeroshot/_gpqa_zeroshot_yaml | 2 +- .../gsm8k/gsm8k-cot-self-consistency.yaml | 2 +- lm_eval/tasks/gsm8k/gsm8k-cot-zeroshot.yaml | 2 +- lm_eval/tasks/gsm8k/gsm8k-cot.yaml | 2 +- lm_eval/tasks/gsm8k/gsm8k.yaml | 2 +- lm_eval/tasks/haerae/_default_haerae_yaml | 1 - lm_eval/tasks/haerae/_haerae.yaml | 16 + lm_eval/tasks/headqa/headqa_en.yaml | 3 +- lm_eval/tasks/hellaswag/hellaswag.yaml | 2 +- .../tasks/hendrycks_ethics/commonsense.yaml | 2 +- lm_eval/tasks/hendrycks_ethics/justice.yaml | 2 +- .../hendrycks_ethics/utilitarianism.yaml | 2 +- lm_eval/tasks/hendrycks_ethics/virtue.yaml | 2 +- .../tasks/hendrycks_math/hendrycks_math.yaml | 6 + .../hendrycks_math_algebra.yaml | 2 +- .../inverse_scaling/_inverse_scaling_mc_yaml | 2 +- lm_eval/tasks/inverse_scaling/_some_results | 39 ++ lm_eval/tasks/kmmlu/cot_hard/_cot_kmmlu_yaml | 2 +- lm_eval/tasks/kmmlu/direct/_direct_kmmlu_yaml | 2 +- .../kmmlu/direct_hard/_direct_hard_kmmlu_yaml | 2 +- lm_eval/tasks/kmmlu/hard/_hard_kmmlu_yaml | 2 +- lm_eval/tasks/kobest/kobest_boolq.yaml | 2 +- lm_eval/tasks/kobest/kobest_copa.yaml | 2 +- lm_eval/tasks/kobest/kobest_hellaswag.yaml | 2 +- lm_eval/tasks/kobest/kobest_sentineg.yaml | 2 +- lm_eval/tasks/kobest/kobest_wic.yaml | 2 +- lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml | 11 + .../tasks/kormedmcqa/kormedmcqa_doctor.yaml | 1 - .../tasks/kormedmcqa/kormedmcqa_nurse.yaml | 1 - .../tasks/kormedmcqa/kormedmcqa_pharm.yaml | 1 - lm_eval/tasks/lambada/lambada_openai.yaml | 2 +- lm_eval/tasks/lambada/lambada_standard.yaml | 2 +- .../lambada_cloze/lambada_openai_cloze.yaml | 2 +- .../lambada_cloze/lambada_standard_cloze.yaml | 2 +- .../lambada_multilingual/lambada_mt_en.yaml | 2 +- .../leaderboard/bbh_mc/_fewshot_template_yaml | 1 - .../leaderboard/bbh_mc/_leaderboard_bbh.yaml | 26 ++ .../leaderboard/gpqa/_leaderboard_gpqa.yaml | 5 + lm_eval/tasks/leaderboard/gpqa/_template_yaml | 1 - .../_leaderboard_instruction_following.yaml | 3 + lm_eval/tasks/leaderboard/ifeval/ifeval.yaml | 1 - .../leaderboard/math/_leaderboard_math.yaml | 9 + lm_eval/tasks/leaderboard/math/_template_yaml | 2 - lm_eval/tasks/leaderboard/musr/_musr.yaml | 5 + lm_eval/tasks/leaderboard/musr/_template_yaml | 2 - lm_eval/tasks/mathqa/mathqa.yaml | 2 +- lm_eval/tasks/mmlu/_generate_configs.py | 3 +- lm_eval/tasks/mmlu/continuation/_mmlu.yaml | 34 +- .../continuation/mmlu_abstract_algebra.yaml | 2 +- .../tasks/mmlu/continuation/mmlu_anatomy.yaml | 2 +- .../mmlu/continuation/mmlu_astronomy.yaml | 2 +- .../continuation/mmlu_business_ethics.yaml | 2 +- .../continuation/mmlu_clinical_knowledge.yaml | 2 +- .../continuation/mmlu_college_biology.yaml | 2 +- .../continuation/mmlu_college_chemistry.yaml | 2 +- .../mmlu_college_computer_science.yaml | 2 +- .../mmlu_college_mathematics.yaml | 2 +- .../continuation/mmlu_college_medicine.yaml | 2 +- .../continuation/mmlu_college_physics.yaml | 2 +- .../continuation/mmlu_computer_security.yaml | 2 +- .../continuation/mmlu_conceptual_physics.yaml | 2 +- .../mmlu/continuation/mmlu_econometrics.yaml | 2 +- .../mmlu_electrical_engineering.yaml | 2 +- .../mmlu_elementary_mathematics.yaml | 2 +- .../mmlu/continuation/mmlu_formal_logic.yaml | 2 +- .../mmlu/continuation/mmlu_global_facts.yaml | 2 +- .../mmlu_high_school_biology.yaml | 2 +- .../mmlu_high_school_chemistry.yaml | 2 +- .../mmlu_high_school_computer_science.yaml | 2 +- .../mmlu_high_school_european_history.yaml | 2 +- .../mmlu_high_school_geography.yaml | 2 +- ...u_high_school_government_and_politics.yaml | 2 +- .../mmlu_high_school_macroeconomics.yaml | 2 +- .../mmlu_high_school_mathematics.yaml | 2 +- .../mmlu_high_school_microeconomics.yaml | 2 +- .../mmlu_high_school_physics.yaml | 2 +- .../mmlu_high_school_psychology.yaml | 2 +- .../mmlu_high_school_statistics.yaml | 2 +- .../mmlu_high_school_us_history.yaml | 2 +- .../mmlu_high_school_world_history.yaml | 2 +- .../mmlu/continuation/mmlu_human_aging.yaml | 2 +- .../continuation/mmlu_human_sexuality.yaml | 2 +- .../continuation/mmlu_international_law.yaml | 2 +- .../mmlu/continuation/mmlu_jurisprudence.yaml | 2 +- .../continuation/mmlu_logical_fallacies.yaml | 2 +- .../continuation/mmlu_machine_learning.yaml | 2 +- .../mmlu/continuation/mmlu_management.yaml | 2 +- .../mmlu/continuation/mmlu_marketing.yaml | 2 +- .../continuation/mmlu_medical_genetics.yaml | 2 +- .../mmlu/continuation/mmlu_miscellaneous.yaml | 2 +- .../continuation/mmlu_moral_disputes.yaml | 2 +- .../continuation/mmlu_moral_scenarios.yaml | 2 +- .../mmlu/continuation/mmlu_nutrition.yaml | 2 +- .../mmlu/continuation/mmlu_philosophy.yaml | 2 +- .../mmlu/continuation/mmlu_prehistory.yaml | 2 +- .../mmlu_professional_accounting.yaml | 2 +- .../continuation/mmlu_professional_law.yaml | 2 +- .../mmlu_professional_medicine.yaml | 2 +- .../mmlu_professional_psychology.yaml | 2 +- .../continuation/mmlu_public_relations.yaml | 2 +- .../continuation/mmlu_security_studies.yaml | 2 +- .../mmlu/continuation/mmlu_sociology.yaml | 2 +- .../continuation/mmlu_us_foreign_policy.yaml | 2 +- .../mmlu/continuation/mmlu_virology.yaml | 2 +- .../continuation/mmlu_world_religions.yaml | 2 +- lm_eval/tasks/mmlu/default/_mmlu.yaml | 5 + .../tasks/mmlu/default/_mmlu_humanities.yaml | 9 + lm_eval/tasks/mmlu/default/_mmlu_other.yaml | 9 + .../mmlu/default/_mmlu_social_sciences.yaml | 9 + lm_eval/tasks/mmlu/default/_mmlu_stem.yaml | 9 + .../mmlu/default/mmlu_abstract_algebra.yaml | 3 +- lm_eval/tasks/mmlu/default/mmlu_anatomy.yaml | 3 +- .../tasks/mmlu/default/mmlu_astronomy.yaml | 3 +- .../mmlu/default/mmlu_business_ethics.yaml | 3 +- .../mmlu/default/mmlu_clinical_knowledge.yaml | 3 +- .../mmlu/default/mmlu_college_biology.yaml | 3 +- .../mmlu/default/mmlu_college_chemistry.yaml | 3 +- .../mmlu_college_computer_science.yaml | 3 +- .../default/mmlu_college_mathematics.yaml | 3 +- .../mmlu/default/mmlu_college_medicine.yaml | 3 +- .../mmlu/default/mmlu_college_physics.yaml | 3 +- .../mmlu/default/mmlu_computer_security.yaml | 3 +- .../mmlu/default/mmlu_conceptual_physics.yaml | 3 +- .../tasks/mmlu/default/mmlu_econometrics.yaml | 3 +- .../default/mmlu_electrical_engineering.yaml | 3 +- .../default/mmlu_elementary_mathematics.yaml | 3 +- .../tasks/mmlu/default/mmlu_formal_logic.yaml | 3 +- .../tasks/mmlu/default/mmlu_global_facts.yaml | 3 +- .../default/mmlu_high_school_biology.yaml | 3 +- .../default/mmlu_high_school_chemistry.yaml | 3 +- .../mmlu_high_school_computer_science.yaml | 3 +- .../mmlu_high_school_european_history.yaml | 3 +- .../default/mmlu_high_school_geography.yaml | 3 +- ...u_high_school_government_and_politics.yaml | 3 +- .../mmlu_high_school_macroeconomics.yaml | 3 +- .../default/mmlu_high_school_mathematics.yaml | 3 +- .../mmlu_high_school_microeconomics.yaml | 3 +- .../default/mmlu_high_school_physics.yaml | 3 +- .../default/mmlu_high_school_psychology.yaml | 3 +- .../default/mmlu_high_school_statistics.yaml | 3 +- .../default/mmlu_high_school_us_history.yaml | 3 +- .../mmlu_high_school_world_history.yaml | 3 +- .../tasks/mmlu/default/mmlu_human_aging.yaml | 3 +- .../mmlu/default/mmlu_human_sexuality.yaml | 3 +- .../mmlu/default/mmlu_international_law.yaml | 3 +- .../mmlu/default/mmlu_jurisprudence.yaml | 3 +- .../mmlu/default/mmlu_logical_fallacies.yaml | 3 +- .../mmlu/default/mmlu_machine_learning.yaml | 3 +- .../tasks/mmlu/default/mmlu_management.yaml | 3 +- .../tasks/mmlu/default/mmlu_marketing.yaml | 3 +- .../mmlu/default/mmlu_medical_genetics.yaml | 3 +- .../mmlu/default/mmlu_miscellaneous.yaml | 3 +- .../mmlu/default/mmlu_moral_disputes.yaml | 3 +- .../mmlu/default/mmlu_moral_scenarios.yaml | 3 +- .../tasks/mmlu/default/mmlu_nutrition.yaml | 3 +- .../tasks/mmlu/default/mmlu_philosophy.yaml | 3 +- .../tasks/mmlu/default/mmlu_prehistory.yaml | 3 +- .../default/mmlu_professional_accounting.yaml | 3 +- .../mmlu/default/mmlu_professional_law.yaml | 3 +- .../default/mmlu_professional_medicine.yaml | 3 +- .../default/mmlu_professional_psychology.yaml | 3 +- .../mmlu/default/mmlu_public_relations.yaml | 3 +- .../mmlu/default/mmlu_security_studies.yaml | 3 +- .../tasks/mmlu/default/mmlu_sociology.yaml | 3 +- .../mmlu/default/mmlu_us_foreign_policy.yaml | 3 +- lm_eval/tasks/mmlu/default/mmlu_virology.yaml | 3 +- .../mmlu/default/mmlu_world_religions.yaml | 3 +- .../tasks/mmlu/flan_cot_fewshot/_mmlu.yaml | 34 +- .../mmlu_abstract_algebra.yaml | 2 +- .../mmlu/flan_cot_fewshot/mmlu_anatomy.yaml | 2 +- .../mmlu/flan_cot_fewshot/mmlu_astronomy.yaml | 2 +- .../mmlu_business_ethics.yaml | 2 +- .../mmlu_clinical_knowledge.yaml | 2 +- .../mmlu_college_biology.yaml | 2 +- .../mmlu_college_chemistry.yaml | 2 +- .../mmlu_college_computer_science.yaml | 2 +- .../mmlu_college_mathematics.yaml | 2 +- .../mmlu_college_medicine.yaml | 2 +- .../mmlu_college_physics.yaml | 2 +- .../mmlu_computer_security.yaml | 2 +- .../mmlu_conceptual_physics.yaml | 2 +- .../flan_cot_fewshot/mmlu_econometrics.yaml | 2 +- .../mmlu_electrical_engineering.yaml | 2 +- .../mmlu_elementary_mathematics.yaml | 2 +- .../flan_cot_fewshot/mmlu_formal_logic.yaml | 2 +- .../flan_cot_fewshot/mmlu_global_facts.yaml | 2 +- .../mmlu_high_school_biology.yaml | 2 +- .../mmlu_high_school_chemistry.yaml | 2 +- .../mmlu_high_school_computer_science.yaml | 2 +- .../mmlu_high_school_european_history.yaml | 2 +- .../mmlu_high_school_geography.yaml | 2 +- ...u_high_school_government_and_politics.yaml | 2 +- .../mmlu_high_school_macroeconomics.yaml | 2 +- .../mmlu_high_school_mathematics.yaml | 2 +- .../mmlu_high_school_microeconomics.yaml | 2 +- .../mmlu_high_school_physics.yaml | 2 +- .../mmlu_high_school_psychology.yaml | 2 +- .../mmlu_high_school_statistics.yaml | 2 +- .../mmlu_high_school_us_history.yaml | 2 +- .../mmlu_high_school_world_history.yaml | 2 +- .../flan_cot_fewshot/mmlu_human_aging.yaml | 2 +- .../mmlu_human_sexuality.yaml | 2 +- .../mmlu_international_law.yaml | 2 +- .../flan_cot_fewshot/mmlu_jurisprudence.yaml | 2 +- .../mmlu_logical_fallacies.yaml | 2 +- .../mmlu_machine_learning.yaml | 2 +- .../flan_cot_fewshot/mmlu_management.yaml | 2 +- .../mmlu/flan_cot_fewshot/mmlu_marketing.yaml | 2 +- .../mmlu_medical_genetics.yaml | 2 +- .../flan_cot_fewshot/mmlu_miscellaneous.yaml | 2 +- .../flan_cot_fewshot/mmlu_moral_disputes.yaml | 2 +- .../mmlu_moral_scenarios.yaml | 2 +- .../mmlu/flan_cot_fewshot/mmlu_nutrition.yaml | 2 +- .../flan_cot_fewshot/mmlu_philosophy.yaml | 2 +- .../flan_cot_fewshot/mmlu_prehistory.yaml | 2 +- .../mmlu_professional_accounting.yaml | 2 +- .../mmlu_professional_law.yaml | 2 +- .../mmlu_professional_medicine.yaml | 2 +- .../mmlu_professional_psychology.yaml | 2 +- .../mmlu_public_relations.yaml | 2 +- .../mmlu_security_studies.yaml | 2 +- .../mmlu/flan_cot_fewshot/mmlu_sociology.yaml | 2 +- .../mmlu_us_foreign_policy.yaml | 2 +- .../mmlu/flan_cot_fewshot/mmlu_virology.yaml | 2 +- .../mmlu_world_religions.yaml | 2 +- .../tasks/mmlu/flan_cot_zeroshot/_mmlu.yaml | 34 +- .../mmlu_abstract_algebra.yaml | 2 +- .../mmlu/flan_cot_zeroshot/mmlu_anatomy.yaml | 2 +- .../flan_cot_zeroshot/mmlu_astronomy.yaml | 2 +- .../mmlu_business_ethics.yaml | 2 +- .../mmlu_clinical_knowledge.yaml | 2 +- .../mmlu_college_biology.yaml | 2 +- .../mmlu_college_chemistry.yaml | 2 +- .../mmlu_college_computer_science.yaml | 2 +- .../mmlu_college_mathematics.yaml | 2 +- .../mmlu_college_medicine.yaml | 2 +- .../mmlu_college_physics.yaml | 2 +- .../mmlu_computer_security.yaml | 2 +- .../mmlu_conceptual_physics.yaml | 2 +- .../flan_cot_zeroshot/mmlu_econometrics.yaml | 2 +- .../mmlu_electrical_engineering.yaml | 2 +- .../mmlu_elementary_mathematics.yaml | 2 +- .../flan_cot_zeroshot/mmlu_formal_logic.yaml | 2 +- .../flan_cot_zeroshot/mmlu_global_facts.yaml | 2 +- .../mmlu_high_school_biology.yaml | 2 +- .../mmlu_high_school_chemistry.yaml | 2 +- .../mmlu_high_school_computer_science.yaml | 2 +- .../mmlu_high_school_european_history.yaml | 2 +- .../mmlu_high_school_geography.yaml | 2 +- ...u_high_school_government_and_politics.yaml | 2 +- .../mmlu_high_school_macroeconomics.yaml | 2 +- .../mmlu_high_school_mathematics.yaml | 2 +- .../mmlu_high_school_microeconomics.yaml | 2 +- .../mmlu_high_school_physics.yaml | 2 +- .../mmlu_high_school_psychology.yaml | 2 +- .../mmlu_high_school_statistics.yaml | 2 +- .../mmlu_high_school_us_history.yaml | 2 +- .../mmlu_high_school_world_history.yaml | 2 +- .../flan_cot_zeroshot/mmlu_human_aging.yaml | 2 +- .../mmlu_human_sexuality.yaml | 2 +- .../mmlu_international_law.yaml | 2 +- .../flan_cot_zeroshot/mmlu_jurisprudence.yaml | 2 +- .../mmlu_logical_fallacies.yaml | 2 +- .../mmlu_machine_learning.yaml | 2 +- .../flan_cot_zeroshot/mmlu_management.yaml | 2 +- .../flan_cot_zeroshot/mmlu_marketing.yaml | 2 +- .../mmlu_medical_genetics.yaml | 2 +- .../flan_cot_zeroshot/mmlu_miscellaneous.yaml | 2 +- .../mmlu_moral_disputes.yaml | 2 +- .../mmlu_moral_scenarios.yaml | 2 +- .../flan_cot_zeroshot/mmlu_nutrition.yaml | 2 +- .../flan_cot_zeroshot/mmlu_philosophy.yaml | 2 +- .../flan_cot_zeroshot/mmlu_prehistory.yaml | 2 +- .../mmlu_professional_accounting.yaml | 2 +- .../mmlu_professional_law.yaml | 2 +- .../mmlu_professional_medicine.yaml | 2 +- .../mmlu_professional_psychology.yaml | 2 +- .../mmlu_public_relations.yaml | 2 +- .../mmlu_security_studies.yaml | 2 +- .../flan_cot_zeroshot/mmlu_sociology.yaml | 2 +- .../mmlu_us_foreign_policy.yaml | 2 +- .../mmlu/flan_cot_zeroshot/mmlu_virology.yaml | 2 +- .../mmlu_world_religions.yaml | 2 +- .../mmlu/flan_n_shot/generative/_mmlu.yaml | 34 +- .../_mmlu_flan_generative_template_yaml | 1 - .../generative/mmlu_abstract_algebra.yaml | 2 +- .../flan_n_shot/generative/mmlu_anatomy.yaml | 2 +- .../generative/mmlu_astronomy.yaml | 2 +- .../generative/mmlu_business_ethics.yaml | 2 +- .../generative/mmlu_clinical_knowledge.yaml | 2 +- .../generative/mmlu_college_biology.yaml | 2 +- .../generative/mmlu_college_chemistry.yaml | 2 +- .../mmlu_college_computer_science.yaml | 2 +- .../generative/mmlu_college_mathematics.yaml | 2 +- .../generative/mmlu_college_medicine.yaml | 2 +- .../generative/mmlu_college_physics.yaml | 2 +- .../generative/mmlu_computer_security.yaml | 2 +- .../generative/mmlu_conceptual_physics.yaml | 2 +- .../generative/mmlu_econometrics.yaml | 2 +- .../mmlu_electrical_engineering.yaml | 2 +- .../mmlu_elementary_mathematics.yaml | 2 +- .../generative/mmlu_formal_logic.yaml | 2 +- .../generative/mmlu_global_facts.yaml | 2 +- .../generative/mmlu_high_school_biology.yaml | 2 +- .../mmlu_high_school_chemistry.yaml | 2 +- .../mmlu_high_school_computer_science.yaml | 2 +- .../mmlu_high_school_european_history.yaml | 2 +- .../mmlu_high_school_geography.yaml | 2 +- ...u_high_school_government_and_politics.yaml | 2 +- .../mmlu_high_school_macroeconomics.yaml | 2 +- .../mmlu_high_school_mathematics.yaml | 2 +- .../mmlu_high_school_microeconomics.yaml | 2 +- .../generative/mmlu_high_school_physics.yaml | 2 +- .../mmlu_high_school_psychology.yaml | 2 +- .../mmlu_high_school_statistics.yaml | 2 +- .../mmlu_high_school_us_history.yaml | 2 +- .../mmlu_high_school_world_history.yaml | 2 +- .../generative/mmlu_human_aging.yaml | 2 +- .../generative/mmlu_human_sexuality.yaml | 2 +- .../generative/mmlu_international_law.yaml | 2 +- .../generative/mmlu_jurisprudence.yaml | 2 +- .../generative/mmlu_logical_fallacies.yaml | 2 +- .../generative/mmlu_machine_learning.yaml | 2 +- .../generative/mmlu_management.yaml | 2 +- .../generative/mmlu_marketing.yaml | 2 +- .../generative/mmlu_medical_genetics.yaml | 2 +- .../generative/mmlu_miscellaneous.yaml | 2 +- .../generative/mmlu_moral_disputes.yaml | 2 +- .../generative/mmlu_moral_scenarios.yaml | 2 +- .../generative/mmlu_nutrition.yaml | 2 +- .../generative/mmlu_philosophy.yaml | 2 +- .../generative/mmlu_prehistory.yaml | 2 +- .../mmlu_professional_accounting.yaml | 2 +- .../generative/mmlu_professional_law.yaml | 2 +- .../mmlu_professional_medicine.yaml | 2 +- .../mmlu_professional_psychology.yaml | 2 +- .../generative/mmlu_public_relations.yaml | 2 +- .../generative/mmlu_security_studies.yaml | 2 +- .../generative/mmlu_sociology.yaml | 2 +- .../generative/mmlu_us_foreign_policy.yaml | 2 +- .../flan_n_shot/generative/mmlu_virology.yaml | 2 +- .../generative/mmlu_world_religions.yaml | 2 +- .../mmlu/flan_n_shot/loglikelihood/_mmlu.yaml | 34 +- .../loglikelihood/mmlu_abstract_algebra.yaml | 2 +- .../loglikelihood/mmlu_anatomy.yaml | 2 +- .../loglikelihood/mmlu_astronomy.yaml | 2 +- .../loglikelihood/mmlu_business_ethics.yaml | 2 +- .../mmlu_clinical_knowledge.yaml | 2 +- .../loglikelihood/mmlu_college_biology.yaml | 2 +- .../loglikelihood/mmlu_college_chemistry.yaml | 2 +- .../mmlu_college_computer_science.yaml | 2 +- .../mmlu_college_mathematics.yaml | 2 +- .../loglikelihood/mmlu_college_medicine.yaml | 2 +- .../loglikelihood/mmlu_college_physics.yaml | 2 +- .../loglikelihood/mmlu_computer_security.yaml | 2 +- .../mmlu_conceptual_physics.yaml | 2 +- .../loglikelihood/mmlu_econometrics.yaml | 2 +- .../mmlu_electrical_engineering.yaml | 2 +- .../mmlu_elementary_mathematics.yaml | 2 +- .../loglikelihood/mmlu_formal_logic.yaml | 2 +- .../loglikelihood/mmlu_global_facts.yaml | 2 +- .../mmlu_high_school_biology.yaml | 2 +- .../mmlu_high_school_chemistry.yaml | 2 +- .../mmlu_high_school_computer_science.yaml | 2 +- .../mmlu_high_school_european_history.yaml | 2 +- .../mmlu_high_school_geography.yaml | 2 +- ...u_high_school_government_and_politics.yaml | 2 +- .../mmlu_high_school_macroeconomics.yaml | 2 +- .../mmlu_high_school_mathematics.yaml | 2 +- .../mmlu_high_school_microeconomics.yaml | 2 +- .../mmlu_high_school_physics.yaml | 2 +- .../mmlu_high_school_psychology.yaml | 2 +- .../mmlu_high_school_statistics.yaml | 2 +- .../mmlu_high_school_us_history.yaml | 2 +- .../mmlu_high_school_world_history.yaml | 2 +- .../loglikelihood/mmlu_human_aging.yaml | 2 +- .../loglikelihood/mmlu_human_sexuality.yaml | 2 +- .../loglikelihood/mmlu_international_law.yaml | 2 +- .../loglikelihood/mmlu_jurisprudence.yaml | 2 +- .../loglikelihood/mmlu_logical_fallacies.yaml | 2 +- .../loglikelihood/mmlu_machine_learning.yaml | 2 +- .../loglikelihood/mmlu_management.yaml | 2 +- .../loglikelihood/mmlu_marketing.yaml | 2 +- .../loglikelihood/mmlu_medical_genetics.yaml | 2 +- .../loglikelihood/mmlu_miscellaneous.yaml | 2 +- .../loglikelihood/mmlu_moral_disputes.yaml | 2 +- .../loglikelihood/mmlu_moral_scenarios.yaml | 2 +- .../loglikelihood/mmlu_nutrition.yaml | 2 +- .../loglikelihood/mmlu_philosophy.yaml | 2 +- .../loglikelihood/mmlu_prehistory.yaml | 2 +- .../mmlu_professional_accounting.yaml | 2 +- .../loglikelihood/mmlu_professional_law.yaml | 2 +- .../mmlu_professional_medicine.yaml | 2 +- .../mmlu_professional_psychology.yaml | 2 +- .../loglikelihood/mmlu_public_relations.yaml | 2 +- .../loglikelihood/mmlu_security_studies.yaml | 2 +- .../loglikelihood/mmlu_sociology.yaml | 2 +- .../loglikelihood/mmlu_us_foreign_policy.yaml | 2 +- .../loglikelihood/mmlu_virology.yaml | 2 +- .../loglikelihood/mmlu_world_religions.yaml | 2 +- lm_eval/tasks/mmlu/generative/_mmlu.yaml | 34 +- .../generative/mmlu_abstract_algebra.yaml | 3 +- .../tasks/mmlu/generative/mmlu_anatomy.yaml | 3 +- .../tasks/mmlu/generative/mmlu_astronomy.yaml | 3 +- .../mmlu/generative/mmlu_business_ethics.yaml | 3 +- .../generative/mmlu_clinical_knowledge.yaml | 3 +- .../mmlu/generative/mmlu_college_biology.yaml | 3 +- .../generative/mmlu_college_chemistry.yaml | 3 +- .../mmlu_college_computer_science.yaml | 3 +- .../generative/mmlu_college_mathematics.yaml | 3 +- .../generative/mmlu_college_medicine.yaml | 3 +- .../mmlu/generative/mmlu_college_physics.yaml | 3 +- .../generative/mmlu_computer_security.yaml | 3 +- .../generative/mmlu_conceptual_physics.yaml | 3 +- .../mmlu/generative/mmlu_econometrics.yaml | 3 +- .../mmlu_electrical_engineering.yaml | 3 +- .../mmlu_elementary_mathematics.yaml | 3 +- .../mmlu/generative/mmlu_formal_logic.yaml | 3 +- .../mmlu/generative/mmlu_global_facts.yaml | 3 +- .../generative/mmlu_high_school_biology.yaml | 3 +- .../mmlu_high_school_chemistry.yaml | 3 +- .../mmlu_high_school_computer_science.yaml | 3 +- .../mmlu_high_school_european_history.yaml | 3 +- .../mmlu_high_school_geography.yaml | 3 +- ...u_high_school_government_and_politics.yaml | 3 +- .../mmlu_high_school_macroeconomics.yaml | 3 +- .../mmlu_high_school_mathematics.yaml | 3 +- .../mmlu_high_school_microeconomics.yaml | 3 +- .../generative/mmlu_high_school_physics.yaml | 3 +- .../mmlu_high_school_psychology.yaml | 3 +- .../mmlu_high_school_statistics.yaml | 3 +- .../mmlu_high_school_us_history.yaml | 3 +- .../mmlu_high_school_world_history.yaml | 3 +- .../mmlu/generative/mmlu_human_aging.yaml | 3 +- .../mmlu/generative/mmlu_human_sexuality.yaml | 3 +- .../generative/mmlu_international_law.yaml | 3 +- .../mmlu/generative/mmlu_jurisprudence.yaml | 3 +- .../generative/mmlu_logical_fallacies.yaml | 3 +- .../generative/mmlu_machine_learning.yaml | 3 +- .../mmlu/generative/mmlu_management.yaml | 3 +- .../tasks/mmlu/generative/mmlu_marketing.yaml | 3 +- .../generative/mmlu_medical_genetics.yaml | 3 +- .../mmlu/generative/mmlu_miscellaneous.yaml | 3 +- .../mmlu/generative/mmlu_moral_disputes.yaml | 3 +- .../mmlu/generative/mmlu_moral_scenarios.yaml | 3 +- .../tasks/mmlu/generative/mmlu_nutrition.yaml | 3 +- .../mmlu/generative/mmlu_philosophy.yaml | 3 +- .../mmlu/generative/mmlu_prehistory.yaml | 3 +- .../mmlu_professional_accounting.yaml | 3 +- .../generative/mmlu_professional_law.yaml | 3 +- .../mmlu_professional_medicine.yaml | 3 +- .../mmlu_professional_psychology.yaml | 3 +- .../generative/mmlu_public_relations.yaml | 3 +- .../generative/mmlu_security_studies.yaml | 3 +- .../tasks/mmlu/generative/mmlu_sociology.yaml | 3 +- .../generative/mmlu_us_foreign_policy.yaml | 3 +- .../tasks/mmlu/generative/mmlu_virology.yaml | 3 +- .../mmlu/generative/mmlu_world_religions.yaml | 3 +- .../advanced_ai_risk/_template_yaml | 2 +- .../persona/_template_yaml | 2 +- .../sycophancy/sycophancy_on_nlp_survey.yaml | 2 +- .../sycophancy_on_philpapers2020.yaml | 2 +- ...sycophancy_on_political_typology_quiz.yaml | 2 +- .../{_template_yaml => winogenerated.yaml} | 2 +- .../tasks/okapi/arc_multilingual/_arc_yaml | 2 +- .../hellaswag_multilingual/_hellaswag_yaml | 2 +- .../okapi/mmlu_multilingual/_default_yaml | 2 +- .../_truthfulqa_mc1_yaml | 2 +- lm_eval/tasks/paloma/_paloma_template | 2 +- lm_eval/tasks/paws-x/_pawsx.yaml | 15 + lm_eval/tasks/paws-x/pawsx_template_yaml | 1 - lm_eval/tasks/pile/pile_arxiv.yaml | 2 - lm_eval/tasks/polemo2/polemo2_in.yaml | 2 +- lm_eval/tasks/qa4mre/qa4mre_2011.yaml | 2 +- lm_eval/tasks/qasper/bool.yaml | 2 +- lm_eval/tasks/qasper/freeform.yaml | 2 +- lm_eval/tasks/squad_completion/task.py | 2 +- lm_eval/tasks/storycloze/storycloze_2016.yaml | 2 +- lm_eval/tasks/storycloze/storycloze_2018.yaml | 2 +- lm_eval/tasks/super_glue/README.md | 6 +- lm_eval/tasks/super_glue/boolq/default.yaml | 2 +- lm_eval/tasks/super_glue/boolq/seq2seq.yaml | 2 +- lm_eval/tasks/super_glue/boolq/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/cb/default.yaml | 2 +- lm_eval/tasks/super_glue/cb/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/copa/default.yaml | 2 +- lm_eval/tasks/super_glue/copa/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/multirc/default.yaml | 2 +- .../tasks/super_glue/multirc/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/record/default.yaml | 2 +- .../tasks/super_glue/record/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/rte/default.yaml | 2 +- lm_eval/tasks/super_glue/rte/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/wic/default.yaml | 2 +- lm_eval/tasks/super_glue/wic/t5-prompt.yaml | 2 +- lm_eval/tasks/super_glue/wsc/default.yaml | 2 +- lm_eval/tasks/super_glue/wsc/t5-prompt.yaml | 2 +- lm_eval/tasks/swde/task.py | 2 +- .../tasks/translation/iwslt2017_ar-en.yaml | 3 +- .../tasks/translation/iwslt2017_en-ar.yaml | 3 +- lm_eval/tasks/translation/wmt14_en-fr.yaml | 3 +- lm_eval/tasks/translation/wmt14_fr-en.yaml | 3 +- lm_eval/tasks/translation/wmt16_de-en.yaml | 3 +- lm_eval/tasks/translation/wmt16_en-de.yaml | 3 +- lm_eval/tasks/translation/wmt16_en-ro.yaml | 3 +- lm_eval/tasks/translation/wmt16_ro-en.yaml | 3 +- lm_eval/tasks/truthfulqa/truthfulqa_gen.yaml | 2 +- lm_eval/tasks/truthfulqa/truthfulqa_mc1.yaml | 2 +- lm_eval/tasks/unscramble/anagrams1.yaml | 2 +- lm_eval/tasks/unscramble/anagrams2.yaml | 2 +- lm_eval/tasks/unscramble/cycle_letters.yaml | 2 +- .../tasks/unscramble/random_insertion.yaml | 2 +- lm_eval/tasks/unscramble/reversed_words.yaml | 2 +- lm_eval/tasks/webqs/webqs.yaml | 2 +- lm_eval/tasks/wmdp/README.md | 2 +- lm_eval/tasks/wmdp/_default_template_yaml | 1 - lm_eval/tasks/wmdp/_wmdp.yaml | 11 + lm_eval/tasks/wmt2016/README.md | 6 +- lm_eval/tasks/wmt2016/ro_en-t5_prompt.yaml | 2 - lm_eval/tasks/xcopa/_xcopa.yaml | 19 + lm_eval/tasks/xcopa/default_et.yaml | 1 - lm_eval/tasks/xnli/_xnli.yaml | 23 ++ lm_eval/tasks/xnli/xnli_common_yaml | 1 - lm_eval/tasks/xnli_eu/README.md | 4 +- lm_eval/tasks/xnli_eu/xnli_common_yaml | 1 - lm_eval/tasks/xnli_eu/xnli_eu_mt.yaml | 2 +- lm_eval/tasks/xnli_eu/xnli_eu_native.yaml | 2 +- lm_eval/tasks/xstorycloze/_xstorycloze.yaml | 19 + lm_eval/tasks/xstorycloze/default_ar.yaml | 1 - lm_eval/tasks/xwinograd/_xwinograd.yaml | 14 + lm_eval/tasks/xwinograd/xwinograd_common_yaml | 2 - lm_eval/utils.py | 16 +- templates/new_yaml_task/README.md | 6 +- tests/test_evaluator.py | 2 +- ...AI-pythia-14m-dtype-float32-device-cpu.txt | 14 +- ...AI-pythia-14m-dtype-float32-device-cpu.txt | 8 +- ...AI-pythia-14m-dtype-float32-device-cpu.txt | 5 - ...AI-pythia-14m-dtype-float32-device-cpu.txt | 22 ++ ...AI-pythia-14m-dtype-float32-device-cpu.txt | 6 +- tests/testyamls/test-01.yaml | 45 +++ 774 files changed, 3114 insertions(+), 1447 deletions(-) create mode 100644 lm_eval/api/group.py create mode 100644 lm_eval/tasks/aclue/_aclue.yaml create mode 100644 lm_eval/tasks/aexams/_aexams.yaml create mode 100644 lm_eval/tasks/agieval/agieval.yaml create mode 100644 lm_eval/tasks/agieval/agieval_cn.yaml create mode 100644 lm_eval/tasks/agieval/agieval_en.yaml create mode 100644 lm_eval/tasks/agieval/agieval_nous.yaml rename lm_eval/tasks/arabicmmlu/{arabicmmlu.yaml => _arabicmmlu.yaml} (60%) create mode 100644 lm_eval/tasks/arabicmmlu/_arabicmmlu_humanities.yaml create mode 100644 lm_eval/tasks/arabicmmlu/_arabicmmlu_language.yaml create mode 100644 lm_eval/tasks/arabicmmlu/_arabicmmlu_other.yaml create mode 100644 lm_eval/tasks/arabicmmlu/_arabicmmlu_social_science.yaml create mode 100644 lm_eval/tasks/arabicmmlu/_arabicmmlu_stem.yaml rename lm_eval/tasks/arabicmmlu/{_default_template_yaml => _default_arabicmmlu_template_yaml} (92%) create mode 100644 lm_eval/tasks/bbh/cot_fewshot/_bbh.yaml create mode 100644 lm_eval/tasks/bbh/cot_fewshot/_bbh_cot_fewshot.yaml create mode 100644 lm_eval/tasks/bbh/cot_zeroshot/_bbh_cot_zeroshot.yaml create mode 100644 lm_eval/tasks/bbh/fewshot/_bbh_fewshot.yaml create mode 100644 lm_eval/tasks/bbh/zeroshot/_bbh_zeroshot.yaml create mode 100644 lm_eval/tasks/belebele/_belebele.yaml create mode 100644 lm_eval/tasks/blimp/_blimp.yaml create mode 100644 lm_eval/tasks/ceval/_ceval-valid.yaml create mode 100644 lm_eval/tasks/cmmlu/_cmmlu.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_agronomy.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_anatomy.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_ancient_chinese.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_arts.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_astronomy.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_business_ethics.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_civil_service_exam.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_driving_rule.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_food_culture.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_foreign_policy.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_history.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_literature.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_chinese_teacher_qualification.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_clinical_knowledge.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_actuarial_science.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_education.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_engineering_hydrology.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_law.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_mathematics.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_medical_statistics.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_college_medicine.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_computer_science.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_computer_security.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_conceptual_physics.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_construction_project_management.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_economics.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_education.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_electrical_engineering.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_elementary_chinese.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_elementary_commonsense.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_elementary_information_and_technology.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_elementary_mathematics.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_ethnology.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_food_science.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_genetics.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_global_facts.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_biology.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_chemistry.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_geography.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_mathematics.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_physics.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_high_school_politics.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_human_sexuality.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_international_law.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_journalism.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_jurisprudence.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_legal_and_moral_basis.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_logical.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_machine_learning.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_management.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_marketing.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_marxist_theory.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_modern_chinese.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_nutrition.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_philosophy.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_professional_accounting.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_professional_law.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_professional_medicine.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_professional_psychology.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_public_relations.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_security_study.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_sociology.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_sports_science.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_traditional_chinese_medicine.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_virology.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_world_history.yaml create mode 100644 lm_eval/tasks/cmmlu/cmmlu_world_religions.yaml create mode 100644 lm_eval/tasks/csatqa/_csatqa.yaml create mode 100644 lm_eval/tasks/haerae/_haerae.yaml create mode 100644 lm_eval/tasks/inverse_scaling/_some_results create mode 100644 lm_eval/tasks/kormedmcqa/_kormedmcqa.yaml create mode 100644 lm_eval/tasks/leaderboard/bbh_mc/_leaderboard_bbh.yaml create mode 100644 lm_eval/tasks/leaderboard/gpqa/_leaderboard_gpqa.yaml create mode 100644 lm_eval/tasks/leaderboard/ifeval/_leaderboard_instruction_following.yaml create mode 100644 lm_eval/tasks/leaderboard/math/_leaderboard_math.yaml create mode 100644 lm_eval/tasks/leaderboard/musr/_musr.yaml create mode 100644 lm_eval/tasks/mmlu/default/_mmlu_humanities.yaml create mode 100644 lm_eval/tasks/mmlu/default/_mmlu_other.yaml create mode 100644 lm_eval/tasks/mmlu/default/_mmlu_social_sciences.yaml create mode 100644 lm_eval/tasks/mmlu/default/_mmlu_stem.yaml rename lm_eval/tasks/model_written_evals/winogenerated/{_template_yaml => winogenerated.yaml} (93%) create mode 100644 lm_eval/tasks/paws-x/_pawsx.yaml create mode 100644 lm_eval/tasks/wmdp/_wmdp.yaml create mode 100644 lm_eval/tasks/xcopa/_xcopa.yaml create mode 100644 lm_eval/tasks/xnli/_xnli.yaml create mode 100644 lm_eval/tasks/xstorycloze/_xstorycloze.yaml create mode 100644 lm_eval/tasks/xwinograd/_xwinograd.yaml delete mode 100644 tests/testdata/mmlu_abstract_algebra-mmlu_global_facts-mmlu_public_relations_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt create mode 100644 tests/testdata/mmlu_stem_10_hf_pretrained-EleutherAI-pythia-14m-dtype-float32-device-cpu.txt create mode 100644 tests/testyamls/test-01.yaml diff --git a/docs/new_task_guide.md b/docs/new_task_guide.md index 2f6d3217..23f8dd81 100644 --- a/docs/new_task_guide.md +++ b/docs/new_task_guide.md @@ -285,7 +285,7 @@ As a heuristic check: For more detail on the task system and advanced features, see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md) . If none of the above sound like they apply to your task, it's time to continue onto checking your task performance! -### Task name + groups (registering a task) +### Task name + tags (registering a task) To test a task conveniently, it helps to *register* the task--that is, to give it a name and make the `lm-eval` library aware it exists! @@ -296,14 +296,14 @@ task: ``` Including a task name is mandatory. -It is often also convenient to label your task with several `groups`, or tags, though this field is optional: +It is often also convenient to label your task with several `tag` values, though this field is optional: ```yaml -group: - - group1 - - group2 +tag: + - tag1 + - tag2 ``` -This will add your task to the `group1` and `group2` groups, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them. +This will add your task to the `tag1` and `tag2` tags, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them. If your task is not in the `lm_eval/tasks` folder, you'll need to tell the Eval Harness where to look for YAML files. @@ -319,7 +319,48 @@ Passing `--tasks /path/to/yaml/file` is also accepted. ### Advanced Group Configs -You can make more complete group config while also tailoring parameters for individual tasks. +While `tag` values are helpful when you want to be able to quickly and conveniently run a set of related tasks via `--tasks my_tag_name`, often, we wish to implement more complex logic. For example, the MMLU benchmark contains 57 *subtasks* that must all be *averaged* together in order to report a final 'MMLU score'. + +Groupings of tasks might also use particular variants of a task--for example, we might want to default to evaluating a task as 5-shot when called as part of a given grouping, but not have a preference for number of shots when evaluating it as a standalone. + +We implement this via **groups**, which are distinct from tags. Groups can be implemented via *group config* YAML files, which are laid out similarly but slightly differently to tasks' YAML configs. + +The most basic form of group can be defined via a YAML config similar to the following: + +```yaml +group: nli_tasks +task: + - cb + - anli_r1 + - rte +metadata: + version: 1.0 +``` + +This will behave almost identically to a `tag` that includes these 3 tasks, but with one key distinction: we'll print the `nli_tasks` group as a row (with no associated metrics) in our table of outputs, and visually show that these 3 tasks appear under its subheader. + + +Now, let's assume we actually want to report an aggregate score for `nli_tasks`. We would instead use a YAML config like the following: + +```yaml +group: nli_tasks +task: + - cb + - anli_r1 + - rte +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean). +metadata: + version: 1.0 +``` + +Similar to our `metric_list` for listing out the metrics we want to calculate for a given task, we use an `aggregate_metric_list` field to specify which metric name to aggregate across subtasks, what aggregation function to use, and whether we should micro- or macro- average these metrics. See [./task_guide.md](./task_guide.md) for a full list of related sub-keys. + +**[!Tip]: currently, we predominantly only support the aggregation of group metrics that use `mean` (either micro- or macro- averaged) over their subtasks. If you require even more complex aggregation rules, you may want to perform aggregation offline.** + +Group configs can be fairly complex! We can do various operations, such as defining new subtask(s) inline in our group YAML, overriding an existing task's specific config value, or nesting existing groups within our For example, let's build a config for evaluating MMLU and a few natural language inference tasks. For MMLU, we can write the name for the benchmark as a subtask written under `task`. You can configure the parameters such as `num_fewshot`. If the task being configured is a group such as `mmlu` or `super_glue`, the parameter set will be applied to all of the subtasks. @@ -331,33 +372,13 @@ task: - cb - anli_r1 - rte + aggregate_metric_list: + - metric: acc + aggregation: mean + higher_is_better: true - task: mmlu num_fewshot: 2 ``` -It's also important to note how you can basically insert a group config as a task. Here, to make a group of natural language inference tasks, you simply write like how you would normally write a group config but this time place that as part of a task list under the main group being built. - -### Duplicate Tasks in Group Configs - -There might be cases where you might want to evaluate prompts and how models perform over prompt variations. You can list an existing task (In the example below, `anli_r1`) which varying `doc_to_text` implementation. To differentiate from each variation, we can utilize `task_alias`. LM-Eval will recognize that there are multiple variations of the same tasks and differentiate them. -```yaml -group: flan_held_in -group_alias: Flan (Held-In) -task: - # ANLI R1 - - group: anli_r1_flan - group_alias: ANLI R1 - task: - - task: anli_r1 - task_alias: prompt-0 - include: _held_in_template_yaml - doc_to_text: "{{premise}}\n\nChoose your answer ..." - ... - - task: anli_r1 - task_alias: prompt-1 - include: _held_in_template_yaml - doc_to_text: "{{premise}}\n\nBased on ..." - ... -``` ### Configuring python classes @@ -384,19 +405,16 @@ task: ## Beautifying Table Display -To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `group_alias` to `stem` and `task_alias` to `abstract_algebra`. +To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set. ``` "dataset_name": "abstract_algebra" "description": "The following are multiple choice questions (with answers) about abstract\ \ algebra.\n\n" -"group": "mmlu_stem" -"group_alias": "stem" "include": "_default_template_yaml" "task": "mmlu_abstract_algebra" "task_alias": "abstract_algebra" ``` -Note: Even though `group` can be a list, for now, `group_alias` can only be a single string. ## Checking validity @@ -416,9 +434,9 @@ a simple eye test. ## Versioning -One key feature in LM Evaluation Harness is the ability to version tasks--that is, mark them with a specific version number that can be bumped whenever a breaking change is made. +One key feature in LM Evaluation Harness is the ability to version tasks and groups--that is, mark them with a specific version number that can be bumped whenever a breaking change is made. -This version info can be provided by adding the following to your new task config file: +This version info can be provided by adding the following to your new task or group config file: ``` metadata: diff --git a/docs/task_guide.md b/docs/task_guide.md index b10ca7d9..34e47c41 100644 --- a/docs/task_guide.md +++ b/docs/task_guide.md @@ -16,7 +16,8 @@ Tasks are configured via the `TaskConfig` object. Below, we describe all fields Task naming + registration: - **task** (`str`, defaults to None) — name of the task. -- **group** (`str`, *optional*) — name of the task group(s) a task belongs to. Enables one to run all tasks with a specified tag or group name at once. +- **task_alias** (`str`, defaults to None) - Alias of the task name that will be printed in the final table results. +- **tag** (`str`, *optional*) — name of the task tags(s) a task belongs to. Enables one to run all tasks with a specified tag name at once. Dataset configuration options: - **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub. @@ -55,8 +56,6 @@ Other: ## Filters -Explain: What are filters? What is their place in the pipeline? - A key component of the `lm-evaluation-harness` library is the `Filter` object. In a typical evaluation run of the harness, we take the formatted inputs and run them through our LM, with the appropriate output type (greedy or free-form generation, or loglikelihood-based comparative scoring). After getting scores or output text from our LM on each `Instance` or document in the dataset, we then need to feed these responses into a metric or scoring function to return scores to a user. @@ -295,105 +294,24 @@ Generative tasks: Tasks using complex filtering: - GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`) - -## Benchmarks +# Group Configuration When evaluating a language model, it's is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be combursome to have to list the set of tasks or add a new group name to each yaml of each individual task. -To solve this, we can create a benchmark yaml config. This is a config that contains the names of the tasks that should be included in a particular benchmark. The config consists of two main keys `group` which denotes the name of the benchmark and `task` which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example would be the list of tasks used to evaluate the Pythia Suite. - -```yaml -group: pythia -task: - - lambada_openai - - wikitext - - piqa - - sciq - - wsc - - winogrande - - arc - - logiqa - - blimp - - hendrycksTest* -``` - -It is also possible to list an existing task in your benchmark configuration with some adjustments. For example, a few tasks from mmlu is included `multimedqa`. There, the `task_alias` and `group_alias` (See [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/new_task_guide.md#beautifying-table-display) for more details) are modified to suit the benchmark. - -```yaml -group: multimedqa -task: - - pubmedqa - - medmcqa - - medqa_4options - - task: mmlu_anatomy - task_alias: "anatomy (mmlu)" - group_alias: null - - task: mmlu_clinical_knowledge - task_alias: "clinical_knowledge (mmlu)" - group_alias: null - ... -``` +To solve this, we can create a **group** yaml config. This is a config that contains the names of the tasks that should be included in a particular group. The config consists of two main keys: a `group` key which denotes the name of the group (as it would be called from the command line, e.g. `mmlu`) and a `task` key which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example of a group yaml config can be found at [../lm_eval/tasks/mmlu/default/_mmlu.yaml]. See also the [New Task Guide](./new_task_guide.md) for a more in-depth and tutorial-esque explanation of how to write complex GroupConfigs. -Alternatively, benchmarks can have tasks that are customizable for each task. They can be defined like how a yaml task is usually set. +## Configurations -```yaml -group: t0_eval -task: - # Coreference Resolution - - dataset_path: super_glue - dataset_name: wsc.fixed - use_prompt: promptsource:* - training_split: train - validation_split: validation - metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - # Coreference Resolution - - dataset_path: winogrande - dataset_name: winogrande_xl - use_prompt: promptsource:* - training_split: train - validation_split: validation - metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - ... -``` +Groups are configured via the `GroupConfig` object. Below, we describe all fields usable within the object, and their role in defining a task. -If the benchmark contains the same dataset but with different configurations, use `task` to differentiate between them. For example, T0-Eval evaluates on 3 versions of ANLI but the huggingface dataset collects them in one dataset. - -```YAML -group: t0_eval -task: - ... - - task: anli_r1 - dataset_path: anli - use_prompt: promptsource:* - training_split: train_r1 - validation_split: dev_r1 - metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true - - task: anli_r2 - dataset_path: anli - use_prompt: promptsource:* - training_split: train_r2 - validation_split: dev_r2 - metric_list: - - metric: exact_match - aggregation: mean - higher_is_better: true - ignore_case: true - ignore_punctuation: true -``` +### Parameters -Calling the benchmark is done the same way we would call any task with `--tasks`. Benchmarks can be added in `lm_eval/tasks/benchmarks/` +- **group** (`str`, defaults to `None`) — name of the group. Used to invoke it from the command line. +- **group_alias** (`str`, defaults to `None`) - Alternative name for the group that will be printed in the table output. +- **task** (`Union[str, list]`, defaults to `None`) - List of tasks that constitute the group. +- **aggregate_metric_list** (`list`, defaults to `None`) - similar to `metric_list` in TaskConfigs, provide a list of configurations for metrics that should be aggregated across subtasks. Leaving empty will result in no aggregation being performed for this group. Keys for each list entry are: + - `metric: str` - the name of the metric to aggregate over (all subtasks must report a metric holding this name.) + - `aggregation: str` - what aggregation function to apply to aggregate these per-subtask metrics. **currently, only `mean` is supported.** + - `weight_by_size: bool = True` whether to perform micro- averaging (`True`) or macro- (`False`) averaging of subtasks' accuracy scores when reporting the group's metric. MMLU, for example, averages over per-document accuracies (the *micro average*), resulting in the same accuracy as if one simply concatenated all 57 subjects into a single dataset and evaluated accuracy on that dataset. + - `filter_list: Union[str, List[str]] = "none"` - what filter keys one should match on to aggregate results. For example, if trying to aggregate over the `exact_match` metric using `strict-match` filter for `bbh_cot_zeroshot`, then set this to be `filter_list: "strict-match"`. +- **metadata** (`dict`, *optional*) - As with TaskConfigs, a field where extra config metadata can be passed. set the `num_fewshot` key within this to override the printed n_shot value in a results table for your group, for example. diff --git a/examples/lm-eval-overview.ipynb b/examples/lm-eval-overview.ipynb index 898192ce..6ba5cdf2 100644 --- a/examples/lm-eval-overview.ipynb +++ b/examples/lm-eval-overview.ipynb @@ -377,7 +377,7 @@ "id": "LOUHK7PtQfq4" }, "source": [ - "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the group `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n", + "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the tag `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n", "\n", "