task for testing recursive

d5071b70 · lintangsutawika · 4f69410c · d5071b70 · 4f69410c · 4f69410c
Commit d5071b70 authored Jan 19, 2024 by lintangsutawika
16 changed files
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -56,17 +56,27 @@ def register_configurable_task(config: Dict[str, str]) -> int:

 def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -> int:
    group = config["group"]
-    all_task_list = config["task"]
-    config_list = [task for task in all_task_list if type(task) != str]
-    task_list = [task for task in all_task_list if type(task) == str]

-    for task_config in config_list:
+    if group not in ["grouptest", "arc_stuff"]:
+        return 0
+
+    task_config_list = []
+    group_config_list = []
+    registered_task_or_group_list = []
+    for task in config["task"]:
+        if isinstance(task, str):
+            registered_task_or_group_list.append(task)
+        elif list(task.keys()) == ["group", "task"]:
+            group_config_list.append(task)
+        else:
+            task_config_list.append(task)

+    for task_config in task_config_list:
        base_config = {}
        task_name_config = {}
        if "task" in task_config:
            task_name = task_config["task"]
-            if task_name in ALL_TASKS:
+            if task_name in TASK_REGISTRY:
                task_obj = get_task_dict(task_name)[task_name]
                if type(task_obj) == tuple:
                    _, task_obj = task_obj
@@ -74,6 +84,8 @@ def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -
                if task_obj is not None:
                    base_config = task_obj._config.to_dict(keep_callable=True)
                    task_name_config["task"] = f"{group}_{task_name}"
+            # elif task_name in GROUP_REGISTRY:
+

        task_config = utils.load_yaml_config(yaml_path, task_config)
        var_configs = check_prompt_config(
@@ -88,7 +100,16 @@ def register_configurable_group(config: Dict[str, str], yaml_path: str = None) -
        for config in var_configs:
            register_configurable_task(config)

-    task_names = utils.pattern_match(task_list, ALL_TASKS)
+    for group_config in group_config_list:
+        sub_group = group_config["group"]
+        register_configurable_group(group_config, yaml_path)
+        if group in GROUP_REGISTRY:
+            GROUP_REGISTRY[group].append(sub_group)
+        else:
+            GROUP_REGISTRY[group] = [sub_group]
+            ALL_TASKS.add(group)
+
+    task_names = utils.pattern_match(registered_task_or_group_list, ALL_TASKS)
    for task in task_names:
        if (task in TASK_REGISTRY) or (task in GROUP_REGISTRY):
            if group in GROUP_REGISTRY:
@@ -143,7 +164,7 @@ def get_task_name_from_config(task_config: Dict[str, str]) -> str:
        return "{dataset_path}".format(**task_config)


-def include_task_folder(task_dir: str, register_task: bool = True) -> None:
+def include_task_folder(task_dir: str, register_task: bool = True, task_name: str = None) -> None:
    """
    Calling this function
    """
@@ -198,18 +219,18 @@ def include_task_folder(task_dir: str, register_task: bool = True) -> None:
    return 0


-def include_path(task_dir):
-    include_task_folder(task_dir)
+def include_path(task_dir, task_name=None):
+    include_task_folder(task_dir, task_name=task_name)
    # Register Benchmarks after all tasks have been added
-    include_task_folder(task_dir, register_task=False)
+    include_task_folder(task_dir, register_task=False, task_name=task_name)
    return 0


-def initialize_tasks(verbosity="INFO"):
+def initialize_tasks(verbosity="INFO", task_name=None):
    eval_logger.setLevel(getattr(logging, f"{verbosity}"))

    task_dir = os.path.dirname(os.path.abspath(__file__)) + "/"
-    include_path(task_dir)
+    include_path(task_dir, task_name=task_name)


 def get_task(task_name, config):

--- a/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_anli.yaml
-group: flan_anli
-task:
-  - include: yaml_templates/held_in_template_yaml
-    task: anli_r1
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r1
-  - include: yaml_templates/held_in_template_yaml
-    task: anli_r2
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r2
-  - include: yaml_templates/held_in_template_yaml
-    task: anli_r3
-    dataset_path: anli
-    use_prompt: prompt_templates/anli.yaml:*
-    validation_split: dev_r3
--- a/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_arc.yaml
-group: flan_arc
-task:
-  - include: yaml_templates/held_in_template_yaml
-    task: arc_easy
-    dataset_path: ai2_arc
-    dataset_name: ARC-Easy
-    use_prompt: prompt_templates/arc.yaml:*
-    validation_split: validation
-  - include: yaml_templates/held_in_template_yaml
-    task: arc_challenge
-    dataset_path: ai2_arc
-    dataset_name: ARC-Challenge
-    use_prompt: prompt_templates/arc.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_boolq.yaml
-group: flan_boolq
-task:
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: boolq
-    use_prompt: prompt_templates/boolq.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_cot.yaml
-group: flan_cot
-task:
-  - include: yaml_templates/cot_template_yaml
-    dataset_path: gsmk
-    dataset_name: boolq
-    use_prompt: promptsource:*
-    validation_split: validation
-  - include: yaml_templates/cot_template_yaml
-    dataset_path: EleutherAI/asdiv
-    use_prompt: promptsource:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in.yaml
-group: flan_held_in
-task:
-  - flan_boolq
-  - flan_rte
-  - flan_anli
-  - flan_arc
--- a/lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_in_yaml
-group: flan_held_in
-task:
-  - include: flan/yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: boolq
-    use_prompt: flan/prompt_templates/boolq.yaml:*
-    validation_split: validation
-  - include: flan/yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: rte
-    use_prompt: flan/prompt_templates/rte.yaml:*
-    validation_split: validation
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: anli_r1
-    dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
-    validation_split: dev_r1
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: anli_r2
-    dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
-    validation_split: dev_r2
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: anli_r3
-    dataset_path: anli
-    use_prompt: flan/prompt_templates/anli.yaml:*
-    validation_split: dev_r3
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: arc_easy
-    dataset_path: ai2_arc
-    dataset_name: ARC-Easy
-    use_prompt: flan/prompt_templates/arc.yaml:*
-    validation_split: validation
-  - include: flan/yaml_templates/held_in_template_yaml
-    task: arc_challenge
-    dataset_path: ai2_arc
-    dataset_name: ARC-Challenge
-    use_prompt: flan/prompt_templates/arc.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_held_out.yaml
-group: flan_held_out
-task:
-  # BBH
-  - bbh_flan_zeroshot
-  - bbh_flan_fewshot
-  - bbh_flan_cot_fewshot
-  - bbh_flan_cot_zeroshot
-  # MMLU
-  - mmlu
-  - mmlu_flan_n_shot_generative
-  - mmlu_flan_n_shot_loglikelihood
-  - mmlu_flan_cot_zeroshot
-  - mmlu_flan_cot_fewshot
--- a/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
+++ b/lm_eval/tasks/benchmarks/flan/flan_rte.yaml
-group: flan_rte
-task:
-  - include: yaml_templates/held_in_template_yaml
-    dataset_path: super_glue
-    dataset_name: rte
-    use_prompt: prompt_templates/rte.yaml:*
-    validation_split: validation
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/anli.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/anli.yaml
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{premise}}\n\nChoose your answer: based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nI think the answer is"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-1":
-    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that this sentence is true?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-2":
-    doc_to_text: "{{premise}}\n\nCan we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-3":
-    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-4":
-    doc_to_text: "{{premise}}\nCan we infer the following?\n{{hypothesis}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nThe answer is:"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true:\n\n{{premise}}\n\nOPTIONS:\n- Yes\n- It's impossible to say\n- No\nHypothesis: {{hypothesis}}\n\n\n"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true (see options at the end):\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-7":
-    doc_to_text: "Can we draw the following hypothesis from the context (see options)? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
-  "template-8":
-    doc_to_text: "Choose from options: Determine if the sentence is true based on the text below:\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- Yes\n- It's impossible to say\n- No"
-    doc_to_target: "{{[\"Yes\", \"It's impossible to say\", \"No\"][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/arc.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/arc.yaml
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-1":
-    doc_to_text: "Question: {{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}\nAnswer:"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-2":
-    doc_to_text: "Question: {{question}}\n\nWhat is the correct answer to the question from the following choices?\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-3":
-    doc_to_text: "Q: {{question}}\nWhat is the correct answer to this question?\nOPTIONS:\n- {{choices.text|join('\n- ')}}...A:"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-4":
-    doc_to_text: "Choose your answer?\n\n{{question}}\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-5":
-    doc_to_text: "Answer the question\n\n{{question}}\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
-  "template-6":
-    doc_to_text: "{{question}}\n\nPick the answer from these options\n\nOPTIONS:\n- {{choices.text|join('\n- ')}}"
-    doc_to_target: "{{choices.text[choices.label.index(answerKey)]}}"
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/boolq.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/boolq.yaml
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{passage}}\n\nCan we conclude that {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-1":
-    doc_to_text: "{{passage}}\n\nIs it true that {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-2":
-    doc_to_text: "{{passage}}\n\n{{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-3":
-    doc_to_text: "Text: {{passage}}\n\nQuestion: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-4":
-    doc_to_text: "{{passage}}\n\nWhat's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-5":
-    doc_to_text: "{{passage}}\nBased on the above text what's the best answer to this question: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-6":
-    doc_to_text: "{{passage}}\nAnswer this question making sure that the answer is supposed by the text: {{question}}?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-7":
-    doc_to_text: "{{passage}}\n\nIs the following statement correct based on the text\n\n{{question}}\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-8":
-    # doc_to_text: "{{title}}\n\n{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_text: "{{passage}}\n\nIs this statement correct \"{{question}}\"?\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
-  "template-9":
-    doc_to_text: "Is it true that {{question}} based on the following text?\n\n{{passage}}\n\nOPTIONS:\n- no\n- yes"
-    doc_to_target: "{{['no', 'yes'][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/prompt_templates/rte.yaml
+++ b/lm_eval/tasks/benchmarks/flan/prompt_templates/rte.yaml
-# Flan Prompt Templates
-prompts:
-  "template-0":
-    doc_to_text: "{{premise}}\n\nQuestion with options: Based on the paragraph above can we conclude that \"{{hypothesis}}\"?\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-1":
-    doc_to_text: "{{premise}}\n\nBased on that paragraph can we conclude that the sentence below is true?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-2":
-    doc_to_text: "{{premise}}\n\nQ with options: Can we draw the following conclusion?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-3":
-    doc_to_text: "{{premise}}\nDoes this next sentence follow, given the preceding text?\n{{hypothesis}}\n\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-4":
-    doc_to_text: "{{premise}}\nOPTIONS:\n- yes\n- no\nQuestion: Can we infer the following?\n{{hypothesis}}"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-5":
-    doc_to_text: "Read the following paragraph and determine if the hypothesis is true. Select from options at the end:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nThe answer is"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-6":
-    doc_to_text: "Read the text and determine if the sentence is true:\n\n{{premise}}\n\nSentence: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-7":
-    doc_to_text: "Question with options: can we draw the following hypothesis from the context? \n\nContext:\n\n{{premise}}\n\nHypothesis: {{hypothesis}}\nOPTIONS:\n- yes\n- no\nA:"
-    doc_to_target: "{{['yes', 'no'][label]}}"
-  "template-8":
-    doc_to_text: "Determine if the sentence is true based on the text below. Choose from options.\n{{hypothesis}}\n\n{{premise}}\nOPTIONS:\n- yes\n- no"
-    doc_to_target: "{{['yes', 'no'][label]}}"
--- a/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/yaml_templates/cot_template_yaml
-group: flan-cot
-output_type: generate_until
-validation_split: validation
-doc_to_target: "{{answer}}"
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-generation_kwargs:
-  until:
-    - "\n\n"
-  do_sample: false
-  temperature: 0.0
-filter_list:
-  - name: "get-answer"
-    filter:
-      - function: "regex"
-        regex_pattern: "The answer is (\\-?[0-9\\.\\,]+)"
-      - function: "take_first"
-metadata:
-  version: 1.0
--- a/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
+++ b/lm_eval/tasks/benchmarks/flan/yaml_templates/held_in_template_yaml
-output_type: generate_until
-validation_split: validation
-metric_list:
-  - metric: exact_match
-    aggregation: mean
-    higher_is_better: true
-generation_kwargs:
-  until:
-    - "</s>"
-  do_sample: false
-  temperature: 0.0
-metadata:
-  version: 1.0
--- a/lm_eval/tasks/benchmarks/test.yaml
+++ b/lm_eval/tasks/benchmarks/test.yaml
+group: grouptest
+task:
+  - boolq
+  - group: arc_stuff
+    task:
+      - arc_challenge
+      - task: arc_easy
+        metric_list:
+          - metric: acc
+        num_fewshot: 3
+  # - task: mmlu_stem
+  #   num_fewshot: 2
\ No newline at end of file