Fix task name to template creation

9f388461 · jon-tow · 9484eecc · 9f388461
Commit 9f388461 authored Apr 25, 2022 by jon-tow
Hide whitespace changes
Inline Side-by-side

Showing with 42 additions and 24 deletions

lm_eval/tasks/__init__.py lm_eval/tasks/__init__.py +42 -24

No files found.
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
 from promptsource.templates import DatasetTemplates
 from pprint import pprint
 from typing import List, Union
@@ -60,8 +59,8 @@ from . import storycloze
 # 6 total
 gpt3_translation_benchmarks = {
-    "wmt14": ["en-fr", "fr-en"],  # French
+    "wmt14": ['en-fr', 'fr-en'],  # French
-    "wmt16": ["en-ro", "ro-en", "de-en", "en-de"],  # German, Romanian
+    "wmt16": ['en-ro', 'ro-en', 'de-en', 'en-de'],  # German, Romanian
 }
@@ -69,7 +68,7 @@ gpt3_translation_benchmarks = {
 selected_translation_benchmarks = {
    **gpt3_translation_benchmarks,
    "wmt20": sacrebleu.get_langpairs_for_testset("wmt20"),
-    "iwslt17": ["en-ar", "ar-en"],  # Arabic
+    "iwslt17": ['en-ar', 'ar-en']  # Arabic
 }
 # 319 total
@@ -93,7 +92,7 @@ TASK_REGISTRY = {
    "rte": glue.RTE,
    "qnli": glue.QNLI,
    "qqp": glue.QQP,
-    # "stsb": glue.STSB, # not implemented yet
+    #"stsb": glue.STSB, # not implemented yet
    "sst": glue.SST,
    "wnli": glue.WNLI,
    # SuperGLUE
@@ -104,26 +103,34 @@ TASK_REGISTRY = {
    "record": superglue.ReCoRD,
    "wic": superglue.WordsInContext,
    "wsc": superglue.SGWinogradSchemaChallenge,
    # Order by benchmark/genre?
    "coqa": coqa.CoQA,
    "drop": drop.DROP,
    "lambada": lambada.LAMBADA,
    "lambada_cloze": lambada_cloze.LAMBADA_cloze,
    # multilingual lambada
    **lambada_multilingual.construct_tasks(),
    "wikitext": wikitext.WikiText,
    # "cbt-cn": cbt.CBTCN, # disabled pending context length fix
    # "cbt-ne": cbt.CBTNE, # disabled pending context length fix
    "piqa": piqa.PiQA,
    "prost": prost.PROST,
    "mc_taco": mc_taco.MCTACO,
    # Science related
-    "pubmedqa": pubmedqa.Pubmed_QA,
+    "pubmedqa" : pubmedqa.Pubmed_QA,
-    "sciq": sciq.SciQ,
+    "sciq" : sciq.SciQ,
    "qasper": qasper.QASPER,
-    "qa4mre_2011": qa4mre.QA4MRE_2011,
-    "qa4mre_2012": qa4mre.QA4MRE_2012,
+    "qa4mre_2011" : qa4mre.QA4MRE_2011,
-    "qa4mre_2013": qa4mre.QA4MRE_2013,
+    "qa4mre_2012" : qa4mre.QA4MRE_2012,
+    "qa4mre_2013" : qa4mre.QA4MRE_2013,
    "triviaqa": triviaqa.TriviaQA,
    "arc_easy": arc.ARCEasy,
    "arc_challenge": arc.ARCChallenge,
@@ -134,7 +141,7 @@ TASK_REGISTRY = {
    "squad2": squad.SQuAD2,
    "race": race.RACE,
    # "naturalqs": naturalqs.NaturalQs, # not implemented yet
-    "headqa": headqa.HeadQAEsDeprecated,  # for backwards compat - headqa used to default to es
+    "headqa": headqa.HeadQAEsDeprecated, # for backwards compat - headqa used to default to es
    "headqa_es": headqa.HeadQAEs,
    "headqa_en": headqa.HeadQAEn,
    "mathqa": mathqa.MathQA,
@@ -144,17 +151,21 @@ TASK_REGISTRY = {
    "anli_r1": anli.ANLIRound1,
    "anli_r2": anli.ANLIRound2,
    "anli_r3": anli.ANLIRound3,
    "ethics_cm": hendrycks_ethics.EthicsCM,
    "ethics_deontology": hendrycks_ethics.EthicsDeontology,
    "ethics_justice": hendrycks_ethics.EthicsJustice,
    "ethics_utilitarianism_original": hendrycks_ethics.EthicsUtilitarianismOriginal,
    "ethics_utilitarianism": hendrycks_ethics.EthicsUtilitarianism,
    "ethics_virtue": hendrycks_ethics.EthicsVirtue,
-    "truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
-    "truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
+     "truthfulqa_mc": truthfulqa.TruthfulQAMultipleChoice,
+     "truthfulqa_gen": truthfulqa.TruthfulQAGeneration,
    # dialogue
    "mutual": mutual.MuTual,
    "mutual_plus": mutual.MuTualPlus,
    # math
    "math_algebra": hendrycks_math.MathAlgebra,
    "math_counting_and_prob": hendrycks_math.MathCountingAndProbability,
@@ -165,6 +176,7 @@ TASK_REGISTRY = {
    "math_precalc": hendrycks_math.MathPrecalculus,
    "math_asdiv": asdiv.Asdiv,
    "gsm8k": gsm8k.GradeSchoolMath8K,
    # arithmetic
    "arithmetic_2da": arithmetic.Arithmetic2DPlus,
    "arithmetic_2ds": arithmetic.Arithmetic2DMinus,
@@ -178,18 +190,22 @@ TASK_REGISTRY = {
    "arithmetic_1dc": arithmetic.Arithmetic1DComposite,
    # TODO Perhaps make these groups of tasks
    #   e.g. anli, arithmetic, openai_translations, harness_translations
    # hendrycksTest (57 tasks)
    **hendrycks_test.create_all_tasks(),
    # e.g. wmt14-fr-en
    **translation.create_tasks_from_benchmarks(gpt3_translation_benchmarks),
    # chef's selection, mostly wmt20
    **translation.create_tasks_from_benchmarks(selected_translation_benchmarks),
    # Word Scrambling and Manipulation Tasks
    "anagrams1": unscramble.Anagrams1,
    "anagrams2": unscramble.Anagrams2,
    "cycle_letters": unscramble.CycleLetters,
    "random_insertion": unscramble.RandomInsertion,
    "reversed_words": unscramble.ReversedWords,
    # Pile
    "pile_arxiv": pile.PileArxiv,
    "pile_books3": pile.PileBooks3,
@@ -213,6 +229,7 @@ TASK_REGISTRY = {
    "pile_ubuntu-irc": pile.PileUbuntuIrc,
    "pile_wikipedia": pile.PileWikipedia,
    "pile_youtubesubtitles": pile.PileYoutubeSubtitles,
    # BLiMP
    "blimp_adjunct_island": blimp.BlimpAdjunctIsland,
    "blimp_anaphor_gender_agreement": blimp.BlimpAnaphorGenderAgreement,
@@ -281,6 +298,7 @@ TASK_REGISTRY = {
    "blimp_wh_vs_that_no_gap_long_distance": blimp.BlimpWhVsThatNoGapLongDistance,
    "blimp_wh_vs_that_with_gap": blimp.BlimpWhVsThatWithGap,
    "blimp_wh_vs_that_with_gap_long_distance": blimp.BlimpWhVsThatWithGapLongDistance,
    # Requires manual download of data.
    # "storycloze_2016": storycloze.StoryCloze2016,
    # "storycloze_2018": storycloze.StoryCloze2018,
@@ -304,25 +322,19 @@ def get_task_name_from_object(task_object):
    for name, class_ in TASK_REGISTRY.items():
        if class_ is task_object:
            return name
    # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
-    return (
+    return task_object.EVAL_HARNESS_NAME if hasattr(task_object, "EVAL_HARNESS_NAME") else type(task_object).__name__
-        task_object.EVAL_HARNESS_NAME
-        if hasattr(task_object, "EVAL_HARNESS_NAME")
-        else type(task_object).__name__
-    )
 def get_task_dict(task_name_list: List[Union[str, lm_eval.base.Task]]):
    task_name_dict = {
        task_name: get_task(task_name)()
-        for task_name in task_name_list
+        for task_name in task_name_list if isinstance(task_name, str)
-        if isinstance(task_name, str)
    }
    task_name_from_object_dict = {
        get_task_name_from_object(task_object): task_object
-        for task_object in task_name_list
+        for task_object in task_name_list if not isinstance(task_object, str)
-        if not isinstance(task_object, str)
    }
    assert set(task_name_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys()))
    return {**task_name_dict, **task_name_from_object_dict}
@@ -334,8 +346,14 @@ def get_task_dict_promptsource(task_name_list: List[str]):
    for task_name in task_name_list:
        assert isinstance(task_name, str)
-        task_prompts = DatasetTemplates(task_name)
+        # Static version of the Task Use this to get HF dataset path / name.
+        static_task_obj = get_task(task_name)
+        # Create the proper task name arg for DatasetTemplates.
+        sub_task = f"/{static_task_obj.DATASET_NAME}" if static_task_obj.DATASET_NAME else ""
+        ps_task_name = f"{static_task_obj.DATASET_PATH}{sub_task}"
+        task_prompts = DatasetTemplates(ps_task_name)
        for prompt_name in task_prompts.all_template_names:
            prompt = task_prompts[prompt_name]
            # NOTE: We choose a sep that can be easily split.