Merge branch 'main' into tasklist

# Conflicts: # pyproject.toml

Merge branch 'main' into tasklist
# Conflicts: # pyproject.toml
b58e5556 · Baber · 6e1866f5 · 4f8195f1 · b58e5556 · b58e5556
Commit b58e5556 authored Jul 27, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/bbh/cot_fewshot/sports_understanding.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/sports_understanding.yaml
@@ -14,15 +14,15 @@ fewshot_config:
  samples:
  - input: Is the following sentence plausible? "Bam Adebayo scored a reverse layup
      in the Western Conference Finals."
-    target: Let's think step by step. Bam Adebayo is an American basketball player.
+    target: Bam Adebayo is an American basketball player.
      Scoring a reverse layup in the Western Conference Finals is part of the NBA
      Finals. So the answer is yes.
  - input: Is the following sentence plausible? "Santi Cazorla scored a touchdown."
-    target: Let's think step by step. Santi Cazorla is a soccer player. Touchdown
+    target: Santi Cazorla is a soccer player. Touchdown
      is part of American football and rugby. So the answer is no.
  - input: Is the following sentence plausible? "DeMar DeRozan was called for the
      goal tend."
-    target: Let's think step by step. DeMar DeRozan is an American basketball player.
+    target: DeMar DeRozan is an American basketball player.
      Goal tending is part of basketball. So the answer is yes.
 include: _cot_fewshot_template_yaml
 task: bbh_cot_fewshot_sports_understanding
--- a/lm_eval/tasks/bbh/cot_fewshot/temporal_sequences.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/temporal_sequences.yaml
@@ -39,9 +39,7 @@ fewshot_config:
      (C) 5pm to 6pm
      (D) 2pm to 4pm'
-    target: 'Let''s think step by step.
+    target: 'Wake-up time: 1pm.
-      Wake-up time: 1pm.
      1pm-2pm: free.
@@ -88,9 +86,7 @@ fewshot_config:
      (C) 1pm to 2pm
      (D) 3pm to 6pm'
-    target: 'Let''s think step by step.
+    target: 'Wake-up time: 7am.
-      Wake-up time: 7am.
      7am-1pm: free.
@@ -137,9 +133,7 @@ fewshot_config:
      (C) 5am to 6am
      (D) 3pm to 4pm'
-    target: 'Let''s think step by step.
+    target: 'Wake-up time: 5am.
-      Wake-up time: 5am.
      5am-6am: getting a coffee at the cafe.

--- a/lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_five_objects.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_five_objects.yaml
@@ -27,9 +27,7 @@ fewshot_config:
      (B) blue ball
      (C) pink ball'
-    target: 'Let''s think step by step.
+    target: '(0) At the start: Alice: yellow, Bob: blue, Claire: pink.
-      (0) At the start: Alice: yellow, Bob: blue, Claire: pink.
      (1) Claire and Alice swap balls: Alice: pink, Bob: blue, Claire: yellow.
@@ -53,9 +51,7 @@ fewshot_config:
      (B) purple ball
      (C) pink ball'
-    target: 'Let''s think step by step.
+    target: '(0) At the start: Alice: white, Bob: purple, Claire: pink.
-      (0) At the start: Alice: white, Bob: purple, Claire: pink.
      (1) Bob and Alice swap balls: Alice: purple, Bob: white, Claire: pink.
@@ -79,9 +75,7 @@ fewshot_config:
      (B) Rodrigo
      (C) Patrick'
-    target: 'Let''s think step by step.
+    target: '(0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.
-      (0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.
      (1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.

--- a/lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_seven_objects.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_seven_objects.yaml
@@ -27,9 +27,7 @@ fewshot_config:
      (B) blue ball
      (C) pink ball'
-    target: 'Let''s think step by step.
+    target: '(0) At the start: Alice: yellow, Bob: blue, Claire: pink.
-      (0) At the start: Alice: yellow, Bob: blue, Claire: pink.
      (1) Claire and Alice swap balls: Alice: pink, Bob: blue, Claire: yellow.
@@ -53,9 +51,7 @@ fewshot_config:
      (B) purple ball
      (C) pink ball'
-    target: 'Let''s think step by step.
+    target: '(0) At the start: Alice: white, Bob: purple, Claire: pink.
-      (0) At the start: Alice: white, Bob: purple, Claire: pink.
      (1) Bob and Alice swap balls: Alice: purple, Bob: white, Claire: pink.
@@ -79,9 +75,7 @@ fewshot_config:
      (B) Rodrigo
      (C) Patrick'
-    target: 'Let''s think step by step.
+    target: '(0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.
-      (0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.
      (1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.

--- a/lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_three_objects.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/tracking_shuffled_objects_three_objects.yaml
@@ -27,9 +27,7 @@ fewshot_config:
      (B) blue ball
      (C) pink ball'
-    target: 'Let''s think step by step.
+    target: '(0) At the start: Alice: yellow, Bob: blue, Claire: pink.
-      (0) At the start: Alice: yellow, Bob: blue, Claire: pink.
      (1) Claire and Alice swap balls: Alice: pink, Bob: blue, Claire: yellow.
@@ -53,9 +51,7 @@ fewshot_config:
      (B) purple ball
      (C) pink ball'
-    target: 'Let''s think step by step.
+    target: '(0) At the start: Alice: white, Bob: purple, Claire: pink.
-      (0) At the start: Alice: white, Bob: purple, Claire: pink.
      (1) Bob and Alice swap balls: Alice: purple, Bob: white, Claire: pink.
@@ -79,9 +75,7 @@ fewshot_config:
      (B) Rodrigo
      (C) Patrick'
-    target: 'Let''s think step by step.
+    target: '(0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.
-      (0) At the start: Alice: Lola, Bob: Rodrigo, Claire: Patrick.
      (1) Alice and Bob switch partners: Alice: Rodrigo, Bob: Lola, Claire: Patrick.

--- a/lm_eval/tasks/bbh/cot_fewshot/web_of_lies.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/web_of_lies.yaml
@@ -14,9 +14,7 @@ fewshot_config:
  - input: 'Question: Fidel tells the truth. Jerry says Fidel tells the truth. Vina
      says Jerry tells the truth. Millicent says Vina lies. Raymond says Millicent
      lies. Does Raymond tell the truth?'
-    target: 'Let''s think step by step.
+    target: '(1) Fidel tells the truth. So, we know that Fidel tells the truth.
-      (1) Fidel tells the truth. So, we know that Fidel tells the truth.
      (2) Jerry says Fidel tells the truth. Since we know from (1) that Fidel tells
      the truth, if Jerry says that Fidel tells the truth, then Jerry tells the truth.
@@ -35,9 +33,7 @@ fewshot_config:
  - input: 'Question: Kristian lies. Millie says Kristian lies. Maybelle says Millie
      tells the truth. Fidel says Maybelle lies. Leda says Fidel lies. Does Leda tell
      the truth?'
-    target: 'Let''s think step by step.
+    target: '(1) Kristian lies. So, we know that Kristian lies.
-      (1) Kristian lies. So, we know that Kristian lies.
      (2) Millie says Kristian lies. Since we know from (1) that Kristian lies, if
      Millie says Kristian lies, then Millie tells the truth.
@@ -57,9 +53,7 @@ fewshot_config:
  - input: 'Question: Kristian tells the truth. Michaela says Kristian lies. Raymond
      says Michaela tells the truth. Osvaldo says Raymond tells the truth. Jamey says
      Osvaldo tells the truth. Does Jamey tell the truth?'
-    target: 'Let''s think step by step.
+    target: '(1) Kristian tells the truth. So, we know that Kristian tells the truth.
-      (1) Kristian tells the truth. So, we know that Kristian tells the truth.
      (2) Michaela says Kristian lies. Since we know from (1) that Kristian tells
      the truth, if Michaela says Kristian lies, then Michaela lies.

--- a/lm_eval/tasks/bbh/cot_fewshot/word_sorting.yaml
+++ b/lm_eval/tasks/bbh/cot_fewshot/word_sorting.yaml
@@ -12,9 +12,7 @@ fewshot_config:
  sampler: first_n
  samples:
  - input: 'Sort the following words alphabetically: List: oven costume counterpart'
-    target: 'Let''s think step by step.
+    target: 'The first letter: "oven": "o" (15). "costume": "c" (3). "counterpart": "c" (3).
-      The first letter: "oven": "o" (15). "costume": "c" (3). "counterpart": "c" (3).
      We now have: (3) ["costume" ? "counterpart"] < (15) "oven". Now let''s sort
      this subpart ["costume" ? "counterpart"] by looking at their second letters.
@@ -27,9 +25,7 @@ fewshot_config:
      < "oven". So the answer is costume counterpart oven.'
  - input: 'Sort the following words alphabetically: List: hypochlorite ponderosa
      phone credulity'
-    target: 'Let''s think step by step.
+    target: 'The first letter: "hypochlorite": "h" (8). "ponderosa": "p" (16). "phone": "p"
-      The first letter: "hypochlorite": "h" (8). "ponderosa": "p" (16). "phone": "p"
      (16). "credulity": "c" (3). We now have: (3) "credulity" < (8) "hypochlorite"
      < (16) ["ponderosa" ? "phone"]. Now let''s sort this subpart ["ponderosa" ?
      "phone"] by looking at their second letters.
@@ -39,9 +35,7 @@ fewshot_config:
      <"ponderosa"]. So the answer is credulity hypochlorite phone ponderosa.'
  - input: 'Sort the following words alphabetically: List: newt arson parthia seismography
      mugho aspect census'
-    target: 'Let''s think step by step.
+    target: 'The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography":
-      The first letter: "newt": "n" (14). "arson": "a" (1). "parthia": "p" (16). "seismography":
      "s" (19). "mugho": "m" (13). "aspect": "a" (1). "census": "c" (3). We now have:
      (1) ["arson" ? "aspect"] < (3) "census" < (13) "mugho" < (14) "newt" < (16)
      "parthia" < (19) "seismography". Now let''s sort this subpart ["arson" ? "aspect"]

--- a/lm_eval/tasks/bbh/cot_zeroshot/utils.py
+++ b/lm_eval/tasks/bbh/cot_zeroshot/utils.py
@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
        english_number_regex = regex.compile(
-            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
        )
        for r in resps:
@@ -161,7 +161,7 @@ class WordSortFilter(Filter):
 class MultiChoiceRegexFilter(ExtendedRegexFilter):
    def __init__(self, *args, **kwargs):
-        """
+        r"""
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )
            filtered = []

--- a/lm_eval/tasks/bbh/zeroshot/utils.py
+++ b/lm_eval/tasks/bbh/zeroshot/utils.py
@@ -118,7 +118,7 @@ class NumberParseRegexFilter(ExtendedRegexFilter):
        # https://www.reddit.com/r/regex/comments/11a38uk/parsing_numbers_written_out_as_english_words
        english_number_regex = regex.compile(
-            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
+            "((?:(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?:|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion)(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?:|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion|[^\\S\r\n]|,|and|&)+)?(?:zero|one|two|three|four|five|(?:twen|thir|for|fif|six|seven|nine)(?|teen|ty)|eight(?|een|y)|ten|eleven|twelve|fourteen|hundred|thousand|(?:m|b|tr)illion))"
        )
        for r in resps:
@@ -161,7 +161,7 @@ class WordSortFilter(Filter):
 class MultiChoiceRegexFilter(ExtendedRegexFilter):
    def __init__(self, *args, **kwargs):
-        """
+        r"""
        regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure
                        - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response.
                        - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices.
@@ -202,7 +202,7 @@ class MultiChoiceRegexFilter(ExtendedRegexFilter):
            fallback_regex = re.compile("|".join(fallback_regexes))
            without_paren_fallback_regex = "|".join(without_paren_fallback_regexes)
            without_paren_fallback_regex = re.compile(
-                f":[\s]*({without_paren_fallback_regex})"
+                rf":[\s]*({without_paren_fallback_regex})"
            )
            filtered = []

--- a/lm_eval/tasks/c4/c4.yaml
+++ b/lm_eval/tasks/c4/c4.yaml
@@ -20,5 +20,4 @@ dataset_kwargs:
    train: en/c4-train.00000-of-01024.json.gz
    validation: en/c4-validation.00000-of-00008.json.gz
  # following the choice of https://arxiv.org/abs/2410.07461
-  trust_remote_code: true
  verification_mode: "no_checks"
--- a/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
+++ b/lm_eval/tasks/catalan_bench/flores_ca/_flores_common_yaml
@@ -21,5 +21,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/coqa/default.yaml
+++ b/lm_eval/tasks/coqa/default.yaml
@@ -20,5 +20,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/darija_bench/darija_sentiment/default_darija_sentiment_template_yaml
+++ b/lm_eval/tasks/darija_bench/darija_sentiment/default_darija_sentiment_template_yaml
@@ -9,5 +9,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/darijahellaswag/darijahellaswag.yaml
+++ b/lm_eval/tasks/darijahellaswag/darijahellaswag.yaml
@@ -20,5 +20,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/darijammlu/_default_darijammlu_template_yaml
+++ b/lm_eval/tasks/darijammlu/_default_darijammlu_template_yaml
@@ -13,5 +13,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 0.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/drop/default.yaml
+++ b/lm_eval/tasks/drop/default.yaml
@@ -22,5 +22,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 3.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/egyhellaswag/README.md
+++ b/lm_eval/tasks/egyhellaswag/README.md
+# EgyHellaSwag
+### Paper
+Title: NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities
+Abstract: [https://arxiv.org/abs/2505.18383](https://arxiv.org/abs/2505.18383)
+**EgyHellaSwag** is a benchmark for evaluating commonsense reasoning in Egyptian Arabic. It is a translated version of the HellaSwag benchmark, consisting of multiple-choice sentence completion tasks. The dataset was translated using the `google/gemma-3-27b-it` model.
+Homepage: [https://huggingface.co/datasets/UBC-NLP/EgyHellaSwag](https://huggingface.co/datasets/UBC-NLP/EgyHellaSwag)
+### Citation
+```
+@article{mekki2025nilechatlinguisticallydiverseculturally,
+  title={NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities},
+  author={Abdellah El Mekki and Houdaifa Atou and Omer Nacar and Shady Shehata and Muhammad Abdul-Mageed},
+  year={2025},
+  eprint={2505.18383},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL},
+  url={https://arxiv.org/abs/2505.18383},
+}
+```
+### Groups and Tasks
+#### Groups
+* Not part of a group.
+#### Tags
+* `egyhellaswag`
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/egyhellaswag/egyhellaswag.yaml
+++ b/lm_eval/tasks/egyhellaswag/egyhellaswag.yaml
+tag:
+  - multiple_choice
+task: egyhellaswag
+dataset_path: UBC-NLP/EgyHellaSwag
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: null
+process_docs: !function utils.process_docs
+doc_to_text: "{{query}}"
+doc_to_target: "{{label}}"
+doc_to_choice: "choices"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 0.0
--- a/lm_eval/tasks/egyhellaswag/utils.py
+++ b/lm_eval/tasks/egyhellaswag/utils.py
+import datasets
+def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
+    def _process_doc(doc):
+        ctx = doc["ctx"]
+        out_doc = {
+            "query": doc["activity_label"] + ": " + ctx,
+            "choices": doc["endings"],
+            "gold": int(doc["label"]),
+        }
+        return out_doc
+    return dataset.map(_process_doc)
--- a/lm_eval/tasks/egymmlu/README.md
+++ b/lm_eval/tasks/egymmlu/README.md
+# EgyMMLU
+### Paper
+Title: NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities
+Abstract: [https://arxiv.org/abs/2505.18383](https://arxiv.org/abs/2505.18383)
+EgyMMLU is a benchmark designed to evaluate the performance of large language models in Egyptian Arabic. It contains 22,027 multiple-choice questions covering 44 subjects, translated from parts of the Massive Multitask Language Understanding (MMLU) and ArabicMMLU benchmarks. The dataset was translated using `google/gemma-3-27b-it`.
+Homepage: [https://huggingface.co/datasets/UBC-NLP/EgyMMLU](https://huggingface.co/datasets/UBC-NLP/EgyMMLU)
+### Citation
+```
+@article{mekki2025nilechatlinguisticallydiverseculturally,
+  title={NileChat: Towards Linguistically Diverse and Culturally Aware LLMs for Local Communities},
+  author={Abdellah El Mekki and Houdaifa Atou and Omer Nacar and Shady Shehata and Muhammad Abdul-Mageed},
+  year={2025},
+  eprint={2505.18383},
+  archivePrefix={arXiv},
+  primaryClass={cs.CL},
+  url={https://arxiv.org/abs/2505.18383},
+}
+```
+### Groups and Tasks
+#### Groups
+* `egymmlu`: evaluates all EgyMMLU tasks.
+#### Tags
+Source-based tags:
+* `egymmlu_mmlu`: evaluates EgyMMLU tasks that were translated from MMLU.
+* `egymmlu_ar_mmlu`: evaluates EgyMMLU tasks that were translated from ArabicMMLU.
+Category-based tags:
+* `egymmlu_stem`: evaluates EgyMMLU STEM tasks.
+* `egymmlu_social_sciences`: evaluates EgyMMLU social sciences tasks.
+* `egymmlu_humanities`: evaluates EgyMMLU humanities tasks.
+* `egymmlu_language`: evaluates EgyMMLU language tasks.
+* `egymmlu_other`: evaluates other EgyMMLU tasks.
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?