Merge branch 'big-refactor' into seq2seq-refactor

89ad0186 · Lintang Sutawika · GitHub · 6122efac · 9a8fee14 · 89ad0186
Unverified Commit 89ad0186 authored Jun 22, 2023 by Lintang Sutawika Committed by GitHub Jun 22, 2023
6 changed files
--- a/lm_eval/tasks/__init__.py
+++ b/lm_eval/tasks/__init__.py
@@ -22,7 +22,7 @@ def include_task_folder(task_dir):
    Calling this function
    """
    for root, subdirs, file_list in os.walk(task_dir):
-        if len(file_list) > 0:
+        if (subdirs == [] or subdirs == ["__pycache__"]) and (len(file_list) > 0):
            for f in file_list:
                if f.endswith(".yaml"):
                    yaml_path = os.path.join(root, f)

--- a/lm_eval/tasks/hellaswag/README.md
+++ b/lm_eval/tasks/hellaswag/README.md
+# Task-name
+
+### Paper
+
+Title: `HellaSwag: Can a Machine Really Finish Your Sentence?`,
+Abstract: ```Recent work by Zellers et al. (2018) introduced a new task of commonsense natural language inference: given an event description such as "A woman sits at a piano," a machine must select the most likely followup: "She sets her fingers on the keys." With the introduction of BERT, near human-level performance was reached. Does this mean that machines can perform human level commonsense inference?
+In this paper, we show that commonsense inference still proves difficult for even state-of-the-art models, by presenting HellaSwag, a new challenge dataset. Though its questions are trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). We achieve this via Adversarial Filtering (AF), a data collection paradigm wherein a series of discriminators iteratively select an adversarial set of machine-generated wrong answers. AF proves to be surprisingly robust. The key insight is to scale up the length and complexity of the dataset examples towards a critical 'Goldilocks' zone wherein generated text is ridiculous to humans, yet often misclassified by state-of-the-art models.
+Our construction of HellaSwag, and its resulting difficulty, sheds light on the inner workings of deep pretrained models. More broadly, it suggests a new path forward for NLP research, in which benchmarks co-evolve with the evolving state-of-the-art in an adversarial way, so as to present ever-harder challenges.```
+
+Homepage: `https://rowanzellers.com/hellaswag/`
+
+
+### Citation
+
+```
+@inproceedings{zellers2019hellaswag,
+    title={HellaSwag: Can a Machine Really Finish Your Sentence?},
+    author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
+    booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
+    year={2019}
+}
+```
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/hellaswag/hellaswag.yaml
+++ b/lm_eval/tasks/hellaswag/hellaswag.yaml
+group:
+  - multiple_choice
+task: hellaswag
+dataset_path: hellaswag
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+test_split: null
+template_aliases: "{% set gold = label %}{% set answer_choices = endings|map('trim')|map('replace', ' [title]', '. ')|map('regex_replace', '\\[.*?\\]', '')|map('replace', '  ', ' ')|list %}"
+doc_to_text: "{% set text = activity_label ~ ': ' ~ ctx_a ~ ' ' ~ ctx_b.capitalize() %}{{text|trim|replace(' [title]', '. ')|regex_replace('\\[.*?\\]', '')|replace('  ', ' ')}}"
+doc_to_target: "{{answer_choices[gold]}}"
+gold_alias: "{{gold}}"
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+  - metric: acc_norm
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -400,7 +400,13 @@ def load_yaml_config(yaml_path):
        return yaml_config


+def regex_replace(string, pattern, repl, count=0):
+    """Implements the `re.sub` function as a custom Jinja filter."""
+    return re.sub(pattern, repl, string, count=count)
+
+
 env = Environment(loader=BaseLoader, undefined=StrictUndefined)
+env.filters["regex_replace"] = regex_replace


 def apply_template(template, doc):

--- a/main.py
+++ b/main.py
@@ -95,8 +95,7 @@ def main():

    if results is not None:
        samples = results.pop("samples")
-
-        dumped = json.dumps(results, indent=2)
+        dumped = json.dumps(results, indent=2, default=lambda o: str(o))
        print(dumped)

        batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))

--- a/setup.py
+++ b/setup.py
@@ -30,6 +30,7 @@ setuptools.setup(
        "accelerate>=0.18.0",
        "evaluate",
        "datasets>=2.0.0",
+        "evaluate>=0.4.0",
        "jsonlines",
        "numexpr",
        "openai>=0.6.4",