Merge branch 'big-refactor' into benchmark-scripts

027fe14c · Lintang Sutawika · GitHub · 32a70d89 · 4dfa8aba · 027fe14c
Unverified Commit 027fe14c authored Jul 17, 2023 by Lintang Sutawika Committed by GitHub Jul 17, 2023
7 changed files
--- a/lm_eval/api/model.py
+++ b/lm_eval/api/model.py
@@ -114,6 +114,8 @@ class LM(abc.ABC):
        additional_config = {} if additional_config is None else additional_config
        args = utils.simple_parse_args_string(arg_string)
        args2 = {k: v for k, v in additional_config.items() if v is not None}
+        if args2.get("device") == "mps" or args.get("device") == "mps":
+            args["dtype"] = "float32"
        return cls(**args, **args2)

    @property

--- a/lm_eval/models/huggingface.py
+++ b/lm_eval/models/huggingface.py
@@ -99,7 +99,7 @@ class HFLM(LM):
        if not (parallelize or accelerator.num_processes > 1):
            # use user-passed device
            device_list = set(
-                ["cuda", "cpu"]
+                ["cuda", "cpu", "mps"]
                + [f"cuda:{i}" for i in range(torch.cuda.device_count())]
            )
            if device:
@@ -107,6 +107,10 @@ class HFLM(LM):
                    device = int(device)
                self._device = torch.device(device)
                eval_logger.info(f"Using device '{device}'")
+                if device == "mps":
+                    eval_logger.info(
+                        "MPS is still in beta and only supports float32; setting dtype to float32."
+                    )
            else:
                eval_logger.info("Device not specified")
                eval_logger.info(f"Cuda Available? {torch.cuda.is_available()}")

--- a/lm_eval/tasks/anli/README.md
+++ b/lm_eval/tasks/anli/README.md
+# Task-name
+
+### Paper
+
+Title: `Adversarial NLI: A New Benchmark for Natural Language Understanding`
+
+Abstract: `https://arxiv.org/pdf/1910.14599.pdf`
+
+Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
+human-and-model-in-the-loop procedure. It consists of three rounds that progressively
+increase in difficulty and complexity, and each question-answer includes annotator-
+provided explanations.
+
+Homepage: `https://github.com/facebookresearch/anli`
+
+
+### Citation
+
+```
+@inproceedings{nie-etal-2020-adversarial,
+    title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
+    author = "Nie, Yixin  and
+      Williams, Adina  and
+      Dinan, Emily  and
+      Bansal, Mohit  and
+      Weston, Jason  and
+      Kiela, Douwe",
+    booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
+    year = "2020",
+    publisher = "Association for Computational Linguistics",
+}
+```
+
+### Subtasks
+
+List or describe tasks defined in this folder, and their names here:
+* `anli_r1`: The data collected adversarially in the first round.
+* `anli_r2`: The data collected adversarially in the second round, after training on the previous round's data.
+* `anli_r3`: The data collected adversarially in the third round, after training on the previous multiple rounds of data.
+
+### Checklist
+
+For adding novel benchmarks/datasets to the library:
+  * [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+
+
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/anli/anli_r1.yaml
+++ b/lm_eval/tasks/anli/anli_r1.yaml
 group:
-  - anli
+  - multiple_choice
+  - natural_language_inference
+  - nli
+  - adverserial
 task: anli_r1
 dataset_path: anli
+dataset_name: null
 output_type: multiple_choice
 training_split: train_r1
 validation_split: dev_r1
 test_split: test_r1
-doc_to_text: "{{premise}}\nQuestion: {{hypothesis}}. True, False, or Neither?\nAnswer:"
-doc_to_target: " {{label}}" # this will be cast to an int.
-template_aliases: "{% set answer_choices = ['True', 'False', 'Neither'] %}"
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
 metric_list:
  - metric: acc
    aggregation: mean

--- a/lm_eval/tasks/anli/anli_r2.yaml
+++ b/lm_eval/tasks/anli/anli_r2.yaml
-include: anli_r1.yaml
+group:
+  - multiple_choice
+  - natural_language_inference
+  - nli
+  - adverserial
 task: anli_r2
+dataset_path: anli
+dataset_name: null
+output_type: multiple_choice
 training_split: train_r2
 validation_split: dev_r2
 test_split: test_r2
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/anli/anli_r3.yaml
+++ b/lm_eval/tasks/anli/anli_r3.yaml
-include: anli_r1.yaml
+group:
+  - multiple_choice
+  - natural_language_inference
+  - nli
+  - adverserial
 task: anli_r3
+dataset_path: anli
+dataset_name: null
+output_type: multiple_choice
 training_split: train_r3
 validation_split: dev_r3
 test_split: test_r3
+doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:"
+# True = entailment
+# False = contradiction
+# Neither = neutral
+doc_to_target: "{{['True', 'Neither', 'False'][label]}}"
+doc_to_choice:
+  - "True"
+  - "Neither"
+  - "False"
+should_decontaminate: true
+doc_to_decontamination_query: premise
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/scripts/write_out.py
+++ b/scripts/write_out.py
@@ -58,7 +58,6 @@ def main():
                ctx = task.fewshot_context(
                    doc=doc,
                    num_fewshot=args.num_fewshot,
-                    rnd=rnd,
                )
                f.write(ctx + "\n")