Merge pull request #859 from baberabb/nqopen_baber_fi

[Refactor] NQopen

Merge pull request #859 from baberabb/nqopen_baber_fi
[Refactor] NQopen
fa8f2381 · Hailey Schoelkopf · GitHub · 57b20eef · 5f205bd5 · fa8f2381
Unverified Commit fa8f2381 authored Sep 14, 2023 by Hailey Schoelkopf Committed by GitHub Sep 14, 2023
Showing with 51 additions and 14 deletions

lm_eval/api/task.py lm_eval/api/task.py +21 -14

lm_eval/tasks/nq_open/README.md lm_eval/tasks/nq_open/README.md +0 -0

lm_eval/tasks/nq_open/nq_open.yaml lm_eval/tasks/nq_open/nq_open.yaml +30 -0

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -674,22 +674,22 @@ class ConfigurableTask(Task):
            check_choices = test_choice
        else:
            check_choices = [test_target]
+        if self.config.doc_to_choice is not None:
-        for choice in check_choices:
+            for choice in check_choices:
-            choice_has_whitespace = True if " " in choice else False
+                choice_has_whitespace = True if choice[0].isspace() else False
-            delimiter_has_whitespace = (
+                delimiter_has_whitespace = (
-                True if " " in self.config.target_delimiter else False
+                    True if self.config.target_delimiter[-1].isspace() else False
-            )
-            if delimiter_has_whitespace and choice_has_whitespace:
-                eval_logger.warning(
-                    f'Both target_delimiter and target choice: "{choice}" have whitespace'
-                )
-            elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
-                eval_logger.warning(
-                    f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
                )
+                if delimiter_has_whitespace and choice_has_whitespace:
+                    eval_logger.warning(
+                        f'Both target_delimiter and target choice: "{choice}" have whitespace'
+                    )
+                elif (not delimiter_has_whitespace) and (not choice_has_whitespace):
+                    eval_logger.warning(
+                        f'Both target_delimiter and target choice: "{choice}" does not have whitespace, ignore if the language you are evaluating on does not require/use whitespace'
+                    )
    def download(self, dataset_kwargs=None) -> None:
        self.dataset = datasets.load_dataset(
            path=self.DATASET_PATH,
@@ -1067,6 +1067,9 @@ class ConfigurableTask(Task):
                # it assumes that doc_to_target returns a number.
                choices = self.doc_to_choice(doc)
                gold = choices[gold]
+            # we expect multiple_targets to be a list.
+            elif self.multiple_target:
+                gold = list(gold)
            else:
                gold = str(gold)
@@ -1077,6 +1080,10 @@ class ConfigurableTask(Task):
                    # return true if any are true
                    # TODO: this may break for multipLe_target, non zero-or-1 metrics
                    scores = []
+                    if not isinstance(gold, list):
+                        # sometimes, a multiple_target dataset has exceptions where one doc has only one string answer
+                        # print(gold)
+                        gold = [gold]
                    for gold_option in gold:
                        try:
                            result_score = self._metric_fn_list[metric](

--- a/lm_eval/tasks/nq_open/README.md
+++ b/lm_eval/tasks/nq_open/README.md
--- a/lm_eval/tasks/nq_open/nq_open.yaml
+++ b/lm_eval/tasks/nq_open/nq_open.yaml
+task: nq_open
+dataset_path: nq_open
+output_type: greedy_until
+training_split: train
+validation_split: validation
+description: "Answer these questions:\n"
+doc_to_text: "Q: {{question}}?\nA:"
+doc_to_target: "{{answer}}" # TODO: should be multi-target
+fewshot_delimiter: "\n"
+generation_kwargs:
+  until:
+    - "\n"
+    - "."
+    - ","
+  do_sample: false
+  temperature: 0.0
+filter_list:
+  - name: remove_whitespace
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+target_delimiter: " "
+metric_list:
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: true
+    regexes_to_ignore:
+    - "\ban|a|the\b"