Merge pull request #819 from EleutherAI/811-fix-indexerror

[Refactor] Fix IndexError

Merge pull request #819 from EleutherAI/811-fix-indexerror
[Refactor] Fix IndexError
6342636e · Lintang Sutawika · GitHub · dc544beb · ebfff08f · 6342636e
Unverified Commit 6342636e authored Aug 30, 2023 by Lintang Sutawika Committed by GitHub Aug 30, 2023
Show whitespace changes
Inline Side-by-side

Showing with 22 additions and 4 deletions

lm_eval/api/task.py lm_eval/api/task.py +22 -4

No files found.
--- a/lm_eval/api/task.py
+++ b/lm_eval/api/task.py
@@ -1020,18 +1020,36 @@ class ConfigurableTask(Task):
                gold = self.doc_to_text(doc)
            else:
                gold = self.doc_to_target(doc)
-                if type(gold) is str:
-                    gold = choices.index(gold)
+            gold_index_error = False
+            if type(gold) is list:
+                gold = [i if i < len(choices) else -100 for i in gold]
+                if -100 in gold:
+                    gold_index_error = True
+            else:
+                if type(gold) is int:
+                    gold = gold if gold < len(choices) else -100
+                elif type(gold) is str:
+                    gold = choices.index(gold) if gold in choices else -100
+                if gold == -100:
+                    gold_index_error = True
+            if gold_index_error:
+                eval_logger.warning(
+                    f"Label index was not in within range of available choices,"
+                    f"Sample:\n\n{doc}\n\n"
+                )
            if self.multiple_target:
                acc = 1.0 if pred in gold else 0.0
                acc_norm = 1.0 if pred_norm in gold else 0.0
-                exact_match = int(any([is_greedy[i] for i in gold]))
+                exact_match = int(any([is_greedy[i] if i != -100 else 0 for i in gold]))
            else:
                acc = 1.0 if pred == gold else 0.0
                acc_norm = 1.0 if pred_norm == gold else 0.0
                # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
-                exact_match = int(is_greedy[gold])
+                exact_match = int(is_greedy[gold]) if gold != -100 else 0
            result_dict = {
                **({"acc": acc} if "acc" in use_metric else {}),