add mathvista

0d187eda · Baber · fb963f0f · 0d187eda · 0d187eda
Commit 0d187eda authored Sep 17, 2024 by Baber
Hide whitespace changes
Inline Side-by-side

Showing with 142 additions and 0 deletions

lm_eval/tasks/mathvista/mathvista.yaml lm_eval/tasks/mathvista/mathvista.yaml +18 -0

lm_eval/tasks/mathvista/utils.py lm_eval/tasks/mathvista/utils.py +124 -0

No files found.
--- a/lm_eval/tasks/mathvista/mathvista.yaml
+++ b/lm_eval/tasks/mathvista/mathvista.yaml
+dataset_path: AI4Math/MathVista
+task: mathvista_mcq
+test_split: testmini
+output_type: "greedy_until"
+process_docs: !function utils.process_docs
+doc_to_image: !function utils.doc_to_image
+doc_to_text: "<image> {{query}}"
+#doc_to_choice: '{{ ["A", "B", "C", "D", "E", "F"][:choices.length] }}'
+doc_to_target: answer
+process_results: !function utils.process_results
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
+dataset_kwargs:
+  trust_remote_code: true
--- a/lm_eval/tasks/mathvista/utils.py
+++ b/lm_eval/tasks/mathvista/utils.py
+import re
+from Levenshtein import distance
+# taken from https://github.com/lupantech/MathVista/blob/main/evaluation/calculate_score.py
+def get_most_similar(prediction: str, choices: list):
+    """
+    Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction
+    """
+    distances = [distance(prediction, choice) for choice in choices]
+    ind = distances.index(min(distances))
+    return choices[ind]
+    # return min(choices, key=lambda choice: distance(prediction, choice))
+def normalize_extracted_answer(
+    extraction,
+    choices: list,
+    question_type: str,
+    answer_type: str,
+    precision,
+    ignore_empty_extractions=False,
+):
+    """
+    Normalize the extracted answer to match the answer type
+    """
+    if question_type == "multi_choice":
+        # make sure the extraction is a string
+        if isinstance(extraction, str):
+            extraction = extraction.strip()
+        else:
+            try:
+                extraction = str(extraction)
+            except Exception:
+                extraction = ""
+        # if the extraction is empty, return None
+        if ignore_empty_extractions and not extraction:
+            return None
+        # extract "A" from "(A) text"
+        letter = re.findall(r"\(([a-zA-Z])\)", extraction)
+        if len(letter) > 0:
+            extraction = letter[0].upper()
+        sequential_characters = [chr(ord("A") + i) for i in range(len(choices))]
+        # if model output a character, use it as index of available choices
+        if extraction in sequential_characters:
+            option_index = sequential_characters.index(extraction)
+            normalized_extraction = choices[option_index]
+        else:
+            # select the most similar option
+            normalized_extraction = get_most_similar(extraction, choices)
+        assert normalized_extraction in choices
+    elif answer_type == "integer":
+        try:
+            normalized_extraction = str(int(float(extraction)))
+        except Exception:
+            normalized_extraction = None
+    elif answer_type == "float":
+        try:
+            normalized_extraction = str(round(float(extraction), precision))
+        except Exception:
+            normalized_extraction = None
+    elif answer_type == "list":
+        try:
+            normalized_extraction = str(extraction)
+        except Exception:
+            normalized_extraction = None
+    return normalized_extraction
+def safe_equal(prediction, answer):
+    """
+    Check if the prediction is equal to the answer, even if they are of different types
+    """
+    try:
+        if prediction == answer:
+            return True
+        return False
+    except Exception:
+        return False
+def get_acc_with_contion(res_pd, key, value):
+    if key == "skills":
+        total_pd = res_pd[res_pd[key].apply(lambda x: value in x)]
+    else:
+        total_pd = res_pd[res_pd[key] == value]
+    correct_pd = total_pd[total_pd["true_false"] == True]  # noqa: E712
+    acc = len(correct_pd) / len(total_pd)
+    return len(correct_pd), len(total_pd), acc
+# adapted from https://github.com/lupantech/MathVista/blob/main/evaluation/extract_answer.py
+def process_results(doc, results):
+    response = results[0]
+    choices = doc["choices"]
+    question_type = doc["question_type"]
+    answer_type = doc["answer_type"]
+    precision = doc["precision"]  # noqa: F841
+    extraction = doc["extraction"]  # noqa: F841
+    if question_type == "multi_choice" and response in choices:
+        return {"acc": 1.0}
+    if answer_type == "integer":
+        try:
+            extraction = int(response)
+            return str(extraction)
+        except Exception:
+            pass
+    if answer_type == "float":
+        try:
+            extraction = str(float(response))
+            return extraction
+        except Exception:
+            pass