add demo prompt

4354fe46 · Baber · f3d2bf90 · 4354fe46
Commit 4354fe46 authored Dec 03, 2024 by Baber
Hide whitespace changes
Inline Side-by-side

Showing with 49 additions and 0 deletions

lm_eval/tasks/mathvista/utils.py lm_eval/tasks/mathvista/utils.py +49 -0

No files found.
--- a/lm_eval/tasks/mathvista/utils.py
+++ b/lm_eval/tasks/mathvista/utils.py
@@ -4,6 +4,55 @@ from typing import Optional
 from Levenshtein import distance
+# required for external LM call
+DEMO_PROMPT = """
+Please read the following example. Then extract the answer from the model response and type it at the end of the prompt.
+Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end.
+Question: Which number is missing?
+Model response: The number missing in the sequence is 14.
+Extracted answer: 14
+Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end.
+Question: What is the fraction of females facing the camera?
+Model response: The fraction of females facing the camera is 0.6, which means that six out of ten females in the group are facing the camera.
+Extracted answer: 0.6
+Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end.
+Question: How much money does Luca need to buy a sour apple candy and a butterscotch candy? (Unit: $)
+Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.
+Extracted answer: 1.45
+Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.
+Question: Between which two years does the line  graph saw its maximum peak?
+Model response: The line graph saw its maximum peak between 2007 and 2008.
+Extracted answer: [2007, 2008]
+Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.
+Question: What fraction of the shape is blue?\nChoices:\n(A) 3/11\n(B) 8/11\n(C) 6/11\n(D) 3/5
+Model response: The correct answer is (B) 8/11.
+Extracted answer: B
+"""
+def create_test_prompt(demo_prompt, query, response):
+    demo_prompt = demo_prompt.strip()
+    test_prompt = f"{query}\n\n{response}"
+    full_prompt = f"{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: "
+    return full_prompt
 # taken from https://github.com/lupantech/MathVista/blob/main/evaluation/calculate_score.py
 def get_most_similar(prediction: str, choices: list) -> float:
    """