utils.py 8.39 KB
Newer Older
Baber's avatar
nit  
Baber committed
1
import os
Baber's avatar
Baber committed
2
import re
Baber's avatar
Baber committed
3
from typing import Optional
Baber's avatar
Baber committed
4

Baber's avatar
Baber committed
5
6
import requests

Baber's avatar
nits  
Baber committed
7
# from api_model import make_concurrent_requests
Baber's avatar
Baber committed
8
9
from Levenshtein import distance

Baber's avatar
nit  
Baber committed
10
11
from lm_eval.models.utils import retry_on_specific_exceptions

Baber's avatar
Baber committed
12

Baber's avatar
nit  
Baber committed
13
14
15
16
API_KEY = os.getenv("OPENAI_API_KEY")
assert API_KEY is not None, (
    "Please set the OPENAI_API_KEY environment variable to use the MathVista task."
)
Baber's avatar
Baber committed
17
API_URL = "https://api.openai.com/v1/chat/completions"
Baber's avatar
nit  
Baber committed
18
MODEL = "gpt-4"
Baber's avatar
Baber committed
19

Baber's avatar
Baber committed
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# required for external LM call

DEMO_PROMPT = """
Please read the following example. Then extract the answer from the model response and type it at the end of the prompt.

Hint: Please answer the question requiring an integer answer and provide the final value, e.g., 1, 2, 3, at the end.
Question: Which number is missing?

Model response: The number missing in the sequence is 14.

Extracted answer: 14

Hint: Please answer the question requiring a floating-point number with one decimal place and provide the final value, e.g., 1.2, 1.3, 1.4, at the end.
Question: What is the fraction of females facing the camera?

Model response: The fraction of females facing the camera is 0.6, which means that six out of ten females in the group are facing the camera.

Extracted answer: 0.6

Hint: Please answer the question requiring a floating-point number with two decimal places and provide the final value, e.g., 1.23, 1.34, 1.45, at the end.
Question: How much money does Luca need to buy a sour apple candy and a butterscotch candy? (Unit: $)

Model response: Luca needs $1.45 to buy a sour apple candy and a butterscotch candy.

Extracted answer: 1.45

Hint: Please answer the question requiring a Python list as an answer and provide the final list, e.g., [1, 2, 3], [1.2, 1.3, 1.4], at the end.
Question: Between which two years does the line  graph saw its maximum peak?

Model response: The line graph saw its maximum peak between 2007 and 2008.

Extracted answer: [2007, 2008]

Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.
Question: What fraction of the shape is blue?\nChoices:\n(A) 3/11\n(B) 8/11\n(C) 6/11\n(D) 3/5

Model response: The correct answer is (B) 8/11.

Extracted answer: B
"""


Baber's avatar
Baber committed
62
# Function to send a single request to the OpenAI API
Baber's avatar
nit  
Baber committed
63
64
65
@retry_on_specific_exceptions(
    on_exceptions=[requests.exceptions.RequestException], max_retries=5
)
Baber's avatar
Baber committed
66
def send_request(prompt: str):
Baber's avatar
nit  
Baber committed
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json",
    }
    data = {
        "model": MODEL,
        "messages": [
            {"role": "user", "content": prompt},
        ],
        "max_tokens": 100,
    }
    response = requests.post(API_URL, headers=headers, json=data)
    response.raise_for_status()
    result = response.json()
    return result["choices"][0]["message"]["content"]
Baber's avatar
Baber committed
82
83


Baber's avatar
Baber committed
84
85
86
87
88
89
90
def create_test_prompt(demo_prompt, query, response):
    demo_prompt = demo_prompt.strip()
    test_prompt = f"{query}\n\n{response}"
    full_prompt = f"{demo_prompt}\n\n{test_prompt}\n\nExtracted answer: "
    return full_prompt


Baber's avatar
nits  
Baber committed
91
92
93
94
95
96
97
def verify_extraction(extraction):
    extraction = extraction.strip()
    if not extraction:
        return False
    return True


Baber's avatar
Baber committed
98
# taken from https://github.com/lupantech/MathVista/blob/main/evaluation/calculate_score.py
Baber's avatar
Baber committed
99
def get_most_similar(prediction: str, choices: list) -> float:
Baber's avatar
Baber committed
100
101
102
103
104
105
106
107
108
    """
    Use the Levenshtein distance (or edit distance) to determine which of the choices is most similar to the given prediction
    """
    distances = [distance(prediction, choice) for choice in choices]
    ind = distances.index(min(distances))
    return choices[ind]
    # return min(choices, key=lambda choice: distance(prediction, choice))


Baber's avatar
Baber committed
109
# taken from https://github.com/lupantech/MathVista/blob/main/evaluation/extract_answer.py
Baber's avatar
Baber committed
110
def normalize_extracted_answer(
Baber's avatar
Baber committed
111
    extraction: str,
Baber's avatar
Baber committed
112
113
114
115
    choices: list,
    question_type: str,
    answer_type: str,
    precision,
Baber's avatar
Baber committed
116
117
    ignore_empty_extractions=True,
) -> Optional[str]:
Baber's avatar
Baber committed
118
119
120
    """
    Normalize the extracted answer to match the answer type
    """
Baber's avatar
Baber committed
121

Baber's avatar
Baber committed
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
    if question_type == "multi_choice":
        # make sure the extraction is a string
        if isinstance(extraction, str):
            extraction = extraction.strip()
        else:
            try:
                extraction = str(extraction)
            except Exception:
                extraction = ""

        # if the extraction is empty, return None
        if ignore_empty_extractions and not extraction:
            return None

        # extract "A" from "(A) text"
        letter = re.findall(r"\(([a-zA-Z])\)", extraction)
        if len(letter) > 0:
            extraction = letter[0].upper()

        sequential_characters = [chr(ord("A") + i) for i in range(len(choices))]

        # if model output a character, use it as index of available choices
        if extraction in sequential_characters:
            option_index = sequential_characters.index(extraction)
            normalized_extraction = choices[option_index]
        else:
            # select the most similar option
            normalized_extraction = get_most_similar(extraction, choices)
        assert normalized_extraction in choices

    elif answer_type == "integer":
        try:
            normalized_extraction = str(int(float(extraction)))
        except Exception:
            normalized_extraction = None

    elif answer_type == "float":
        try:
Baber's avatar
Baber committed
160
            normalized_extraction = str(round(float(extraction), int(precision)))
Baber's avatar
Baber committed
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
        except Exception:
            normalized_extraction = None

    elif answer_type == "list":
        try:
            normalized_extraction = str(extraction)
        except Exception:
            normalized_extraction = None

    return normalized_extraction


def safe_equal(prediction, answer):
    """
    Check if the prediction is equal to the answer, even if they are of different types
    """
    try:
        if prediction == answer:
            return True
        return False
    except Exception:
        return False


Baber's avatar
nits  
Baber committed
185
def extract_answer(response: str, problem: dict, quick_extract=True) -> str:
Baber's avatar
Baber committed
186
187
188
    question_type = problem["question_type"]
    answer_type = problem["answer_type"]
    choices = problem["choices"]
Baber's avatar
Baber committed
189
190
    query = problem["query"]
    pid = problem["pid"]
Baber's avatar
Baber committed
191

Baber's avatar
Baber committed
192
193
    if response == "":
        return ""
Baber's avatar
Baber committed
194
195

    if question_type == "multi_choice" and response in choices:
Baber's avatar
Baber committed
196
197
        return response

Baber's avatar
Baber committed
198
199
200
201
202
203
    if answer_type == "integer":
        try:
            extraction = int(response)
            return str(extraction)
        except Exception:
            pass
Baber's avatar
Baber committed
204

Baber's avatar
Baber committed
205
206
207
208
209
210
    if answer_type == "float":
        try:
            extraction = str(float(response))
            return extraction
        except Exception:
            pass
Baber's avatar
Baber committed
211

Baber's avatar
nits  
Baber committed
212
213
214
215
216
217
218
219
220
221
222
223
    # quick extraction
    if quick_extract:
        # The answer is "text". -> "text"
        try:
            result = re.search(r'The answer is "(.*)"\.', response)
            if result:
                extraction = result.group(1)
                return extraction
        except Exception:
            pass

    # general extraction
Baber's avatar
Baber committed
224
225
226
227
228
229
230
231
    try:
        full_prompt = create_test_prompt(DEMO_PROMPT, query, response)
        extraction = send_request(full_prompt)
        return extraction
    except Exception:
        print(
            f"Error in extracting answer for problem: {pid} with response: {response}"
        )
Baber's avatar
nits  
Baber committed
232
233
234
235
236
237
238
239
240
241
242

    return ""


def extract_all_answers(
    resps: list[list[str]], docs: dict, quick_extract=True
) -> list[str]:
    return [
        extract_answer(resp[0], doc, quick_extract=quick_extract)
        for resp, doc in zip(resps, docs)
    ]
Baber's avatar
Baber committed
243
244
245
246
247
248
249
250
251
252


# adapted from https://github.com/lupantech/MathVista/blob/main/evaluation/extract_answer.py
def process_results(doc: dict, results: list[str]):
    response = results[0]  # noqa: F841
    choices = doc["choices"]
    question_type = doc["question_type"]
    answer_type = doc["answer_type"]
    precision = doc["precision"]  # noqa: F841
    answer = doc["answer"]
Baber's avatar
nits  
Baber committed
253
254
    # step 1: extract the answer from the model response
    # extracted_answer = extract_answer(response, doc)
Baber's avatar
nit  
Baber committed
255
    extracted_answer = response
Baber's avatar
nits  
Baber committed
256
257
258
259
260
261
262
    if verify_extraction(extracted_answer):
        normalized_extraction = normalize_extracted_answer(
            extracted_answer, choices, question_type, answer_type, precision
        )
        res = safe_equal(normalized_extraction, answer)
    else:
        res = False
Baber's avatar
Baber committed
263
    return {"acc": 1.0} if res else {"acc": 0.0}
Baber's avatar
Baber committed
264
265
266
267
268
269
270


### MathVista MCQ ###


def process_docs_mcq(dataset):
    return dataset.filter(lambda x: x["question_type"] == "multi_choice")