add mmmu tasks

8bce8cf6 · lintangsutawika · 34a079e3 · 8bce8cf6 · 8bce8cf6 · 8bce8cf6
Commit 8bce8cf6 authored Jul 02, 2024 by lintangsutawika
9 changed files
--- a/lm_eval/tasks/mmmu/_template_yaml
+++ b/lm_eval/tasks/mmmu/_template_yaml
+dataset_path: MMMU/MMMU
+validation_split: validation
+output_type: generate_until
+doc_to_image: !function utils.doc_to_image
+doc_to_text: !function utils.doc_to_text
+doc_to_target: "answer"
+process_results: !function utils.process_results
+generation_kwargs:
+  until:
+    - "<|endoftext|>"
+  temperature: 1.0
+  do_sample: false
+  max_gen_toks: 512
+  top_p: 0.94
+  repetition_penalty: 1.0
+  image_aspect_ratio: original
+metric_list:
+  - metric: mmmu_acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/mmmu/art_and_design.yaml
+++ b/lm_eval/tasks/mmmu/art_and_design.yaml
+group: mmmu_art_and_design
+group_alias: Art and Design
+task:
+  - task: mmmu_art
+    include: _template_yaml
+    task_alias: Art
+    dataset_name: Art
+  - task: mmmu_art_theory
+    include: _template_yaml
+    task_alias: Art Theory
+    dataset_name: Art_Theory
+  - task: mmmu_design
+    include: _template_yaml
+    task_alias: Design
+    dataset_name: Design
+  - task: mmmu_music
+    include: _template_yaml
+    task_alias: Music
+    dataset_name: Music
--- a/lm_eval/tasks/mmmu/business.yaml
+++ b/lm_eval/tasks/mmmu/business.yaml
+group: mmmu_business
+group_alias: Business
+task:
+  # - task: mmmu_accounting
+  #   include: _template_yaml
+  #   task_alias: Accounting
+  #   dataset_name: Accounting
+  - task: mmmu_economics
+    include: _template_yaml
+    task_alias: Economics
+    dataset_name: Economics
+  # - task: mmmu_finance
+  #   include: _template_yaml
+  #   task_alias: Finance
+  #   dataset_name: Finance
+  # - task: mmmu_manage
+  #   include: _template_yaml
+  #   task_alias: Manage
+  #   dataset_name: Manage
+  # - task: mmmu_marketing
+  #   include: _template_yaml
+  #   task_alias: Marketing
+  #   dataset_name: Marketing
--- a/lm_eval/tasks/mmmu/health_and_medicine.yaml
+++ b/lm_eval/tasks/mmmu/health_and_medicine.yaml
+group: mmmu_health_and_medicine
+group_alias: Health and Medicine
+task:
+  - task: mmmu_basic_medical_science
+    include: _template_yaml
+    task_alias: Basic Medical Science
+    dataset_name: Basic_Medical_Science
+  - task: mmmu_clinical_medicine
+    include: _template_yaml
+    task_alias: Clinical Medicine
+    dataset_name: Clinical_Medicine
+  - task: mmmu_diagnostics_and_laboratory_medicine
+    include: _template_yaml
+    task_alias: Diagnostics and Laboratory Medicine
+    dataset_name: Diagnostics_and_Laboratory_Medicine
+  - task: mmmu_pharmacy
+    include: _template_yaml
+    task_alias: Pharmacy
+    dataset_name: Pharmacy
+  - task: mmmu_public_health
+    include: _template_yaml
+    task_alias: Public Health
+    dataset_name: Public_Health
--- a/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
+++ b/lm_eval/tasks/mmmu/humanities_and_social_sciences.yaml
+group: mmmu_humanities_and_social_science
+group_alias: Humanities and Social Science
+task:
+  - task: mmmu_history
+    include: _template_yaml
+    task_alias: History
+    dataset_name: History
+  - task: mmmu_literature
+    include: _template_yaml
+    task_alias: Literature
+    dataset_name: Literature
+  - task: mmmu_sociology
+    include: _template_yaml
+    task_alias: Sociology
+    dataset_name: Sociology
+  - task: mmmu_psychology
+    include: _template_yaml
+    task_alias: Psychology
+    dataset_name: Psychology
--- a/lm_eval/tasks/mmmu/mmmu.yaml
+++ b/lm_eval/tasks/mmmu/mmmu.yaml
+group: mmmu
+task:
+  - mmmu_art_and_design
+  - mmmu_business
+  - mmmu_health_and_medicine
+  - mmmu_humanities_and_social_science
+  - mmmu_science
+  - mmmu_tech_and_engineering
--- a/lm_eval/tasks/mmmu/science.yaml
+++ b/lm_eval/tasks/mmmu/science.yaml
+group: mmmu_science
+group_alias: Science
+task:
+  - task: mmmu_biology
+    include: _template_yaml
+    task_alias: Biology
+    dataset_name: Biology
+  - task: mmmu_chemistry
+    include: _template_yaml
+    task_alias: Chemistry
+    dataset_name: Chemistry
+  - task: mmmu_geography
+    include: _template_yaml
+    task_alias: Geography
+    dataset_name: Geography
+  - task: mmmu_math
+    include: _template_yaml
+    task_alias: Math
+    dataset_name: Math
+  - task: mmmu_physics
+    include: _template_yaml
+    task_alias: Physics
+    dataset_name: Physics
--- a/lm_eval/tasks/mmmu/tech_and_engineering.yaml
+++ b/lm_eval/tasks/mmmu/tech_and_engineering.yaml
+group: mmmu_tech_and_engineering
+group_alias: Tech and Engineering
+task:
+  - task: mmmu_agriculture
+    include: _template_yaml
+    task_alias: Agriculture
+    dataset_name: Agriculture
+  - task: mmmu_architecture_and_engineering
+    include: _template_yaml
+    task_alias: Architecture and Engineering
+    dataset_name: Architecture_and_Engineering
+  - task: mmmu_computer_science
+    include: _template_yaml
+    task_alias: Computer Science
+    dataset_name: Computer_Science
+  - task: mmmu_electronics
+    include: _template_yaml
+    task_alias: Electronics
+    dataset_name: Electronics
+  - task: mmmu_energy_and_power
+    include: _template_yaml
+    task_alias: Energy and Power
+    dataset_name: Energy_and_Power
+  - task: mmmu_materials
+    include: _template_yaml
+    task_alias: Materials
+    dataset_name: Materials
+  - task: mmmu_mechanical_engineering
+    include: _template_yaml
+    task_alias: Mechanical Engineering
+    dataset_name: Mechanical_Engineering
--- a/lm_eval/tasks/mmmu/utils.py
+++ b/lm_eval/tasks/mmmu/utils.py
+import ast
+import random
+import re
+import numpy as np
+MULTI_CHOICE_PROMPT = "Answer with the option letter from the given choices directly."
+OPEN_ENDED_PROMPT = "Answer the question using a single word or phrase."
+def replace_images_tokens(input_string):
+    for i in range(1, 8):
+        question_text = f"<image {i}>"
+        query_text = "<image>"
+        if question_text in input_string:
+            input_string = input_string.replace(question_text, query_text)
+    return input_string
+def parse_options(options):
+    option_letters = [chr(ord("A") + i) for i in range(len(options))]
+    choices_str = "\n".join(
+        [
+            f"{option_letter}. {option}"
+            for option_letter, option in zip(option_letters, options)
+        ]
+    )
+    return choices_str
+def construct_prompt(doc):
+    question = doc["question"]
+    if doc["question_type"] == "multiple-choice":
+        # Weirdly, data["options"] is a string in MMMU Huggingface dataset
+        parsed_options = parse_options(ast.literal_eval(doc["options"]))
+        # parsed_options already prepends a newline so no need to add space here
+        question = f"{question}\n{parsed_options}\n{MULTI_CHOICE_PROMPT}"
+    else:
+        question = f"{question}\n{OPEN_ENDED_PROMPT}"
+    return question
+def doc_to_text(doc):
+    question = construct_prompt(doc)
+    return replace_images_tokens(question)
+def doc_to_image(doc):
+    prompt = construct_prompt(doc)
+    image_tokens = re.findall(r"<image \d+>", prompt)
+    # Remove <> and  swap space as _
+    image_tokens = [
+        image_token.strip("<>").replace(" ", "_") for image_token in image_tokens
+    ]
+    visual = [doc[image_token].convert("RGB") for image_token in image_tokens]
+    return visual
+def process_results(doc, results):
+    pred = results[0]
+    if doc["question_type"] == "multiple-choice":
+        index2ans, all_choices = get_multi_choice_info(ast.literal_eval(doc["options"]))
+        parsed_pred = parse_multi_choice_response(pred, all_choices, index2ans)
+    else:
+        parsed_pred = parse_open_response(pred)
+    sample_dict = {
+        "id": doc["id"],
+        "subdomain": extract_subset_name(doc["id"]),
+        "question_type": doc["question_type"],
+        "answer": doc["answer"],
+        "parsed_pred": parsed_pred,
+    }
+    _, result_dict = evaluate_mmmu([sample_dict])
+    return result_dict
+def extract_subset_name(input_string):
+    # Define a regex pattern to match "validation_" at the beginning and "_<number>" at the end
+    split = input_string.split("_")[0]
+    pattern = re.compile(rf"^{split}_(.+?)_\d+$")
+    match = pattern.search(input_string)
+    if match:
+        return match.group(1)
+    else:
+        raise ValueError(f'No match found in "{input_string}"')
+##################
+# Helper functions written by official MMMU repo.
+##################
+def calculate_ins_level_acc(results):
+    """Calculate the instruction level accuracy for given Subject results
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L246
+    """
+    acc = 0
+    ins_num = 0
+    for cat_results in results.values():
+        acc += cat_results["mmmu_acc"] * cat_results["num_example"]
+        ins_num += cat_results["num_example"]
+    if ins_num == 0:
+        return 0
+    return acc / ins_num
+def eval_multi_choice(gold_i, pred_i):
+    """
+    Evaluate a multiple choice instance.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L175
+    """
+    correct = False
+    # only they are exactly the same, we consider it as correct
+    if isinstance(gold_i, list):
+        for answer in gold_i:
+            if answer == pred_i:
+                correct = True
+                break
+    else:  # gold_i is a string
+        if gold_i == pred_i:
+            correct = True
+    return correct
+def eval_open(gold_i, pred_i):
+    """
+    Evaluate an open question instance
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L191
+    """
+    correct = False
+    if isinstance(gold_i, list):
+        # use float to avoid trivial matches
+        norm_answers = []
+        for answer in gold_i:
+            norm_answers.extend(normalize_str(answer))
+    else:
+        norm_answers = normalize_str(gold_i)
+    for pred in pred_i:  # pred is already normalized in parse response phase
+        if isinstance(pred, str):  # if it's a string, then find if ans in the pred_i
+            for norm_ans in norm_answers:
+                # only see if the string answer in the string pred
+                if isinstance(norm_ans, str) and norm_ans in pred:
+                    if not correct:
+                        correct = True
+                    break
+        else:  # it's a float number
+            if pred in norm_answers:
+                if not correct:
+                    correct = True
+                break
+    return correct
+def evaluate_mmmu(samples):
+    """
+    Batch evaluation for multiple choice and open questions.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L219
+    """
+    pred_correct = 0
+    judge_dict = dict()
+    for sample in samples:
+        gold_i = sample["answer"]
+        pred_i = sample["parsed_pred"]
+        if sample["question_type"] == "multiple-choice":
+            correct = eval_multi_choice(gold_i, pred_i)
+        else:  # open question
+            correct = eval_open(gold_i, pred_i)
+        if correct:
+            judge_dict[sample["id"]] = "Correct"
+            pred_correct += 1
+        else:
+            judge_dict[sample["id"]] = "Wrong"
+    if len(samples) == 0:
+        return {"mmmu_acc": 0}
+    return judge_dict, {"mmmu_acc": pred_correct / len(samples)}
+def parse_multi_choice_response(response, all_choices, index2ans):
+    """
+    Parse the prediction from the generated response.
+    Return the predicted index e.g., A, B, C, D.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L10
+    """
+    for char in [",", ".", "!", "?", ";", ":", "'"]:
+        response = response.strip(char)
+    response = " " + response + " "  # add space to avoid partial match
+    index_ans = True
+    ans_with_brack = False
+    candidates = []
+    for choice in all_choices:  # e.g., (A) (B) (C) (D)
+        if f"({choice})" in response:
+            candidates.append(choice)
+            ans_with_brack = True
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A B C D
+            if f"{choice} " in response:
+                candidates.append(choice)
+    if len(candidates) == 0:
+        for choice in all_choices:  # e.g., A. B. C. D.
+            if f"{choice}." in response:
+                candidates.append(choice)
+    # if all above doesn't get candidates, check if the content is larger than 5 tokens and try to parse the example
+    if len(candidates) == 0 and len(response.split()) > 5:
+        for index, ans in index2ans.items():
+            if ans.lower() in response.lower():
+                candidates.append(index)
+                index_ans = False  # it's content ans.
+    if len(candidates) == 0:  # still not get answer, randomly choose one.
+        pred_index = random.choice(all_choices)
+    elif len(candidates) > 1:
+        start_indexes = []
+        if index_ans:
+            if ans_with_brack:
+                for can in candidates:
+                    index = response.rfind(f"({can})")
+                    start_indexes.append(index)  # -1 will be ignored anyway
+                # start_indexes = [generated_response.index(f'({can})') for can in candidates]
+            else:
+                for can in candidates:
+                    index = response.rfind(f" {can} ")
+                    start_indexes.append(index)
+        else:
+            for can in candidates:
+                index = response.lower().rfind(index2ans[can].lower())
+                start_indexes.append(index)
+        # get the last one
+        pred_index = candidates[np.argmax(start_indexes)]
+    else:  # if only one candidate, use it.
+        pred_index = candidates[0]
+    return pred_index
+def extract_numbers(string):
+    """
+    Exact all forms of numbers from a string with regex.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L100
+    """
+    # Pattern for numbers with commas
+    pattern_commas = r"-?\b\d{1,3}(?:,\d{3})+\b"
+    # Pattern for scientific notation
+    pattern_scientific = r"-?\d+(?:\.\d+)?[eE][+-]?\d+"
+    # Pattern for simple numbers without commas
+    pattern_simple = r"-?(?:\d+\.\d+|\.\d+|\d+\b)(?![eE][+-]?\d+)(?![,\d])"
+    # Extract numbers with commas
+    numbers_with_commas = re.findall(pattern_commas, string)
+    # Extract numbers in scientific notation
+    numbers_scientific = re.findall(pattern_scientific, string)
+    # Extract simple numbers without commas
+    numbers_simple = re.findall(pattern_simple, string)
+    # Combine all extracted numbersz
+    all_numbers = numbers_with_commas + numbers_scientific + numbers_simple
+    return all_numbers
+def check_is_number(string):
+    """
+    Check if the given string a number.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L65
+    """
+    try:
+        float(string.replace(",", ""))
+        return True
+    except ValueError:
+        # check if there's comma inside
+        return False
+def normalize_str(string):
+    """
+    Normalize the str to lower case and make them float numbers if possible.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L76
+    """
+    # check if characters in the string
+    # if number, numerize it.
+    string = string.strip()
+    is_number = check_is_number(string)
+    if is_number:
+        string = string.replace(",", "")
+        string = float(string)
+        # leave 2 decimal
+        string = round(string, 2)
+        return [string]
+    else:  # it's likely to be a string
+        # lower it
+        string = string.lower()
+        if len(string) == 1:
+            return [" " + string, string + " "]  # avoid trivial matches
+        return [string]
+def parse_open_response(response):
+    """
+    Parse the prediction from the generated response.
+    Return a list of predicted strings or numbers.
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/eval_utils.py#L122
+    """
+    # content = content.strip("\n").strip(".").strip(" ")
+    def get_key_subresponses(response):
+        key_responses = []
+        response = response.strip().strip(".").lower()
+        sub_responses = re.split(r"\.\s(?=[A-Z])|\n", response)
+        indicators_of_keys = [
+            "could be ",
+            "so ",
+            "is ",
+            "thus ",
+            "therefore ",
+            "final ",
+            "answer ",
+            "result ",
+        ]
+        key_responses = []
+        for index, resp in enumerate(sub_responses):
+            # if last one, accept it's an equation (the entire response can be just one sentence with equation)
+            if index == len(sub_responses) - 1:
+                indicators_of_keys.extend(["="])
+            shortest_key_response = None  # the shortest response that may contain the answer (tail part of the response)
+            for indicator in indicators_of_keys:
+                if indicator in resp:
+                    if not shortest_key_response:
+                        shortest_key_response = resp.split(indicator)[-1].strip()
+                    else:
+                        if len(resp.split(indicator)[-1].strip()) < len(
+                            shortest_key_response
+                        ):
+                            shortest_key_response = resp.split(indicator)[-1].strip()
+                    # key_responses.append(resp.split(indicator)[1].strip())
+            if shortest_key_response:
+                # and it's not trivial
+                if shortest_key_response.strip() not in [
+                    ":",
+                    ",",
+                    ".",
+                    "!",
+                    "?",
+                    ";",
+                    ":",
+                    "'",
+                ]:
+                    key_responses.append(shortest_key_response)
+        if len(key_responses) == 0:  # did not found any
+            return [response]
+        return key_responses
+    # pdb.set_trace()
+    key_responses = get_key_subresponses(response)
+    pred_list = key_responses.copy()  # keep the original string response
+    for resp in key_responses:
+        pred_list.extend(extract_numbers(resp))
+    tmp_pred_list = []
+    for i in range(len(pred_list)):
+        tmp_pred_list.extend(normalize_str(pred_list[i]))
+    pred_list = tmp_pred_list
+    # remove duplicates
+    pred_list = list(set(pred_list))
+    return pred_list
+def get_multi_choice_info(options):
+    """
+    Given the list of options for multiple choice question
+    Return the index2ans and all_choices
+    https://github.com/MMMU-Benchmark/MMMU/blob/51ce7f3e829c16bb44bc5445782686b4c3508794/eval/data_utils.py#L54
+    """
+    start_chr = "A"
+    all_choices = []
+    index2ans = {}
+    for i, option in enumerate(options):
+        index2ans[chr(ord(start_chr) + i)] = option
+        all_choices.append(chr(ord(start_chr) + i))
+    return index2ans, all_choices