data_utils.py

"""Utils for data load, save, and process (e.g., prompt construction)"""

import json
import os
import re

import yaml

DOMAIN_CAT2SUB_CAT = {
    "Art and Design": ["Art", "Art_Theory", "Design", "Music"],
    "Business": ["Accounting", "Economics", "Finance", "Manage", "Marketing"],
    "Science": [
        "Biology",
        "Chemistry",
        "Geography",
        "Math",
        "Physics",
    ],
    "Health and Medicine": [
        "Basic_Medical_Science",
        "Clinical_Medicine",
        "Diagnostics_and_Laboratory_Medicine",
        "Pharmacy",
        "Public_Health",
    ],
    "Humanities and Social Science": [
        "History",
        "Literature",
        "Sociology",
        "Psychology",
    ],
    "Tech and Engineering": [
        "Agriculture",
        "Architecture_and_Engineering",
        "Computer_Science",
        "Electronics",
        "Energy_and_Power",
        "Materials",
        "Mechanical_Engineering",
    ],
}


CAT_SHORT2LONG = {
    "acc": "Accounting",
    "agri": "Agriculture",
    "arch": "Architecture_and_Engineering",
    "art": "Art",
    "art_theory": "Art_Theory",
    "bas_med": "Basic_Medical_Science",
    "bio": "Biology",
    "chem": "Chemistry",
    "cli_med": "Clinical_Medicine",
    "cs": "Computer_Science",
    "design": "Design",
    "diag_med": "Diagnostics_and_Laboratory_Medicine",
    "econ": "Economics",
    "elec": "Electronics",
    "ep": "Energy_and_Power",
    "fin": "Finance",
    "geo": "Geography",
    "his": "History",
    "liter": "Literature",
    "manage": "Manage",
    "mark": "Marketing",
    "mate": "Materials",
    "math": "Math",
    "mech": "Mechanical_Engineering",
    "music": "Music",
    "phar": "Pharmacy",
    "phys": "Physics",
    "psy": "Psychology",
    "pub_health": "Public_Health",
    "socio": "Sociology",
}


def get_multi_choice_info(options):
    """
    Given the list of options for multiple choice question
    Return the index2ans and all_choices
    """

    start_chr = "A"
    all_choices = []
    index2ans = {}
    for i, option in enumerate(options):
        index2ans[chr(ord(start_chr) + i)] = option
        all_choices.append(chr(ord(start_chr) + i))

    return index2ans, all_choices


def load_yaml(file_path):
    with open(file_path, "r") as stream:
        try:
            yaml_dict = yaml.safe_load(stream)
        except yaml.YAMLError as exc:
            print(exc)

    return yaml_dict


def parse_img_path(text):
    matches = re.findall("<img='(.*?)'>", text)
    return matches


def process_single_sample(data):
    question = data["question"]
    o_imgs_paths = []
    for option in data["options"]:
        current_o_imgs_paths = parse_img_path(option)
        for img_path in current_o_imgs_paths:
            o_imgs_paths.append(img_path)

    if len(o_imgs_paths) > 1:  # multiple images in options, used for random selection
        return {
            "id": data["id"],
            "question": question,
            "options": data["options"],
            "answer": data["answer"],
            "image": None,
            "question_type": data["question_type"],
        }
    else:
        return {
            "id": data["id"],
            "question": question,
            "options": data["options"],
            "answer": data["answer"],
            "image": data["image_1"],
            "question_type": data["question_type"],
        }


# DATA SAVING
def save_json(filename, ds):
    print(f"answers saved to: {filename}")
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w") as f:
        json.dump(ds, f, indent=4)


def save_jsonl(filename, data):
    """
    Save a dictionary of data to a JSON Lines file with the filename as key and caption as value.

    Args:
        filename (str): The path to the file where the data should be saved.
        data (dict): The dictionary containing the data to save where key is the image path and value is the caption.
    """
    with open(filename, "w", encoding="utf-8") as f:
        for img_path, caption in data.items():
            # Extract the base filename without the extension
            base_filename = os.path.basename(img_path)
            # Create a JSON object with the filename as the key and caption as the value
            json_record = json.dumps({base_filename: caption}, ensure_ascii=False)
            # Write the JSON object to the file, one per line
            f.write(json_record + "\n")


def save_args(args, path_dir):
    argsDict = args.__dict__
    with open(path_dir + "setting.txt", "w") as f:
        f.writelines("------------------ start ------------------" + "\n")
        for eachArg, value in argsDict.items():
            f.writelines(eachArg + " : " + str(value) + "\n")
        f.writelines("------------------- end -------------------")


# DATA PROCESSING
def construct_prompt(sample, config):
    question = sample["question"]
    options = eval(sample["options"])
    example = ""
    if sample["question_type"] == "multiple-choice":
        start_chr = "A"
        prediction_range = []
        index2ans = {}
        for option in options:
            prediction_range.append(start_chr)
            example += f"({start_chr}) {option}\n"
            index2ans[start_chr] = option
            start_chr = chr(ord(start_chr) + 1)
        empty_prompt_sample_structure = config["multi_choice_example_format"]
        empty_prompt = empty_prompt_sample_structure.format(question, example)
        res_dict = {}
        res_dict["index2ans"] = index2ans
        res_dict["correct_choice"] = sample["answer"]
        res_dict["all_choices"] = prediction_range
        res_dict["empty_prompt"] = empty_prompt
        if config["task_instructions"]:
            res_dict["final_input_prompt"] = (
                config["task_instructions"].strip() + "\n\n" + empty_prompt
            )
        else:
            res_dict["final_input_prompt"] = empty_prompt

        res_dict["gt_content"] = options[ord(sample["answer"].upper()) - ord("A")]
    else:
        empty_prompt_sample_structure = config["short_ans_example_format"]
        empty_prompt = empty_prompt_sample_structure.format(question)
        res_dict = {}
        res_dict["empty_prompt"] = empty_prompt
        if config["task_instructions"]:
            res_dict["final_input_prompt"] = (
                config["task_instructions"].strip() + "\n\n" + empty_prompt
            )
        else:
            res_dict["final_input_prompt"] = empty_prompt
        res_dict["gt_content"] = sample["answer"]

    res_dict.update(sample)
    return res_dict