utils.py

import random

import datasets


def process_docs_hard(dataset: datasets.Dataset):
    return dataset


def process_docs(dataset: datasets.Dataset):
    def _helper(doc):
        return doc

    num_entries = len(dataset)
    ten_percent_index = int(0.1 * num_entries)

    # Select the first 10% of the dataset
    filtered_dataset = dataset.select(range(ten_percent_index))

    return filtered_dataset.map(_helper)


def doc_to_choice_easy(doc):
    return [
        "neoplasms",
        "digestive system diseases",
        "nervous system diseases",
        "cardiovascular diseases",
        "general pathological conditions",
    ]


def doc_to_text_easy(doc) -> str:
    choices = doc_to_choice_easy(doc)
    prompt = (
        "Classify the topic of the following medical text into one of the following choices. \n"
        "Text: {} \n"
        "Choices: \n"
        "A. {} \n"
        "B. {} \n"
        "C. {} \n"
        "D. {} \n"
        "E. {} \n Answer:".format(
            doc["text"], choices[0], choices[1], choices[2], choices[3], choices[4]
        )
    )

    return prompt


def doc_to_target_easy(doc):
    return int(doc["class"]) - 1


def doc_to_text_hard(doc) -> str:
    choices = doc_to_choice_hard(doc)
    prompt = (
        "Select the medical specialty the following text is talking about among the following choices. \n"
        "Text: {} \n"
        "Choices: {}\n"
        " Answer:".format(doc["transcription"], choices)
    )

    return prompt


def doc_to_choice_hard(doc):
    choices_list = [
        " Bariatrics",
        " Allergy / Immunology",
        " Dentistry",
        " Cardiovascular / Pulmonary",
        " Urology",
        " Hospice - Palliative Care",
        " Radiology",
        " Pediatrics - Neonatal",
        " Neurology",
        " Neurosurgery",
        " Emergency Room Reports",
        " IME-QME-Work Comp etc.",
        " Office Notes",
        " Surgery",
        " Letters",
        " Ophthalmology",
        " Hematology - Oncology",
        " Endocrinology",
        " Cosmetic / Plastic Surgery",
        " Diets and Nutritions",
        " Rheumatology",
        " Nephrology",
        " Physical Medicine - Rehab",
        " Podiatry",
        " Chiropractic",
        " Lab Medicine - Pathology",
        " Orthopedic",
        " Autopsy",
        " Psychiatry / Psychology",
        " Speech - Language",
        " ENT - Otolaryngology",
        " Sleep Medicine",
        " Dermatology",
        " SOAP / Chart / Progress Notes",
        " General Medicine",
        " Consult - History and Phy.",
        " Obstetrics / Gynecology",
        " Gastroenterology",
        " Pain Management",
        " Discharge Summary",
    ]
    return choices_list


def doc_to_target_hard(doc):
    choices = doc_to_choice_hard(doc)
    gold = doc["medical_specialty"]
    idx = choices.index(gold)
    return idx