utils.py

import argparse
import os

import yaml


class FunctionTag:
    def __init__(self, value):
        self.value = value


def prompt_func(mode, lang):
    prompt_map = {
        "prompt_1": "Given the categories science/technology, travel, politics, sports, health, entertainment, or geography; what category does the text: '{{text}}' belong to: \n\n",
        "prompt_2": f"Does this {lang} topic; "
        "'{{text}}' belong to one of the following categories: science/technology, travel, politics, sports, health, entertainment, or geography? category only\n\n",
        "prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
        f"Given the categories science/technology, travel, politics, sports, health, entertainment, or geography; what is "
        f"the topic of the {lang} statement below? Return only the category. "
        "\n\ntext: {{text}} \category:\n\n",
        "prompt_4": "Label the following text as science/technology, travel, politics, sports, health, entertainment, or geography. Provide only the category as your "
        "response. \n\ntext: {{text}} \category: \n\n",
        "prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
        f"For each input, classify the topic as science/technology, travel, politics, sports, health, entertainment, or geography. "
        f"Use the following guidelines: \n\n "
        f"science/technology: The text discusses scientific discoveries, technological advancements, or related topics. \n"
        f"travel: The text describes travel experiences, destinations, or related topics. \n"
        f"politics: The text covers political events, policies, or related topics. \n"
        f"sports: The text talks about sports events, athletes, or related topics. \n"
        f"health: The text addresses health issues, medical advancements, or related topics. \n"
        f"entertainment: The text pertains to movies, music, celebrities, or related topics. \n"
        f"geography: The text involves geographical information, locations, or related topics. \n\n"
        f"If the text contains multiple topics, choose the dominant topic. "
        f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
        "Please provide a single classification for each input.\n\ntext: {{text}} \category: \n\n",
    }
    return prompt_map[mode]


def gen_lang_yamls(output_dir: str, overwrite: bool, mode: str) -> None:
    """
    Generate a yaml file for each language.

    :param output_dir: The directory to output the files to.
    :param overwrite: Whether to overwrite files if they already exist.
    """
    err = []
    languages = {
        "aeb": "Tunisian Arabic",
        "afr": "Afrikaans",
        "aka": "Akan",
        "amh": "Amharic",
        "ary": "Moroccan Arabic",
        "arz": "Egyptian Arabic",
        "bam": "Bambara",
        "bem": "Bemba",
        "cjk": "Chokwe",
        "dik": "Southwestern Dinka",
        "dyu": "Dyula",
        "eng": "English",
        "ewe": "Ewe",
        "fon": "Fon",
        "fra": "French",
        "fuv": "Nigerian Fulfulde",
        "gaz": "West Central Oromo",
        "hau": "Hausa",
        "ibo": "Igbo",
        "kab": "Kabyle",
        "kam": "Kamba",
        "kmb": "Kimbundu",
        "kbp": "Kabiye",
        "kea": "Kabuverdianu",
        "kik": "Kikuyu",
        "kin": "Kinyarwanda",
        "kon": "Kikongo",
        "knc": "Central Kanuri",
        "lua": "Luba-Kasai",
        "lug": "Luganda",
        "luo": "Luo",
        "lin": "Lingala",
        "mos": "Mossi",
        "nus": "Nuer",
        "nso": "Northern Sotho",
        "nya": "Nyanga",
        "plt": "Plateau Malagasy",
        "por": "Portuguese",
        "run": "Rundi",
        "sag": "Sango",
        "sna": "Shona",
        "som": "Somali",
        "sot": "Southern Sotho",
        "ssw": "Swazi",
        "swa": "Swahili",
        "taq": "Tamasheq",
        "tir": "Tigrinya",
        "tum": "Tumbuka",
        "tso": "Tsonga",
        "twi": "Twi",
        "tzm": "Tamazight",
        "umb": "Umbundu",
        "wol": "Wolof",
        "xho": "Xhosa",
        "yor": "Yoruba",
        "zul": "Zulu",
    }

    lang_2_dataset_lang_code = {
        "aeb": "aeb_Arab",
        "afr": "afr_Latn",
        "aka": "aka_Latn",
        "amh": "amh_Ethi",
        "ary": "ary_Arab",
        "arz": "arz_Arab",
        "bam": "bam_Latn",
        "bem": "bem_Latn",
        "cjk": "cjk_Latn",
        "dik": "dik_Latn",
        "dyu": "dyu_Latn",
        "eng": "eng_Latn",
        "ewe": "ewe_Latn",
        "fon": "fon_Latn",
        "fra": "fra_Latn",
        "fuv": "fuv_Latn",
        "gaz": "gaz_Latn",
        "hau": "hau_Latn",
        "ibo": "ibo_Latn",
        "kab": "kab_Latn",
        "kam": "kam_Latn",
        "kmb": "kmb_Latn",
        "kbp": "kbp_Latn",
        "kea": "kea_Latn",
        "kik": "kik_Latn",
        "kin": "kin_Latn",
        "kon": "kon_Latn",
        "knc": "knc_Latn",
        "lua": "lua_Latn",
        "lug": "lug_Latn",
        "luo": "luo_Latn",
        "lin": "lin_Latn",
        "mos": "mos_Latn",
        "nus": "nus_Latn",
        "nso": "nso_Latn",
        "nya": "nya_Latn",
        "plt": "plt_Latn",
        "por": "por_Latn",
        "run": "run_Latn",
        "sag": "sag_Latn",
        "sna": "sna_Latn",
        "som": "som_Latn",
        "sot": "sot_Latn",
        "ssw": "ssw_Latn",
        "swa": "swh_Latn",
        "taq": "taq_Latn",
        "tir": "tir_Ethi",
        "tum": "tum_Latn",
        "tso": "tso_Latn",
        "twi": "twi_Latn",
        "tzm": "tzm_Tfng",
        "umb": "umb_Latn",
        "wol": "wol_Latn",
        "xho": "xho_Latn",
        "yor": "yor_Latn",
        "zul": "zul_Latn",
    }

    for lang in languages.keys():
        try:
            file_name = f"sib_{lang}.yaml"
            task_name = f"sib_{lang}_{mode}"
            yaml_template = "sib"
            yaml_details = {
                "include": yaml_template,
                "task": task_name,
                "dataset_name": lang_2_dataset_lang_code[lang],
                "doc_to_text": prompt_func(mode, languages[lang]),
            }
            file_path = os.path.join(output_dir, mode)
            os.makedirs(file_path, exist_ok=True)

            with open(
                f"{output_dir}/{mode}/{file_name}",
                "w" if overwrite else "x",
                encoding="utf8",
            ) as f:
                f.write("# Generated by utils.py\n")
                yaml.dump(
                    yaml_details,
                    f,
                    allow_unicode=True,
                )
        except FileExistsError:
            err.append(file_name)

    if len(err) > 0:
        raise FileExistsError(
            "Files were not created because they already exist (use --overwrite flag):"
            f" {', '.join(err)}"
        )


def main() -> None:
    """Parse CLI args and generate language-specific yaml files."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--overwrite",
        default=True,
        action="store_true",
        help="Overwrite files if they already exist",
    )
    parser.add_argument(
        "--output-dir",
        default="./",
        help="Directory to write yaml files to",
    )
    parser.add_argument(
        "--mode",
        default="prompt_3",
        choices=["prompt_1", "prompt_2", "prompt_3", "prompt_4", "prompt_5"],
        help="Prompt number",
    )
    args = parser.parse_args()

    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite, mode=args.mode)


if __name__ == "__main__":
    main()