utils_completion.py 3.33 KB
Newer Older
Boda Sadallah's avatar
Boda Sadallah committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os

from lm_eval.tasks.arab_culture_completion.prompts import (
    BASE_PROMPT,
    BASE_PROMPT_AR,
    JAIS_CHAT_AR,
    JAIS_CHAT_EN,
    REGION_COUNTRY_PROMPT,
    REGION_COUNTRY_PROMPT_AR,
    REGION_PROMPT,
    REGION_PROMPT_AR,
)


### get the conutry variable from environment


### Set this to one to add the country and region information to the prompt
COUNTRY = True if os.getenv("COUNTRY", True) == "True" else False
### Set this to one to add the region information to the prompt
REGION = True if os.getenv("REGION", True) == "True" else False
### Set this to change between Arabic and English for the answer keys and the choices keys
ARABIC = True if os.getenv("ARABIC", True) == "True" else False
### Get the model name
MODEL_NAME = os.getenv("MODEL_NAME")

## Uncomment this to check if the environment variables are set correctly
# print(f'Task settings: COUNTRY: {COUNTRY}, REGION: {REGION}, ARABIC: {ARABIC}', MODEL_NAME: {MODEL_NAME})

en_ar_countries_regions = {
    "Egypt": "مصر",
    "Morocco": "المغرب",
    "Algeria": "الجزائر",
    "Libya": "ليبيا",
    "Sudan": "السودان",
    "Tunisia": "تونس",
    "Jordan": "الأردن",
    "Lebanon": "لبنان",
    "Syria": "سوريا",
    "Palestine": "فلسطين",
    "Yemen": "اليمن",
    "UAE": "الإمارات",
    "KSA": "السعودية",
    "Gulf": "الخليج",
    "Levant": "الشام",
    "North Africa": "شمال أفريقيا",
    "Nile Valley": "وادي النيل",
}


# here, we only give the question to the model
def doc_to_text(doc):
    country = "" if not doc["country"] else doc["country"]
    region = "" if not doc["region"] else doc["region"]
    first_statement = doc["first_statement"].strip()

    ## We don't have a setting for only information about the country without the region
    if COUNTRY:
        assert REGION, (
            "If you want to add the country information, you must also add the region information"
        )

    ## convert contry and region name to arabic if the language is arabic
    if ARABIC:
        country = en_ar_countries_regions[country]
        region = en_ar_countries_regions[region]

    if COUNTRY and REGION:
        cur_prompt = REGION_COUNTRY_PROMPT_AR if ARABIC else REGION_COUNTRY_PROMPT
        doc_text = cur_prompt.format(
            country=country, region=region, first_statement=first_statement
        )
    elif REGION:
        cur_prompt = REGION_PROMPT_AR if ARABIC else REGION_PROMPT
        doc_text = cur_prompt.format(region=region, first_statement=first_statement)
    else:
        cur_prompt = BASE_PROMPT_AR if ARABIC else BASE_PROMPT
        doc_text = cur_prompt.format(first_statement=first_statement)

    ### apply jais chat tempelate
    if MODEL_NAME and "jais" in MODEL_NAME and "chat" in MODEL_NAME:
        if ARABIC:
            doc_text = JAIS_CHAT_AR.format(question=doc_text)
        else:
            doc_text = JAIS_CHAT_EN.format(question=doc_text)

    return doc_text


### Here we give the choices themsleves to the model
def doc_to_choice(doc):
    return doc["options"]["text"]


## The target is the choice text
def doc_to_target(doc):
    answer_key = doc["answer_key"]["english_answer_key"]
    answer_text = doc["options"]["text"][
        doc["options"]["english_keys"].index(answer_key)
    ]
    answer_text = answer_text.strip()
    return answer_text