Commit a27ea4bd authored by JessicaOjo's avatar JessicaOjo
Browse files

add afrixnli to task

parent e6394715
import re
def wikitext_detokenizer(doc):
string = doc["label"]
string = string.replace('[.,]', '').lower()
string = string.split("\\n\\n")
string = string.split("<pad>")[-1].split("</s>")[0].strip()
string = extract_answer(string)
string = verbalizer(string.strip())
return string
def extract_answer(string):
pattern = r'(\*\*answer:\*\*|\*answer is:\*|\*\*|\*\*|\*answer is exact\*|label:|the premise and hypothesis ' \
r'are|the premise and the hypothesis is|the premise and the hypothesis is a|described as|therefore they ' \
r'are|therefore|are considered|is an exact|it is|is a|is)\s*(neutral|entailment|contradiction)'
match = re.search(pattern, string, re.IGNORECASE)
return match.group(2) if match else string
def verbalizer(string):
verbalizer_dict = {
"entailment": ['encouragement', 'entitlement', 'entails', 'entailed', 'entailment'],
"contradiction": ['contradictory', 'contradicts', 'contradiction'],
"neutral": ['neutral']}
for key, values in verbalizer_dict.items():
for value in values:
if value in string:
return key
return string
import yaml
import argparse
class FunctionTag:
def __init__(self, value):
self.value = value
def function_representer(dumper, data):
return dumper.represent_scalar('!function', data.value, style='')
yaml.add_representer(FunctionTag, function_representer)
def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
"""
Generate a yaml file for each language.
:param output_dir: The directory to output the files to.
:param overwrite: Whether to overwrite files if they already exist.
"""
err = []
languages = ['amh', 'ibo', 'fra', 'sna', 'lin', 'wol', 'ewe', 'lug', 'xho', 'kin', 'twi', 'zul', 'orm',
'yor', 'hau', 'sot', 'swa']
for lang in languages:
file_name = f"afrixnli_{lang}.yaml"
try:
with open(
f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
) as f:
f.write("# Generated by utils.py\n")
yaml.dump(
{
"include": "afrixnli_common_yaml",
"task": f"afrixnli_{lang}",
"dataset_name": lang,
"doc_to_target": "{{label}}", #FunctionTag('preprocess_wikitext.wikitext_detokenizer'),
"doc_to_text": "Premise: {{premise}} \nHypothesis: {{hypothesis}} \nIs it entailment, "
"contradiction, or neutral?"
},
f,
allow_unicode=True,
)
except FileExistsError:
err.append(file_name)
if len(err) > 0:
raise FileExistsError(
"Files were not created because they already exist (use --overwrite flag):"
f" {', '.join(err)}"
)
def main() -> None:
"""Parse CLI args and generate language-specific yaml files."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--overwrite",
default=True,
action="store_true",
help="Overwrite files if they already exist",
)
parser.add_argument(
"--output-dir", default=".", help="Directory to write yaml files to"
)
args = parser.parse_args()
gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
if __name__ == "__main__":
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment