add xcopa

1a2cf3b2 · lintangsutawika · 1710b42d · 1a2cf3b2 · 1a2cf3b2 · 1a2cf3b2
Commit 1a2cf3b2 authored Aug 08, 2023 by lintangsutawika
13 changed files
--- a/lm_eval/tasks/xcopa/README.md
+++ b/lm_eval/tasks/xcopa/README.md
+## XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning
+https://ducdauge.github.io/files/xcopa.pdf
+The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages.
+The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe.
+The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages.
+All the details about the creation of XCOPA and the implementation of the baselines are available in the paper.
+Homepage: https://github.com/cambridgeltl/xcopa
+```
+@inproceedings{ponti2020xcopa,
+  title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
+  author={Edoardo M. Ponti, Goran Glava\v{s}, Olga Majewska, Qianchu Liu, Ivan Vuli\'{c} and Anna Korhonen},
+  booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+  year={2020},
+  url={https://ducdauge.github.io/files/xcopa.pdf}
+}
+```
\ No newline at end of file
--- a/lm_eval/tasks/xcopa/default_et.yaml
+++ b/lm_eval/tasks/xcopa/default_et.yaml
+group: xcopa
+task: xcopa_et
+dataset_path: xcopa
+dataset_name: et
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+doc_to_text: !function utils.doc_to_text_et
+doc_to_target: !function utils.doc_to_target
+doc_to_choice: !function utils.doc_to_choice
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/xcopa/default_ht.yaml
+++ b/lm_eval/tasks/xcopa/default_ht.yaml
+include: default_et.yaml
+task: xcopa_ht
+dataset_name: ht
+doc_to_text: !function utils.doc_to_text_ht
--- a/lm_eval/tasks/xcopa/default_id.yaml
+++ b/lm_eval/tasks/xcopa/default_id.yaml
+include: default_et.yaml
+task: xcopa_id
+dataset_name: id
+doc_to_text: !function utils.doc_to_text_id
--- a/lm_eval/tasks/xcopa/default_it.yaml
+++ b/lm_eval/tasks/xcopa/default_it.yaml
+include: default_et.yaml
+task: xcopa_it
+dataset_name: it
+doc_to_text: !function utils.doc_to_text_it
--- a/lm_eval/tasks/xcopa/default_qu.yaml
+++ b/lm_eval/tasks/xcopa/default_qu.yaml
+include: default_et.yaml
+task: xcopa_qu
+dataset_name: qu
+doc_to_text: !function utils.doc_to_text_qu
--- a/lm_eval/tasks/xcopa/default_sw.yaml
+++ b/lm_eval/tasks/xcopa/default_sw.yaml
+include: default_et.yaml
+task: xcopa_sw
+dataset_name: sw
+doc_to_text: !function utils.doc_to_text_sw
--- a/lm_eval/tasks/xcopa/default_ta.yaml
+++ b/lm_eval/tasks/xcopa/default_ta.yaml
+include: default_et.yaml
+task: xcopa_ta
+dataset_name: ta
+doc_to_text: !function utils.doc_to_text_ta
--- a/lm_eval/tasks/xcopa/default_th.yaml
+++ b/lm_eval/tasks/xcopa/default_th.yaml
+include: default_et.yaml
+task: xcopa_th
+dataset_name: th
+doc_to_text: !function utils.doc_to_text_th
--- a/lm_eval/tasks/xcopa/default_tr.yaml
+++ b/lm_eval/tasks/xcopa/default_tr.yaml
+include: default_et.yaml
+task: xcopa_tr
+dataset_name: tr
+doc_to_text: !function utils.doc_to_text_tr
--- a/lm_eval/tasks/xcopa/default_vi.yaml
+++ b/lm_eval/tasks/xcopa/default_vi.yaml
+include: default_et.yaml
+task: xcopa_vi
+dataset_name: vi
+doc_to_text: !function utils.doc_to_text_vi
--- a/lm_eval/tasks/xcopa/default_zh.yaml
+++ b/lm_eval/tasks/xcopa/default_zh.yaml
+include: default_et.yaml
+task: xcopa_zh
+dataset_name: zh
+doc_to_text: !function utils.doc_to_text_zh
--- a/lm_eval/tasks/xcopa/utils.py
+++ b/lm_eval/tasks/xcopa/utils.py
+from functools import partial
+def convert_choice(choice):
+    return choice[0].lower() + choice[1:]
+def doc_to_text(doc, connector):
+    # Drop the period
+    conn = connector[doc["question"]]
+    return doc["premise"].strip()[:-1] + f" {conn}"
+def doc_to_target(doc):
+    correct_choice = doc["choice1"] if doc["label"] == 0 else doc["choice2"]
+    # Connect the sentences
+    return convert_choice(correct_choice)
+def doc_to_choice(doc):
+    return [convert_choice(doc["choice1"]), convert_choice(doc["choice2"])]
+doc_to_text_et = partial(
+    doc_to_text, 
+    connector={
+        "cause": "sest",
+        "effect": "seetõttu",
+        }
+    )
+doc_to_text_ht = partial(
+    doc_to_text, 
+    connector={
+        "cause": "poukisa",
+        "effect": "donk sa",
+        }
+    )
+doc_to_text_it = partial(
+    doc_to_text, 
+    connector={
+        "cause": "perché",
+        "effect": "quindi",
+        }
+    )
+doc_to_text_id = partial(
+    doc_to_text, 
+    connector={
+        "cause": "karena",
+        "effect": "maka",
+        }
+    )
+doc_to_text_qu = partial(
+    doc_to_text, 
+    connector={
+        "cause": "imataq",
+        "effect": "chaymi",
+        }
+    )
+doc_to_text_sw = partial(
+    doc_to_text, 
+    connector={
+        "cause": "kwa sababu",
+        "effect": "kwa hiyo",
+        }
+    )
+doc_to_text_zh = partial(
+    doc_to_text, 
+    connector={
+        "cause": "因为",
+        "effect": "所以",
+        }
+    )
+doc_to_text_ta = partial(
+    doc_to_text, 
+    connector={
+        "cause": "காரணமாக",
+        "effect": "எனவே",
+        }
+    )
+doc_to_text_th = partial(
+    doc_to_text, 
+    connector={
+        "cause": "เพราะ",
+        "effect": "ดังนั้น",
+        }
+    )
+doc_to_text_tr = partial(
+    doc_to_text, 
+    connector={
+        "cause": "çünkü",
+        "effect": "bu yüzden",
+        }
+    )
+doc_to_text_vi = partial(
+    doc_to_text, 
+    connector={
+        "cause": "bởi vì",
+        "effect": "vì vậy",
+        }
+    )