Merge pull request #1 from EleutherAI/toxicity-test

Toxicity test

Merge pull request #1 from EleutherAI/toxicity-test
Toxicity test
35a24652 · Aflah · GitHub · 52213e29 · 0021de21 · 35a24652
Unverified Commit 35a24652 authored Aug 15, 2023 by Aflah Committed by GitHub Aug 15, 2023
20 changed files
--- a/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
+++ b/lm_eval/tasks/realtoxicityprompts/realtoxicityprompts.yaml
-task: realtoxicityprompts_yaml
+task: realtoxicityprompts
 dataset_path: "allenai/real-toxicity-prompts"
-dataset_name: null
-dataset_kwargs: null
 training_split: 'train'
-validation_split: null
 test_split: 'train'
-doc_to_text: "{{prompt['text']}}"
+doc_to_text: "{{' '+prompt.text}}"
 doc_to_target: ""
 metric_list:
  - metric: !function metric.toxicity_perspective_api

--- a/lm_eval/tasks/xcopa/README.md
+++ b/lm_eval/tasks/xcopa/README.md
+## XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning
+https://ducdauge.github.io/files/xcopa.pdf
+The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages.
+The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe.
+The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages.
+All the details about the creation of XCOPA and the implementation of the baselines are available in the paper.
+Homepage: https://github.com/cambridgeltl/xcopa
+```
+@inproceedings{ponti2020xcopa,
+  title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning},
+  author={Edoardo M. Ponti, Goran Glava\v{s}, Olga Majewska, Qianchu Liu, Ivan Vuli\'{c} and Anna Korhonen},
+  booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
+  year={2020},
+  url={https://ducdauge.github.io/files/xcopa.pdf}
+}
+```
--- a/lm_eval/tasks/xcopa/default_et.yaml
+++ b/lm_eval/tasks/xcopa/default_et.yaml
+group: xcopa
+task: xcopa_et
+dataset_path: xcopa
+dataset_name: et
+output_type: multiple_choice
+validation_split: validation
+test_split: test
+doc_to_text: !function utils.doc_to_text_et
+doc_to_target: label
+doc_to_choice: !function utils.doc_to_choice
+metric_list:
+  - metric: acc
--- a/lm_eval/tasks/xcopa/default_ht.yaml
+++ b/lm_eval/tasks/xcopa/default_ht.yaml
+include: default_et.yaml
+task: xcopa_ht
+dataset_name: ht
+doc_to_text: !function utils.doc_to_text_ht
--- a/lm_eval/tasks/xcopa/default_id.yaml
+++ b/lm_eval/tasks/xcopa/default_id.yaml
+include: default_et.yaml
+task: xcopa_id
+dataset_name: id
+doc_to_text: !function utils.doc_to_text_id
--- a/lm_eval/tasks/xcopa/default_it.yaml
+++ b/lm_eval/tasks/xcopa/default_it.yaml
+include: default_et.yaml
+task: xcopa_it
+dataset_name: it
+doc_to_text: !function utils.doc_to_text_it
--- a/lm_eval/tasks/xcopa/default_qu.yaml
+++ b/lm_eval/tasks/xcopa/default_qu.yaml
+include: default_et.yaml
+task: xcopa_qu
+dataset_name: qu
+doc_to_text: !function utils.doc_to_text_qu
--- a/lm_eval/tasks/xcopa/default_sw.yaml
+++ b/lm_eval/tasks/xcopa/default_sw.yaml
+include: default_et.yaml
+task: xcopa_sw
+dataset_name: sw
+doc_to_text: !function utils.doc_to_text_sw
--- a/lm_eval/tasks/xcopa/default_ta.yaml
+++ b/lm_eval/tasks/xcopa/default_ta.yaml
+include: default_et.yaml
+task: xcopa_ta
+dataset_name: ta
+doc_to_text: !function utils.doc_to_text_ta
--- a/lm_eval/tasks/xcopa/default_th.yaml
+++ b/lm_eval/tasks/xcopa/default_th.yaml
+include: default_et.yaml
+task: xcopa_th
+dataset_name: th
+doc_to_text: !function utils.doc_to_text_th
--- a/lm_eval/tasks/xcopa/default_tr.yaml
+++ b/lm_eval/tasks/xcopa/default_tr.yaml
+include: default_et.yaml
+task: xcopa_tr
+dataset_name: tr
+doc_to_text: !function utils.doc_to_text_tr
--- a/lm_eval/tasks/xcopa/default_vi.yaml
+++ b/lm_eval/tasks/xcopa/default_vi.yaml
+include: default_et.yaml
+task: xcopa_vi
+dataset_name: vi
+doc_to_text: !function utils.doc_to_text_vi
--- a/lm_eval/tasks/xcopa/default_zh.yaml
+++ b/lm_eval/tasks/xcopa/default_zh.yaml
+include: default_et.yaml
+task: xcopa_zh
+dataset_name: zh
+doc_to_text: !function utils.doc_to_text_zh
--- a/lm_eval/tasks/xcopa/utils.py
+++ b/lm_eval/tasks/xcopa/utils.py
+from functools import partial
+def convert_choice(choice):
+    return choice[0].lower() + choice[1:]
+def doc_to_text(doc, connector):
+    # Drop the period
+    conn = connector[doc["question"]]
+    return doc["premise"].strip()[:-1] + f" {conn}"
+def doc_to_choice(doc):
+    return [convert_choice(doc["choice1"]), convert_choice(doc["choice2"])]
+doc_to_text_et = partial(
+    doc_to_text,
+    connector={
+        "cause": "sest",
+        "effect": "seetõttu",
+    },
+)
+doc_to_text_ht = partial(
+    doc_to_text,
+    connector={
+        "cause": "poukisa",
+        "effect": "donk sa",
+    },
+)
+doc_to_text_it = partial(
+    doc_to_text,
+    connector={
+        "cause": "perché",
+        "effect": "quindi",
+    },
+)
+doc_to_text_id = partial(
+    doc_to_text,
+    connector={
+        "cause": "karena",
+        "effect": "maka",
+    },
+)
+doc_to_text_qu = partial(
+    doc_to_text,
+    connector={
+        "cause": "imataq",
+        "effect": "chaymi",
+    },
+)
+doc_to_text_sw = partial(
+    doc_to_text,
+    connector={
+        "cause": "kwa sababu",
+        "effect": "kwa hiyo",
+    },
+)
+doc_to_text_zh = partial(
+    doc_to_text,
+    connector={
+        "cause": "因为",
+        "effect": "所以",
+    },
+)
+doc_to_text_ta = partial(
+    doc_to_text,
+    connector={
+        "cause": "காரணமாக",
+        "effect": "எனவே",
+    },
+)
+doc_to_text_th = partial(
+    doc_to_text,
+    connector={
+        "cause": "เพราะ",
+        "effect": "ดังนั้น",
+    },
+)
+doc_to_text_tr = partial(
+    doc_to_text,
+    connector={
+        "cause": "çünkü",
+        "effect": "bu yüzden",
+    },
+)
+doc_to_text_vi = partial(
+    doc_to_text,
+    connector={
+        "cause": "bởi vì",
+        "effect": "vì vậy",
+    },
+)
--- a/lm_eval/tasks/xnli/README.md
+++ b/lm_eval/tasks/xnli/README.md
+# XNLI
+### Paper
+Title: `XNLI: Evaluating Cross-lingual Sentence Representations`
+Abstract: https://arxiv.org/abs/1809.05053
+Based on the implementation of @yongzx (see https://github.com/EleutherAI/lm-evaluation-harness/pull/258)
+Prompt format (same as XGLM and mGPT):
+sentence1 + ", right? " + mask = (Yes|Also|No) + ", " + sentence2
+Predicition is the full sequence with the highest likelihood.
+Language specific prompts are translated word-by-word with Google Translate
+and may differ from the ones used by mGPT and XGLM (they do not provide their prompts).
+Homepage: https://github.com/facebookresearch/XNLI
+### Citation
+"""
+@InProceedings{conneau2018xnli,
+  author = "Conneau, Alexis
+        and Rinott, Ruty
+        and Lample, Guillaume
+        and Williams, Adina
+        and Bowman, Samuel R.
+        and Schwenk, Holger
+        and Stoyanov, Veselin",
+  title = "XNLI: Evaluating Cross-lingual Sentence Representations",
+  booktitle = "Proceedings of the 2018 Conference on Empirical Methods
+               in Natural Language Processing",
+  year = "2018",
+  publisher = "Association for Computational Linguistics",
+  location = "Brussels, Belgium",
+}
+"""
+### Groups and Tasks
+#### Groups
+* `xnli`
+#### Tasks
+* `xnli_ar`: Arabic
+* `xnli_bg`: Bulgarian
+* `xnli_de`: German
+* `xnli_el`: Greek
+* `xnli_en`: English
+* `xnli_es`: Spanish
+* `xnli_fr`: French
+* `xnli_hi`: Hindi
+* `xnli_ru`: Russian
+* `xnli_sw`: Swahili
+* `xnli_th`: Thai
+* `xnli_tr`: Turkish
+* `xnli_ur`: Urdu
+* `xnli_vi`: Vietnamese
+* `xnli_zh`: Chinese
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/xnli/utils.py
+++ b/lm_eval/tasks/xnli/utils.py
+import argparse
+from typing import Dict, List
+import yaml
+# Different languages that are part of xnli.
+# These correspond to dataset names (Subsets) on HuggingFace.
+# A yaml file is generated by this script for each language.
+LANGUAGES = {
+    "ar": {  # Arabic
+        "QUESTION_WORD": "صحيح",
+        "ENTAILMENT_LABEL": "نعم",
+        "NEUTRAL_LABEL": "لذا",
+        "CONTRADICTION_LABEL": "رقم",
+    },
+    "bg": {  # Bulgarian
+        "QUESTION_WORD": "правилно",
+        "ENTAILMENT_LABEL": "да",
+        "NEUTRAL_LABEL": "така",
+        "CONTRADICTION_LABEL": "не",
+    },
+    "de": {  # German
+        "QUESTION_WORD": "richtig",
+        "ENTAILMENT_LABEL": "Ja",
+        "NEUTRAL_LABEL": "Auch",
+        "CONTRADICTION_LABEL": "Nein",
+    },
+    "el": {  # Greek
+        "QUESTION_WORD": "σωστός",
+        "ENTAILMENT_LABEL": "Ναί",
+        "NEUTRAL_LABEL": "Έτσι",
+        "CONTRADICTION_LABEL": "όχι",
+    },
+    "en": {  # English
+        "QUESTION_WORD": "right",
+        "ENTAILMENT_LABEL": "Yes",
+        "NEUTRAL_LABEL": "Also",
+        "CONTRADICTION_LABEL": "No",
+    },
+    "es": {  # Spanish
+        "QUESTION_WORD": "correcto",
+        "ENTAILMENT_LABEL": "Sí",
+        "NEUTRAL_LABEL": "Asi que",
+        "CONTRADICTION_LABEL": "No",
+    },
+    "fr": {  # French
+        "QUESTION_WORD": "correct",
+        "ENTAILMENT_LABEL": "Oui",
+        "NEUTRAL_LABEL": "Aussi",
+        "CONTRADICTION_LABEL": "Non",
+    },
+    "hi": {  # Hindi
+        "QUESTION_WORD": "सही",
+        "ENTAILMENT_LABEL": "हाँ",
+        "NEUTRAL_LABEL": "इसलिए",
+        "CONTRADICTION_LABEL": "नहीं",
+    },
+    "ru": {  # Russian
+        "QUESTION_WORD": "правильно",
+        "ENTAILMENT_LABEL": "Да",
+        "NEUTRAL_LABEL": "Так",
+        "CONTRADICTION_LABEL": "Нет",
+    },
+    "sw": {  # Swahili
+        "QUESTION_WORD": "sahihi",
+        "ENTAILMENT_LABEL": "Ndiyo",
+        "NEUTRAL_LABEL": "Hivyo",
+        "CONTRADICTION_LABEL": "Hapana",
+    },
+    "th": {  # Thai
+        "QUESTION_WORD": "ถูกต้อง",
+        "ENTAILMENT_LABEL": "ใช่",
+        "NEUTRAL_LABEL": "ดังนั้น",
+        "CONTRADICTION_LABEL": "ไม่",
+    },
+    "tr": {  # Turkish
+        "QUESTION_WORD": "doğru",
+        "ENTAILMENT_LABEL": "Evet",
+        "NEUTRAL_LABEL": "Böylece",
+        "CONTRADICTION_LABEL": "Hayır",
+    },
+    "ur": {  # Urdu
+        "QUESTION_WORD": "صحیح",
+        "ENTAILMENT_LABEL": "جی ہاں",
+        "NEUTRAL_LABEL": "اس لئے",
+        "CONTRADICTION_LABEL": "نہیں",
+    },
+    "vi": {  # Vietnamese
+        "QUESTION_WORD": "đúng",
+        "ENTAILMENT_LABEL": "Vâng",
+        "NEUTRAL_LABEL": "Vì vậy",
+        "CONTRADICTION_LABEL": "Không",
+    },
+    "zh": {  # Chinese
+        "QUESTION_WORD": "正确",
+        "ENTAILMENT_LABEL": "是的",
+        "NEUTRAL_LABEL": "所以",
+        "CONTRADICTION_LABEL": "不是的",
+    },
+}
+def gen_lang_yamls(output_dir: str, overwrite: bool) -> None:
+    """
+    Generate a yaml file for each language.
+    :param output_dir: The directory to output the files to.
+    :param overwrite: Whether to overwrite files if they already exist.
+    """
+    err = []
+    for lang in LANGUAGES.keys():
+        file_name = f"xnli_{lang}.yaml"
+        try:
+            QUESTION_WORD = LANGUAGES[lang]["QUESTION_WORD"]
+            ENTAILMENT_LABEL = LANGUAGES[lang]["ENTAILMENT_LABEL"]
+            NEUTRAL_LABEL = LANGUAGES[lang]["NEUTRAL_LABEL"]
+            CONTRADICTION_LABEL = LANGUAGES[lang]["CONTRADICTION_LABEL"]
+            with open(
+                f"{output_dir}/{file_name}", "w" if overwrite else "x", encoding="utf8"
+            ) as f:
+                f.write("# Generated by utils.py\n")
+                yaml.dump(
+                    {
+                        "include": "xnli_common_yaml",
+                        "dataset_name": lang,
+                        "task": f"xnli_{lang}",
+                        "doc_to_text": "",
+                        "doc_to_choice": f"{{{{["
+                        f"""premise+\", {QUESTION_WORD}? {ENTAILMENT_LABEL}, \"+hypothesis,"""
+                        f"""premise+\", {QUESTION_WORD}? {NEUTRAL_LABEL}, \"+hypothesis,"""
+                        f"""premise+\", {QUESTION_WORD}? {CONTRADICTION_LABEL}, \"+hypothesis"""
+                        f"]}}}}",
+                    },
+                    f,
+                    allow_unicode=True,
+                )
+        except FileExistsError:
+            err.append(file_name)
+    if len(err) > 0:
+        raise FileExistsError(
+            "Files were not created because they already exist (use --overwrite flag):"
+            f" {', '.join(err)}"
+        )
+def main() -> None:
+    """Parse CLI args and generate language-specific yaml files."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--overwrite",
+        default=False,
+        action="store_true",
+        help="Overwrite files if they already exist",
+    )
+    parser.add_argument(
+        "--output-dir", default=".", help="Directory to write yaml files to"
+    )
+    args = parser.parse_args()
+    gen_lang_yamls(output_dir=args.output_dir, overwrite=args.overwrite)
+if __name__ == "__main__":
+    main()
--- a/lm_eval/tasks/xnli/xnli_ar.yaml
+++ b/lm_eval/tasks/xnli/xnli_ar.yaml
+# Generated by utils.py
+dataset_name: ar
+doc_to_choice: '{{[premise+", صحيح? نعم, "+hypothesis,premise+", صحيح? لذا, "+hypothesis,premise+",
+  صحيح? رقم, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_ar
--- a/lm_eval/tasks/xnli/xnli_bg.yaml
+++ b/lm_eval/tasks/xnli/xnli_bg.yaml
+# Generated by utils.py
+dataset_name: bg
+doc_to_choice: '{{[premise+", правилно? да, "+hypothesis,premise+", правилно? така,
+  "+hypothesis,premise+", правилно? не, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_bg
--- a/lm_eval/tasks/xnli/xnli_common_yaml
+++ b/lm_eval/tasks/xnli/xnli_common_yaml
+# This file will be included in the generated language-specific task configs.
+# It doesn't have a yaml file extension as it is not meant to be imported directly
+# by the harness.
+group: xnli
+task: null
+dataset_path: xnli
+dataset_name: null
+output_type: multiple_choice
+training_split: train
+validation_split: validation
+doc_to_text: null
+doc_to_target: label
+doc_to_choice: null
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
--- a/lm_eval/tasks/xnli/xnli_de.yaml
+++ b/lm_eval/tasks/xnli/xnli_de.yaml
+# Generated by utils.py
+dataset_name: de
+doc_to_choice: '{{[premise+", richtig? Ja, "+hypothesis,premise+", richtig? Auch,
+  "+hypothesis,premise+", richtig? Nein, "+hypothesis]}}'
+doc_to_text: ''
+include: xnli_common_yaml
+task: xnli_de