Merge branch 'main' into mathvista

# Conflicts: # lm_eval/models/openai_completions.py

Merge branch 'main' into mathvista
# Conflicts: # lm_eval/models/openai_completions.py
2106fbeb · Baber · 4354fe46 · 703fbffd · 2106fbeb · 2106fbeb
Commit 2106fbeb authored Jan 15, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/global_mmlu/global_mmlu_yo.yaml
+++ b/lm_eval/tasks/global_mmlu/global_mmlu_yo.yaml
+# Generated by _generate_configs.py
+dataset_name: yo
+include: _default_yaml
+task: global_mmlu_yo
--- a/lm_eval/tasks/global_mmlu/global_mmlu_zh.yaml
+++ b/lm_eval/tasks/global_mmlu/global_mmlu_zh.yaml
+# Generated by _generate_configs.py
+dataset_name: zh
+include: _default_yaml
+task: global_mmlu_zh
--- a/lm_eval/tasks/humaneval/README.md
+++ b/lm_eval/tasks/humaneval/README.md
+# HumanEval
+## Paper
+Evaluating Large Language Models Trained on Code
+https://arxiv.org/abs/2107.03374
+We introduce Codex, a GPT language model fine-tuned on publicly available code from GitHub, and study its Python code-writing capabilities. A distinct production version of Codex powers GitHub Copilot. On HumanEval, a new evaluation set we release to measure functional correctness for synthesizing programs from docstrings, our model solves 28.8% of the problems, while GPT-3 solves 0% and GPT-J solves 11.4%. Furthermore, we find that repeated sampling from the model is a surprisingly effective strategy for producing working solutions to difficult prompts. Using this method, we solve 70.2% of our problems with 100 samples per problem. Careful investigation of our model reveals its limitations, including difficulty with docstrings describing long chains of operations and with binding operations to variables. Finally, we discuss the potential broader impacts of deploying powerful code generation technologies, covering safety, security, and economics.
+Homepage: https://github.com/openai/human-eval
+## Citation
+```
+@article{chen2021codex,
+  title={Evaluating Large Language Models Trained on Code},
+  author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser and Mohammad Bavarian and Clemens Winter and Philippe Tillet and Felipe Petroski Such and Dave Cummings and Matthias Plappert and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain and William Saunders and Christopher Hesse and Andrew N. Carr and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba},
+  year={2021},
+  eprint={2107.03374},
+  archivePrefix={arXiv},
+  primaryClass={cs.LG}
+}
+```
+### Groups and Tasks
+#### Groups
+* Not part of a group yet.
+#### Tasks
+- `humaneval` pass@1
+- `humaneval_64` pass@64 variant
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/humaneval/humaneval.yaml
+++ b/lm_eval/tasks/humaneval/humaneval.yaml
+task: humaneval
+dataset_path: openai/openai_humaneval
+unsafe_code: true
+output_type: generate_until
+test_split: test
+doc_to_text: "{{prompt}}"
+doc_to_target: "{{test}}\ncheck({{entry_point}})"
+metric_list:
+  - metric: !function utils.pass_at_k
+    aggregation: mean
+    higher_is_better: true
+    k: [1]
+generation_kwargs:
+  until:
+    - "\nclass"
+    - "\ndef"
+    - "\n#"
+    - "\nif"
+    - "\nprint"
+  max_gen_toks: 1024
+  do_sample: false
+repeats: 1
+num_fewshot: 0
+filter_list:
+  - name: "create_test"
+    filter:
+      - function: "custom"
+        filter_fn: !function utils.build_predictions
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/humaneval/humaneval_64.yaml
+++ b/lm_eval/tasks/humaneval/humaneval_64.yaml
+include: humaneval.yaml
+task: humaneval_64
+repeats: 64
+metric_list:
+  - metric: !function utils.pass_at_k
+    aggregation: mean
+    higher_is_better: true
+    k: [2,8,16,32,64]
+generation_kwargs:
+  until:
+    - "\nclass"
+    - "\ndef"
+    - "\n#"
+    - "\nif"
+    - "\nprint"
+  max_gen_toks: 1024
+  do_sample: true
+  temperature: 0.2
+  top_p: 0.95
--- a/lm_eval/tasks/humaneval/utils.py
+++ b/lm_eval/tasks/humaneval/utils.py
+import evaluate as hf_evaluate
+try:
+    compute_ = hf_evaluate.load("code_eval")
+    test_cases = ["assert add(2, 3)==5"]
+    candidates = [["def add(a,b): return a*b"]]
+    results = compute_.compute(references=test_cases, predictions=candidates, k=[1])
+except Exception as e:
+    raise e
+def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):
+    global compute_
+    assert k is not None
+    if isinstance(k, int):
+        k = [k]
+    res = compute_.compute(
+        references=references,
+        predictions=predictions,
+        k=k,
+    )
+    return res[0]
+def build_predictions(resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
+    return [[doc["prompt"] + r for r in resp] for resp, doc in zip(resps, docs)]
--- a/lm_eval/tasks/ifeval/instructions_util.py
+++ b/lm_eval/tasks/ifeval/instructions_util.py
@@ -15,13 +15,14 @@
 """Utility library of instructions."""
 import functools
+import os
 import random
 import re
+from importlib.metadata import version
 import immutabledict
 import nltk
-import pkg_resources
+from packaging.version import parse as parse_version
-from packaging import version
 # Downloading 'punkt' with nltk<3.9 has a remote code vuln.
@@ -29,19 +30,22 @@ from packaging import version
 # and https://github.com/nltk/nltk/issues/3266
 # for more information.
 NLTK_MIN_VERSION = "3.9.1"
+RANK = os.environ.get("LOCAL_RANK", "0")
 def download_nltk_resources():
    """Download 'punkt' if not already installed"""
-    nltk_version = pkg_resources.get_distribution("nltk").version
    assert (
-        version.parse(nltk_version) >= version.parse(NLTK_MIN_VERSION)
+        (nltk_version := parse_version(version("nltk")))
+        >= parse_version(NLTK_MIN_VERSION)
    ), f"`nltk` version {nltk_version} is not >= {NLTK_MIN_VERSION}. Please update `nltk` before proceeding--older versions are vulnerable to a remote code execution vulnerability."
    try:
        nltk.data.find("tokenizers/punkt_tab")
    except LookupError:
-        nltk.download("punkt_tab")
+        if RANK == "0":
+            nltk.download("punkt_tab")
+            print("Downloaded punkt_tab on rank 0")
 download_nltk_resources()

--- a/lm_eval/tasks/japanese_leaderboard/README.md
+++ b/lm_eval/tasks/japanese_leaderboard/README.md
+# Japanese Leaderboard
+The Japanese LLM Leaderboard evaluates language models based on a wide range of NLP tasks that reflect the characteristics of the Japanese language.
+### Groups, Tags, and Tasks
+#### Groups
+* `japanese_leaderboard`: runs all tasks defined in this directory
+#### Tasks
+##### Generation Evaluation
+* `ja_leaderboard_jaqket_v2`: The JAQKET dataset is designed for Japanese question answering research, featuring quiz-like questions with answers derived from Wikipedia article titles. [Source](https://github.com/kumapo/JAQKET-dataset)
+* `ja_leaderboard_mgsm`: Multilingual Grade School Math Benchmark (MGSM) is a benchmark of grade-school math problems, proposed in the paper Language models are multilingual chain-of-thought reasoners. [Source](https://huggingface.co/datasets/juletxara/mgsm)
+* `ja_leaderboard_xlsum`: This is the filtered Japanese subset of XL-Sum. [Source](https://github.com/csebuetnlp/xl-sum)
+* `ja_leaderboard_jsquad`: JSQuAD is a Japanese version of SQuAD, a reading comprehension dataset. Each instance in the dataset consists of a question regarding a given context (Wikipedia article) and its answer. JSQuAD is based on SQuAD 1.1 (there are no unanswerable questions). [Source](https://github.com/yahoojapan/JGLUE)
+##### Multi-Choice/Classification Evaluation
+* `ja_leaderboard_jcommonsenseqa`: JCommonsenseQA is a Japanese version of CommonsenseQA, which is a multiple-choice question answering dataset that requires commonsense reasoning ability. [Source](https://github.com/yahoojapan/JGLUE)
+* `ja_leaderboard_jnli`: JNLI is a Japanese version of the NLI (Natural Language Inference) dataset. The inference relations are entailment (含意), contradiction (矛盾), and neutral (中立). [Source](https://github.com/yahoojapan/JGLUE)
+* `ja_leaderboard_marc_ja`: MARC-ja is a text classification dataset based on the Japanese portion of Multilingual Amazon Reviews Corpus (MARC). [Source](https://github.com/yahoojapan/JGLUE)
+* `ja_leaderboard_xwinograd`: This is the Japanese portion of XWinograd. [Source](https://huggingface.co/datasets/polm-stability/xwinograd-ja)
+### Citation
+```bibtex
+@inproceedings{ja_leaderboard_jaqket_v2,
+  title         = {JAQKET: クイズを題材にした日本語 QA データセットの構築},
+  author        = {鈴木正敏 and 鈴木潤 and 松田耕史 and ⻄田京介 and 井之上直也},
+  year          = 2020,
+  booktitle     = {言語処理学会第26回年次大会},
+  url           = {https://www.anlp.jp/proceedings/annual_meeting/2020/pdf_dir/P2-24.pdf}
+}
+@article{ja_leaderboard_mgsm_1,
+  title         = {Training Verifiers to Solve Math Word Problems},
+  author        = {
+    Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and
+    Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro
+    and Hesse, Christopher and Schulman, John
+  },
+  year          = 2021,
+  journal       = {arXiv preprint arXiv:2110.14168}
+}
+@misc{ja_leaderboard_mgsm_2,
+  title         = {Language Models are Multilingual Chain-of-Thought Reasoners},
+  author        = {
+    Freda Shi and Mirac Suzgun and Markus Freitag and Xuezhi Wang and Suraj Srivats and Soroush
+    Vosoughi and Hyung Won Chung and Yi Tay and Sebastian Ruder and Denny Zhou and Dipanjan Das and
+    Jason Wei
+  },
+  year          = 2022,
+  eprint        = {2210.03057},
+  archiveprefix = {arXiv},
+  primaryclass  = {cs.CL}
+}
+@inproceedings{ja_leaderboard_xlsum,
+  title         = {{XL}-Sum: Large-Scale Multilingual Abstractive Summarization for 44 Languages},
+  author        = {
+    Hasan, Tahmid  and Bhattacharjee, Abhik  and Islam, Md. Saiful  and Mubasshir, Kazi  and Li,
+    Yuan-Fang  and Kang, Yong-Bin  and Rahman, M. Sohel  and Shahriyar, Rifat
+  },
+  year          = 2021,
+  month         = aug,
+  booktitle     = {Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021},
+  publisher     = {Association for Computational Linguistics},
+  address       = {Online},
+  pages         = {4693--4703},
+  url           = {https://aclanthology.org/2021.findings-acl.413}
+}
+@article{jglue_2023,
+  title         = {JGLUE: 日本語言語理解ベンチマーク},
+  author        = {栗原 健太郎 and 河原 大輔 and 柴田 知秀},
+  year          = 2023,
+  journal       = {自然言語処理},
+  volume        = 30,
+  number        = 1,
+  pages         = {63--87},
+  doi           = {10.5715/jnlp.30.63},
+  url           = {https://www.jstage.jst.go.jp/article/jnlp/30/1/30_63/_article/-char/ja}
+}
+@inproceedings{jglue_kurihara-etal-2022-jglue,
+  title         = {{JGLUE}: {J}apanese General Language Understanding Evaluation},
+  author        = {Kurihara, Kentaro  and Kawahara, Daisuke  and Shibata, Tomohide},
+  year          = 2022,
+  month         = jun,
+  booktitle     = {Proceedings of the Thirteenth Language Resources and Evaluation Conference},
+  publisher     = {European Language Resources Association},
+  address       = {Marseille, France},
+  pages         = {2957--2966},
+  url           = {https://aclanthology.org/2022.lrec-1.317}
+}
+@inproceedings{jglue_kurihara_nlp2022,
+  title         = {JGLUE: 日本語言語理解ベンチマーク},
+  author        = {栗原健太郎 and 河原大輔 and 柴田知秀},
+  year          = 2022,
+  booktitle     = {言語処理学会第28回年次大会},
+  url           = {https://www.anlp.jp/proceedings/annual_meeting/2022/pdf_dir/E8-4.pdf},
+  note          = {in Japanese}
+}
+@misc{xwinograd_muennighoff2022crosslingual,
+  title         = {Crosslingual Generalization through Multitask Finetuning},
+  author        = {
+    Niklas Muennighoff and Thomas Wang and Lintang Sutawika and Adam Roberts and Stella Biderman
+    and Teven Le Scao and M Saiful Bari and Sheng Shen and Zheng-Xin Yong and Hailey Schoelkopf and
+    Xiangru Tang and Dragomir Radev and Alham Fikri Aji and Khalid Almubarak and Samuel Albanie and
+    Zaid Alyafeai and Albert Webson and Edward Raff and Colin Raffel
+  },
+  year          = 2022,
+  eprint        = {2211.01786},
+  archiveprefix = {arXiv},
+  primaryclass  = {cs.CL}
+}
+@misc{xwinograd_tikhonov2021heads,
+  title         = {
+    It's All in the Heads: Using Attention Heads as a Baseline for Cross-Lingual Transfer in
+    Commonsense Reasoning
+  },
+  author        = {Alexey Tikhonov and Max Ryabinin},
+  year          = 2021,
+  eprint        = {2106.12066},
+  archiveprefix = {arXiv},
+  primaryclass  = {cs.CL}
+}
+```
+### Credit
+* Prompts: https://github.com/Stability-AI/lm-evaluation-harness/tree/jp-stable/lm_eval/tasks/ja
+### Checklist
+For adding novel benchmarks/datasets to the library:
+* [x] Is the task an existing benchmark in the literature?
+  * [x] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
--- a/lm_eval/tasks/japanese_leaderboard/_ja_leaderboard.yaml
+++ b/lm_eval/tasks/japanese_leaderboard/_ja_leaderboard.yaml
+group: japanese_leaderboard
+task:
+  - ja_leaderboard_jaqket_v2
+  - ja_leaderboard_jcommonsenseqa
+  - ja_leaderboard_jnli
+  - ja_leaderboard_jsquad
+  - ja_leaderboard_marc_ja
+  - ja_leaderboard_mgsm
+  - ja_leaderboard_xlsum
+  - ja_leaderboard_xwinograd
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jaqket_v2.yaml
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jaqket_v2.yaml
+task: ja_leaderboard_jaqket_v2
+dataset_path: kumapo/JAQKET
+dataset_name: v2.0
+training_split: train
+validation_split: validation
+test_split: null
+fewshot_split: train
+num_fewshot: 1
+description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n"
+doc_to_text: "### 指示：\n与えられた文脈から、質問に対する答えを抜き出してください。\n\n### 入力：\n文脈：{{ ctxs['text'] | join('\n') }}\n質問：{{ question }}\n\n### 応答："
+doc_to_target: "{{ answers['text'][0] }}"
+target_delimiter: "\n"
+output_type: generate_until
+generation_kwargs:
+  until:
+    - "\n\n"
+  do_sample: false
+metric_list:
+  - metric: exact_match
+    regexes_to_ignore:
+      - '^\s+'
+      - '\s+$'
+    aggregation: mean
+    higher_is_better: true
+filter_list:
+  - name: whitespaces
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+metadata:
+  version: 2.0
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jcommonsenseqa.py
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jcommonsenseqa.py
+def process_docs(dataset):
+    def _add_choices(doc):
+        doc["choices"] = [doc[f"choice{i}"] for i in range(5)]
+        return doc
+    return dataset.map(_add_choices)
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jcommonsenseqa.yaml
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jcommonsenseqa.yaml
+task: ja_leaderboard_jcommonsenseqa
+dataset_path: Rakuten/JGLUE
+dataset_name: JCommonsenseQA
+training_split: train
+validation_split: validation
+test_split: null
+fewshot_split: train
+num_fewshot: 3
+description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n"
+process_docs: !function ja_leaderboard_jcommonsenseqa.process_docs
+doc_to_text: "### 指示：\n出力は以下から選択してください：\n{% for choice in choices %}- {{ choice }}\n{% endfor %}\n### 入力：\n{{ question }}\n\n### 応答："
+doc_to_target: label
+doc_to_choice: choices
+target_delimiter: "\n"
+output_type: multiple_choice
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jnli.yaml
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jnli.yaml
+task: ja_leaderboard_jnli
+dataset_path: Rakuten/JGLUE
+dataset_name: JNLI
+training_split: train
+validation_split: validation
+test_split: null
+fewshot_split: train
+num_fewshot: 3
+description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n"
+doc_to_text: "### 指示：\n与えられた前提と仮説の関係を回答してください。\n\n出力は以下から選択してください：\n含意\n矛盾\n中立\n\n### 入力：\n前提：{{ sentence1 }}\n仮説：{{ sentence2 }}\n\n### 応答："
+doc_to_target: label
+doc_to_choice: ["含意", "矛盾", "中立"]
+target_delimiter: "\n"
+output_type: multiple_choice
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jsquad.yaml
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_jsquad.yaml
+task: ja_leaderboard_jsquad
+dataset_path: Rakuten/JGLUE
+dataset_name: JSQuAD
+training_split: train
+validation_split: validation
+test_split: null
+fewshot_split: train
+num_fewshot: 2
+description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n"
+doc_to_text: "### 指示：\n与えられた文脈から、質問に対する答えを抜き出してください。\n\n### 入力：\n文脈：{% set _context = context.split('[SEP]')[-1] %}{{ _context | trim }}\n質問：{{ question }}\n\n### 応答："
+doc_to_target: "{{ answers['text'][0] }}"
+target_delimiter: "\n"
+output_type: generate_until
+generation_kwargs:
+  until:
+    - "\n\n"
+  do_sample: false
+metric_list:
+  - metric: exact_match
+    regexes_to_ignore:
+      - '^\s+'
+      - '\s+$'
+    aggregation: mean
+    higher_is_better: true
+filter_list:
+  - name: whitespaces
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_marc_ja.yaml
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_marc_ja.yaml
+task: ja_leaderboard_marc_ja
+dataset_path: Rakuten/JGLUE
+dataset_name: MARC-ja
+training_split: train
+validation_split: validation
+test_split: null
+fewshot_split: train
+num_fewshot: 3
+description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n"
+doc_to_text: "### 指示：\n以下の製品レビューを、ポジティブまたはネガティブの感情クラスのいずれかに分類してください。\n\n### 入力：\n{{ sentence }}\n\n### 応答："
+doc_to_target: label
+doc_to_choice: ["ポジティブ", "ネガティブ"]
+target_delimiter: "\n"
+output_type: multiple_choice
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_mgsm.py
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_mgsm.py
+import re
+_INVALID_ANSWER = "[invalid]"
+_ANSWER_REGEX = re.compile(r"(\-?[0-9\.\,]+)")
+def _extract_answer(completion):
+    matches = _ANSWER_REGEX.findall(completion)
+    if matches:
+        match_str = matches[-1].strip(".")
+        match_str = match_str.replace(",", "")
+        try:
+            match_float = float(match_str)
+        except ValueError:
+            return _INVALID_ANSWER
+        if match_float.is_integer():
+            return int(match_float)
+    return _INVALID_ANSWER
+def process_results(doc, results):
+    assert (
+        len(results) == 1
+    ), f"results should be a list with 1 str element, but is {results}"
+    completion = results[0]
+    extracted_answer = _extract_answer(completion)
+    answer = doc["answer_number"]
+    acc = extracted_answer == answer
+    return {
+        "acc": acc,
+    }
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_mgsm.yaml
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_mgsm.yaml
+task: ja_leaderboard_mgsm
+dataset_path: juletxara/mgsm
+dataset_name: ja
+training_split: train
+validation_split: null
+test_split: test
+fewshot_split: train
+num_fewshot: 5
+description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n"
+doc_to_text: "### 指示：\n与えられた問題に対して、ステップごとに答えを導き出してください。\n\n### 入力：\n{{ question | replace('問題：', '') }}\n\n### 応答："
+doc_to_target: "{{ answer | replace('ステップごとの答え：', '') }}"
+target_delimiter: "\n"
+output_type: generate_until
+process_results: !function ja_leaderboard_mgsm.process_results
+metric_list:
+  - metric: acc
+    aggregation: mean
+    higher_is_better: true
+generation_kwargs:
+  until:
+    - "\n\n"
+  do_sample: false
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_xlsum.py
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_xlsum.py
+import re
+def _missing_module_message(name):
+    return f"`{name}` is required for `japanese_leaderboard`, please install `{name}` via pip install lm_eval[japanese_leaderboard] or pip install -e .[japanese_leaderboard]"
+try:
+    import emoji
+    import neologdn
+    from fugashi import Tagger
+    from rouge_score import rouge_scorer, scoring
+except ModuleNotFoundError as err:
+    raise ModuleNotFoundError(_missing_module_message(err.name)) from err
+class MecabTokenizer:
+    def __init__(self) -> None:
+        self.tagger = Tagger("-Owakati")
+    def normalize_answer(self, text):
+        """Lower case text, remove punctuation and extra whitespace, etc."""
+        def white_space_fix(text):
+            return " ".join(text.split())
+        def remove_emoji(text):
+            text = "".join(["" if emoji.is_emoji(c) else c for c in text])
+            emoji_pattern = re.compile(
+                "["
+                "\U0001f600-\U0001f64f"  # emoticons
+                "\U0001f300-\U0001f5ff"  # symbols & pictographs
+                "\U0001f680-\U0001f6ff"  # transport & map symbols
+                "\U0001f1e0-\U0001f1ff"  # flags (iOS)
+                "\U00002702-\U000027b0"
+                "]+",
+                flags=re.UNICODE,
+            )
+            return emoji_pattern.sub(r"", text)
+        text = remove_emoji(text)
+        # see neologdn docs for details, but handles things like full/half width variation
+        text = neologdn.normalize(text)
+        text = white_space_fix(text)
+        return text
+    def tokenize(self, text):
+        return self.tagger.parse(self.normalize_answer(text)).split()
+def rouge2(items):
+    return items
+def rouge2_agg(items):
+    tokenizer = MecabTokenizer()
+    refs = list(zip(*items))[0]
+    preds = list(zip(*items))[1]
+    rouge_type = "rouge2"
+    # mecab-based rouge
+    scorer = rouge_scorer.RougeScorer(
+        rouge_types=[rouge_type],
+        tokenizer=tokenizer,
+    )
+    # Acumulate confidence intervals.
+    aggregator = scoring.BootstrapAggregator()
+    for ref, pred in zip(refs, preds):
+        aggregator.add_scores(scorer.score(ref, pred))
+    result = aggregator.aggregate()
+    return result[rouge_type].mid.fmeasure
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_xlsum.yaml
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_xlsum.yaml
+task: ja_leaderboard_xlsum
+dataset_path: mkshing/xlsum_ja
+dataset_name: null
+training_split: train
+validation_split: validation
+test_split: test
+fewshot_split: train
+num_fewshot: 1
+description: "以下は、タスクを説明する指示と、文脈のある入力の組み合わせです。要求を適切に満たす応答を書きなさい。\n\n"
+doc_to_text: "### 指示：\n与えられたニュース記事を要約してください。\n\n### 入力：\nニュース記事：{{ text }}\n\n### 応答："
+doc_to_target: "{{ summary }}"
+target_delimiter: "\n"
+output_type: generate_until
+metric_list:
+  - metric: !function ja_leaderboard_xlsum.rouge2
+    aggregation: !function ja_leaderboard_xlsum.rouge2_agg
+    higher_is_better: true
+filter_list:
+  - name: whitespaces
+    filter:
+      - function: remove_whitespace
+      - function: take_first
+generation_kwargs:
+  until:
+    - "\n\n"
+  do_sample: false
+metadata:
+  version: 1.0
--- a/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_xwinograd.py
+++ b/lm_eval/tasks/japanese_leaderboard/ja_leaderboard_xwinograd.py
+def process_docs(dataset):
+    def _add_choices_and_label(doc):
+        doc["label"] = int(doc["answer"]) - 1
+        doc["choices"] = [doc["sentence1"].strip(), doc["sentence2"].strip()]
+        return doc
+    return dataset.map(_add_choices_and_label)