Merge branch 'smolrefact' into tasklist

# Conflicts: # lm_eval/__main__.py # lm_eval/api/group.py # lm_eval/api/task.py # lm_eval/evaluator_utils.py # lm_eval/tasks/__init__.py # lm_eval/utils.py # pyproject.toml

Merge branch 'smolrefact' into tasklist
# Conflicts: # lm_eval/__main__.py # lm_eval/api/group.py # lm_eval/api/task.py # lm_eval/evaluator_utils.py # lm_eval/tasks/__init__.py # lm_eval/utils.py # pyproject.toml
abd17276 · Baber · 00afd536 · 70314843 · abd17276 · abd17276
Commit abd17276 authored Sep 26, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR.yaml
+dataset_name: lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
+include: _template_yaml
+task: lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
--- a/lm_eval/tasks/lm_syneval/lm_syneval_group.yaml
+++ b/lm_eval/tasks/lm_syneval/lm_syneval_group.yaml
+group: lm_syneval
+task:
+  - group: lm_syneval__reflexives
+    task:
+      - group: lm_syneval__reflexives__simple_reflexives
+        task:
+          - lm_syneval__reflexives__simple_reflexives__sing_MS_ANPHR
+          - lm_syneval__reflexives__simple_reflexives__plur_MS_ANPHR
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__reflexives__reflexive_sent_comp
+        task:
+          - lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_sing_BS
+          - lm_syneval__reflexives__reflexive_sent_comp__sing_MS_ANPHR_plur_BS
+          - lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_sing_BS
+          - lm_syneval__reflexives__reflexive_sent_comp__plur_MS_ANPHR_plur_BS
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__reflexives__reflexives_across
+        task:
+          - lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_sing_ES_EV
+          - lm_syneval__reflexives__reflexives_across__sing_MS_ANPHR_plur_ES_EV
+          - lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_sing_ES_EV
+          - lm_syneval__reflexives__reflexives_across__plur_MS_ANPHR_plur_ES_EV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        weight_by_size: false
+  - group: lm_syneval__agreement
+    task:
+      - group: lm_syneval__agreement__obj_rel_within_inanim
+        task:
+          - lm_syneval__agreement__obj_rel_within_inanim__sing_ES_EV_sing_IS_IV
+          - lm_syneval__agreement__obj_rel_within_inanim__sing_ES_EV_plur_IS_IV
+          - lm_syneval__agreement__obj_rel_within_inanim__plur_ES_EV_sing_IS_IV
+          - lm_syneval__agreement__obj_rel_within_inanim__plur_ES_EV_plur_IS_IV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__vp_coord
+        task:
+          - lm_syneval__agreement__vp_coord__sing_MS_MV_MV
+          - lm_syneval__agreement__vp_coord__plur_MS_MV_MV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__sent_comp
+        task:
+          - lm_syneval__agreement__sent_comp__sing_MS_MV_sing_BS
+          - lm_syneval__agreement__sent_comp__sing_MS_MV_plur_BS
+          - lm_syneval__agreement__sent_comp__plur_MS_MV_sing_BS
+          - lm_syneval__agreement__sent_comp__plur_MS_MV_plur_BS
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_no_comp_within_inanim
+        task:
+          - lm_syneval__agreement__obj_rel_no_comp_within_inanim__sing_ES_EV_sing_IS_IV
+          - lm_syneval__agreement__obj_rel_no_comp_within_inanim__sing_ES_EV_plur_IS_IV
+          - lm_syneval__agreement__obj_rel_no_comp_within_inanim__plur_ES_EV_sing_IS_IV
+          - lm_syneval__agreement__obj_rel_no_comp_within_inanim__plur_ES_EV_plur_IS_IV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_within_anim
+        task:
+          - lm_syneval__agreement__obj_rel_within_anim__sing_ES_EV_sing_MS_MV
+          - lm_syneval__agreement__obj_rel_within_anim__sing_ES_EV_plur_MS_MV
+          - lm_syneval__agreement__obj_rel_within_anim__plur_ES_EV_sing_MS_MV
+          - lm_syneval__agreement__obj_rel_within_anim__plur_ES_EV_plur_MS_MV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__subj_rel
+        task:
+          - lm_syneval__agreement__subj_rel__sing_MS_EV_MV_sing_ES
+          - lm_syneval__agreement__subj_rel__sing_MS_EV_MV_plur_ES
+          - lm_syneval__agreement__subj_rel__plur_MS_EV_MV_sing_ES
+          - lm_syneval__agreement__subj_rel__plur_MS_EV_MV_plur_ES
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__prep_inanim
+        task:
+          - lm_syneval__agreement__prep_inanim__sing_IS_IV_sing_ES
+          - lm_syneval__agreement__prep_inanim__sing_IS_IV_plur_ES
+          - lm_syneval__agreement__prep_inanim__plur_IS_IV_sing_ES
+          - lm_syneval__agreement__prep_inanim__plur_IS_IV_plur_ES
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__long_vp_coord
+        task:
+          - lm_syneval__agreement__long_vp_coord__sing_MS_LMV_LMV
+          - lm_syneval__agreement__long_vp_coord__plur_MS_LMV_LMV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_across_anim
+        task:
+          - lm_syneval__agreement__obj_rel_across_anim__sing_MS_MV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_across_anim__sing_MS_MV_plur_ES_EV
+          - lm_syneval__agreement__obj_rel_across_anim__plur_MS_MV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_across_anim__plur_MS_MV_plur_ES_EV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_across_inanim
+        task:
+          - lm_syneval__agreement__obj_rel_across_inanim__sing_IS_IV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_across_inanim__sing_IS_IV_plur_ES_EV
+          - lm_syneval__agreement__obj_rel_across_inanim__plur_IS_IV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_across_inanim__plur_IS_IV_plur_ES_EV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_no_comp_across_anim
+        task:
+          - lm_syneval__agreement__obj_rel_no_comp_across_anim__sing_MS_MV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_anim__sing_MS_MV_plur_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_anim__plur_MS_MV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_anim__plur_MS_MV_plur_ES_EV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_no_comp_across_inanim
+        task:
+          - lm_syneval__agreement__obj_rel_no_comp_across_inanim__sing_IS_IV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_inanim__sing_IS_IV_plur_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_inanim__plur_IS_IV_sing_ES_EV
+          - lm_syneval__agreement__obj_rel_no_comp_across_inanim__plur_IS_IV_plur_ES_EV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__simple_agrmt
+        task:
+          - lm_syneval__agreement__simple_agrmt__sing_MS_MV
+          - lm_syneval__agreement__simple_agrmt__plur_MS_MV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__prep_anim
+        task:
+          - lm_syneval__agreement__prep_anim__sing_MS_MV_sing_ES
+          - lm_syneval__agreement__prep_anim__sing_MS_MV_plur_ES
+          - lm_syneval__agreement__prep_anim__plur_MS_MV_sing_ES
+          - lm_syneval__agreement__prep_anim__plur_MS_MV_plur_ES
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__agreement__obj_rel_no_comp_within_anim
+        task:
+          - lm_syneval__agreement__obj_rel_no_comp_within_anim__sing_ES_EV_sing_MS_MV
+          - lm_syneval__agreement__obj_rel_no_comp_within_anim__sing_ES_EV_plur_MS_MV
+          - lm_syneval__agreement__obj_rel_no_comp_within_anim__plur_ES_EV_sing_MS_MV
+          - lm_syneval__agreement__obj_rel_no_comp_within_anim__plur_ES_EV_plur_MS_MV
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        weight_by_size: false
+  - group: lm_syneval__npi
+    task:
+      - group: lm_syneval__npi__npi_across_anim
+        task:
+          - lm_syneval__npi__npi_across_anim__past
+          - lm_syneval__npi__npi_across_anim__future
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__npi__npi_across_inanim
+        task:
+          - lm_syneval__npi__npi_across_inanim__past
+          - lm_syneval__npi__npi_across_inanim__future
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__npi__simple_npi_anim
+        task:
+          - lm_syneval__npi__simple_npi_anim__past
+          - lm_syneval__npi__simple_npi_anim__future
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+      - group: lm_syneval__npi__simple_npi_inanim
+        task:
+          - lm_syneval__npi__simple_npi_inanim__past
+          - lm_syneval__npi__simple_npi_inanim__future
+        aggregate_metric_list:
+          - metric: acc
+            aggregation: mean
+            weight_by_size: false
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        weight_by_size: false
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: false
--- a/lm_eval/tasks/longbench/2wikimqa.yaml
+++ b/lm_eval/tasks/longbench/2wikimqa.yaml
@@ -5,17 +5,17 @@ task: longbench_2wikimqa
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: 2wikimqa
-doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
+doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
  max_gen_toks: 32
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "qa_f1_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/2wikimqa_e.yaml
+++ b/lm_eval/tasks/longbench/2wikimqa_e.yaml
@@ -5,17 +5,17 @@ task: longbench_2wikimqa_e
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: 2wikimqa_e
-doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
+doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
  max_gen_toks: 32
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "qa_f1_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/README.md
+++ b/lm_eval/tasks/longbench/README.md
@@ -101,4 +101,7 @@ If other tasks on this dataset are already supported:

 ### Changelog
 v2.: fix doc_to_target; add vcsum
+
 v3: properly use all answers for metric calculation; trim whitespace from resps; fix stop sequences not parsing correctly.
+
+v4: fixed special characters in prompts; use greedy decoding by default.
--- a/lm_eval/tasks/longbench/_generate_config.py
+++ b/lm_eval/tasks/longbench/_generate_config.py
@@ -149,7 +149,7 @@ task: {{ task }}
 dataset_path: {{ dataset_path }}
 test_split: {{ test_split }}
 dataset_name: {{ dataset_name }}
-doc_to_text: '{{ doc_to_text }}'
+doc_to_text: "{{ doc_to_text }}"
 doc_to_target: '{{ doc_to_target }}'
 process_results: {{ process_results }}
 generation_kwargs:
@@ -180,13 +180,14 @@ if __name__ == "__main__":
        generation_kwargs = {
            "max_gen_toks": dataset2maxlen[df],
            "temperature": 1,
-            "do_sample": True,
+            "do_sample": False,
            # We'll handle the until value directly in the template
        }

        raw_doc_to_text = (
            dataset2prompt[df]
            .replace("\n", "\\n")
+            .replace('"', '\\"')
            .replace("{", "{{")
            .replace("}", "}}")
        )
@@ -210,7 +211,7 @@ if __name__ == "__main__":
            "generation_kwargs": generation_kwargs,
            "has_newline": has_newline,  # Add the flag to the template context
            "metric_list": metric_list,
-            "metadata": {"version": "3.0"},
+            "metadata": {"version": "4.0"},
        }

        # Render template

--- a/lm_eval/tasks/longbench/dureader.yaml
+++ b/lm_eval/tasks/longbench/dureader.yaml
@@ -5,17 +5,17 @@ task: longbench_dureader
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: dureader
-doc_to_text: '请基于给定的文章回答下述问题。\n\n文章：{{context}}\n\n请基于上述文章回答下面的问题。\n\n问题：{{input}}\n回答：'
+doc_to_text: "请基于给定的文章回答下述问题。\n\n文章：{{context}}\n\n请基于上述文章回答下面的问题。\n\n问题：{{input}}\n回答："
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_rouge_zh_score
 generation_kwargs:
  max_gen_toks: 128
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "rouge_zh_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/gov_report.yaml
+++ b/lm_eval/tasks/longbench/gov_report.yaml
@@ -5,17 +5,17 @@ task: longbench_gov_report
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: gov_report
-doc_to_text: 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:'
+doc_to_text: "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_rouge_score
 generation_kwargs:
  max_gen_toks: 512
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "rouge_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/gov_report_e.yaml
+++ b/lm_eval/tasks/longbench/gov_report_e.yaml
@@ -5,17 +5,17 @@ task: longbench_gov_report_e
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: gov_report_e
-doc_to_text: 'You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:'
+doc_to_text: "You are given a report by a government agency. Write a one-page summary of the report.\n\nReport:\n{{context}}\n\nNow, write a one-page summary of the report.\n\nSummary:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_rouge_score
 generation_kwargs:
  max_gen_toks: 512
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "rouge_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/hotpotqa.yaml
+++ b/lm_eval/tasks/longbench/hotpotqa.yaml
@@ -5,17 +5,17 @@ task: longbench_hotpotqa
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: hotpotqa
-doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
+doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
  max_gen_toks: 32
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "qa_f1_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/hotpotqa_e.yaml
+++ b/lm_eval/tasks/longbench/hotpotqa_e.yaml
@@ -5,17 +5,17 @@ task: longbench_hotpotqa_e
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: hotpotqa_e
-doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
+doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
  max_gen_toks: 32
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "qa_f1_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/lcc.yaml
+++ b/lm_eval/tasks/longbench/lcc.yaml
@@ -5,17 +5,17 @@ task: longbench_lcc
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: lcc
-doc_to_text: 'Please complete the code given below. \n{{context}}Next line of code:\n'
+doc_to_text: "Please complete the code given below. \n{{context}}Next line of code:\n"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_code_sim_score
 generation_kwargs:
  max_gen_toks: 64
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "code_sim_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/lcc_e.yaml
+++ b/lm_eval/tasks/longbench/lcc_e.yaml
@@ -5,17 +5,17 @@ task: longbench_lcc_e
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: lcc_e
-doc_to_text: 'Please complete the code given below. \n{{context}}Next line of code:\n'
+doc_to_text: "Please complete the code given below. \n{{context}}Next line of code:\n"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_code_sim_score
 generation_kwargs:
  max_gen_toks: 64
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "code_sim_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/lsht.yaml
+++ b/lm_eval/tasks/longbench/lsht.yaml
@@ -5,17 +5,17 @@ task: longbench_lsht
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: lsht
-doc_to_text: '请判断给定新闻的类别，下面是一些例子。\n\n{{context}}\n{{input}}'
+doc_to_text: "请判断给定新闻的类别，下面是一些例子。\n\n{{context}}\n{{input}}"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_classification_score
 generation_kwargs:
  max_gen_toks: 64
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: ["\n"]
 metric_list:
  - metric: "classification_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/multi_news.yaml
+++ b/lm_eval/tasks/longbench/multi_news.yaml
@@ -5,17 +5,17 @@ task: longbench_multi_news
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: multi_news
-doc_to_text: 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{{context}}\n\nNow, write a one-page summary of all the news.\n\nSummary:'
+doc_to_text: "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{{context}}\n\nNow, write a one-page summary of all the news.\n\nSummary:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_rouge_score
 generation_kwargs:
  max_gen_toks: 512
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "rouge_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/multi_news_e.yaml
+++ b/lm_eval/tasks/longbench/multi_news_e.yaml
@@ -5,17 +5,17 @@ task: longbench_multi_news_e
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: multi_news_e
-doc_to_text: 'You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{{context}}\n\nNow, write a one-page summary of all the news.\n\nSummary:'
+doc_to_text: "You are given several news passages. Write a one-page summary of all news. \n\nNews:\n{{context}}\n\nNow, write a one-page summary of all the news.\n\nSummary:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_rouge_score
 generation_kwargs:
  max_gen_toks: 512
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "rouge_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/multifieldqa_en.yaml
+++ b/lm_eval/tasks/longbench/multifieldqa_en.yaml
@@ -5,17 +5,17 @@ task: longbench_multifieldqa_en
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: multifieldqa_en
-doc_to_text: 'Read the following text and answer briefly.\n\n{{context}}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
+doc_to_text: "Read the following text and answer briefly.\n\n{{context}}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
  max_gen_toks: 64
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "qa_f1_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/multifieldqa_en_e.yaml
+++ b/lm_eval/tasks/longbench/multifieldqa_en_e.yaml
@@ -5,17 +5,17 @@ task: longbench_multifieldqa_en_e
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: multifieldqa_en_e
-doc_to_text: 'Read the following text and answer briefly.\n\n{{context}}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
+doc_to_text: "Read the following text and answer briefly.\n\n{{context}}\n\nNow, answer the following question based on the above text, only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
  max_gen_toks: 64
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "qa_f1_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/multifieldqa_zh.yaml
+++ b/lm_eval/tasks/longbench/multifieldqa_zh.yaml
@@ -5,17 +5,17 @@ task: longbench_multifieldqa_zh
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: multifieldqa_zh
-doc_to_text: '阅读以下文字并用中文简短回答：\n\n{{context}}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{{input}}\n回答：'
+doc_to_text: "阅读以下文字并用中文简短回答：\n\n{{context}}\n\n现在请基于上面的文章回答下面的问题，只告诉我答案，不要输出任何其他字词。\n\n问题：{{input}}\n回答："
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_qa_f1_zh_score
 generation_kwargs:
  max_gen_toks: 64
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "qa_f1_zh_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0
--- a/lm_eval/tasks/longbench/musique.yaml
+++ b/lm_eval/tasks/longbench/musique.yaml
@@ -5,17 +5,17 @@ task: longbench_musique
 dataset_path: THUDM/LongBench
 test_split: test
 dataset_name: musique
-doc_to_text: 'Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:'
+doc_to_text: "Answer the question based on the given passages. Only give me the answer and do not output any other words.\n\nThe following are given passages.\n{{context}}\n\nAnswer the question based on the given passages. Only give me the answer and do not output any other words.\n\nQuestion: {{input}}\nAnswer:"
 doc_to_target: '{{answers}}'
 process_results: !function metrics.get_qa_f1_score
 generation_kwargs:
  max_gen_toks: 32
  temperature: 1
-  do_sample: True
+  do_sample: False
  until: []
 metric_list:
  - metric: "qa_f1_score"
    aggregation: mean
    higher_is_better: True
 metadata:
-  version: 3.0
+  version: 4.0