Merge branch 'main' into tasklist

# Conflicts: # pyproject.toml

Merge branch 'main' into tasklist
# Conflicts: # pyproject.toml
b58e5556 · Baber · 6e1866f5 · 4f8195f1 · b58e5556 · b58e5556
Commit b58e5556 authored Jul 27, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/afrobench/masakhanews/utils.py
+++ b/lm_eval/tasks/afrobench/masakhanews/utils.py
@@ -12,9 +12,9 @@ def prompt_func(mode, lang):
        "prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
        f"Given the categories technology, religion, politics, sports, health, entertainment, or business; what is "
        f"the topic of the {lang} statement below? Return only the category. "
-        "\n\ntext: {{headline}} \category:\n\n",
+        "\n\ntext: {{headline}} \\category:\n\n",
        "prompt_4": "Label the following text as technology, religion, politics, sports, health, entertainment, or geography. Provide only the category as your "
-        "response. \n\ntext: {{headline}} \category: \n\n",
+        "response. \n\ntext: {{headline}} \\category: \n\n",
        "prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
        f"For each input, classify the topic as technology, business, politics, sports, health, entertainment, or religion. "
        f"Use the following guidelines: \n\n "
@@ -27,7 +27,7 @@ def prompt_func(mode, lang):
        f"business: The text covers economy, business, or related topics. \n\n"
        f"If the text contains multiple topics, choose the dominant topic. "
        f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
-        "Please provide a single classification for each input.\n\ntext: {{headline}} \category: \n\n",
+        "Please provide a single classification for each input.\n\ntext: {{headline}} \\category: \n\n",
    }
    return prompt_map[mode]


--- a/lm_eval/tasks/afrobench/sib/utils.py
+++ b/lm_eval/tasks/afrobench/sib/utils.py
@@ -17,9 +17,9 @@ def prompt_func(mode, lang):
        "prompt_3": f"You are an assistant able to classify topics in texts. \n\n"
        f"Given the categories science/technology, travel, politics, sports, health, entertainment, or geography; what is "
        f"the topic of the {lang} statement below? Return only the category. "
-        "\n\ntext: {{text}} \category:\n\n",
+        "\n\ntext: {{text}} \\category:\n\n",
        "prompt_4": "Label the following text as science/technology, travel, politics, sports, health, entertainment, or geography. Provide only the category as your "
-        "response. \n\ntext: {{text}} \category: \n\n",
+        "response. \n\ntext: {{text}} \\category: \n\n",
        "prompt_5": f"You are tasked with performing topic classification on the following {lang} text. "
        f"For each input, classify the topic as science/technology, travel, politics, sports, health, entertainment, or geography. "
        f"Use the following guidelines: \n\n "
@@ -32,7 +32,7 @@ def prompt_func(mode, lang):
        f"geography: The text involves geographical information, locations, or related topics. \n\n"
        f"If the text contains multiple topics, choose the dominant topic. "
        f"For ambiguous or unclear topics, select the category that best reflects the overall content. "
-        "Please provide a single classification for each input.\n\ntext: {{text}} \category: \n\n",
+        "Please provide a single classification for each input.\n\ntext: {{text}} \\category: \n\n",
    }
    return prompt_map[mode]


--- a/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_1/xlsum
@@ -4,8 +4,6 @@ tag:
 task: null
 dataset_path: csebuetnlp/xlsum
 dataset_name: null
-dataset_kwargs:
-  trust_remote_code: true
 output_type: generate_until
 generation_kwargs:
  until:

--- a/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_2/xlsum
@@ -4,8 +4,6 @@ tag:
 task: null
 dataset_path: csebuetnlp/xlsum
 dataset_name: null
-dataset_kwargs:
-  trust_remote_code: true
 output_type: generate_until
 generation_kwargs:
  until:

--- a/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum
+++ b/lm_eval/tasks/afrobench/xlsum/prompt_3/xlsum
@@ -4,8 +4,6 @@ tag:
 task: null
 dataset_path: csebuetnlp/xlsum
 dataset_name: null
-dataset_kwargs:
-  trust_remote_code: true
 output_type: generate_until
 generation_kwargs:
  until:

--- a/lm_eval/tasks/agieval/utils.py
+++ b/lm_eval/tasks/agieval/utils.py
@@ -47,7 +47,7 @@ def parse_math_answer(raw_string):
        return retval

    def get_answer_with_dollar_sign(s):
-        first_pattern = "\$(.*)\$"
+        first_pattern = r"\$(.*)\$"
        last_match = None
        matches = re.findall(first_pattern, s)
        if matches:
@@ -63,7 +63,7 @@ def parse_math_answer(raw_string):
            if "\\n" in last_match:
                last_match = last_match.split("\\n")[0]
        else:
-            pattern = "(?:\\$)?\d+(?:\.\d+)?(?![\w\d])"
+            pattern = "(?:\\$)?\\d+(?:\\.\\d+)?(?![\\w\\d])"
            matches = re.findall(pattern, s)
            if matches:
                last_match = matches[-1]
@@ -186,7 +186,7 @@ def _strip_string(string):

    # remove percentage
    string = string.replace("\\%", "")
-    string = string.replace("\%", "")
+    string = string.replace(r"\%", "")

    # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
    string = string.replace(" .", " 0.")

--- a/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_1dc.yaml
@@ -15,5 +15,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 2.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_2da.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_2da.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_2da
 dataset_name: arithmetic_2da
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_2dm.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_2dm
 dataset_name: arithmetic_2dm
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_2ds.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_2ds
 dataset_name: arithmetic_2ds
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_3da.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_3da.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_3da
 dataset_name: arithmetic_3da
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_3ds.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_3ds
 dataset_name: arithmetic_3ds
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_4da.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_4da.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_4da
 dataset_name: arithmetic_4da
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_4ds.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_4ds
 dataset_name: arithmetic_4ds
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_5da.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_5da.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_5da
 dataset_name: arithmetic_5da
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
+++ b/lm_eval/tasks/arithmetic/arithmetic_5ds.yaml
 include: arithmetic_1dc.yaml
 task: arithmetic_5ds
 dataset_name: arithmetic_5ds
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
+++ b/lm_eval/tasks/asdiv/asdiv-cot-llama.yaml
@@ -41,13 +41,13 @@ fewshot_config:
    target: Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15
      dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The final answer is 8
 filter_list:
- filter:
+  - filter:
      - function: regex
        group_select: -1
        regex_pattern: The final answer is ((-?[$0-9.,]{2,})|(-?[0-9]+))
      - function: take_first
    name: strict-match
- filter:
+  - filter:
      - function: regex
        group_select: -1
        regex_pattern: (-?[$0-9.,]{2,})|(-?[0-9]+)
@@ -62,11 +62,11 @@ generation_kwargs:
    - </s>
    - <|im_end|>
 tag:
- chain_of_thought
+  - chain_of_thought
 metadata:
  version: 1.0
 metric_list:
- aggregation: mean
+  - aggregation: mean
    higher_is_better: true
    ignore_case: true
    ignore_punctuation: false
@@ -84,5 +84,3 @@ validation_split: validation
 test_split: validation
 should_decontaminate: true
 doc_to_decontamination_query: "{{body}} {{question}}"
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/asdiv/default.yaml
+++ b/lm_eval/tasks/asdiv/default.yaml
@@ -12,5 +12,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 1.0
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/basque_bench/flores_eu/_flores_common_yaml
+++ b/lm_eval/tasks/basque_bench/flores_eu/_flores_common_yaml
@@ -23,5 +23,3 @@ metric_list:
    higher_is_better: true
 metadata:
  version: 0.1
-dataset_kwargs:
-  trust_remote_code: true
--- a/lm_eval/tasks/bbh/README.md
+++ b/lm_eval/tasks/bbh/README.md
@@ -53,4 +53,7 @@ None.
 - [ ] Majority voting "without CoT"

 ### Changelog
-no version change: changed dataset to `SaylorTwift/bbh`. Do not expect any change in the results.
+- no version change: changed dataset to `SaylorTwift/bbh`. Do not expect any change in the results.
+- `bbh_cot_fewshot` v.4.0; 2025-07-14:
+  - PR #3140. Removed duplicate "Let's think step by step" from the fewshots.
+  - set target_delimiter to "" as the fewshot samples end with a newline character.