Merge branch 'smolrefact' into tasklist

# Conflicts: # lm_eval/__main__.py # lm_eval/api/group.py # lm_eval/api/task.py # lm_eval/evaluator_utils.py # lm_eval/tasks/__init__.py # lm_eval/utils.py # pyproject.toml

Merge branch 'smolrefact' into tasklist
# Conflicts: # lm_eval/__main__.py # lm_eval/api/group.py # lm_eval/api/task.py # lm_eval/evaluator_utils.py # lm_eval/tasks/__init__.py # lm_eval/utils.py # pyproject.toml
abd17276 · Baber · 00afd536 · 70314843 · abd17276 · abd17276
Commit abd17276 authored Sep 26, 2025 by Baber
20 changed files
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_yaml
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_yaml
@@ -16,17 +16,16 @@ fewshot_split: train
 doc_to_target: !function utils.doc_to_target
 should_decontaminate: true
 doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+process_results: !function utils.process_results
 filter_list:
  - filter:
-    - function: regex_pos
+    - function: "custom"
+      filter_fn: !function utils.extract_pos
+    - function: "take_first"
    name: flexible-extract
 metric_list:
  - metric: acc
-    aggregation: !function utils.acc_score
+    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
 metadata:
  version: 1.0
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_2/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/utils.py
-from itertools import chain
+import re
+from collections.abc import Iterable
+from typing import Any

 from sklearn.metrics import accuracy_score

-from lm_eval.utils import weighted_f1_score
-

 def doc_to_target(doc):
    pos_tag_map = {
@@ -29,27 +29,40 @@ def doc_to_target(doc):
    return [pos_tag_map[tag] for tag in doc["upos"]]


-def acc_score(items):
-    unzipped_list = list(zip(*items))
+def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
+    def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
+        # Extract tagged tokens list from text input using regex
+        tokens = re.findall(
+            r"\('([^']*)', '([^']*)'\)",
+            "Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
+        )
+        return [(token, pos) for token, pos in tokens]
+
+    def extract_pos_tags(result: str):
+        pos_tags = []
+        if isinstance(result, str):
+            result_ = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result_)
+        return pos_tags if pos_tags else ["invalid"]
+
+    def filter_set(inst: list[str]) -> list[str]:
+        filtered = []
+        for resp in inst:
+            match = extract_pos_tags(resp)
+            filtered.append(match)
+        return filtered

-    golds, preds = unzipped_list[0], unzipped_list[1]
+    filtered_resps = map(lambda x: filter_set(x), resps)

-    # Flatten preds' inner lists
-    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+    return filtered_resps

-    # Calculate the accuracy for each gold-pred pair
-    accuracy_scores = []
-    for gold, pred in zip(golds, flattened_preds):
-        # Ensure both lists are of the same length, otherwise truncate to match
-        min_length = min(len(gold), len(pred))
-        gold = gold[:min_length]
-        pred = pred[:min_length]

-        # Calculate accuracy for the current pair and add to the list
-        accuracy = accuracy_score(gold, pred)
-        accuracy_scores.append(accuracy)
+def process_results(doc: dict[str, Any], results: list[list[str]]):
+    golds, preds = doc_to_target(doc), results[0]
+    # Ensure both lists are of the same length, otherwise truncate to match
+    min_length = min(len(golds), len(preds))
+    gold = golds[:min_length]
+    pred = preds[:min_length]
+    accuracy = accuracy_score(gold, pred)

-    mean_accuracy = (
-        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
-    )
-    return mean_accuracy
+    return {"acc": accuracy}
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_yaml
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_yaml
@@ -16,17 +16,16 @@ fewshot_split: train
 doc_to_target: !function utils.doc_to_target
 should_decontaminate: true
 doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+process_results: !function utils.process_results
 filter_list:
  - filter:
-    - function: regex_pos
+    - function: "custom"
+      filter_fn: !function utils.extract_pos
+    - function: "take_first"
    name: flexible-extract
 metric_list:
  - metric: acc
-    aggregation: !function utils.acc_score
+    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
 metadata:
  version: 1.0
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_3/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/utils.py
-from itertools import chain
+import re
+from collections.abc import Iterable
+from typing import Any

 from sklearn.metrics import accuracy_score

-from lm_eval.utils import weighted_f1_score
-

 def doc_to_target(doc):
    pos_tag_map = {
@@ -29,27 +29,40 @@ def doc_to_target(doc):
    return [pos_tag_map[tag] for tag in doc["upos"]]


-def acc_score(items):
-    unzipped_list = list(zip(*items))
+def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
+    def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
+        # Extract tagged tokens list from text input using regex
+        tokens = re.findall(
+            r"\('([^']*)', '([^']*)'\)",
+            "Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
+        )
+        return [(token, pos) for token, pos in tokens]
+
+    def extract_pos_tags(result: str):
+        pos_tags = []
+        if isinstance(result, str):
+            result_ = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result_)
+        return pos_tags if pos_tags else ["invalid"]
+
+    def filter_set(inst: list[str]) -> list[str]:
+        filtered = []
+        for resp in inst:
+            match = extract_pos_tags(resp)
+            filtered.append(match)
+        return filtered

-    golds, preds = unzipped_list[0], unzipped_list[1]
+    filtered_resps = map(lambda x: filter_set(x), resps)

-    # Flatten preds' inner lists
-    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+    return filtered_resps

-    # Calculate the accuracy for each gold-pred pair
-    accuracy_scores = []
-    for gold, pred in zip(golds, flattened_preds):
-        # Ensure both lists are of the same length, otherwise truncate to match
-        min_length = min(len(gold), len(pred))
-        gold = gold[:min_length]
-        pred = pred[:min_length]

-        # Calculate accuracy for the current pair and add to the list
-        accuracy = accuracy_score(gold, pred)
-        accuracy_scores.append(accuracy)
+def process_results(doc: dict[str, Any], results: list[list[str]]):
+    golds, preds = doc_to_target(doc), results[0]
+    # Ensure both lists are of the same length, otherwise truncate to match
+    min_length = min(len(golds), len(preds))
+    gold = golds[:min_length]
+    pred = preds[:min_length]
+    accuracy = accuracy_score(gold, pred)

-    mean_accuracy = (
-        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
-    )
-    return mean_accuracy
+    return {"acc": accuracy}
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_yaml
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_yaml
@@ -16,17 +16,16 @@ fewshot_split: train
 doc_to_target: !function utils.doc_to_target
 should_decontaminate: true
 doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+process_results: !function utils.process_results
 filter_list:
  - filter:
-    - function: regex_pos
+    - function: "custom"
+      filter_fn: !function utils.extract_pos
+    - function: "take_first"
    name: flexible-extract
 metric_list:
  - metric: acc
-    aggregation: !function utils.acc_score
+    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
 metadata:
  version: 1.0
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_4/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/utils.py
-from itertools import chain
+import re
+from collections.abc import Iterable
+from typing import Any

 from sklearn.metrics import accuracy_score

-from lm_eval.utils import weighted_f1_score
-

 def doc_to_target(doc):
    pos_tag_map = {
@@ -29,27 +29,40 @@ def doc_to_target(doc):
    return [pos_tag_map[tag] for tag in doc["upos"]]


-def acc_score(items):
-    unzipped_list = list(zip(*items))
+def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
+    def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
+        # Extract tagged tokens list from text input using regex
+        tokens = re.findall(
+            r"\('([^']*)', '([^']*)'\)",
+            "Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
+        )
+        return [(token, pos) for token, pos in tokens]
+
+    def extract_pos_tags(result: str):
+        pos_tags = []
+        if isinstance(result, str):
+            result_ = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result_)
+        return pos_tags if pos_tags else ["invalid"]
+
+    def filter_set(inst: list[str]) -> list[str]:
+        filtered = []
+        for resp in inst:
+            match = extract_pos_tags(resp)
+            filtered.append(match)
+        return filtered

-    golds, preds = unzipped_list[0], unzipped_list[1]
+    filtered_resps = map(lambda x: filter_set(x), resps)

-    # Flatten preds' inner lists
-    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+    return filtered_resps

-    # Calculate the accuracy for each gold-pred pair
-    accuracy_scores = []
-    for gold, pred in zip(golds, flattened_preds):
-        # Ensure both lists are of the same length, otherwise truncate to match
-        min_length = min(len(gold), len(pred))
-        gold = gold[:min_length]
-        pred = pred[:min_length]

-        # Calculate accuracy for the current pair and add to the list
-        accuracy = accuracy_score(gold, pred)
-        accuracy_scores.append(accuracy)
+def process_results(doc: dict[str, Any], results: list[list[str]]):
+    golds, preds = doc_to_target(doc), results[0]
+    # Ensure both lists are of the same length, otherwise truncate to match
+    min_length = min(len(golds), len(preds))
+    gold = golds[:min_length]
+    pred = preds[:min_length]
+    accuracy = accuracy_score(gold, pred)

-    mean_accuracy = (
-        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
-    )
-    return mean_accuracy
+    return {"acc": accuracy}
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_yaml
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_yaml
@@ -16,17 +16,16 @@ fewshot_split: train
 doc_to_target: !function utils.doc_to_target
 should_decontaminate: true
 doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+process_results: !function utils.process_results
 filter_list:
  - filter:
-    - function: regex_pos
+    - function: "custom"
+      filter_fn: !function utils.extract_pos
+    - function: "take_first"
    name: flexible-extract
 metric_list:
  - metric: acc
-    aggregation: !function utils.acc_score
+    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
 metadata:
  version: 1.0
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_5/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/utils.py
-from itertools import chain
+import re
+from collections.abc import Iterable
+from typing import Any

 from sklearn.metrics import accuracy_score

-from lm_eval.utils import weighted_f1_score
-

 def doc_to_target(doc):
    pos_tag_map = {
@@ -29,27 +29,40 @@ def doc_to_target(doc):
    return [pos_tag_map[tag] for tag in doc["upos"]]


-def acc_score(items):
-    unzipped_list = list(zip(*items))
+def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
+    def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
+        # Extract tagged tokens list from text input using regex
+        tokens = re.findall(
+            r"\('([^']*)', '([^']*)'\)",
+            "Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
+        )
+        return [(token, pos) for token, pos in tokens]
+
+    def extract_pos_tags(result: str):
+        pos_tags = []
+        if isinstance(result, str):
+            result_ = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result_)
+        return pos_tags if pos_tags else ["invalid"]
+
+    def filter_set(inst: list[str]) -> list[str]:
+        filtered = []
+        for resp in inst:
+            match = extract_pos_tags(resp)
+            filtered.append(match)
+        return filtered

-    golds, preds = unzipped_list[0], unzipped_list[1]
+    filtered_resps = map(lambda x: filter_set(x), resps)

-    # Flatten preds' inner lists
-    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+    return filtered_resps

-    # Calculate the accuracy for each gold-pred pair
-    accuracy_scores = []
-    for gold, pred in zip(golds, flattened_preds):
-        # Ensure both lists are of the same length, otherwise truncate to match
-        min_length = min(len(gold), len(pred))
-        gold = gold[:min_length]
-        pred = pred[:min_length]

-        # Calculate accuracy for the current pair and add to the list
-        accuracy = accuracy_score(gold, pred)
-        accuracy_scores.append(accuracy)
+def process_results(doc: dict[str, Any], results: list[list[str]]):
+    golds, preds = doc_to_target(doc), results[0]
+    # Ensure both lists are of the same length, otherwise truncate to match
+    min_length = min(len(golds), len(preds))
+    gold = golds[:min_length]
+    pred = preds[:min_length]
+    accuracy = accuracy_score(gold, pred)

-    mean_accuracy = (
-        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
-    )
-    return mean_accuracy
+    return {"acc": accuracy}
--- a/lm_eval/tasks/afrobench/masakhapos/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/utils.py
-from lm_eval.utils import weighted_f1_score
-
-
 def doc_to_text(doc):
    output = """Please provide the POS tags for each word in the input sentence. The input will be a list of words in
    the sentence. The output format should be a list of tuples, where each tuple consists of a word from the input text
-    and its corresponding POS tag label from the tag label set: ["ADJ", "ADP", "ADV", "AUX", "CCONJ, "DET", "INTJ",
+    and its corresponding POS tag label from the tag label set: ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ",
    "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT" "SCONJ", "SYM", "VERB", "X"]. \nYour response should include only a
    list of tuples, in the order that the words appear in the input sentence, with each tuple containing the
    corresponding POS tag label for a word.

--- a/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_1/nollysenti
@@ -2,7 +2,6 @@ tag:
    - afrobench_sentiment_tasks
    - nollysenti_prompt_1
 dataset_path: Davlan/nollysenti
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_2/nollysenti
@@ -2,7 +2,6 @@ tag:
    - afrobench_sentiment_tasks
    - nollysenti_prompt_2
 dataset_path: Davlan/nollysenti
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_3/nollysenti
@@ -2,7 +2,6 @@ tag:
    - afrobench_sentiment_tasks
    - nollysenti_prompt_3
 dataset_path: Davlan/nollysenti
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_4/nollysenti
@@ -2,7 +2,6 @@ tag:
    - afrobench_sentiment_tasks
    - nollysenti_prompt_4
 dataset_path: Davlan/nollysenti
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti
+++ b/lm_eval/tasks/afrobench/nollysenti/prompt_5/nollysenti
@@ -2,7 +2,6 @@ tag:
    - afrobench_sentiment_tasks
    - nollysenti_prompt_5
 dataset_path: Davlan/nollysenti
-dataset_kwargs: {trust_remote_code: True}
 output_type: multiple_choice
 validation_split: validation
 test_split: test

--- a/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/african-english/ntrex
@@ -4,7 +4,6 @@ tag:
 - ntrex_afr-eng_prompt_1
 - afrobench_MT_tasks
 dataset_path: masakhane/ntrex_african
-dataset_kwargs: {trust_remote_code: True}
 output_type: generate_until
 validation_split: test
 fewshot_split: test

--- a/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_1/english-african/ntrex
@@ -4,7 +4,6 @@ tag:
 - ntrex_eng-afr_prompt_1
 - afrobench_MT_tasks
 dataset_path: masakhane/ntrex_african
-dataset_kwargs: {trust_remote_code: True}
 output_type: generate_until
 validation_split: test
 fewshot_split: test

--- a/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/african-english/ntrex
@@ -3,7 +3,6 @@ tag:
 - ntrex_afr-eng_prompt_2
 - afrobench_MT_tasks
 dataset_path: masakhane/ntrex_african
-dataset_kwargs: {trust_remote_code: True}
 output_type: generate_until
 validation_split: test
 fewshot_split: test

--- a/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_2/english-african/ntrex
@@ -3,7 +3,6 @@ tag:
 - ntrex_eng-afr_prompt_2
 - afrobench_MT_tasks
 dataset_path: masakhane/ntrex_african
-dataset_kwargs: {trust_remote_code: True}
 output_type: generate_until
 validation_split: test
 fewshot_split: test

--- a/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/african-english/ntrex
@@ -3,7 +3,6 @@ tag:
 - ntrex_afr-eng_prompt_3
 - afrobench_MT_tasks
 dataset_path: masakhane/ntrex_african
-dataset_kwargs: {trust_remote_code: True}
 output_type: generate_until
 validation_split: test
 fewshot_split: test

--- a/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex
+++ b/lm_eval/tasks/afrobench/ntrex/prompt_3/english-african/ntrex
@@ -3,7 +3,6 @@ tag:
 - ntrex_eng-afr_prompt_3
 - afrobench_MT_tasks
 dataset_path: masakhane/ntrex_african
-dataset_kwargs: {trust_remote_code: True}
 output_type: generate_until
 validation_split: test
 fewshot_split: test