refactor masakhapos

d58d67b3 · Baber · 7f04db12 · d58d67b3 · d58d67b3 · d58d67b3
Commit d58d67b3 authored Jul 21, 2025 by Baber
12 changed files
--- a/lm_eval/tasks/afrobench/masakhapos/README.md
+++ b/lm_eval/tasks/afrobench/masakhapos/README.md
@@ -73,3 +73,5 @@ HomePage: https://github.com/masakhane-io/masakhane-pos
    abstract = "In this paper, we present AfricaPOS, the largest part-of-speech (POS) dataset for 20 typologically diverse African languages. We discuss the challenges in annotating POS for these languages using the universal dependencies (UD) guidelines. We conducted extensive POS baseline experiments using both conditional random field and several multilingual pre-trained language models. We applied various cross-lingual transfer models trained with data available in the UD. Evaluating on the AfricaPOS dataset, we show that choosing the best transfer language(s) in both single-source and multi-source setups greatly improves the POS tagging performance of the target languages, in particular when combined with parameter-fine-tuning methods. Crucially, transferring knowledge from a language that matches the language family and morphosyntactic properties seems to be more effective for POS tagging in unseen languages."
 }
 ```
+## Changelog
+- 2025-07-21: Refactored. Scores should not be affected.
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_yaml
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/masakhapos_yaml
@@ -14,19 +14,18 @@ validation_split: validation
 test_split: test
 fewshot_split: train
 doc_to_target: !function utils.doc_to_target
+process_results: !function utils.process_results
 should_decontaminate: true
 doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
 filter_list:
  - filter:
-    - function: regex_pos
+    - function: "custom"
+      filter_fn: !function utils.extract_pos
+    - function: "take_first"
    name: flexible-extract
 metric_list:
  - metric: acc
-    aggregation: !function utils.acc_score
+    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
 metadata:
  version: 1.0
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_1/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_1/utils.py
-from itertools import chain
+import re
+from collections.abc import Iterable
+from typing import Any

 from sklearn.metrics import accuracy_score

-from lm_eval.utils import weighted_f1_score
-

 def doc_to_target(doc):
    pos_tag_map = {
@@ -29,27 +29,40 @@ def doc_to_target(doc):
    return [pos_tag_map[tag] for tag in doc["upos"]]


-def acc_score(items):
-    unzipped_list = list(zip(*items))
+def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
+    def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
+        # Extract tagged tokens list from text input using regex
+        tokens = re.findall(
+            r"\('([^']*)', '([^']*)'\)",
+            "Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
+        )
+        return [(token, pos) for token, pos in tokens]
+
+    def extract_pos_tags(result: str):
+        pos_tags = []
+        if isinstance(result, str):
+            result_ = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result_)
+        return pos_tags if pos_tags else ["invalid"]
+
+    def filter_set(inst: list[str]) -> list[str]:
+        filtered = []
+        for resp in inst:
+            match = extract_pos_tags(resp)
+            filtered.append(match)
+        return filtered

-    golds, preds = unzipped_list[0], unzipped_list[1]
+    filtered_resps = map(lambda x: filter_set(x), resps)

-    # Flatten preds' inner lists
-    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+    return filtered_resps

-    # Calculate the accuracy for each gold-pred pair
-    accuracy_scores = []
-    for gold, pred in zip(golds, flattened_preds):
-        # Ensure both lists are of the same length, otherwise truncate to match
-        min_length = min(len(gold), len(pred))
-        gold = gold[:min_length]
-        pred = pred[:min_length]

-        # Calculate accuracy for the current pair and add to the list
-        accuracy = accuracy_score(gold, pred)
-        accuracy_scores.append(accuracy)
+def process_results(doc: dict[str, Any], results: list[list[str]]):
+    golds, preds = doc_to_target(doc), results[0]
+    # Ensure both lists are of the same length, otherwise truncate to match
+    min_length = min(len(golds), len(preds))
+    gold = golds[:min_length]
+    pred = preds[:min_length]
+    accuracy = accuracy_score(gold, pred)

-    mean_accuracy = (
-        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
-    )
-    return mean_accuracy
+    return {"acc": accuracy}
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_yaml
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/masakhapos_yaml
@@ -16,17 +16,16 @@ fewshot_split: train
 doc_to_target: !function utils.doc_to_target
 should_decontaminate: true
 doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+process_results: !function utils.process_results
 filter_list:
  - filter:
-    - function: regex_pos
+    - function: "custom"
+      filter_fn: !function utils.extract_pos
+    - function: "take_first"
    name: flexible-extract
 metric_list:
  - metric: acc
-    aggregation: !function utils.acc_score
+    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
 metadata:
  version: 1.0
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_2/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_2/utils.py
-from itertools import chain
+import re
+from collections.abc import Iterable
+from typing import Any

 from sklearn.metrics import accuracy_score

-from lm_eval.utils import weighted_f1_score
-

 def doc_to_target(doc):
    pos_tag_map = {
@@ -29,27 +29,40 @@ def doc_to_target(doc):
    return [pos_tag_map[tag] for tag in doc["upos"]]


-def acc_score(items):
-    unzipped_list = list(zip(*items))
+def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
+    def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
+        # Extract tagged tokens list from text input using regex
+        tokens = re.findall(
+            r"\('([^']*)', '([^']*)'\)",
+            "Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
+        )
+        return [(token, pos) for token, pos in tokens]
+
+    def extract_pos_tags(result: str):
+        pos_tags = []
+        if isinstance(result, str):
+            result_ = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result_)
+        return pos_tags if pos_tags else ["invalid"]
+
+    def filter_set(inst: list[str]) -> list[str]:
+        filtered = []
+        for resp in inst:
+            match = extract_pos_tags(resp)
+            filtered.append(match)
+        return filtered

-    golds, preds = unzipped_list[0], unzipped_list[1]
+    filtered_resps = map(lambda x: filter_set(x), resps)

-    # Flatten preds' inner lists
-    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+    return filtered_resps

-    # Calculate the accuracy for each gold-pred pair
-    accuracy_scores = []
-    for gold, pred in zip(golds, flattened_preds):
-        # Ensure both lists are of the same length, otherwise truncate to match
-        min_length = min(len(gold), len(pred))
-        gold = gold[:min_length]
-        pred = pred[:min_length]

-        # Calculate accuracy for the current pair and add to the list
-        accuracy = accuracy_score(gold, pred)
-        accuracy_scores.append(accuracy)
+def process_results(doc: dict[str, Any], results: list[list[str]]):
+    golds, preds = doc_to_target(doc), results[0]
+    # Ensure both lists are of the same length, otherwise truncate to match
+    min_length = min(len(golds), len(preds))
+    gold = golds[:min_length]
+    pred = preds[:min_length]
+    accuracy = accuracy_score(gold, pred)

-    mean_accuracy = (
-        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
-    )
-    return mean_accuracy
+    return {"acc": accuracy}
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_yaml
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/masakhapos_yaml
@@ -16,17 +16,16 @@ fewshot_split: train
 doc_to_target: !function utils.doc_to_target
 should_decontaminate: true
 doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+process_results: !function utils.process_results
 filter_list:
  - filter:
-    - function: regex_pos
+    - function: "custom"
+      filter_fn: !function utils.extract_pos
+    - function: "take_first"
    name: flexible-extract
 metric_list:
  - metric: acc
-    aggregation: !function utils.acc_score
+    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
 metadata:
  version: 1.0
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_3/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_3/utils.py
-from itertools import chain
+import re
+from collections.abc import Iterable
+from typing import Any

 from sklearn.metrics import accuracy_score

-from lm_eval.utils import weighted_f1_score
-

 def doc_to_target(doc):
    pos_tag_map = {
@@ -29,27 +29,40 @@ def doc_to_target(doc):
    return [pos_tag_map[tag] for tag in doc["upos"]]


-def acc_score(items):
-    unzipped_list = list(zip(*items))
+def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
+    def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
+        # Extract tagged tokens list from text input using regex
+        tokens = re.findall(
+            r"\('([^']*)', '([^']*)'\)",
+            "Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
+        )
+        return [(token, pos) for token, pos in tokens]
+
+    def extract_pos_tags(result: str):
+        pos_tags = []
+        if isinstance(result, str):
+            result_ = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result_)
+        return pos_tags if pos_tags else ["invalid"]
+
+    def filter_set(inst: list[str]) -> list[str]:
+        filtered = []
+        for resp in inst:
+            match = extract_pos_tags(resp)
+            filtered.append(match)
+        return filtered

-    golds, preds = unzipped_list[0], unzipped_list[1]
+    filtered_resps = map(lambda x: filter_set(x), resps)

-    # Flatten preds' inner lists
-    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+    return filtered_resps

-    # Calculate the accuracy for each gold-pred pair
-    accuracy_scores = []
-    for gold, pred in zip(golds, flattened_preds):
-        # Ensure both lists are of the same length, otherwise truncate to match
-        min_length = min(len(gold), len(pred))
-        gold = gold[:min_length]
-        pred = pred[:min_length]

-        # Calculate accuracy for the current pair and add to the list
-        accuracy = accuracy_score(gold, pred)
-        accuracy_scores.append(accuracy)
+def process_results(doc: dict[str, Any], results: list[list[str]]):
+    golds, preds = doc_to_target(doc), results[0]
+    # Ensure both lists are of the same length, otherwise truncate to match
+    min_length = min(len(golds), len(preds))
+    gold = golds[:min_length]
+    pred = preds[:min_length]
+    accuracy = accuracy_score(gold, pred)

-    mean_accuracy = (
-        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
-    )
-    return mean_accuracy
+    return {"acc": accuracy}
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_yaml
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/masakhapos_yaml
@@ -16,17 +16,16 @@ fewshot_split: train
 doc_to_target: !function utils.doc_to_target
 should_decontaminate: true
 doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+process_results: !function utils.process_results
 filter_list:
  - filter:
-    - function: regex_pos
+    - function: "custom"
+      filter_fn: !function utils.extract_pos
+    - function: "take_first"
    name: flexible-extract
 metric_list:
  - metric: acc
-    aggregation: !function utils.acc_score
+    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
 metadata:
  version: 1.0
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_4/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_4/utils.py
-from itertools import chain
+import re
+from collections.abc import Iterable
+from typing import Any

 from sklearn.metrics import accuracy_score

-from lm_eval.utils import weighted_f1_score
-

 def doc_to_target(doc):
    pos_tag_map = {
@@ -29,27 +29,40 @@ def doc_to_target(doc):
    return [pos_tag_map[tag] for tag in doc["upos"]]


-def acc_score(items):
-    unzipped_list = list(zip(*items))
+def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
+    def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
+        # Extract tagged tokens list from text input using regex
+        tokens = re.findall(
+            r"\('([^']*)', '([^']*)'\)",
+            "Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
+        )
+        return [(token, pos) for token, pos in tokens]
+
+    def extract_pos_tags(result: str):
+        pos_tags = []
+        if isinstance(result, str):
+            result_ = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result_)
+        return pos_tags if pos_tags else ["invalid"]
+
+    def filter_set(inst: list[str]) -> list[str]:
+        filtered = []
+        for resp in inst:
+            match = extract_pos_tags(resp)
+            filtered.append(match)
+        return filtered

-    golds, preds = unzipped_list[0], unzipped_list[1]
+    filtered_resps = map(lambda x: filter_set(x), resps)

-    # Flatten preds' inner lists
-    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+    return filtered_resps

-    # Calculate the accuracy for each gold-pred pair
-    accuracy_scores = []
-    for gold, pred in zip(golds, flattened_preds):
-        # Ensure both lists are of the same length, otherwise truncate to match
-        min_length = min(len(gold), len(pred))
-        gold = gold[:min_length]
-        pred = pred[:min_length]

-        # Calculate accuracy for the current pair and add to the list
-        accuracy = accuracy_score(gold, pred)
-        accuracy_scores.append(accuracy)
+def process_results(doc: dict[str, Any], results: list[list[str]]):
+    golds, preds = doc_to_target(doc), results[0]
+    # Ensure both lists are of the same length, otherwise truncate to match
+    min_length = min(len(golds), len(preds))
+    gold = golds[:min_length]
+    pred = preds[:min_length]
+    accuracy = accuracy_score(gold, pred)

-    mean_accuracy = (
-        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
-    )
-    return mean_accuracy
+    return {"acc": accuracy}
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_yaml
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/masakhapos_yaml
@@ -16,17 +16,16 @@ fewshot_split: train
 doc_to_target: !function utils.doc_to_target
 should_decontaminate: true
 doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
+process_results: !function utils.process_results
 filter_list:
  - filter:
-    - function: regex_pos
+    - function: "custom"
+      filter_fn: !function utils.extract_pos
+    - function: "take_first"
    name: flexible-extract
 metric_list:
  - metric: acc
-    aggregation: !function utils.acc_score
+    aggregation: mean
    higher_is_better: true
-    ignore_case: true
-    ignore_punctuation: true
-    regexes_to_ignore:
-      - ","
 metadata:
  version: 1.0
--- a/lm_eval/tasks/afrobench/masakhapos/prompt_5/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/prompt_5/utils.py
-from itertools import chain
+import re
+from collections.abc import Iterable
+from typing import Any

 from sklearn.metrics import accuracy_score

-from lm_eval.utils import weighted_f1_score
-

 def doc_to_target(doc):
    pos_tag_map = {
@@ -29,27 +29,40 @@ def doc_to_target(doc):
    return [pos_tag_map[tag] for tag in doc["upos"]]


-def acc_score(items):
-    unzipped_list = list(zip(*items))
+def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
+    def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
+        # Extract tagged tokens list from text input using regex
+        tokens = re.findall(
+            r"\('([^']*)', '([^']*)'\)",
+            "Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
+        )
+        return [(token, pos) for token, pos in tokens]
+
+    def extract_pos_tags(result: str):
+        pos_tags = []
+        if isinstance(result, str):
+            result_ = extract_tagged_tokens(result)
+            pos_tags.extend(pos for _, pos in result_)
+        return pos_tags if pos_tags else ["invalid"]
+
+    def filter_set(inst: list[str]) -> list[str]:
+        filtered = []
+        for resp in inst:
+            match = extract_pos_tags(resp)
+            filtered.append(match)
+        return filtered

-    golds, preds = unzipped_list[0], unzipped_list[1]
+    filtered_resps = map(lambda x: filter_set(x), resps)

-    # Flatten preds' inner lists
-    flattened_preds = [list(chain.from_iterable(p)) for p in preds]
+    return filtered_resps

-    # Calculate the accuracy for each gold-pred pair
-    accuracy_scores = []
-    for gold, pred in zip(golds, flattened_preds):
-        # Ensure both lists are of the same length, otherwise truncate to match
-        min_length = min(len(gold), len(pred))
-        gold = gold[:min_length]
-        pred = pred[:min_length]

-        # Calculate accuracy for the current pair and add to the list
-        accuracy = accuracy_score(gold, pred)
-        accuracy_scores.append(accuracy)
+def process_results(doc: dict[str, Any], results: list[list[str]]):
+    golds, preds = doc_to_target(doc), results[0]
+    # Ensure both lists are of the same length, otherwise truncate to match
+    min_length = min(len(golds), len(preds))
+    gold = golds[:min_length]
+    pred = preds[:min_length]
+    accuracy = accuracy_score(gold, pred)

-    mean_accuracy = (
-        sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
-    )
-    return mean_accuracy
+    return {"acc": accuracy}
--- a/lm_eval/tasks/afrobench/masakhapos/utils.py
+++ b/lm_eval/tasks/afrobench/masakhapos/utils.py
-from lm_eval.utils import weighted_f1_score
-
-
 def doc_to_text(doc):
    output = """Please provide the POS tags for each word in the input sentence. The input will be a list of words in
    the sentence. The output format should be a list of tuples, where each tuple consists of a word from the input text