Commit abd17276 authored by Baber's avatar Baber
Browse files

Merge branch 'smolrefact' into tasklist

# Conflicts:
#	lm_eval/__main__.py
#	lm_eval/api/group.py
#	lm_eval/api/task.py
#	lm_eval/evaluator_utils.py
#	lm_eval/tasks/__init__.py
#	lm_eval/utils.py
#	pyproject.toml
parents 00afd536 70314843
......@@ -16,17 +16,16 @@ fewshot_split: train
doc_to_target: !function utils.doc_to_target
should_decontaminate: true
doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
process_results: !function utils.process_results
filter_list:
- filter:
- function: regex_pos
- function: "custom"
filter_fn: !function utils.extract_pos
- function: "take_first"
name: flexible-extract
metric_list:
- metric: acc
aggregation: !function utils.acc_score
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- ","
metadata:
version: 1.0
from itertools import chain
import re
from collections.abc import Iterable
from typing import Any
from sklearn.metrics import accuracy_score
from lm_eval.utils import weighted_f1_score
def doc_to_target(doc):
pos_tag_map = {
......@@ -29,27 +29,40 @@ def doc_to_target(doc):
return [pos_tag_map[tag] for tag in doc["upos"]]
def acc_score(items):
unzipped_list = list(zip(*items))
def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
# Extract tagged tokens list from text input using regex
tokens = re.findall(
r"\('([^']*)', '([^']*)'\)",
"Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
)
return [(token, pos) for token, pos in tokens]
def extract_pos_tags(result: str):
pos_tags = []
if isinstance(result, str):
result_ = extract_tagged_tokens(result)
pos_tags.extend(pos for _, pos in result_)
return pos_tags if pos_tags else ["invalid"]
def filter_set(inst: list[str]) -> list[str]:
filtered = []
for resp in inst:
match = extract_pos_tags(resp)
filtered.append(match)
return filtered
golds, preds = unzipped_list[0], unzipped_list[1]
filtered_resps = map(lambda x: filter_set(x), resps)
# Flatten preds' inner lists
flattened_preds = [list(chain.from_iterable(p)) for p in preds]
return filtered_resps
# Calculate the accuracy for each gold-pred pair
accuracy_scores = []
for gold, pred in zip(golds, flattened_preds):
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(gold), len(pred))
gold = gold[:min_length]
pred = pred[:min_length]
# Calculate accuracy for the current pair and add to the list
accuracy = accuracy_score(gold, pred)
accuracy_scores.append(accuracy)
def process_results(doc: dict[str, Any], results: list[list[str]]):
golds, preds = doc_to_target(doc), results[0]
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(golds), len(preds))
gold = golds[:min_length]
pred = preds[:min_length]
accuracy = accuracy_score(gold, pred)
mean_accuracy = (
sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
)
return mean_accuracy
return {"acc": accuracy}
......@@ -16,17 +16,16 @@ fewshot_split: train
doc_to_target: !function utils.doc_to_target
should_decontaminate: true
doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
process_results: !function utils.process_results
filter_list:
- filter:
- function: regex_pos
- function: "custom"
filter_fn: !function utils.extract_pos
- function: "take_first"
name: flexible-extract
metric_list:
- metric: acc
aggregation: !function utils.acc_score
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- ","
metadata:
version: 1.0
from itertools import chain
import re
from collections.abc import Iterable
from typing import Any
from sklearn.metrics import accuracy_score
from lm_eval.utils import weighted_f1_score
def doc_to_target(doc):
pos_tag_map = {
......@@ -29,27 +29,40 @@ def doc_to_target(doc):
return [pos_tag_map[tag] for tag in doc["upos"]]
def acc_score(items):
unzipped_list = list(zip(*items))
def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
# Extract tagged tokens list from text input using regex
tokens = re.findall(
r"\('([^']*)', '([^']*)'\)",
"Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
)
return [(token, pos) for token, pos in tokens]
def extract_pos_tags(result: str):
pos_tags = []
if isinstance(result, str):
result_ = extract_tagged_tokens(result)
pos_tags.extend(pos for _, pos in result_)
return pos_tags if pos_tags else ["invalid"]
def filter_set(inst: list[str]) -> list[str]:
filtered = []
for resp in inst:
match = extract_pos_tags(resp)
filtered.append(match)
return filtered
golds, preds = unzipped_list[0], unzipped_list[1]
filtered_resps = map(lambda x: filter_set(x), resps)
# Flatten preds' inner lists
flattened_preds = [list(chain.from_iterable(p)) for p in preds]
return filtered_resps
# Calculate the accuracy for each gold-pred pair
accuracy_scores = []
for gold, pred in zip(golds, flattened_preds):
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(gold), len(pred))
gold = gold[:min_length]
pred = pred[:min_length]
# Calculate accuracy for the current pair and add to the list
accuracy = accuracy_score(gold, pred)
accuracy_scores.append(accuracy)
def process_results(doc: dict[str, Any], results: list[list[str]]):
golds, preds = doc_to_target(doc), results[0]
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(golds), len(preds))
gold = golds[:min_length]
pred = preds[:min_length]
accuracy = accuracy_score(gold, pred)
mean_accuracy = (
sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
)
return mean_accuracy
return {"acc": accuracy}
......@@ -16,17 +16,16 @@ fewshot_split: train
doc_to_target: !function utils.doc_to_target
should_decontaminate: true
doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
process_results: !function utils.process_results
filter_list:
- filter:
- function: regex_pos
- function: "custom"
filter_fn: !function utils.extract_pos
- function: "take_first"
name: flexible-extract
metric_list:
- metric: acc
aggregation: !function utils.acc_score
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- ","
metadata:
version: 1.0
from itertools import chain
import re
from collections.abc import Iterable
from typing import Any
from sklearn.metrics import accuracy_score
from lm_eval.utils import weighted_f1_score
def doc_to_target(doc):
pos_tag_map = {
......@@ -29,27 +29,40 @@ def doc_to_target(doc):
return [pos_tag_map[tag] for tag in doc["upos"]]
def acc_score(items):
unzipped_list = list(zip(*items))
def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
# Extract tagged tokens list from text input using regex
tokens = re.findall(
r"\('([^']*)', '([^']*)'\)",
"Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
)
return [(token, pos) for token, pos in tokens]
def extract_pos_tags(result: str):
pos_tags = []
if isinstance(result, str):
result_ = extract_tagged_tokens(result)
pos_tags.extend(pos for _, pos in result_)
return pos_tags if pos_tags else ["invalid"]
def filter_set(inst: list[str]) -> list[str]:
filtered = []
for resp in inst:
match = extract_pos_tags(resp)
filtered.append(match)
return filtered
golds, preds = unzipped_list[0], unzipped_list[1]
filtered_resps = map(lambda x: filter_set(x), resps)
# Flatten preds' inner lists
flattened_preds = [list(chain.from_iterable(p)) for p in preds]
return filtered_resps
# Calculate the accuracy for each gold-pred pair
accuracy_scores = []
for gold, pred in zip(golds, flattened_preds):
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(gold), len(pred))
gold = gold[:min_length]
pred = pred[:min_length]
# Calculate accuracy for the current pair and add to the list
accuracy = accuracy_score(gold, pred)
accuracy_scores.append(accuracy)
def process_results(doc: dict[str, Any], results: list[list[str]]):
golds, preds = doc_to_target(doc), results[0]
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(golds), len(preds))
gold = golds[:min_length]
pred = preds[:min_length]
accuracy = accuracy_score(gold, pred)
mean_accuracy = (
sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
)
return mean_accuracy
return {"acc": accuracy}
......@@ -16,17 +16,16 @@ fewshot_split: train
doc_to_target: !function utils.doc_to_target
should_decontaminate: true
doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
process_results: !function utils.process_results
filter_list:
- filter:
- function: regex_pos
- function: "custom"
filter_fn: !function utils.extract_pos
- function: "take_first"
name: flexible-extract
metric_list:
- metric: acc
aggregation: !function utils.acc_score
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- ","
metadata:
version: 1.0
from itertools import chain
import re
from collections.abc import Iterable
from typing import Any
from sklearn.metrics import accuracy_score
from lm_eval.utils import weighted_f1_score
def doc_to_target(doc):
pos_tag_map = {
......@@ -29,27 +29,40 @@ def doc_to_target(doc):
return [pos_tag_map[tag] for tag in doc["upos"]]
def acc_score(items):
unzipped_list = list(zip(*items))
def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
# Extract tagged tokens list from text input using regex
tokens = re.findall(
r"\('([^']*)', '([^']*)'\)",
"Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
)
return [(token, pos) for token, pos in tokens]
def extract_pos_tags(result: str):
pos_tags = []
if isinstance(result, str):
result_ = extract_tagged_tokens(result)
pos_tags.extend(pos for _, pos in result_)
return pos_tags if pos_tags else ["invalid"]
def filter_set(inst: list[str]) -> list[str]:
filtered = []
for resp in inst:
match = extract_pos_tags(resp)
filtered.append(match)
return filtered
golds, preds = unzipped_list[0], unzipped_list[1]
filtered_resps = map(lambda x: filter_set(x), resps)
# Flatten preds' inner lists
flattened_preds = [list(chain.from_iterable(p)) for p in preds]
return filtered_resps
# Calculate the accuracy for each gold-pred pair
accuracy_scores = []
for gold, pred in zip(golds, flattened_preds):
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(gold), len(pred))
gold = gold[:min_length]
pred = pred[:min_length]
# Calculate accuracy for the current pair and add to the list
accuracy = accuracy_score(gold, pred)
accuracy_scores.append(accuracy)
def process_results(doc: dict[str, Any], results: list[list[str]]):
golds, preds = doc_to_target(doc), results[0]
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(golds), len(preds))
gold = golds[:min_length]
pred = preds[:min_length]
accuracy = accuracy_score(gold, pred)
mean_accuracy = (
sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
)
return mean_accuracy
return {"acc": accuracy}
from lm_eval.utils import weighted_f1_score
def doc_to_text(doc):
output = """Please provide the POS tags for each word in the input sentence. The input will be a list of words in
the sentence. The output format should be a list of tuples, where each tuple consists of a word from the input text
and its corresponding POS tag label from the tag label set: ["ADJ", "ADP", "ADV", "AUX", "CCONJ, "DET", "INTJ",
and its corresponding POS tag label from the tag label set: ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ",
"NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT" "SCONJ", "SYM", "VERB", "X"]. \nYour response should include only a
list of tuples, in the order that the words appear in the input sentence, with each tuple containing the
corresponding POS tag label for a word.
......
......@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks
- nollysenti_prompt_1
dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks
- nollysenti_prompt_2
dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks
- nollysenti_prompt_3
dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks
- nollysenti_prompt_4
dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks
- nollysenti_prompt_5
dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice
validation_split: validation
test_split: test
......
......@@ -4,7 +4,6 @@ tag:
- ntrex_afr-eng_prompt_1
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -4,7 +4,6 @@ tag:
- ntrex_eng-afr_prompt_1
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -3,7 +3,6 @@ tag:
- ntrex_afr-eng_prompt_2
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -3,7 +3,6 @@ tag:
- ntrex_eng-afr_prompt_2
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -3,7 +3,6 @@ tag:
- ntrex_afr-eng_prompt_3
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
......@@ -3,7 +3,6 @@ tag:
- ntrex_eng-afr_prompt_3
- afrobench_MT_tasks
dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until
validation_split: test
fewshot_split: test
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment