Commit abd17276 authored by Baber's avatar Baber
Browse files

Merge branch 'smolrefact' into tasklist

# Conflicts:
#	lm_eval/__main__.py
#	lm_eval/api/group.py
#	lm_eval/api/task.py
#	lm_eval/evaluator_utils.py
#	lm_eval/tasks/__init__.py
#	lm_eval/utils.py
#	pyproject.toml
parents 00afd536 70314843
...@@ -16,17 +16,16 @@ fewshot_split: train ...@@ -16,17 +16,16 @@ fewshot_split: train
doc_to_target: !function utils.doc_to_target doc_to_target: !function utils.doc_to_target
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "Sentence: {{token}}\nOutput:" doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
process_results: !function utils.process_results
filter_list: filter_list:
- filter: - filter:
- function: regex_pos - function: "custom"
filter_fn: !function utils.extract_pos
- function: "take_first"
name: flexible-extract name: flexible-extract
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: !function utils.acc_score aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- ","
metadata: metadata:
version: 1.0 version: 1.0
from itertools import chain import re
from collections.abc import Iterable
from typing import Any
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
from lm_eval.utils import weighted_f1_score
def doc_to_target(doc): def doc_to_target(doc):
pos_tag_map = { pos_tag_map = {
...@@ -29,27 +29,40 @@ def doc_to_target(doc): ...@@ -29,27 +29,40 @@ def doc_to_target(doc):
return [pos_tag_map[tag] for tag in doc["upos"]] return [pos_tag_map[tag] for tag in doc["upos"]]
def acc_score(items): def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
unzipped_list = list(zip(*items)) def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
# Extract tagged tokens list from text input using regex
tokens = re.findall(
r"\('([^']*)', '([^']*)'\)",
"Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
)
return [(token, pos) for token, pos in tokens]
def extract_pos_tags(result: str):
pos_tags = []
if isinstance(result, str):
result_ = extract_tagged_tokens(result)
pos_tags.extend(pos for _, pos in result_)
return pos_tags if pos_tags else ["invalid"]
def filter_set(inst: list[str]) -> list[str]:
filtered = []
for resp in inst:
match = extract_pos_tags(resp)
filtered.append(match)
return filtered
golds, preds = unzipped_list[0], unzipped_list[1] filtered_resps = map(lambda x: filter_set(x), resps)
# Flatten preds' inner lists return filtered_resps
flattened_preds = [list(chain.from_iterable(p)) for p in preds]
# Calculate the accuracy for each gold-pred pair
accuracy_scores = []
for gold, pred in zip(golds, flattened_preds):
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(gold), len(pred))
gold = gold[:min_length]
pred = pred[:min_length]
# Calculate accuracy for the current pair and add to the list def process_results(doc: dict[str, Any], results: list[list[str]]):
accuracy = accuracy_score(gold, pred) golds, preds = doc_to_target(doc), results[0]
accuracy_scores.append(accuracy) # Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(golds), len(preds))
gold = golds[:min_length]
pred = preds[:min_length]
accuracy = accuracy_score(gold, pred)
mean_accuracy = ( return {"acc": accuracy}
sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
)
return mean_accuracy
...@@ -16,17 +16,16 @@ fewshot_split: train ...@@ -16,17 +16,16 @@ fewshot_split: train
doc_to_target: !function utils.doc_to_target doc_to_target: !function utils.doc_to_target
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "Sentence: {{token}}\nOutput:" doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
process_results: !function utils.process_results
filter_list: filter_list:
- filter: - filter:
- function: regex_pos - function: "custom"
filter_fn: !function utils.extract_pos
- function: "take_first"
name: flexible-extract name: flexible-extract
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: !function utils.acc_score aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- ","
metadata: metadata:
version: 1.0 version: 1.0
from itertools import chain import re
from collections.abc import Iterable
from typing import Any
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
from lm_eval.utils import weighted_f1_score
def doc_to_target(doc): def doc_to_target(doc):
pos_tag_map = { pos_tag_map = {
...@@ -29,27 +29,40 @@ def doc_to_target(doc): ...@@ -29,27 +29,40 @@ def doc_to_target(doc):
return [pos_tag_map[tag] for tag in doc["upos"]] return [pos_tag_map[tag] for tag in doc["upos"]]
def acc_score(items): def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
unzipped_list = list(zip(*items)) def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
# Extract tagged tokens list from text input using regex
tokens = re.findall(
r"\('([^']*)', '([^']*)'\)",
"Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
)
return [(token, pos) for token, pos in tokens]
def extract_pos_tags(result: str):
pos_tags = []
if isinstance(result, str):
result_ = extract_tagged_tokens(result)
pos_tags.extend(pos for _, pos in result_)
return pos_tags if pos_tags else ["invalid"]
def filter_set(inst: list[str]) -> list[str]:
filtered = []
for resp in inst:
match = extract_pos_tags(resp)
filtered.append(match)
return filtered
golds, preds = unzipped_list[0], unzipped_list[1] filtered_resps = map(lambda x: filter_set(x), resps)
# Flatten preds' inner lists return filtered_resps
flattened_preds = [list(chain.from_iterable(p)) for p in preds]
# Calculate the accuracy for each gold-pred pair
accuracy_scores = []
for gold, pred in zip(golds, flattened_preds):
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(gold), len(pred))
gold = gold[:min_length]
pred = pred[:min_length]
# Calculate accuracy for the current pair and add to the list def process_results(doc: dict[str, Any], results: list[list[str]]):
accuracy = accuracy_score(gold, pred) golds, preds = doc_to_target(doc), results[0]
accuracy_scores.append(accuracy) # Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(golds), len(preds))
gold = golds[:min_length]
pred = preds[:min_length]
accuracy = accuracy_score(gold, pred)
mean_accuracy = ( return {"acc": accuracy}
sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
)
return mean_accuracy
...@@ -16,17 +16,16 @@ fewshot_split: train ...@@ -16,17 +16,16 @@ fewshot_split: train
doc_to_target: !function utils.doc_to_target doc_to_target: !function utils.doc_to_target
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "Sentence: {{token}}\nOutput:" doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
process_results: !function utils.process_results
filter_list: filter_list:
- filter: - filter:
- function: regex_pos - function: "custom"
filter_fn: !function utils.extract_pos
- function: "take_first"
name: flexible-extract name: flexible-extract
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: !function utils.acc_score aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- ","
metadata: metadata:
version: 1.0 version: 1.0
from itertools import chain import re
from collections.abc import Iterable
from typing import Any
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
from lm_eval.utils import weighted_f1_score
def doc_to_target(doc): def doc_to_target(doc):
pos_tag_map = { pos_tag_map = {
...@@ -29,27 +29,40 @@ def doc_to_target(doc): ...@@ -29,27 +29,40 @@ def doc_to_target(doc):
return [pos_tag_map[tag] for tag in doc["upos"]] return [pos_tag_map[tag] for tag in doc["upos"]]
def acc_score(items): def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
unzipped_list = list(zip(*items)) def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
# Extract tagged tokens list from text input using regex
tokens = re.findall(
r"\('([^']*)', '([^']*)'\)",
"Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
)
return [(token, pos) for token, pos in tokens]
def extract_pos_tags(result: str):
pos_tags = []
if isinstance(result, str):
result_ = extract_tagged_tokens(result)
pos_tags.extend(pos for _, pos in result_)
return pos_tags if pos_tags else ["invalid"]
def filter_set(inst: list[str]) -> list[str]:
filtered = []
for resp in inst:
match = extract_pos_tags(resp)
filtered.append(match)
return filtered
golds, preds = unzipped_list[0], unzipped_list[1] filtered_resps = map(lambda x: filter_set(x), resps)
# Flatten preds' inner lists return filtered_resps
flattened_preds = [list(chain.from_iterable(p)) for p in preds]
# Calculate the accuracy for each gold-pred pair
accuracy_scores = []
for gold, pred in zip(golds, flattened_preds):
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(gold), len(pred))
gold = gold[:min_length]
pred = pred[:min_length]
# Calculate accuracy for the current pair and add to the list def process_results(doc: dict[str, Any], results: list[list[str]]):
accuracy = accuracy_score(gold, pred) golds, preds = doc_to_target(doc), results[0]
accuracy_scores.append(accuracy) # Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(golds), len(preds))
gold = golds[:min_length]
pred = preds[:min_length]
accuracy = accuracy_score(gold, pred)
mean_accuracy = ( return {"acc": accuracy}
sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
)
return mean_accuracy
...@@ -16,17 +16,16 @@ fewshot_split: train ...@@ -16,17 +16,16 @@ fewshot_split: train
doc_to_target: !function utils.doc_to_target doc_to_target: !function utils.doc_to_target
should_decontaminate: true should_decontaminate: true
doc_to_decontamination_query: "Sentence: {{token}}\nOutput:" doc_to_decontamination_query: "Sentence: {{token}}\nOutput:"
process_results: !function utils.process_results
filter_list: filter_list:
- filter: - filter:
- function: regex_pos - function: "custom"
filter_fn: !function utils.extract_pos
- function: "take_first"
name: flexible-extract name: flexible-extract
metric_list: metric_list:
- metric: acc - metric: acc
aggregation: !function utils.acc_score aggregation: mean
higher_is_better: true higher_is_better: true
ignore_case: true
ignore_punctuation: true
regexes_to_ignore:
- ","
metadata: metadata:
version: 1.0 version: 1.0
from itertools import chain import re
from collections.abc import Iterable
from typing import Any
from sklearn.metrics import accuracy_score from sklearn.metrics import accuracy_score
from lm_eval.utils import weighted_f1_score
def doc_to_target(doc): def doc_to_target(doc):
pos_tag_map = { pos_tag_map = {
...@@ -29,27 +29,40 @@ def doc_to_target(doc): ...@@ -29,27 +29,40 @@ def doc_to_target(doc):
return [pos_tag_map[tag] for tag in doc["upos"]] return [pos_tag_map[tag] for tag in doc["upos"]]
def acc_score(items): def extract_pos(resps: Iterable[list[str]], *args) -> Iterable[list[str]]:
unzipped_list = list(zip(*items)) def extract_tagged_tokens(text: str) -> list[tuple[str, str]]:
# Extract tagged tokens list from text input using regex
tokens = re.findall(
r"\('([^']*)', '([^']*)'\)",
"Here are some tuples: ('apple', 'red'), ('banana', 'yellow'), ('grape', 'purple')",
)
return [(token, pos) for token, pos in tokens]
def extract_pos_tags(result: str):
pos_tags = []
if isinstance(result, str):
result_ = extract_tagged_tokens(result)
pos_tags.extend(pos for _, pos in result_)
return pos_tags if pos_tags else ["invalid"]
def filter_set(inst: list[str]) -> list[str]:
filtered = []
for resp in inst:
match = extract_pos_tags(resp)
filtered.append(match)
return filtered
golds, preds = unzipped_list[0], unzipped_list[1] filtered_resps = map(lambda x: filter_set(x), resps)
# Flatten preds' inner lists return filtered_resps
flattened_preds = [list(chain.from_iterable(p)) for p in preds]
# Calculate the accuracy for each gold-pred pair
accuracy_scores = []
for gold, pred in zip(golds, flattened_preds):
# Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(gold), len(pred))
gold = gold[:min_length]
pred = pred[:min_length]
# Calculate accuracy for the current pair and add to the list def process_results(doc: dict[str, Any], results: list[list[str]]):
accuracy = accuracy_score(gold, pred) golds, preds = doc_to_target(doc), results[0]
accuracy_scores.append(accuracy) # Ensure both lists are of the same length, otherwise truncate to match
min_length = min(len(golds), len(preds))
gold = golds[:min_length]
pred = preds[:min_length]
accuracy = accuracy_score(gold, pred)
mean_accuracy = ( return {"acc": accuracy}
sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
)
return mean_accuracy
from lm_eval.utils import weighted_f1_score
def doc_to_text(doc): def doc_to_text(doc):
output = """Please provide the POS tags for each word in the input sentence. The input will be a list of words in output = """Please provide the POS tags for each word in the input sentence. The input will be a list of words in
the sentence. The output format should be a list of tuples, where each tuple consists of a word from the input text the sentence. The output format should be a list of tuples, where each tuple consists of a word from the input text
and its corresponding POS tag label from the tag label set: ["ADJ", "ADP", "ADV", "AUX", "CCONJ, "DET", "INTJ", and its corresponding POS tag label from the tag label set: ["ADJ", "ADP", "ADV", "AUX", "CCONJ", "DET", "INTJ",
"NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT" "SCONJ", "SYM", "VERB", "X"]. \nYour response should include only a "NOUN", "NUM", "PART", "PRON", "PROPN", "PUNCT" "SCONJ", "SYM", "VERB", "X"]. \nYour response should include only a
list of tuples, in the order that the words appear in the input sentence, with each tuple containing the list of tuples, in the order that the words appear in the input sentence, with each tuple containing the
corresponding POS tag label for a word. corresponding POS tag label for a word.
......
...@@ -2,7 +2,6 @@ tag: ...@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks - afrobench_sentiment_tasks
- nollysenti_prompt_1 - nollysenti_prompt_1
dataset_path: Davlan/nollysenti dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice output_type: multiple_choice
validation_split: validation validation_split: validation
test_split: test test_split: test
......
...@@ -2,7 +2,6 @@ tag: ...@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks - afrobench_sentiment_tasks
- nollysenti_prompt_2 - nollysenti_prompt_2
dataset_path: Davlan/nollysenti dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice output_type: multiple_choice
validation_split: validation validation_split: validation
test_split: test test_split: test
......
...@@ -2,7 +2,6 @@ tag: ...@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks - afrobench_sentiment_tasks
- nollysenti_prompt_3 - nollysenti_prompt_3
dataset_path: Davlan/nollysenti dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice output_type: multiple_choice
validation_split: validation validation_split: validation
test_split: test test_split: test
......
...@@ -2,7 +2,6 @@ tag: ...@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks - afrobench_sentiment_tasks
- nollysenti_prompt_4 - nollysenti_prompt_4
dataset_path: Davlan/nollysenti dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice output_type: multiple_choice
validation_split: validation validation_split: validation
test_split: test test_split: test
......
...@@ -2,7 +2,6 @@ tag: ...@@ -2,7 +2,6 @@ tag:
- afrobench_sentiment_tasks - afrobench_sentiment_tasks
- nollysenti_prompt_5 - nollysenti_prompt_5
dataset_path: Davlan/nollysenti dataset_path: Davlan/nollysenti
dataset_kwargs: {trust_remote_code: True}
output_type: multiple_choice output_type: multiple_choice
validation_split: validation validation_split: validation
test_split: test test_split: test
......
...@@ -4,7 +4,6 @@ tag: ...@@ -4,7 +4,6 @@ tag:
- ntrex_afr-eng_prompt_1 - ntrex_afr-eng_prompt_1
- afrobench_MT_tasks - afrobench_MT_tasks
dataset_path: masakhane/ntrex_african dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until output_type: generate_until
validation_split: test validation_split: test
fewshot_split: test fewshot_split: test
......
...@@ -4,7 +4,6 @@ tag: ...@@ -4,7 +4,6 @@ tag:
- ntrex_eng-afr_prompt_1 - ntrex_eng-afr_prompt_1
- afrobench_MT_tasks - afrobench_MT_tasks
dataset_path: masakhane/ntrex_african dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until output_type: generate_until
validation_split: test validation_split: test
fewshot_split: test fewshot_split: test
......
...@@ -3,7 +3,6 @@ tag: ...@@ -3,7 +3,6 @@ tag:
- ntrex_afr-eng_prompt_2 - ntrex_afr-eng_prompt_2
- afrobench_MT_tasks - afrobench_MT_tasks
dataset_path: masakhane/ntrex_african dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until output_type: generate_until
validation_split: test validation_split: test
fewshot_split: test fewshot_split: test
......
...@@ -3,7 +3,6 @@ tag: ...@@ -3,7 +3,6 @@ tag:
- ntrex_eng-afr_prompt_2 - ntrex_eng-afr_prompt_2
- afrobench_MT_tasks - afrobench_MT_tasks
dataset_path: masakhane/ntrex_african dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until output_type: generate_until
validation_split: test validation_split: test
fewshot_split: test fewshot_split: test
......
...@@ -3,7 +3,6 @@ tag: ...@@ -3,7 +3,6 @@ tag:
- ntrex_afr-eng_prompt_3 - ntrex_afr-eng_prompt_3
- afrobench_MT_tasks - afrobench_MT_tasks
dataset_path: masakhane/ntrex_african dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until output_type: generate_until
validation_split: test validation_split: test
fewshot_split: test fewshot_split: test
......
...@@ -3,7 +3,6 @@ tag: ...@@ -3,7 +3,6 @@ tag:
- ntrex_eng-afr_prompt_3 - ntrex_eng-afr_prompt_3
- afrobench_MT_tasks - afrobench_MT_tasks
dataset_path: masakhane/ntrex_african dataset_path: masakhane/ntrex_african
dataset_kwargs: {trust_remote_code: True}
output_type: generate_until output_type: generate_until
validation_split: test validation_split: test
fewshot_split: test fewshot_split: test
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment