Unverified Commit 7f15cce4 authored by Nathan Weinberg's avatar Nathan Weinberg Committed by GitHub
Browse files

refactor: limit usage of `scipy` and `skilearn` dependencies (#2097)



* refactor: move scipy and sklearn module imports to func imports
Signed-off-by: default avatarNathan Weinberg <nweinber@redhat.com>

* refactor: consolidate weighted_f1_score func into lm_eval utils
Signed-off-by: default avatarNathan Weinberg <nweinber@redhat.com>

* lint: allow for utils file to have unused imports

this allows for shared functions to be defined only
once while allowing for the YAML function importing
to continue working
Signed-off-by: default avatarNathan Weinberg <nweinber@redhat.com>

---------
Signed-off-by: default avatarNathan Weinberg <nweinber@redhat.com>
parent 63e76e89
...@@ -8,7 +8,6 @@ from typing import List ...@@ -8,7 +8,6 @@ from typing import List
import numpy as np import numpy as np
import sacrebleu import sacrebleu
import sklearn.metrics
from lm_eval.api.registry import register_aggregation, register_metric from lm_eval.api.registry import register_aggregation, register_metric
...@@ -51,21 +50,24 @@ def bits_per_byte(items): ...@@ -51,21 +50,24 @@ def bits_per_byte(items):
@register_aggregation("f1") @register_aggregation("f1")
def f1_score(items): def f1_score(items):
from sklearn.metrics import f1_score
unzipped_list = list(zip(*items)) unzipped_list = list(zip(*items))
golds = unzipped_list[0] golds = unzipped_list[0]
preds = unzipped_list[1] preds = unzipped_list[1]
fscore = sklearn.metrics.f1_score(golds, preds) fscore = f1_score(golds, preds)
return np.max(fscore) return np.max(fscore)
@register_aggregation("matthews_corrcoef") @register_aggregation("matthews_corrcoef")
def matthews_corrcoef(items): def matthews_corrcoef(items):
from sklearn.metrics import matthews_corrcoef
unzipped_list = list(zip(*items)) unzipped_list = list(zip(*items))
golds = unzipped_list[0] golds = unzipped_list[0]
preds = unzipped_list[1] preds = unzipped_list[1]
# print(preds) return matthews_corrcoef(golds, preds)
return sklearn.metrics.matthews_corrcoef(golds, preds)
@register_aggregation("bleu") @register_aggregation("bleu")
......
from sklearn.metrics import f1_score from lm_eval.utils import weighted_f1_score
def doc_to_choice(doc): def doc_to_choice(doc):
...@@ -30,11 +30,3 @@ def doc_to_text(doc): ...@@ -30,11 +30,3 @@ def doc_to_text(doc):
choice4=choices[3], choice4=choices[3],
) )
return text return text
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
from sklearn.metrics import f1_score from lm_eval.utils import weighted_f1_score
def doc_to_choice(doc): def doc_to_choice(doc):
...@@ -30,11 +30,3 @@ def doc_to_text(doc): ...@@ -30,11 +30,3 @@ def doc_to_text(doc):
choice4=choices[3], choice4=choices[3],
) )
return text return text
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
from sklearn.metrics import f1_score from lm_eval.utils import weighted_f1_score
def doc_to_choice(doc): def doc_to_choice(doc):
...@@ -30,11 +30,3 @@ def doc_to_text(doc): ...@@ -30,11 +30,3 @@ def doc_to_text(doc):
choice4=choices[3], choice4=choices[3],
) )
return text return text
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
from sklearn.metrics import f1_score from lm_eval.utils import weighted_f1_score
def doc_to_target(doc): def doc_to_target(doc):
replacements = {0: "True", 1: "Neither", 2: "False"} replacements = {0: "True", 1: "Neither", 2: "False"}
return replacements[doc["label"]] return replacements[doc["label"]]
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
from sklearn.metrics import f1_score from lm_eval.utils import weighted_f1_score
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
from sklearn.metrics import f1_score from lm_eval.utils import weighted_f1_score
def doc_to_target(doc): def doc_to_target(doc):
replacements = {0: "True", 1: "Neither", 2: "False"} replacements = {0: "True", 1: "Neither", 2: "False"}
return replacements[doc["label"]] return replacements[doc["label"]]
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
from sklearn.metrics import f1_score from lm_eval.utils import weighted_f1_score
def doc_to_text(doc): def doc_to_text(doc):
...@@ -17,11 +17,3 @@ def doc_to_text(doc): ...@@ -17,11 +17,3 @@ def doc_to_text(doc):
def doc_to_target(doc): def doc_to_target(doc):
replacements = {0: "entailment", 1: "neutral", 2: "contradiction"} replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
return replacements[doc["label"]] return replacements[doc["label"]]
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
from sklearn.metrics import f1_score from lm_eval.utils import weighted_f1_score
def doc_to_text(doc): def doc_to_text(doc):
...@@ -17,11 +17,3 @@ def doc_to_text(doc): ...@@ -17,11 +17,3 @@ def doc_to_text(doc):
def doc_to_target(doc): def doc_to_target(doc):
replacements = {0: "entailment", 1: "neutral", 2: "contradiction"} replacements = {0: "entailment", 1: "neutral", 2: "contradiction"}
return replacements[doc["label"]] return replacements[doc["label"]]
def weighted_f1_score(items):
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
...@@ -2,7 +2,6 @@ import re ...@@ -2,7 +2,6 @@ import re
import string import string
import numpy as np import numpy as np
from scipy.optimize import linear_sum_assignment
_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE) _ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE)
...@@ -117,6 +116,8 @@ def _align_bags(predicted, gold): ...@@ -117,6 +116,8 @@ def _align_bags(predicted, gold):
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
between them and gets maximum metric values over all the answers. between them and gets maximum metric values over all the answers.
""" """
from scipy.optimize import linear_sum_assignment
scores = np.zeros([len(gold), len(predicted)]) scores = np.zeros([len(gold), len(predicted)])
for gold_index, gold_item in enumerate(gold): for gold_index, gold_item in enumerate(gold):
for pred_index, pred_item in enumerate(predicted): for pred_index, pred_item in enumerate(predicted):
......
from datasets import Dataset from datasets import Dataset
from sklearn.metrics import f1_score
def copa_doc_to_text(doc: dict) -> str: def copa_doc_to_text(doc: dict) -> str:
...@@ -41,6 +40,8 @@ def hellaswag_process_doc(doc: Dataset) -> Dataset: ...@@ -41,6 +40,8 @@ def hellaswag_process_doc(doc: Dataset) -> Dataset:
def macro_f1_score(items): def macro_f1_score(items):
from sklearn.metrics import f1_score
unzipped_list = list(zip(*items)) unzipped_list = list(zip(*items))
golds = unzipped_list[0] golds = unzipped_list[0]
preds = unzipped_list[1] preds = unzipped_list[1]
......
import numpy as np import numpy as np
import sklearn
def cb_multi_fi(items): def cb_multi_fi(items):
from sklearn.metrics import f1_score
preds, golds = zip(*items) preds, golds = zip(*items)
preds = np.array(preds) preds = np.array(preds)
golds = np.array(golds) golds = np.array(golds)
f11 = sklearn.metrics.f1_score(y_true=golds == 0, y_pred=preds == 0) f11 = f1_score(y_true=golds == 0, y_pred=preds == 0)
f12 = sklearn.metrics.f1_score(y_true=golds == 1, y_pred=preds == 1) f12 = f1_score(y_true=golds == 1, y_pred=preds == 1)
f13 = sklearn.metrics.f1_score(y_true=golds == 2, y_pred=preds == 2) f13 = f1_score(y_true=golds == 2, y_pred=preds == 2)
avg_f1 = np.mean([f11, f12, f13]) avg_f1 = np.mean([f11, f12, f13])
return avg_f1 return avg_f1
import sklearn.metrics
def mean_3class_f1(predictions, references): # This is a passthrough function def mean_3class_f1(predictions, references): # This is a passthrough function
string_label = ["entailment", "contradiction", "neutral"] string_label = ["entailment", "contradiction", "neutral"]
predictions = ( predictions = (
...@@ -23,6 +20,8 @@ def agg_mean_3class_f1(items): ...@@ -23,6 +20,8 @@ def agg_mean_3class_f1(items):
} }
def _fn(predictions, references): def _fn(predictions, references):
import sklearn.metrics
metric_fn = getattr(sklearn.metrics, metric_str) metric_fn = getattr(sklearn.metrics, metric_str)
metric_val = metric_fn(references, predictions, **metric_fn_kwargs) metric_val = metric_fn(references, predictions, **metric_fn_kwargs)
return metric_val return metric_val
......
import collections import collections
import numpy as np import numpy as np
import sklearn.metrics
def f1(predictions, references): # This is a passthrough function def f1(predictions, references): # This is a passthrough function
...@@ -19,10 +18,12 @@ def f1(predictions, references): # This is a passthrough function ...@@ -19,10 +18,12 @@ def f1(predictions, references): # This is a passthrough function
def agg_f1(items): def agg_f1(items):
from sklearn.metrics import f1_score
predictions, references = zip(*items) predictions, references = zip(*items)
references, predictions = np.asarray(references), np.asarray(predictions) references, predictions = np.asarray(references), np.asarray(predictions)
return sklearn.metrics.f1_score(references, predictions) return f1_score(references, predictions)
def em(predictions, references): # This is a passthrough function def em(predictions, references): # This is a passthrough function
......
...@@ -487,3 +487,13 @@ def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None): ...@@ -487,3 +487,13 @@ def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None):
among ranks in multigpu setting or only pulling a sample of documents among ranks in multigpu setting or only pulling a sample of documents
""" """
return islice(raw_iterator, rank, limit, world_size) return islice(raw_iterator, rank, limit, world_size)
def weighted_f1_score(items):
from sklearn.metrics import f1_score
unzipped_list = list(zip(*items))
golds = unzipped_list[0]
preds = unzipped_list[1]
fscore = f1_score(golds, preds, average="weighted")
return fscore
...@@ -104,3 +104,4 @@ known-first-party = ["lm_eval"] ...@@ -104,3 +104,4 @@ known-first-party = ["lm_eval"]
[tool.ruff.lint.extend-per-file-ignores] [tool.ruff.lint.extend-per-file-ignores]
"__init__.py" = ["F401","F402","F403"] "__init__.py" = ["F401","F402","F403"]
"utils.py" = ["F401"]
...@@ -4,7 +4,6 @@ from typing import Dict, List, Tuple ...@@ -4,7 +4,6 @@ from typing import Dict, List, Tuple
import numpy as np import numpy as np
import pandas as pd import pandas as pd
import scipy.stats
import torch import torch
import lm_eval.evaluator import lm_eval.evaluator
...@@ -23,11 +22,13 @@ def memory_stats(): ...@@ -23,11 +22,13 @@ def memory_stats():
def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]: def calculate_z_value(res1: Dict, res2: Dict) -> Tuple[float, float]:
from scipy.stats.norm import sf
acc1, acc2 = res1["acc,none"], res2["acc,none"] acc1, acc2 = res1["acc,none"], res2["acc,none"]
st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"] st_err1, st_err2 = res1["acc_stderr,none"], res2["acc_stderr,none"]
Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2)) Z = (acc1 - acc2) / np.sqrt((st_err1**2) + (st_err2**2))
# Determining the p-value # Determining the p-value
p_value = 2 * scipy.stats.norm.sf(abs(Z)) # two-tailed test p_value = 2 * sf(abs(Z)) # two-tailed test
return Z, p_value return Z, p_value
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment