Commit 97e8278b authored by zzg_666's avatar zzg_666
Browse files

适配后端vllm

parents
Pipeline #3071 canceled with stages
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.core import LLMServingABC
@OPERATOR_REGISTRY.register()
class PromptedVQAGenerator(OperatorABC):
def __init__(self, llm_serving: LLMServingABC, system_prompt: str = "You are a helpful assistant."):
self.logger = get_logger()
self.llm_serving = llm_serving
self.system_prompt = system_prompt
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于视觉问答生成,接收包含图像和问题的输入内容,使用大语言模型生成回答,"
"并将生成的回答保存到数据框中。\n"
"输入参数:\n"
"- llm_serving:LLM服务对象,需实现LLMServingABC接口\n"
"- system_prompt:系统提示词,用于定义模型行为,默认为'You are a helpful assistant.'\n"
"- input_key:输入内容的字段名,默认为'raw_content'\n"
"- output_key:输出生成内容的字段名,默认为'generated_content'\n"
"输出参数:\n"
"- 返回输出字段名,用于后续算子引用\n"
"- 在数据框中添加包含生成回答的新列"
)
elif lang == "en":
return (
"This operator generates visual question answering responses. It receives input content containing images and questions, "
"uses a large language model to generate answers, and saves the generated answers to the dataframe.\n"
"Input Parameters:\n"
"- llm_serving: LLM serving object implementing LLMServingABC interface\n"
"- system_prompt: System prompt to define model behavior, default is 'You are a helpful assistant.'\n"
"- input_key: Field name for input content, default is 'raw_content'\n"
"- output_key: Field name for output generated content, default is 'generated_content'\n\n"
"Output Parameters:\n"
"- Returns output field name for subsequent operator reference\n"
"- Adds a new column containing generated answers to the dataframe"
)
else:
return (
"PromptedVQAGenerator processes visual questions and generates answers using a large language model."
)
def run(self, storage: DataFlowStorage, input_key: str = "raw_content", output_key: str = "generated_content"):
self.input_key, self.output_key = input_key, output_key
self.logger.info("Running Prompted VQA Generator...")
dataframe = storage.read('dataframe')
self.logger.info(f"Loading, number of rows: {len(dataframe)}")
llm_inputs = []
for index, row in dataframe.iterrows():
raw_content = row.get(self.input_key, '')
if raw_content:
llm_inputs.append(raw_content)
llm_outputs = self.llm_serving.generate_from_input(llm_inputs, self.system_prompt)
dataframe[self.output_key] = llm_outputs
output_file = storage.write(dataframe)
self.logger.info(f"Saving to {output_file}")
self.logger.info("Prompted VQA Generator done")
return output_key
\ No newline at end of file
from typing import TYPE_CHECKING
if TYPE_CHECKING:
# DB
from .generate.db_operator import DBOperator
else:
import sys
from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
cur_path = "dataflow/operators/db/"
_import_structure = generate_import_structure_from_type_checking(__file__, cur_path)
sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/db/", _import_structure)
\ No newline at end of file
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import MyScaleDBStorage
@OPERATOR_REGISTRY.register()
class DBOperator(OperatorABC):
def __init__(self, expr):
"""
Initialize the DBOperator with the provided expression.
Args:
expr (str): The SQL expression to execute.
"""
self.logger = get_logger()
self.expr = expr
def run(self, storage:MyScaleDBStorage, input_key:str) -> list:
"""
Execute the SQL expression against the database storage.
Args:
storage (DBStorage): The database storage instance to use.
input_key (str): The key for the input data.
Returns:
list: The result of the SQL query execution.
"""
self.logger.info(f"Executing SQL expression: {self.expr}")
result = storage.execute(self.expr, input_key)
self.logger.info(f"Query executed successfully, retrieved {len(result)} records.")
return result
\ No newline at end of file
from typing import TYPE_CHECKING
if TYPE_CHECKING:
# filter
from .filter.rule_based_filter import ColonEndFilter
from .filter.rule_based_filter import SentenceNumberFilter
from .filter.rule_based_filter import LineEndWithEllipsisFilter
from .filter.rule_based_filter import ContentNullFilter
from .filter.rule_based_filter import SymbolWordRatioFilter
from .filter.rule_based_filter import AlphaWordsFilter
from .filter.rule_based_filter import HtmlEntityFilter
from .filter.rule_based_filter import IDCardFilter
from .filter.rule_based_filter import NoPuncFilter
from .filter.rule_based_filter import SpecialCharacterFilter
from .filter.rule_based_filter import WatermarkFilter
from .filter.rule_based_filter import MeanWordLengthFilter
from .filter.rule_based_filter import StopWordFilter
from .filter.rule_based_filter import CurlyBracketFilter
from .filter.rule_based_filter import CapitalWordsFilter
from .filter.rule_based_filter import LoremIpsumFilter
from .filter.rule_based_filter import UniqueWordsFilter
from .filter.rule_based_filter import CharNumberFilter
from .filter.rule_based_filter import LineStartWithBulletpointFilter
from .filter.rule_based_filter import LineWithJavascriptFilter
from .filter.langkit_filter import LangkitFilter
from .filter.lexical_diversity_filter import LexicalDiversityFilter
from .filter.ngram_filter import NgramFilter
from .filter.presidio_filter import PresidioFilter
from .filter.blocklist_filter import BlocklistFilter
from .filter.hash_deduplicate_filter import HashDeduplicateFilter
from .filter.language_filter import LanguageFilter
from .filter.llm_language_filter import LLMLanguageFilter
from .filter.minhash_deduplicate_filter import MinHashDeduplicateFilter
from .filter.ngramhash_deduplicate_filter import NgramHashDeduplicateFilter
from .filter.perspective_filter import PerspectiveFilter
from .filter.sem_deduplicate_filter import SemDeduplicateFilter
from .filter.simhash_deduplicate_filter import SimHashDeduplicateFilter
from .filter.word_number_filter import WordNumberFilter
# refine
from .refine.html_entity_refiner import HtmlEntityRefiner
from .refine.html_url_remover_refiner import HtmlUrlRemoverRefiner
from .refine.lowercase_refiner import LowercaseRefiner
from .refine.ner_refiner import NERRefiner
from .refine.pii_anonymize_refiner import PIIAnonymizeRefiner
from .refine.ref_removal_refiner import ReferenceRemoverRefiner
from .refine.remove_contractions_refiner import RemoveContractionsRefiner
from .refine.remove_emoji_refiner import RemoveEmojiRefiner
from .refine.remove_emoticons_refiner import RemoveEmoticonsRefiner
from .refine.remove_extra_spaces_refiner import RemoveExtraSpacesRefiner
from .refine.remove_image_ref_refiner import RemoveImageRefsRefiner
from .refine.remove_number_refiner import RemoveNumberRefiner
from .refine.remove_punctuation_refiner import RemovePunctuationRefiner
from .refine.remove_repetitions_punctuation_refiner import RemoveRepetitionsPunctuationRefiner
from .refine.remove_stopwords_refiner import RemoveStopwordsRefiner
from .refine.spelling_correction_refiner import SpellingCorrectionRefiner
from .refine.stemming_lemmatization_refiner import StemmingLemmatizationRefiner
from .refine.text_normalization_refiner import TextNormalizationRefiner
# eval
from .eval.ngram_sample_evaluator import NgramSampleEvaluator
from .eval.lexical_diversity_sample_evaluator import LexicalDiversitySampleEvaluator
from .eval.langkit_sample_evaluator import LangkitSampleEvaluator
from .eval.presidio_sample_evaluator import PresidioSampleEvaluator
from .eval.bert_sample_evaluator import BertSampleEvaluator
from .eval.bleu_sample_evaluator import BleuSampleEvaluator
from .eval.cider_sample_evaluator import CiderSampleEvaluator
from .eval.perspective_sample_evaluator import PerspectiveSampleEvaluator
from .eval.task2vec_dataset_evaluator import Task2VecDatasetEvaluator
from .eval.vendi_dataset_evaluator import VendiDatasetEvaluator
else:
import sys
from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
cur_path = "dataflow/operators/general_text/"
_import_structure = generate_import_structure_from_type_checking(__file__, cur_path)
sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/general_text/", _import_structure)
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
import evaluate
@OPERATOR_REGISTRY.register()
class BertSampleEvaluator(OperatorABC):
def __init__(self, lang='en', model_cache_dir='./dataflow_cache'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.data_type = "text"
self.score_name = "BERTScore"
self.lang = lang
self.model_type = "distilbert-base-uncased"
self.idf = False
self.rescale_with_baseline = False
self.bertscore = evaluate.load("bertscore", cache_dir=model_cache_dir)
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用BERTScore评估生成文本与参考文本的相似度,基于上下文嵌入计算P/R/F1分数。\n"
"输入参数:\n"
"- lang:语言类型,默认为'en'\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"- input_key:生成文本字段名\n"
"- reference_key:参考文本字段名\n"
"- output_key:输出得分字段名,默认为'BertScore'\n"
"输出参数:\n"
"- 包含F1相似度得分的DataFrame"
)
elif lang == "en":
return (
"Evaluate similarity between generated and reference text using BERTScore with contextual embeddings.\n"
"Input Parameters:\n"
"- lang: Language type, default 'en'\n"
"- model_cache_dir: Model cache directory, default './dataflow_cache'\n"
"- input_key: Field name for generated text\n"
"- reference_key: Field name for reference text\n"
"- output_key: Field name for output score, default 'BertScore'\n"
"Output Parameters:\n"
"- DataFrame containing F1 similarity scores"
)
else:
return "Evaluate text similarity using BERTScore."
def eval(self, dataframe, input_key, reference_key):
eval_data = dataframe[input_key].to_list()
ref_data = dataframe[reference_key].to_list()
self.logger.info(f"Evaluating {self.score_name}...")
if ref_data is None:
raise ValueError("Reference data must be provided for BERTScorer")
results = self.bertscore.compute(
predictions=eval_data,
references=ref_data,
lang=self.lang,
model_type=self.model_type,
idf=self.idf,
rescale_with_baseline=self.rescale_with_baseline
)
scores = results["f1"]
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_key: str, input_reference_key: str, output_key: str='BertScore'):
self.input_key = input_key
self.reference_key = input_reference_key
self.output_key = output_key
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_key, self.reference_key)
dataframe[self.output_key] = scores
storage.write(dataframe)
import copy
import sys, math, re
from collections import defaultdict
import six
from six.moves import xrange as range
def precook(s, n=4, out=False):
words = s.split()
counts = defaultdict(int)
for k in range(1,n+1):
for i in range(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return (len(words), counts)
def cook_refs(refs, eff=None, n=4):
reflen = []
maxcounts = {}
for ref in refs:
rl, counts = precook(ref, n)
reflen.append(rl)
for (ngram,count) in six.iteritems(counts):
maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
if eff == "shortest":
reflen = min(reflen)
elif eff == "average":
reflen = float(sum(reflen))/len(reflen)
return (reflen, maxcounts)
def cook_test(test, reflen_refmaxcounts, eff=None, n=4):
reflen, refmaxcounts = reflen_refmaxcounts
testlen, counts = precook(test, n, True)
result = {}
if eff == "closest":
result["reflen"] = min((abs(l-testlen), l) for l in reflen)[1]
else:
result["reflen"] = reflen
result["testlen"] = testlen
result["guess"] = [max(0,testlen-k+1) for k in range(1,n+1)]
result['correct'] = [0]*n
for (ngram, count) in six.iteritems(counts):
result["correct"][len(ngram)-1] += min(refmaxcounts.get(ngram,0), count)
return result
class Bleu(object):
"""Bleu scorer.
"""
__slots__ = "n", "crefs", "ctest", "_score", "_ratio", "_testlen", "_reflen", "special_reflen"
def copy(self):
new = Bleu(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
new._score = None
return new
def __init__(self, test=None, refs=None, n=4, special_reflen=None):
self.n = n
self.crefs = []
self.ctest = []
self.cook_append(test, refs)
self.special_reflen = special_reflen
def cook_append(self, test, refs):
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
cooked_test = cook_test(test, self.crefs[-1])
self.ctest.append(cooked_test) ## N.B.: -1
else:
self.ctest.append(None) # lens of crefs and ctest have to match
self._score = None ## need to recompute
def ratio(self, option=None):
self.compute_score(option=option)
return self._ratio
def score_ratio(self, option=None):
return (self.fscore(option=option), self.ratio(option=option))
def score_ratio_str(self, option=None):
return "%.4f (%.2f)" % self.score_ratio(option)
def reflen(self, option=None):
self.compute_score(option=option)
return self._reflen
def testlen(self, option=None):
self.compute_score(option=option)
return self._testlen
def retest(self, new_test):
if type(new_test) is str:
new_test = [new_test]
assert len(new_test) == len(self.crefs), new_test
self.ctest = []
for t, rs in zip(new_test, self.crefs):
self.ctest.append(cook_test(t, rs))
self._score = None
return self
def rescore(self, new_test):
''' replace test(s) with new test(s), and returns the new score.'''
return self.retest(new_test).compute_score()
def size(self):
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)
def __iadd__(self, other):
'''add an instance (e.g., from another sentence).'''
if type(other) is tuple:
## avoid creating new BleuScorer instances
self.cook_append(other[0], other[1])
else:
assert self.compatible(other), "incompatible BLEUs."
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
self._score = None ## need to recompute
return self
def compatible(self, other):
return isinstance(other, Bleu) and self.n == other.n
def single_reflen(self, option="average"):
return self._single_reflen(self.crefs[0][0], option)
def _single_reflen(self, reflens, option=None, testlen=None):
if option == "shortest":
reflen = min(reflens)
elif option == "average":
reflen = float(sum(reflens))/len(reflens)
elif option == "closest":
reflen = min((abs(l-testlen), l) for l in reflens)[1]
else:
assert False, "unsupported reflen option %s" % option
return reflen
def recompute_score(self, option=None, verbose=0):
self._score = None
return self.compute_score(option, verbose)
def compute_score(self, option=None, verbose=0):
n = self.n
small = 1e-9
tiny = 1e-15 ## so that if guess is 0 still return 0
bleu_list = [[] for _ in range(n)]
if self._score is not None:
return self._score
if option is None:
option = "average" if len(self.crefs) == 1 else "closest"
self._testlen = 0
self._reflen = 0
totalcomps = {'testlen':0, 'reflen':0, 'guess':[0]*n, 'correct':[0]*n}
# for each sentence
for comps in self.ctest:
testlen = comps['testlen']
self._testlen += testlen
if self.special_reflen is None: ## need computation
reflen = self._single_reflen(comps['reflen'], option, testlen)
else:
reflen = self.special_reflen
self._reflen += reflen
for key in ['guess','correct']:
for k in range(n):
totalcomps[key][k] += comps[key][k]
# append per image bleu score
bleu = 1.
for k in range(n):
bleu *= (float(comps['correct'][k]) + tiny) \
/(float(comps['guess'][k]) + small)
bleu_list[k].append(bleu ** (1./(k+1)))
ratio = (testlen + tiny) / (reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in range(n):
bleu_list[k][-1] *= math.exp(1 - 1/ratio)
if verbose > 1:
print(comps, reflen)
totalcomps['reflen'] = self._reflen
totalcomps['testlen'] = self._testlen
bleus = []
bleu = 1.
for k in range(n):
bleu *= float(totalcomps['correct'][k] + tiny) \
/ (totalcomps['guess'][k] + small)
bleus.append(bleu ** (1./(k+1)))
ratio = (self._testlen + tiny) / (self._reflen + small) ## N.B.: avoid zero division
if ratio < 1:
for k in range(n):
bleus[k] *= math.exp(1 - 1/ratio)
if verbose > 0:
print(totalcomps)
print("ratio:", ratio)
self._score = bleus
return self._score, bleu_list
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.operators.general_text.eval.bleu.bleu import Bleu
from tqdm import tqdm
@OPERATOR_REGISTRY.register()
class BleuSampleEvaluator(OperatorABC):
def __init__(self, n=4, eff="average", special_reflen=None):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.score_name = 'BleuScore'
valid_eff_options = ["shortest", "average", "longest"]
if eff not in valid_eff_options:
raise ValueError(f"Invalid value for 'eff'. Must be one of {valid_eff_options}, but got '{eff}'.")
self.n = n # Max n-gram length (default: 4)
self.eff = eff # [shortest, average, longest]
self.special_reflen = special_reflen # Special reference length if specified
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"计算BLEU分数评估生成文本与参考文本的n-gram重叠度,支持1-4元语法分析。\n"
"输入参数:\n"
"- n:最大n-gram长度,默认为4\n"
"- eff:参考长度计算方式,可选'shortest'/'average'/'longest',默认为'average'\n"
"- special_reflen:特殊参考长度,默认为None\n"
"- input_key:生成文本字段名\n"
"- reference_key:参考文本字段名\n"
"- output_key:输出得分字段名,默认为'BleuScore'\n"
"输出参数:\n"
"- 包含BLEU得分的DataFrame"
)
elif lang == "en":
return (
"Evaluate n-gram overlap between generated and reference text using BLEU score (1-4 grams supported).\n"
"Input Parameters:\n"
"- n: Maximum n-gram length, default 4\n"
"- eff: Reference length calculation method, 'shortest'/'average'/'longest', default 'average'\n"
"- special_reflen: Special reference length, default None\n"
"- input_key: Field name for generated text\n"
"- reference_key: Field name for reference text\n"
"- output_key: Field name for output score, default 'BleuScore'\n"
"Output Parameters:\n"
"- DataFrame containing BLEU scores"
)
else:
return "Evaluate text similarity using BLEU score."
def _score_func(self, eval_text, ref_text):
bleu_scorer = Bleu(
test=eval_text,
refs=[ref_text],
n=self.n,
special_reflen=self.special_reflen,
)
bleu_score, _ = bleu_scorer.compute_score(option=self.eff)
return bleu_score[0]
def eval(self, dataframe, input_key, reference_key):
eval_data = dataframe[input_key]
ref_data = dataframe[reference_key]
self.logger.info(f"Evaluating {self.score_name}...")
scores = [self._score_func(eval_text, ref_text) for eval_text, ref_text in tqdm(zip(eval_data, ref_data), desc="BleuScorer Evaluating...")]
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_key: str, input_reference_key: str, output_key: str='BleuScore'):
self.input_key = input_key
self.reference_key = input_reference_key
self.output_key = output_key
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_key, self.reference_key)
dataframe[self.output_key] = scores
storage.write(dataframe)
import copy
import math
import pickle
import numpy as np
from collections import defaultdict
import os
from six.moves import xrange
import six
def precook(s, n=4, out=False):
words = s.split()
counts = defaultdict(int)
for k in xrange(1, n+1):
for i in xrange(len(words)-k+1):
ngram = tuple(words[i:i+k])
counts[ngram] += 1
return counts
def cook_refs(refs, n=4):
return [precook(ref, n) for ref in refs]
def cook_test(test, n=4):
return precook(test, n, True)
class Cider(object):
"""CIDEr scorer."""
def copy(self):
new = Cider(n=self.n)
new.ctest = copy.copy(self.ctest)
new.crefs = copy.copy(self.crefs)
return new
def __init__(self, test=None, refs=None, n=4, sigma=6.0, idf=None):
self.n = n
self.sigma = sigma
self.crefs = []
self.ctest = []
self.document_frequency = defaultdict(float)
self.ref_len = None
if idf:
self.document_frequency = idf['df']
self.ref_len = np.log(float(idf['ref_len'])) # Use reference length from the IDF
self.cook_append(test, refs)
def cook_append(self, test, refs):
if refs is not None:
self.crefs.append(cook_refs(refs))
if test is not None:
self.ctest.append(cook_test(test))
else:
self.ctest.append(None)
def size(self):
assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
return len(self.crefs)
def __iadd__(self, other):
if type(other) is tuple:
self.cook_append(other[0], other[1])
else:
self.ctest.extend(other.ctest)
self.crefs.extend(other.crefs)
return self
def compute_doc_freq(self):
'''Compute term frequency for reference data to generate IDF.'''
if not self.document_frequency: # Handle empty DF (for 'corpus' mode)
for refs in self.crefs:
for ngram in set([ngram for ref in refs for (ngram, count) in ref.items()]):
self.document_frequency[ngram] += 1
def compute_cider(self, df_mode):
def counts2vec(cnts):
vec = [defaultdict(float) for _ in range(self.n)]
length = 0
norm = [0.0 for _ in range(self.n)]
for (ngram, term_freq) in cnts.items():
df = np.log(max(1.0, self.document_frequency[ngram]))
n = len(ngram) - 1
vec[n][ngram] = float(term_freq) * (self.ref_len - df)
norm[n] += pow(vec[n][ngram], 2)
if n == 1:
length += term_freq
norm = [np.sqrt(n) for n in norm]
return vec, norm, length
def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
delta = float(length_hyp - length_ref)
val = np.array([0.0 for _ in range(self.n)])
for n in range(self.n):
for (ngram, count) in vec_hyp[n].items():
val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
val[n] /= (norm_hyp[n] * norm_ref[n])
val[n] *= np.e**(-(delta**2) / (2 * self.sigma**2))
return val
if df_mode == "corpus":
self.ref_len = np.log(float(len(self.crefs))) # Use total references in corpus as ref_len
scores = []
for test, refs in zip(self.ctest, self.crefs):
vec, norm, length = counts2vec(test)
score = np.array([0.0 for _ in range(self.n)])
for ref in refs:
vec_ref, norm_ref, length_ref = counts2vec(ref)
score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
score_avg = np.mean(score)
score_avg /= len(refs)
score_avg *= 10.0
scores.append(score_avg)
return scores
def compute_score(self, df_mode, option=None, verbose=0):
'''Compute the CIDEr score based on df_mode (corpus or IDF-based).'''
self.compute_doc_freq()
if df_mode == "corpus":
if not self.document_frequency: # Handle the case where DF is empty
raise ValueError("Document frequency is empty. Please check the corpus data.")
min_required_data = max(self.document_frequency.values())
# print(min_required_data)# For corpus mode, we require at least one reference
# if len(self.ctest) < min_required_data:
# raise ValueError(f"Insufficient test data: {len(self.ctest)} samples, but at least {min_required_data} are required.")
score = self.compute_cider(df_mode)
return np.mean(np.array(score)), np.array(score)
import os
import json
import pickle
from tqdm import tqdm
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
from dataflow.operators.general_text.eval.cider.cider import Cider
def load_idf(idf_path):
with open(idf_path, 'rb') as f:
idf = pickle.load(f, encoding='utf-8')
return idf
@OPERATOR_REGISTRY.register()
class CiderSampleEvaluator(OperatorABC):
def __init__(self, n=4, sigma=6.0, df_mode="coco-val-df", idf_path="./dataflow/operators/general_pt/eval/cider/coco-val-df.p"):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.score_name = 'CiderScore'
self.n = n # Max n-gram length (default: 4)
self.sigma = sigma # Sigma for Gaussian penalty (default: 6.0)
self.df_mode = df_mode
if self.df_mode != "corpus":
# The idf file can be downloaded at https://github.com/ramavedantam/coco-caption/blob/master/data/coco-val-df.p
# Put the file in the correct idf_path
self.idf = load_idf(idf_path)
else:
self.idf = None # No need to load IDF for 'corpus' mode
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用CIDEr指标评估生成文本与参考文本的相似度,基于TF-IDF加权的n-gram重叠度。\n"
"输入参数:\n"
"- n:最大n-gram长度,默认为4\n"
"- sigma:高斯惩罚参数,默认为6.0\n"
"- df_mode:文档频率模式,默认为'coco-val-df'\n"
"- idf_path:IDF文件路径,默认为预训练COCO数据集IDF\n"
"- input_key:生成文本字段名\n"
"- reference_key:参考文本字段名\n"
"- output_key:输出得分字段名,默认为'CiderScore'\n"
"输出参数:\n"
"- 包含CIDEr得分的DataFrame"
)
elif lang == "en":
return (
"Evaluate text similarity using CIDEr metric with TF-IDF weighted n-gram overlap.\n"
"Input Parameters:\n"
"- n: Maximum n-gram length, default 4\n"
"- sigma: Gaussian penalty parameter, default 6.0\n"
"- df_mode: Document frequency mode, default 'coco-val-df'\n"
"- idf_path: Path to IDF file, default pre-trained COCO dataset IDF\n"
"- input_key: Field name for generated text\n"
"- reference_key: Field name for reference text\n"
"- output_key: Field name for output score, default 'CiderScore'\n"
"Output Parameters:\n"
"- DataFrame containing CIDEr scores"
)
else:
return "Evaluate text similarity using CIDEr metric."
def _score_func(self, eval_text, ref_text):
cider_scorer = Cider(
test=eval_text,
refs=[ref_text],
n=self.n,
sigma=self.sigma,
idf=self.idf # Pass IDF (None if using 'corpus')
)
# Pass df_mode dynamically based on the argument
cider_score, _ = cider_scorer.compute_score(df_mode='corpus' if self.idf is None else 'coco-val-df')
return cider_score
def eval(self, dataframe, input_key, reference_key):
eval_data = dataframe[input_key]
ref_data = dataframe[reference_key]
self.logger.info(f"Evaluating {self.score_name}...")
scores = [self._score_func(eval_text, ref_text) for eval_text, ref_text in tqdm(zip(eval_data, ref_data), desc="CiderScorer Evaluating...")]
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_key: str, input_reference_key: str, output_key: str='CiderScore'):
self.input_key = input_key
self.reference_key = input_reference_key
self.output_key = output_key
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_key, self.reference_key)
dataframe[self.output_key] = scores
storage.write(dataframe)
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from dataflow import get_logger
import pandas as pd
from langkit import light_metrics, extract
from tqdm import tqdm
@OPERATOR_REGISTRY.register()
class LangkitSampleEvaluator(OperatorABC):
def __init__(self):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.llm_schema = light_metrics.init()
self.score_name = 'LangkitScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用Langkit工具包计算文本统计信息,帮助评估文本结构复杂性和可读性。提取多种语言特征,包括句子长度、词汇多样性、情感倾向等。\n\n"
"输出参数:\n"
"- LangkitNumSentencesScore: 句子数量\n"
"- LangkitNumWordsScore: 单词数量\n"
"- LangkitAvgWordLengthScore: 平均单词长度\n"
"- LangkitFleschReadingEaseScore: 可读性评分(Flesch公式)\n"
"- LangkitSentimentScore: 情感倾向(-1到1之间)"
)
else:
return (
"Uses Langkit toolkit to calculate text statistics for evaluating structural complexity and readability. Extracts multiple linguistic features including sentence length, lexical diversity, and sentiment.\n\n"
"Output Parameters:\n"
"- LangkitNumSentencesScore: Number of sentences\n"
"- LangkitNumWordsScore: Number of words\n"
"- LangkitAvgWordLengthScore: Average word length\n"
"- LangkitFleschReadingEaseScore: Readability score (Flesch formula)\n"
"- LangkitSentimentScore: Sentiment polarity (between -1 and 1)"
)
def _score_func(self, sample):
df = pd.DataFrame({'prompt': [sample]})
df['response'] = ''
enhanced_df = extract(df, schema=self.llm_schema)
scores_dict = enhanced_df.to_dict(orient='records')[0]
processed_scores = {}
for k, v in scores_dict.items():
if k == 'prompt':
continue
elif k.startswith('prompt.'):
new_key = k[len('prompt.'):]
processed_scores[new_key] = v
elif not (k == 'response' or k.startswith('response.')):
processed_scores[k] = v
processed_scores.pop('has_patterns', None)
result = {}
for k, v in processed_scores.items():
score_key = f"Langkit{''.join([word.capitalize() for word in k.split('_')])}Score"
result[score_key] = v
return result
def eval(self, dataframe, input_key):
scores_list = []
self.logger.info(f"Evaluating {self.score_name}...")
for sample in tqdm(dataframe[input_key], desc="LangkitScore Evaluating..."):
scores = self._score_func(sample)
scores_list.append(scores)
self.logger.info("Evaluation complete!")
return scores_list
def run(self, storage: DataFlowStorage, input_key: str):
self.input_key = input_key
dataframe = storage.read("dataframe")
self.logger.info("LangkitScore ready to evaluate.")
scores = self.eval(dataframe, input_key)
for idx, score_dict in enumerate(scores):
for key, value in score_dict.items():
dataframe.at[idx, key] = value
storage.write(dataframe)
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.storage import DataFlowStorage
from dataflow import get_logger
import string
from tqdm import tqdm
remove_punctuation = str.maketrans('', '', string.punctuation)
# mtld, hdd and other functions remain unchanged as they are utility functions
# ... (mtld_calc, mtld, factorial, combination, hypergeometric, hdd)
def mtld_calc(word_array, ttr_threshold):
current_ttr = 1.0
token_count = 0
type_count = 0
types = set()
factors = 0.0
for token in word_array:
token = token.translate(remove_punctuation).lower()
token_count += 1
if token not in types:
type_count +=1
types.add(token)
current_ttr = type_count / token_count
if current_ttr <= ttr_threshold:
factors += 1
token_count = 0
type_count = 0
types = set()
current_ttr = 1.0
excess = 1.0 - current_ttr
excess_val = 1.0 - ttr_threshold
factors += excess / excess_val
if factors != 0:
return len(word_array) / factors
return -1
def mtld(word_array, ttr_threshold=0.72):
if isinstance(word_array, str):
raise ValueError("The input should be a list of str")
if len(word_array) < 50:
raise ValueError("The input length should be larger than 50")
return (mtld_calc(word_array, ttr_threshold) + mtld_calc(word_array[::-1], ttr_threshold)) / 2
def factorial(x):
x=int(x)
result = 1
for i in range(2, x + 1):
result *= i
return result
def combination(n, r):
r_fact = factorial(r)
numerator = 1.0
num = n-r+1.0
while num < n+1.0:
numerator *= num
num += 1.0
return numerator / r_fact
def hypergeometric(population, population_successes, sample, sample_successes):
return (combination(population_successes, sample_successes) *
combination(population - population_successes, sample - sample_successes)) /\
combination(population, sample)
def hdd(word_array, sample_size=42.0):
if isinstance(word_array, str):
raise ValueError("The input should be a list of str")
if len(word_array) < 50:
raise ValueError("The input length should be larger than 50")
type_counts = {}
for token in word_array:
token = token.translate(remove_punctuation).lower()
if token in type_counts:
type_counts[token] += 1.0
else:
type_counts[token] = 1.0
hdd_value = 0.0
for token_type in type_counts.keys():
contribution = (1.0 - hypergeometric(len(word_array), sample_size, type_counts[token_type], 0.0)) / sample_size
hdd_value += contribution
return hdd_value
@OPERATOR_REGISTRY.register()
class LexicalDiversitySampleEvaluator(OperatorABC):
def __init__(self):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.metrics_to_keep = {'mtld': True, 'hdd': True}
self.score_name = 'LexicalDiversityScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用MTLD(词汇多样性测量)和HDD(移动平均类型-标记比)方法计算文本词汇多样性。\n\n"
"功能说明:\n"
"- MTLD(词汇多样性测量):通过计算维持特定TTR阈值所需的单词数量来评估词汇多样性\n"
"- HDD(移动平均类型-标记比):基于样本的词汇丰富度估计\n\n"
"输入要求:文本长度需大于50个单词\n\n"
"输出参数:\n"
"- LexicalDiversityMTLDScore: MTLD多样性得分(值越高表示多样性越好)\n"
"- LexicalDiversityHD-DScore: HDD多样性得分(值越高表示多样性越好)"
)
else:
return (
"Measures text lexical diversity using MTLD (Measure of Textual Lexical Diversity) and HDD (Hypergeometric Distribution Diversity) methods.\n\n"
"Features:\n"
"- MTLD: Evaluates lexical diversity by calculating the number of words needed to maintain a specific TTR threshold\n"
"- HDD: Estimates vocabulary richness based on sampling\n\n"
"Input Requirement: Text length must exceed 50 words\n\n"
"Output Parameters:\n"
"- LexicalDiversityMTLDScore: MTLD diversity score (higher = more diverse)\n"
"- LexicalDiversityHD-DScore: HDD diversity score (higher = more diverse)"
)
def _score_func(self, sample):
text = sample
words = text.split()
scores = {}
# must ensure text length in the given interval
if self.metrics_to_keep.get('mtld'):
if len(words) > 50:
scores['LexicalDiversityMTLDScore'] = mtld(words)
else:
scores['LexicalDiversityMTLDScore'] = None
if self.metrics_to_keep.get('hdd'):
if 50 < len(words) < 1000:
scores['LexicalDiversityHD-DScore'] = hdd(words)
else:
scores['LexicalDiversityHD-DScore'] = None
return scores
def eval(self, dataframe, input_key):
scores_list = []
self.logger.info(f"Evaluating {self.score_name}...")
for sample in tqdm(dataframe[input_key], desc="LexicalDiversityScore Evaluating..."):
scores = self._score_func(sample)
scores_list.append(scores)
self.logger.info("Evaluation complete!")
return scores_list
def run(self, storage: DataFlowStorage, input_key: str):
self.input_key = input_key
dataframe = storage.read("dataframe")
self.logger.info("LexicalDiversityScore ready to evaluate.")
scores = self.eval(dataframe, input_key)
# Flatten the nested dictionary of scores into the dataframe
for idx, score_dict in enumerate(scores):
for key, value in score_dict.items():
dataframe.at[idx, key] = value
storage.write(dataframe)
import re
from tqdm import tqdm
import pandas as pd
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
@OPERATOR_REGISTRY.register()
class NgramSampleEvaluator(OperatorABC):
def __init__(self, ngrams=5):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.ngrams = ngrams
self.score_name = 'NgramScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"计算文本中n-gram的重复比例,评估文本冗余度。通过比较唯一n-gram数量与总n-gram数量的比值来衡量文本原创性。\n\n"
"初始化参数:\n"
"- ngrams: n-gram的长度,默认为5\n\n"
"输出参数:\n"
"- NgramScore: n-gram重复比例得分(0到1之间,得分越高表示重复比例越低)"
)
else:
return (
"Evaluates text redundancy by calculating n-gram repetition ratio. Measures text originality by comparing the ratio of unique n-grams to total n-grams.\n\n"
"Initialization Parameters:\n"
"- ngrams: Length of n-grams, default is 5\n\n"
"Output Parameters:\n"
"- NgramScore: N-gram repetition ratio score (0-1, higher = less repetition)"
)
def _score_func(self, sample):
content = sample
content = content.lower()
content = re.sub(r'[^\w\s]', '', content)
words = content.split()
ngrams = [' '.join(words[i:i + self.ngrams]) for i in range(len(words) - (self.ngrams - 1))]
unique_ngrams = set(ngrams)
total_ngrams = len(ngrams)
unique_ngrams_count = len(unique_ngrams)
repetition_score = unique_ngrams_count / total_ngrams if total_ngrams > 0 else 0.0
return repetition_score
def eval(self, dataframe: pd.DataFrame, input_key: str):
self.logger.info(f"Evaluating {self.score_name}...")
scores = [self._score_func(sample) for sample in tqdm(dataframe[input_key], desc="NgramScorer Evaluating...")]
self.logger.info("Evaluation complete!")
return scores
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='NgramScore'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_key)
dataframe[self.output_key] = scores
storage.write(dataframe)
\ No newline at end of file
from googleapiclient import discovery
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
import pandas as pd
from dataflow.core import LLMServingABC
from dataflow.serving import PerspectiveAPIServing
@OPERATOR_REGISTRY.register()
class PerspectiveSampleEvaluator(OperatorABC):
"""Operator that assigns Perspective API toxicity scores to text inputs."""
def __init__(self, serving: PerspectiveAPIServing = None):
self.logger = get_logger()
self.logger.info(f"Initializing {self.__class__.__name__}...")
self.serving = serving
self.score_name = 'PerspectiveScore'
self.logger.info(f"{self.__class__.__name__} initialized.")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用Perspective API评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。\n"
"输入参数:\n"
"- serving:Perspective API服务对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出得分字段名,默认'PerspectiveScore'\n"
"输出参数:\n"
"- 包含毒性评估得分的DataFrame"
)
elif lang == "en":
return (
"Assess text toxicity using PerspectiveAPI; higher scores indicate more toxicity.\n"
"Input Parameters:\n"
"- serving: Perspective API serving object\n"
"- input_key: Field name for input text\n"
"- output_key: Field name for output score, default 'PerspectiveScore'\n"
"Output Parameters:\n"
"- DataFrame containing toxicity assessment scores"
)
else:
return "Assess text toxicity using PerspectiveAPI; higher scores indicate more toxicity."
def get_score(self, samples: list[dict], input_key: str) -> list[float]:
# Extract texts, truncate if needed
texts = []
max_bytes = 20480
for sample in samples:
text = sample.get(input_key, '') or ''
encoded = text.encode('utf-8')
if len(encoded) > max_bytes:
text = encoded[:max_bytes].decode('utf-8', errors='ignore')
texts.append(text)
# Delegate to serving
return self.serving.generate_from_input(texts)
def eval(self, dataframe: pd.DataFrame, input_key: str) -> list[float]:
self.logger.info(f"Evaluating {self.score_name}...")
samples = dataframe.to_dict(orient='records')
scores = self.get_score(samples, input_key)
self.logger.info("Evaluation complete!")
return scores
def run(self,
storage: DataFlowStorage,
input_key: str,
output_key: str = 'PerspectiveScore'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
dataframe[self.output_key] = self.eval(dataframe, self.input_key)
storage.write(dataframe)
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from presidio_analyzer import AnalyzerEngine
from presidio_analyzer.nlp_engine import TransformersNlpEngine
from transformers import AutoModelForTokenClassification, AutoTokenizer
import warnings
from dataflow import get_logger
from dataflow.utils.storage import DataFlowStorage
# Presidio PII detection Scorer
@OPERATOR_REGISTRY.register()
class PresidioSampleEvaluator(OperatorABC):
def __init__(self, device='cuda', lang='en', model_cache_dir='./dataflow_cache'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.language = lang
self.device = device
self.model_cache_dir = model_cache_dir
self.model_name = 'dslim/bert-base-NER'
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
self.model = AutoModelForTokenClassification.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
warnings.filterwarnings("ignore", category=UserWarning, module="spacy_huggingface_pipelines")
model_config = [{
"lang_code": self.language,
"model_name": {
"spacy": "en_core_web_sm",
"transformers": self.model_name
}
}]
self.nlp_engine = TransformersNlpEngine(models=model_config)
self.analyzer = AnalyzerEngine(nlp_engine=self.nlp_engine)
self.score_name = 'PresidioScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用Microsoft Presidio模型识别文本中的个人身份信息(PII),返回检测到的PII实体数量。支持多种实体类型如姓名、邮箱、电话号码等,"
"基于dslim/bert-base-NER模型实现。适用于评估文本的隐私安全风险。\n"
"输入参数:\n"
"- text: 待检测的文本字符串\n"
"- lang: 语言类型,默认为'en'\n"
"输出参数:\n"
"- int: 检测到的PII实体数量"
)
else:
return (
"Detect personally identifiable information (PII) in text using the Microsoft Presidio model and return the count of detected PII entities. "
"Supports various entity types such as names, emails, phone numbers, etc., implemented based on the dslim/bert-base-NER model. Suitable for assessing text privacy and security risks.\n"
"Input parameters:\n"
"- text: Text string to be detected\n"
"- lang: Language type, default 'en'\n"
"Output parameters:\n"
"- int: Count of detected PII entities"
)
def eval(self, dataframe, input_key):
input_texts = dataframe.get(input_key, '').to_list()
results = []
self.logger.info(f"Evaluating {self.score_name}...")
for text in input_texts:
analysis_results = self.analyzer.analyze(text=text, language=self.language)
pii_count = len(analysis_results)
results.append(pii_count)
self.logger.info("Evaluation complete!")
return results
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='PresidioScore'):
dataframe = storage.read("dataframe")
scores = self.eval(dataframe, input_key)
dataframe[output_key] = scores
storage.write(dataframe)
# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import itertools
import math
import random
from abc import ABC, abstractmethod
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Optimizer
import numpy as np
from tqdm.auto import tqdm, trange
import logging
from torch.utils.data import DataLoader, Dataset
from .utils import AverageMeter, get_error, get_device
## LLM DIV
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
## LLM DIV
def get_loss(logits: torch.tensor, targets: torch.tensor, ignore_index=None) -> torch.tensor:
"""
Computes the cross-entropy loss for either sequence classification or generation.
"""
assert logits.dim() == 3 and ignore_index is not None
loss = nn.CrossEntropyLoss(ignore_index=ignore_index)
logits = logits[:,:-1,:]
logits = logits.transpose(1, 2) # batch_size, vocab_size (i.e. num_classes), sequence_length
targets = targets[:,1:]
return loss(logits, targets)
class Embedding:
"""
task_embedding = diagonal of the FIM for the filters of size [F_total, 1] total filters for a network.
Notes:
- the diagonal of the Fisher Information Matrix for each layer.
- embedding size should be the size of the total number of filters for the network.
"""
def __init__(self, hessian, scale, meta=None):
self.hessian = np.array(hessian)
self.scale = np.array(scale)
self.meta = meta
def __repr__(self):
return f'{self.hessian}'
class ProbeNetwork(ABC, nn.Module):
"""Abstract class that all probe networks should inherit from.
This is a standard torch.nn.Module but needs to expose a classifier property that returns the final classicifation
module (e.g., the last fully connected layer).
"""
@property
@abstractmethod
def classifier(self):
raise NotImplementedError("Override the classifier property to return the submodules of the network that"
" should be interpreted as the classifier")
@classifier.setter
@abstractmethod
def classifier(self, val):
raise NotImplementedError("Override the classifier setter to set the submodules of the network that"
" should be interpreted as the classifier")
class Task2Vec:
def __init__(self, model: ProbeNetwork, skip_layers=0, max_samples=None, classifier_opts=None,
method='montecarlo', method_opts=None, loader_opts=None, bernoulli=False, mode='autoregressive'): ## LLM DIV
if classifier_opts is None:
classifier_opts = {}
if method_opts is None:
method_opts = {}
if loader_opts is None:
loader_opts = {}
assert method in ('variational', 'montecarlo')
assert skip_layers >= 0
self.model = model
# Fix batch norm running statistics (i.e., put batch_norm layers in eval mode)
self.model.train()
self.device = get_device(self.model)
self.skip_layers = skip_layers
self.max_samples = max_samples
self.classifier_opts = classifier_opts
self.method = method
self.method_opts = method_opts
self.loader_opts = loader_opts
self.bernoulli = bernoulli
self.mode = mode
if self.mode == "autoregressive":
self.loss_fn = get_loss
else:
self.loss_fn = nn.CrossEntropyLoss() if not self.bernoulli else nn.BCEWithLogitsLoss()
self.loss_fn = self.loss_fn.to(self.device)
def embed(self, dataset: Dataset, epochs: int = 5):
## LLM DIV
# Cache the last layer features (needed to train the classifier) and (if needed) the intermediate layer features
# so that we can skip the initial layers when computing the embedding
# dataset.train()
if self.mode == "autoregressive":
loss = None
print(f'{self.classifier_opts=}')
if self.classifier_opts: # is it something truthy? e.g., dict with something in it?
if self.classifier_opts.get('finetune', False): # finetune only if specified True, else no finetuning if not specified or False.
epochs = 0
print(f'Warning: classifier_opts doesnt specify finetune or break early, thus no finetuning is being done. See: {self.classifier_opts=} {epochs=}')
loss = self._finetune_classifier(dataset, loader_opts=self.loader_opts, classifier_opts=self.classifier_opts, max_samples=self.max_samples, epochs=epochs)
else:
loss = self._finetune_classifier(dataset, loader_opts=self.loader_opts, classifier_opts=self.classifier_opts, max_samples=self.max_samples, epochs=epochs)
else: # self.classifier_opts might be None or {}
loss = self._finetune_classifier(dataset, loader_opts=self.loader_opts, classifier_opts=self.classifier_opts, max_samples=self.max_samples, epochs=epochs)
print(f'{loss=} (after fine tune, if not done it will be None)')
assert loss is not None, f'Err: {loss=}'
self.compute_fisher(dataset)
embedding = self.extract_embedding(self.model)
return embedding, loss
else:
if self.skip_layers > 0:
self._cache_features(dataset, indexes=(self.skip_layers, -1), loader_opts=self.loader_opts,
max_samples=self.max_samples)
else:
self._cache_features(dataset, max_samples=self.max_samples)
# Fits the last layer classifier using cached features
self._fit_classifier(**self.classifier_opts)
if self.skip_layers > 0:
dataset = torch.utils.data.TensorDataset(self.model.layers[self.skip_layers].input_features,
self.model.layers[-1].targets)
# dataset.eval() # I added this so that the embedding is computed on the val set
self.compute_fisher(dataset)
embedding = self.extract_embedding(self.model)
# dataset.train() # returns to using the support set
return embedding
### LLM DIV
def _finetune_classifier(self, dataset: Dataset, loader_opts: dict = None, classifier_opts: dict = None, max_samples=None, epochs = 5, learning_rate = 5e-5, adam_epsilon = 1e-8):
"""Fits the last layer of the HuggingFace transformer probe network."""
logging.info("Finetune classifier...")
if loader_opts is None:
loader_opts = {}
if classifier_opts is None:
classifier_opts = {}
data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 8),
num_workers=loader_opts.get('num_workers', 0), drop_last=False)
device = next(self.model.parameters()).device
print("MODEL DEVICE: ", device)
# num_examples = int(classifier_opts.get("task_batch_size", 256) / loader_opts.get('batch_size', 8))
num_examples = len(list(data_loader)) # not ideal but it's quicker in dev time, usually we won't feed the entire data set to task2vec so this should be fine
n_batches = num_examples
optimizer_grouped_parameters = [
{'params': [p for p in self.model.lm_head.parameters()],
'weight_decay': classifier_opts.get("weight_decay",0.0001)},
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=classifier_opts.get("learning_rate",learning_rate), eps=classifier_opts.get("adam_epsilon",adam_epsilon))
# Train!
logging.info("***** Running training *****")
# logging.info(" Num examples = %d", num_examples)
logging.info(" Num Epochs = %d", epochs)
logging.info(" Batch size = %d", loader_opts.get('batch_size', 8))
train_iterator = trange(classifier_opts.get("epochs", epochs), desc="Epoch", leave=False)
set_seed(classifier_opts.get("seed", 42)) # Added here for reproductibility (even between python 2 and 3)
self.model.train()
for epoch in train_iterator:
metrics = AverageMeter()
epoch_iterator = tqdm(data_loader, desc="Iteration", total=n_batches, leave=False)
for step, batch in enumerate(epoch_iterator):
optimizer.zero_grad()
inputs = {'input_ids': batch['input_ids'].to(device),
'attention_mask': batch['attention_mask'].to(device)}
logits = self.model(**inputs, labels=inputs["input_ids"]).logits
loss = self.loss_fn(logits, inputs["input_ids"], ignore_index=50256)
print(f'\nInitial loss {loss.item()} ({step=} {epoch=})') if step == 0 else None
error = get_error(logits, inputs['input_ids'], ignore_index=50256)
loss.backward()
optimizer.step()
metrics.update(n=batch['input_ids'].shape[0], loss=loss.item(), error=error)
epoch_iterator.update(1)
if classifier_opts.get("break_early", False):
print("----> breaking early")
break
if classifier_opts.get("break_early", False):
break
logging.info(f"[epoch {epoch}]: " + "\t".join(f"{k}: {v}" for k, v in metrics.avg.items()))
print(f'\nfinal loss {step=} {epoch=} of final layer loss {loss.item()} (note we are not recomputing loss after a step so this loss printed is larger than it should be/one off)')
return loss.item()
### LLM DIV
def montecarlo_fisher_autoregressive(self, dataset: Dataset, epochs: int = 1):
logging.info("Using montecarlo Fisher")
if self.loader_opts is None:
loader_opts = {}
else:
loader_opts = self.loader_opts
data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 8),
num_workers=loader_opts.get('num_workers', 0), drop_last=False)
device = get_device(self.model)
# num_examples = int(classifier_opts.get("task_batch_size", 256) / loader_opts.get('batch_size', 8))
num_examples = len(list(data_loader)) # not idea but it's quicker in dev time, usually we won't feed the entire data set to task2vec so this should be fine
n_batches = num_examples
logging.info("Computing Fisher...")
for p in self.model.parameters():
p.grad2_acc = torch.zeros_like(p.data)
p.grad_counter = 0
for k in range(epochs):
logging.info(f"\tepoch {k + 1}/{epochs}")
epoch_iterator = tqdm(data_loader, desc="Iteration", total=n_batches, leave=False)
for step, batch in enumerate(epoch_iterator):
inputs = {'input_ids': batch['input_ids'].to(device),
'attention_mask': batch['attention_mask'].to(device)}
logits = self.model(**inputs, labels=inputs["input_ids"]).logits
# The gradients used to compute the FIM needs to be for y sampled from
# the model distribution y ~ p_w(y|x), not for y from the dataset
if self.bernoulli:
target = torch.bernoulli(F.sigmoid(logits[:,:-1,:])).detach()
else:
softmax_output = F.softmax(logits, dim=-1)
lst = [torch.multinomial(softmax_output[i,:,:], 1).detach().view(-1) for i in range(len(softmax_output))]
target = torch.stack(lst, dim=0)
loss = self.loss_fn(logits, target, ignore_index=50256)
self.model.zero_grad()
loss.backward()
for p in self.model.parameters():
if p.grad is not None:
p.grad2_acc += p.grad.data ** 2
p.grad_counter += 1
if self.classifier_opts.get("break_early", False):
break # for debugging faster, otherwise FIM is really slow
if self.classifier_opts.get("break_early", False):
break # for debugging faster, otherwise FIM is really slow
for p in self.model.parameters():
if p.grad_counter == 0:
del p.grad2_acc
else:
p.grad2_acc /= p.grad_counter
logging.info("done")
def montecarlo_fisher(self, dataset: Dataset, epochs: int = 1):
logging.info("Using montecarlo Fisher")
if self.skip_layers > 0:
dataset = torch.utils.data.TensorDataset(self.model.layers[self.skip_layers].input_features,
self.model.layers[-1].targets)
data_loader = _get_loader(dataset, **self.loader_opts)
device = get_device(self.model)
logging.info("Computing Fisher...")
for p in self.model.parameters():
p.grad2_acc = torch.zeros_like(p.data)
p.grad_counter = 0
for k in range(epochs):
logging.info(f"\tepoch {k + 1}/{epochs}")
for i, (data, target) in enumerate(tqdm(data_loader, leave=False, desc="Computing Fisher")):
data = data.to(device)
output = self.model(data, start_from=self.skip_layers)
# The gradients used to compute the FIM needs to be for y sampled from
# the model distribution y ~ p_w(y|x), not for y from the dataset
if self.bernoulli:
target = torch.bernoulli(F.sigmoid(output)).detach()
else:
target = torch.multinomial(F.softmax(output, dim=-1), 1).detach().view(-1)
loss = self.loss_fn(output, target)
self.model.zero_grad()
loss.backward()
for p in self.model.parameters():
if p.grad is not None:
p.grad2_acc += p.grad.data ** 2
p.grad_counter += 1
for p in self.model.parameters():
if p.grad_counter == 0:
del p.grad2_acc
else:
p.grad2_acc /= p.grad_counter
logging.info("done")
def _run_epoch(self, data_loader: DataLoader, model: ProbeNetwork, loss_fn,
optimizer: Optimizer, epoch: int, train: bool = True,
add_compression_loss: bool = False, skip_layers=0, beta=1.0e-7):
metrics = AverageMeter()
device = get_device(model)
for i, (input, target) in enumerate(tqdm(data_loader, leave=False, desc="Computing Fisher")):
input = input.to(device)
target = target.to(device)
output = model(input, start_from=skip_layers)
loss = loss_fn(output, target)
lz = beta * variational.get_compression_loss(model) if add_compression_loss else torch.zeros_like(loss)
loss += lz
error = get_error(output, target)
metrics.update(n=input.size(0), loss=loss.item(), lz=lz.item(), error=error)
if train:
optimizer.zero_grad()
loss.backward()
optimizer.step()
# logging.info(
print(
"{}: [{epoch}] ".format('Epoch' if train else '', epoch=epoch) +
"Data/Batch: {:.3f}/{:.3f} ".format(metrics.avg["data_time"], metrics.avg["batch_time"]) +
"Loss {:.3f} Lz: {:.3f} ".format(metrics.avg["loss"], metrics.avg["lz"]) +
"Error: {:.2f}".format(metrics.avg["error"])
)
return metrics.avg
def variational_fisher(self, dataset: Dataset, epochs=1, beta=1e-7):
logging.info("Training variational fisher...")
parameters = []
for layer in self.model.layers[self.skip_layers:-1]:
if isinstance(layer, nn.Module): # Skip lambda functions
variational.make_variational(layer)
parameters += variational.get_variational_vars(layer)
bn_params = []
# Allows batchnorm parameters to change
for m in self.model.modules():
if isinstance(m, nn.BatchNorm2d):
bn_params += list(m.parameters())
# Avoids computing the gradients wrt to the weights to save time and memory
for p in self.model.parameters():
if p not in set(parameters) and p not in set(self.model.classifier.parameters()):
p.old_requires_grad = p.requires_grad
p.requires_grad = False
optimizer = torch.optim.Adam([
{'params': parameters},
{'params': bn_params, 'lr': 5e-4},
{'params': self.model.classifier.parameters(), 'lr': 5e-4}],
lr=1e-2, betas=(.9, 0.999))
if self.skip_layers > 0:
dataset = torch.utils.data.TensorDataset(self.model.layers[self.skip_layers].input_features,
self.model.layers[-1].targets)
train_loader = _get_loader(dataset, **self.loader_opts)
for epoch in range(epochs):
self._run_epoch(train_loader, self.model, self.loss_fn, optimizer, epoch, beta=beta,
add_compression_loss=True, train=True)
# Resets original value of requires_grad
for p in self.model.parameters():
if hasattr(p, 'old_requires_grad'):
p.requires_grad = p.old_requires_grad
del p.old_requires_grad
def compute_fisher(self, dataset: Dataset):
"""
Computes the Fisher Information of the weights of the model wrt the model output on the dataset and stores it.
The Fisher Information Matrix is defined as:
F = E_{x ~ dataset} E_{y ~ p_w(y|x)} [\nabla_w log p_w(y|x) \nabla_w log p_w(y|x)^t]
where p_w(y|x) is the output probability vector of the network and w are the weights of the network.
Notice that the label y is sampled from the model output distribution and not from the dataset.
This code only approximate the diagonal of F. The result is stored in the model layers and can be extracted
using the `get_fisher` method. Different approximation methods of the Fisher information matrix are available,
and can be selected in the __init__.
:param dataset: dataset with the task to compute the Fisher on
"""
if self.mode == 'autoregressive' and self.method == 'montecarlo':
fisher_fn = self.montecarlo_fisher_autoregressive
elif self.method == 'variational':
fisher_fn = self.variational_fisher
elif self.method == 'montecarlo':
fisher_fn = self.montecarlo_fisher
else:
raise ValueError(f"Invalid Fisher method {self.method}")
fisher_fn(dataset, **self.method_opts)
def _cache_features(self, dataset: Dataset, indexes=(-1,), max_samples=None, loader_opts: dict = None):
logging.info("Caching features...")
if loader_opts is None:
loader_opts = {}
data_loader = DataLoader(dataset, shuffle=False, batch_size=loader_opts.get('batch_size', 64),
num_workers=loader_opts.get('num_workers', 0), drop_last=False)
device = next(self.model.parameters()).device
def _hook(layer, inputs):
if not hasattr(layer, 'input_features'):
layer.input_features = []
layer.input_features.append(inputs[0].data.cpu().clone())
hooks = [self.model.layers[index].register_forward_pre_hook(_hook)
for index in indexes]
if max_samples is not None:
n_batches = min(
math.floor(max_samples / data_loader.batch_size) - 1, len(data_loader))
else:
n_batches = len(data_loader)
targets = []
for i, (input, target) in tqdm(enumerate(itertools.islice(data_loader, 0, n_batches)), total=n_batches,
leave=False,
desc="Caching features"):
targets.append(target.clone())
self.model(input.to(device))
for hook in hooks:
hook.remove()
for index in indexes:
self.model.layers[index].input_features = torch.cat(self.model.layers[index].input_features)
self.model.layers[-1].targets = torch.cat(targets)
def _fit_classifier(self, optimizer='adam', learning_rate=0.0004, weight_decay=0.0001,
epochs=10):
"""Fits the last layer of the network using the cached features."""
logging.info("Fitting final classifier...")
if not hasattr(self.model.classifier, 'input_features'):
raise ValueError("You need to run `cache_features` on model before running `fit_classifier`")
targets = self.model.classifier.targets.to(self.device)
features = self.model.classifier.input_features.to(self.device)
dataset = torch.utils.data.TensorDataset(features, targets)
data_loader = _get_loader(dataset, **self.loader_opts)
if optimizer == 'adam':
optimizer = torch.optim.Adam(self.model.fc.parameters(), lr=learning_rate, weight_decay=weight_decay)
elif optimizer == 'sgd':
optimizer = torch.optim.SGD(self.model.fc.parameters(), lr=learning_rate, weight_decay=weight_decay)
else:
raise ValueError(f'Unsupported optimizer {optimizer}')
loss_fn = nn.CrossEntropyLoss()
for epoch in tqdm(range(epochs), desc="Fitting classifier", leave=False):
metrics = AverageMeter()
for data, target in data_loader:
optimizer.zero_grad()
output = self.model.classifier(data)
loss = loss_fn(self.model.classifier(data), target)
error = get_error(output, target)
loss.backward()
optimizer.step()
metrics.update(n=data.size(0), loss=loss.item(), error=error)
logging.info(f"[epoch {epoch}]: " + "\t".join(f"{k}: {v}" for k, v in metrics.avg.items()))
print(f'\nfinal loss after fitting final layer {loss=}')
def extract_embedding(self, model: ProbeNetwork):
"""
Reads the values stored by `compute_fisher` and returns them in a common format that describes the diagonal of the
Fisher Information Matrix for each layer.
:param model:
:return:
"""
if self.mode == 'autoregressive':
hess, scale = [], []
for name, module in model.named_modules():
if module is model.lm_head:
continue
# The other Fisher approximation methods directly approximate the hessian at the minimum
if hasattr(module, 'weight') and hasattr(module.weight, 'grad2_acc'):
grad2 = module.weight.grad2_acc.cpu().detach().numpy()
filterwise_hess = grad2.reshape(grad2.shape[0], -1).mean(axis=1)
hess.append(filterwise_hess)
scale.append(np.ones_like(filterwise_hess))
else:
hess, scale = [], []
for name, module in model.named_modules():
if module is model.classifier:
continue
# The variational Fisher approximation estimates the variance of noise that can be added to the weights
# without increasing the error more than a threshold. The inverse of this is proportional to an
# approximation of the hessian in the local minimum.
if hasattr(module, 'logvar0') and hasattr(module, 'loglambda2'):
logvar = module.logvar0.view(-1).detach().cpu().numpy()
hess.append(np.exp(-logvar))
loglambda2 = module.loglambda2.detach().cpu().numpy()
scale.append(np.exp(-loglambda2).repeat(logvar.size))
# The other Fisher approximation methods directly approximate the hessian at the minimum
elif hasattr(module, 'weight') and hasattr(module.weight, 'grad2_acc'):
grad2 = module.weight.grad2_acc.cpu().detach().numpy()
filterwise_hess = grad2.reshape(grad2.shape[0], -1).mean(axis=1)
hess.append(filterwise_hess)
scale.append(np.ones_like(filterwise_hess))
return Embedding(hessian=np.concatenate(hess), scale=np.concatenate(scale), meta=None)
def _get_loader(trainset, testset=None, batch_size=64, num_workers=0, num_samples=10000, drop_last=True):
if getattr(trainset, 'is_multi_label', False):
raise ValueError("Multi-label datasets not supported")
# TODO: Find a way to standardize this
if hasattr(trainset, 'labels'):
labels = trainset.labels
elif hasattr(trainset, 'targets'):
labels = trainset.targets
else:
labels = list(trainset.tensors[1].cpu().numpy())
num_classes = int(getattr(trainset, 'num_classes', max(labels) + 1))
class_count = np.eye(num_classes)[labels].sum(axis=0)
weights = 1. / class_count[labels] / num_classes
weights /= weights.sum()
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, num_samples=num_samples)
# No need for mutli-threaded loading if everything is already in memory,
# and would raise an error if TensorDataset is on CUDA
num_workers = num_workers if not isinstance(trainset, torch.utils.data.TensorDataset) else 0
trainloader = torch.utils.data.DataLoader(trainset, sampler=sampler, batch_size=batch_size,
num_workers=num_workers, drop_last=drop_last)
if testset is None:
return trainloader
else:
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, pin_memory=True, shuffle=False,
num_workers=num_workers)
return trainloader, testloader
#!/usr/bin/env python3
# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import itertools
from typing import Tuple
import scipy.spatial.distance as distance
import numpy as np
import copy
import pickle
# import uutils
_DISTANCES = {}
# TODO: Remove methods that do not perform well
def _register_distance(distance_fn):
_DISTANCES[distance_fn.__name__] = distance_fn
return distance_fn
def is_excluded(k):
exclude = ['fc', 'linear']
return any([e in k for e in exclude])
def load_embedding(filename):
with open(filename, 'rb') as f:
e = pickle.load(f)
return e
def get_trivial_embedding_from(e):
trivial_embedding = copy.deepcopy(e)
for l in trivial_embedding['layers']:
a = np.array(l['filter_logvar'])
a[:] = l['filter_lambda2']
l['filter_logvar'] = list(a)
return trivial_embedding
def binary_entropy(p):
from scipy.special import xlogy
return - (xlogy(p, p) + xlogy(1. - p, 1. - p))
def get_layerwise_variance(e, normalized=False):
var = [np.exp(l['filter_logvar']) for l in e['layers']]
if normalized:
var = [v / np.linalg.norm(v) for v in var]
return var
def get_variance(e, normalized=False):
var = 1. / np.array(e.hessian)
if normalized:
lambda2 = 1. / np.array(e.scale)
var = var / lambda2
return var
def get_variances(*embeddings, normalized=False):
return [get_variance(e, normalized=normalized) for e in embeddings]
def get_hessian(e, normalized=False):
hess = np.array(e.hessian)
if normalized:
scale = np.array(e.scale)
hess = hess / scale
return hess
def get_hessians(*embeddings, normalized=False):
return [get_hessian(e, normalized=normalized) for e in embeddings]
def get_scaled_hessian(e0, e1):
h0, h1 = get_hessians(e0, e1, normalized=False)
return h0 / (h0 + h1 + 1e-8), h1 / (h0 + h1 + 1e-8)
def get_full_kl(e0, e1):
var0, var1 = get_variance(e0), get_variance(e1)
kl0 = .5 * (var0 / var1 - 1 + np.log(var1) - np.log(var0))
kl1 = .5 * (var1 / var0 - 1 + np.log(var0) - np.log(var1))
return kl0, kl1
def layerwise_kl(e0, e1):
layers0, layers1 = get_layerwise_variance(e0), get_layerwise_variance(e1)
kl0 = []
for var0, var1 in zip(layers0, layers1):
kl0.append(np.sum(.5 * (var0 / var1 - 1 + np.log(var1) - np.log(var0))))
return kl0
def layerwise_cosine(e0, e1):
layers0, layers1 = get_layerwise_variance(e0, normalized=True), get_layerwise_variance(e1, normalized=True)
res = []
for var0, var1 in zip(layers0, layers1):
res.append(distance.cosine(var0, var1))
return res
@_register_distance
def kl(e0, e1):
var0, var1 = get_variance(e0), get_variance(e1)
kl0 = .5 * (var0 / var1 - 1 + np.log(var1) - np.log(var0))
kl1 = .5 * (var1 / var0 - 1 + np.log(var0) - np.log(var1))
return np.maximum(kl0, kl1).sum()
@_register_distance
def asymmetric_kl(e0, e1):
var0, var1 = get_variance(e0), get_variance(e1)
kl0 = .5 * (var0 / var1 - 1 + np.log(var1) - np.log(var0))
kl1 = .5 * (var1 / var0 - 1 + np.log(var0) - np.log(var1))
return kl0.sum()
@_register_distance
def jsd(e0, e1):
var0, var1 = get_variance(e0), get_variance(e1)
var = .5 * (var0 + var1)
kl0 = .5 * (var0 / var - 1 + np.log(var) - np.log(var0))
kl1 = .5 * (var1 / var - 1 + np.log(var) - np.log(var1))
return (.5 * (kl0 + kl1)).mean()
@_register_distance
def cosine(e0, e1):
h1, h2 = get_scaled_hessian(e0, e1)
return distance.cosine(h1, h2)
@_register_distance
def normalized_cosine(e0, e1):
h1, h2 = get_variances(e0, e1, normalized=True)
return distance.cosine(h1, h2)
@_register_distance
def correlation(e0, e1):
v1, v2 = get_variances(e0, e1, normalized=False)
return distance.correlation(v1, v2)
@_register_distance
def entropy(e0, e1):
h1, h2 = get_scaled_hessian(e0, e1)
return np.log(2) - binary_entropy(h1).mean()
def get_normalized_embeddings(embeddings, normalization=None):
F = [1. / get_variance(e, normalized=False) if e is not None else None for e in embeddings]
zero_embedding = np.zeros_like([x for x in F if x is not None][0])
F = np.array([x if x is not None else zero_embedding for x in F])
# FIXME: compute variance using only valid embeddings
if normalization is None:
normalization = np.sqrt((F ** 2).mean(axis=0, keepdims=True))
F /= normalization
return F, normalization
def pdist(embeddings, distance='cosine') -> np.ndarray:
distance_fn = _DISTANCES[distance]
n = len(embeddings)
distance_matrix = np.zeros([n, n])
if distance != 'asymmetric_kl':
for (i, e1), (j, e2) in itertools.combinations(enumerate(embeddings), 2):
distance_matrix[i, j] = distance_fn(e1, e2)
distance_matrix[j, i] = distance_matrix[i, j]
else:
for (i, e1) in enumerate(embeddings):
for (j, e2) in enumerate(embeddings):
distance_matrix[i, j] = distance_fn(e1, e2)
return distance_matrix
def cross_pdist(embeddings1, embeddings2, distance='cosine') -> np.ndarray :
"""
Compute pairwise distance between embeddings1 and embeddings2.
ref: https://chat.openai.com/share/a5ca38dc-3393-4cfd-971c-4a29b0c56b63
"""
distance_fn = _DISTANCES[distance]
n1 = len(embeddings1)
n2 = len(embeddings2)
distance_matrix = np.zeros([n1, n2])
if distance != 'asymmetric_kl':
for i, e1 in enumerate(embeddings1):
for j, e2 in enumerate(embeddings2):
distance_matrix[i, j] = distance_fn(e1, e2)
else:
for i, e1 in enumerate(embeddings1):
for j, e2 in enumerate(embeddings2):
distance_matrix[i, j] = distance_fn(e1, e2)
return distance_matrix
def cdist(from_embeddings, to_embeddings, distance='cosine'):
distance_fn = _DISTANCES[distance]
distance_matrix = np.zeros([len(from_embeddings), len(to_embeddings)])
for (i, e1) in enumerate(from_embeddings):
for (j, e2) in enumerate(to_embeddings):
if e1 is None or e2 is None:
continue
distance_matrix[i, j] = distance_fn(e1, e2)
return distance_matrix
def plot_distance_matrix(embeddings, labels=None, distance='cosine', show_plot=True):
import seaborn as sns
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform
import pandas as pd
import matplotlib.pyplot as plt
distance_matrix = pdist(embeddings, distance=distance)
cond_distance_matrix = squareform(distance_matrix, checks=False)
linkage_matrix = linkage(cond_distance_matrix, method='complete', optimal_ordering=True)
if labels is not None:
distance_matrix = pd.DataFrame(distance_matrix, index=labels, columns=labels)
sns.clustermap(distance_matrix, row_linkage=linkage_matrix, col_linkage=linkage_matrix, cmap='viridis_r')
if show_plot:
plt.show()
## LLM DIV
def plot_distance_matrix_heatmap_only(embeddings, labels=None, distance='cosine', show_plot=True, title=None, save_file=None):
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
distance_matrix = pdist(embeddings, distance=distance)
if labels is not None:
distance_matrix = pd.DataFrame(distance_matrix, index=labels, columns=labels)
sns.heatmap(distance_matrix, cmap='viridis_r')
if title:
plt.title(title)
if save_file:
_ = plt.savefig("plots/" + save_file + ".png", bbox_inches='tight')
if show_plot:
plt.show()
## LLM DIV
def plot_distance_matrix_from_distance_matrix(distance_matrix, labels=None, show_plot=True, title=None, save_file=None, cluster=False, plot_multi=False):
import seaborn as sns
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform
import pandas as pd
import matplotlib.pyplot as plt
cond_distance_matrix = squareform(distance_matrix, checks=False)
linkage_matrix = linkage(cond_distance_matrix, method='complete', optimal_ordering=True)
if labels is not None:
distance_matrix = pd.DataFrame(distance_matrix, index=labels, columns=labels)
# plot multiple subplots in one figure
# distance_matrix passed in is a list of distance_matrix (np.arrays)
if plot_multi and not cluster:
num_rows, num_cols = 3, 2
f, ax = plt.subplots(num_rows, num_cols)#, figsize=(12, 15))
i = 0
for row_ind in range(len(num_rows)):
for col_ind in range(len(num_cols)):
sns.heatmap(distance_matrix[i], cmap='viridis_r', ax=ax[row_ind, col_ind])
i += 1
else:
if cluster:
sns.clustermap(distance_matrix, row_linkage=linkage_matrix, col_linkage=linkage_matrix, cmap='viridis_r')
else:
sns.heatmap(distance_matrix, cmap='viridis_r')
if title:
plt.title(title)
if save_file:
_ = plt.savefig("plots/" + save_file + ".png", bbox_inches='tight')
if show_plot:
plt.show()
## LLM DIV
# plot multiple subplots in one figure
# distance_matrix passed in is a list of distance_matrix np.arrays
def plot_multi_distance_matrix_from_distance_matrix_list(distance_matrix_lst, title_lst, labels, main_title=None, show_plot=True, title=None, save_file=None, vmin=None, vmax=None):
import seaborn as sns
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform
import pandas as pd
import matplotlib.pyplot as plt
import math
num_rows, num_cols = math.ceil(len(distance_matrix_lst)/2), 2
if len(distance_matrix_lst) % 2 == 1:
figsize = (12,10)
else:
figsize = (12,10)
f, ax = plt.subplots(num_rows, num_cols, figsize=figsize)
i = 0
for row_ind in range(num_rows):
for col_ind in range(num_cols):
if i >= len(distance_matrix_lst):
break
distance_matrix = distance_matrix_lst[i]
distance_matrix = pd.DataFrame(distance_matrix, index=labels[i], columns=labels[i])
if len(distance_matrix_lst) > 2:
ax[row_ind, col_ind].set_aspect('equal')
if vmin is not None and vmax is not None:
sns.heatmap(distance_matrix, cmap='viridis_r', ax=ax[row_ind, col_ind], vmin=vmin, vmax=vmax)
else:
sns.heatmap(distance_matrix, cmap='viridis_r', ax=ax[row_ind, col_ind])
ax[row_ind, col_ind].set_title(title_lst[i])
else:
ax[col_ind].set_aspect('equal')
sns.heatmap(distance_matrix, cmap='viridis_r', ax=ax[col_ind])
ax[col_ind].set_title(title_lst[i])
i += 1
if len(distance_matrix_lst) % 2 == 1:
f.delaxes(ax[num_rows-1,1])
if main_title:
f.suptitle(main_title)
f.subplots_adjust(top=0.5)
if len(distance_matrix_lst) % 2 == 1:
plt.tight_layout(h_pad=2)
else:
plt.tight_layout(h_pad=2, w_pad=5)
if save_file:
_ = plt.savefig("plots/" + save_file + ".png", bbox_inches='tight')
if show_plot:
plt.show()
## LLM DIV
def stats_of_distance_matrix(distance_matrix: np.ndarray,
remove_diagonal: bool = True,
variance_type: str = 'std', # TODO: was ci_0.95. Changed to rid uutils call
get_total: bool = False,
) -> Tuple[float, float]:
if remove_diagonal:
# - remove diagonal: ref https://stackoverflow.com/questions/46736258/deleting-diagonal-elements-of-a-numpy-array
triu: np.ndarray = np.triu(distance_matrix)
tril: np.ndarray = np.tril(distance_matrix)
# distance_matrix = distance_matrix[~np.eye(distance_matrix.shape[0], dtype=bool)].reshape(distance_matrix.shape[0], -1)
# remove diagonal and dummy zeros where the other triangular matrix was artificially placed.
distance_matrix = triu[triu != 0.0]
# - flatten
distance_matrix: np.ndarray = distance_matrix.flatten()
# - compute stats of distance matrix
if variance_type == 'std':
mu, var = distance_matrix.mean(), distance_matrix.std()
# elif variance_type == 'ci_0.95':
# from uutils.torch_uu.metrics.confidence_intervals import mean_confidence_interval
# mu, var = mean_confidence_interval(distance_matrix, confidence=0.95)
else:
raise ValueError(f'Invalid variance type, got: {variance_type=}')
# - double checks the mean was computed corrects. Since it's symmetric the mean after removing diagonal should be equal to just one side of the diagonals
if remove_diagonal:
# from uutils.torch_uu import approx_equal
# assert approx_equal(triu.sum(), tril.sum(), tolerance=1e-4), f'Distance matrix is not symmetric, are you sure this is correct?'
# assert approx_equal(distance_matrix.mean(), triu[triu != 0.0].mean(), tolerance=1e-4), f'Mean should be equal to triangular matrix'
# assert approx_equal(mu, triu[triu != 0.0].mean(), tolerance=1e-4)
print('Lower tri sum', tril.sum(), ' / Upper tri sum', triu.sum(), '| These should be approx equal!!')
print('Total mean', distance_matrix.mean(), ' / Upper mean', triu[triu != 0.0].mean(), ' / Lower mean', tril[tril != 0.0].mean(), '| These should all be approx equal!!')
print('mu (div coefficient)', mu, ' / Upper mean', triu[triu != 0.0].mean(), '| These should all be approx equal!!')
if get_total:
total = distance_matrix.sum()
return mu, var, total
else:
return mu, var
def stats_cross_distance_matrix(distance_matrix: np.ndarray,
remove_diagonal: bool = False,
variance_type: str = 'std', # TODO: was ci_0.95. Changed to rid uutils call
get_total: bool = False,
) -> Tuple[float, float]:
return stats_of_distance_matrix(distance_matrix, remove_diagonal=remove_diagonal, variance_type=variance_type, get_total=get_total)
def plot_histogram_of_distances(distance_matrix: np.ndarray, title, show_plot=True, save_file=None, bins_width=None, grid=True):
import matplotlib.pyplot as plt
triu = np.triu(distance_matrix)
triu = triu[triu != 0.0]
distance_values = triu.flatten()
if grid:
plt.grid(zorder=0)
plt.axvline(np.mean(distance_values), color='k', linestyle='dashed', linewidth=1, zorder=4)
if bins_width is not None:
plt.hist(distance_values, edgecolor ="black", bins=np.arange(min(distance_values), max(distance_values) + bins_width, bins_width), zorder=3)
else:
plt.hist(distance_values, edgecolor ="black", zorder=3)
plt.title(title)
plt.xlabel("Cosine Distance between Task Pairs")
plt.ylabel("Frequency")
plt.tight_layout()
if save_file:
_ = plt.savefig("plots/" + save_file + ".png", bbox_inches='tight')
if show_plot:
plt.show()
## LLM DIV
# plot multiple subplots in one figure
# distance_matrix passed in is a list of distance_matrix (np.arrays)
def plot_multi_histogram_of_distances(distance_matrix_lst, title_lst, main_title=None, show_plot=True, save_file=None,
xlabel="Cosine Distance between Task Pairs", grid=True, bins_width=None,
num_cols=2, figsize=(12,10)):
import seaborn as sns
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform
import pandas as pd
import matplotlib.pyplot as plt
import math
if num_cols == 2:
num_rows = math.ceil(len(distance_matrix_lst)/2)
else:
num_rows = math.ceil(len(distance_matrix_lst)/num_cols)
f, ax = plt.subplots(num_rows, num_cols, figsize=figsize)
i = 0
for row_ind in range(num_rows):
for col_ind in range(num_cols):
if i >= len(distance_matrix_lst):
break
triu = np.triu(distance_matrix_lst[i])
triu = triu[triu != 0.0]
distance_values = triu.flatten()
if len(distance_matrix_lst) > 2:
if grid:
ax[row_ind, col_ind].grid(zorder=0)
if bins_width is not None:
ax[row_ind, col_ind].hist(distance_values, edgecolor ="black", zorder=3, bins=np.arange(min(distance_values), max(distance_values) + bins_width, bins_width))
else:
ax[row_ind, col_ind].hist(distance_values, edgecolor ="black", zorder=3)
ax[row_ind, col_ind].set_xlabel(xlabel)
ax[row_ind, col_ind].set_ylabel("Frequency")
ax[row_ind, col_ind].axvline(np.mean(distance_values), color='k', linestyle='dashed', linewidth=1, zorder=4)
ax[row_ind, col_ind].set_title(title_lst[i])
else:
if grid:
ax[col_ind].grid(zorder=0)
ax[col_ind].hist(distance_values, edgecolor ="black", zorder=3)
if bins_width is not None:
ax[col_ind].hist(distance_values, edgecolor ="black", zorder=3, bins=np.arange(min(distance_values), max(distance_values) + bins_width, bins_width))
else:
ax[col_ind].hist(distance_values, edgecolor ="black", zorder=3)
ax[col_ind].set_xlabel(xlabel)
ax[col_ind].set_ylabel("Frequency")
ax[col_ind].set_title(title_lst[i])
i += 1
if len(distance_matrix_lst) % 2 == 1 and num_cols == 2:
f.delaxes(ax[num_rows-1,1])
if main_title:
f.suptitle(main_title)
f.subplots_adjust(top=1)
plt.grid(True)
plt.tight_layout()
if save_file:
_ = plt.savefig("plots/" + save_file + ".png", bbox_inches='tight')
if show_plot:
plt.show()
\ No newline at end of file
# Copyright 2017-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from collections import defaultdict
import torch
import numpy as np
class AverageMeter(object):
"""Computes and stores the average and current value"""
def __init__(self):
self.reset()
def reset(self):
self.val = defaultdict(int)
self.avg = defaultdict(float)
self.sum = defaultdict(int)
self.count = defaultdict(int)
def update(self, n=1, **val):
for k in val:
self.val[k] = val[k]
self.sum[k] += val[k] * n
self.count[k] += n
self.avg[k] = self.sum[k] / self.count[k]
def set_batchnorm_mode(model, train=True):
"""Allows to set batch_norm layer mode to train or eval, independendtly on the mode of the model."""
def _set_batchnorm_mode(module):
if isinstance(module, torch.nn.BatchNorm1d) or isinstance(module, torch.nn.BatchNorm2d):
if train:
module.train()
else:
module.eval()
model.apply(_set_batchnorm_mode)
### LLM DIV
def get_error(output, target, mode='autoregressive', ignore_index=None):
if mode == 'autoregressive': # output = logits here
assert ignore_index is not None
output = output[:,:-1,:]
logits_inds = torch.argmax(output, dim=-1)
target = target[:,1:]
if ignore_index is not None:
acc = torch.eq(logits_inds, target.unsqueeze(0))[:, target != ignore_index]
else:
acc = torch.eq(logits_inds, target.unsqueeze(0))
acc = acc.float().mean()
return 1 - acc
else:
pred = output.argmax(dim=1)
correct = pred.eq(target).float().sum()
return float((1. - correct / output.size(0)) * 100.)
def adjust_learning_rate(optimizer, epoch, optimizer_cfg):
lr = optimizer_cfg.lr * (0.1 ** np.less(optimizer_cfg.schedule, epoch).sum())
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def get_device(model: torch.nn.Module):
return next(model.parameters()).device
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment