Commit 97e8278b authored by zzg_666's avatar zzg_666
Browse files

适配后端vllm

parents
Pipeline #3071 canceled with stages
from dataflow.operators.general_text.eval.task2vec.task2vec import Task2Vec
from dataflow.operators.general_text.eval.task2vec import task_similarity
import torch
import random
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from dataflow.utils.storage import DataFlowStorage
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from torch.utils.data import Dataset
from dataflow import get_logger
from typing import Optional
# Task2Vec dataset diversity evaluation
# Cited from: Beyond Scale: the Diversity Coefficient as a Data Quality Metric Demonstrates LLMs are Pre-trained on Formally Diverse Data
@OPERATOR_REGISTRY.register()
class Task2VecDatasetEvaluator(OperatorABC):
def __init__(self, device='cuda', sample_nums=10, sample_size=1, method: Optional[str]='montecarlo', model_cache_dir='./dataflow_cache'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
# evaluating diversity by extract sample_nums * sample_size samples
self.sample_nums = sample_nums
self.sample_size = sample_size
self.device = device
self.model_cache_dir = model_cache_dir
self.score_name = 'Task2VecScore'
self.method = method
if method not in ['montecarlo', 'variational']:
raise ValueError(f"Invalid method '{method}'. Valid options are 'montecarlo' and 'variational'.")
self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=self.model_cache_dir)
self.probe_network = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=self.model_cache_dir)
self.device = torch.device(self.device if self.device and torch.cuda.is_available() else "cpu")
self.probe_network = self.probe_network.to(self.device)
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用Task2Vec方法评估数据集的多样性,通过计算样本嵌入的余弦距离矩阵来量化多样性。\n"
"输入参数:\n"
"- device:计算设备,默认为'cuda'\n"
"- sample_nums:采样次数,默认为10\n"
"- sample_size:每次采样样本数,默认为1\n"
"- method:嵌入方法,可选'montecarlo'或'variational',默认为'montecarlo'\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"- input_key:输入文本字段名\n"
"输出参数:\n"
"- Task2VecDiversityScore:多样性得分\n"
"- ConfidenceInterval:置信区间"
)
elif lang == "en":
return (
"Evaluate dataset diversity using Task2Vec by calculating cosine distance matrix of sample embeddings.\n"
"Input Parameters:\n"
"- device: Computing device, default 'cuda'\n"
"- sample_nums: Number of sampling iterations, default 10\n"
"- sample_size: Number of samples per iteration, default 1\n"
"- method: Embedding method, 'montecarlo' or 'variational', default 'montecarlo'\n"
"- model_cache_dir: Model cache directory, default './dataflow_cache'\n"
"- input_key: Field name for input text\n"
"Output Parameters:\n"
"- Task2VecDiversityScore: Diversity score\n"
"- ConfidenceInterval: Confidence interval"
)
else:
return "Evaluate dataset diversity using Task2Vec method."
def preprocess(self, texts):
self.tokenizer.pad_token = self.tokenizer.eos_token
tokenized_outputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
return {key: value.to(self.device) for key, value in tokenized_outputs.items()}
def get_score(self, sentences):
embeddings = []
data_length = len(sentences)
for sample_num in range(self.sample_nums):
self.logger.info(f'--> Sample {sample_num + 1}/{self.sample_nums}')
indices = random.sample(range(data_length), self.sample_size)
texts = [sentences[i] for i in indices]
tokenized_batch = self.preprocess(texts)
tokenized_dataset = CustomTensorDataset(tokenized_batch)
embedding, _ = Task2Vec(self.probe_network, method=self.method).embed(tokenized_dataset)
embeddings.append(embedding)
distance_matrix = task_similarity.pdist(embeddings, distance='cosine')
div_coeff, conf_interval = task_similarity.stats_of_distance_matrix(distance_matrix)
return {
"Task2VecDiversityScore": div_coeff,
"ConfidenceInterval": conf_interval
}
def run(self, storage: DataFlowStorage, input_key: str):
dataframe = storage.read("dataframe")
samples = dataframe[input_key].to_list()
self.logger.info(f"Evaluating {self.score_name}...")
task2vec_score = self.get_score(samples)
self.logger.info("Evaluation complete!")
self.logger.info(f"Task2Vec Diversity Score: {task2vec_score}")
return task2vec_score
class CustomTensorDataset(Dataset):
def __init__(self, tokenized_batch):
self.tokenized_batch = tokenized_batch
def __getitem__(self, index):
return {key: self.tokenized_batch[key][index] for key in self.tokenized_batch}
def __len__(self):
return len(next(iter(self.tokenized_batch.values())))
from vendi_score import text_utils
from dataflow.utils.storage import DataFlowStorage
import pandas as pd
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow import get_logger
# VendiScore dataset diversity evaluation
# Cited from: The Vendi Score: A Diversity Evaluation Metric for Machine Learning
@OPERATOR_REGISTRY.register()
class VendiDatasetEvaluator(OperatorABC):
def __init__(self, device='cuda'):
self.logger = get_logger()
self.logger.info(f'Initializing {self.__class__.__name__}...')
self.bert_model_path = 'bert-base-uncased'
self.simcse_model_path = 'princeton-nlp/unsup-simcse-bert-base-uncased'
self.device = device
self.score_name = 'VendiScore'
self.logger.info(f'{self.__class__.__name__} initialized.')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"通过计算VendiScore来评估数据集的多样性,使用BERT和SimCSE模型生成嵌入并计算分数。\n"
"输入参数:\n"
"- device:计算设备,默认为'cuda'\n"
"- input_key:输入文本字段名\n"
"输出参数:\n"
"- BERTVendiScore:基于BERT的多样性得分\n"
"- SimCSEVendiScore:基于SimCSE的多样性得分"
)
elif lang == "en":
return (
"Assess dataset diversity using VendiScore with embeddings from BERT and SimCSE models.\n"
"Input Parameters:\n"
"- device: Computing device, default 'cuda'\n"
"- input_key: Field name for input text\n"
"Output Parameters:\n"
"- BERTVendiScore: Diversity score based on BERT\n"
"- SimCSEVendiScore: Diversity score based on SimCSE"
)
else:
return "Assess dataset diversity using VendiScore."
def get_score(self, sentences):
result = {}
bert_vs = text_utils.embedding_vendi_score(sentences, model_path=self.bert_model_path, device=self.device)
result["BERTVendiScore"] = round(bert_vs, 2)
simcse_vs = text_utils.embedding_vendi_score(sentences, model_path=self.simcse_model_path, device=self.device)
result["SimCSEVendiScore"] = round(simcse_vs, 2)
return result
def run(self, storage: DataFlowStorage, input_key: str):
dataframe = storage.read("dataframe")
samples = dataframe[input_key].to_list()
self.logger.info(f"Evaluating {self.score_name}...")
vendiscore = self.get_score(samples)
self.logger.info("Evaluation complete!")
self.logger.info(f"VendiScore: {vendiscore}")
return vendiscore
\ No newline at end of file
# import sys
# from dataflow.utils.registry import LazyLoader
# cur_path = "dataflow/operators/filter/"
# _import_structure = {
# # Primary filters
# "NgramFilter": (cur_path + "ngram_filter.py", "NgramFilter"),
# "LanguageFilter": (cur_path + "language_filter.py", "LanguageFilter"),
# "DeitaQualityFilter": (cur_path + "deita_quality_filter.py", "DeitaQualityFilter"),
# "DeitaComplexityFilter": (cur_path + "deita_complexity_filter.py", "DeitaComplexityFilter"),
# "InstagFilter": (cur_path + "instag_filter.py", "InstagFilter"),
# "PairQualFilter": (cur_path + "pair_qual_filter.py", "PairQualFilter"),
# "QuratingFilter": (cur_path + "qurating_filter.py", "QuratingFilter"),
# "SuperfilteringFilter": (cur_path + "superfiltering_filter.py", "SuperfilteringFilter"),
# "FineWebEduFilter": (cur_path + "fineweb_edu_filter.py", "FineWebEduFilter"),
# "TextbookFilter": (cur_path + "text_book_filter.py", "TextbookFilter"),
# "AlpagasusFilter": (cur_path + "alpagasus_filter.py", "AlpagasusFilter"),
# "DebertaV3Filter": (cur_path + "debertav3_filter.py", "DebertaV3Filter"),
# "LangkitFilter": (cur_path + "langkit_filter.py", "LangkitFilter"),
# "LexicalDiversityFilter": (cur_path + "lexical_diversity_filter.py", "LexicalDiversityFilter"),
# "PerplexityFilter": (cur_path + "perplexity_filter.py", "PerplexityFilter"),
# "PerspectiveFilter": (cur_path + "perspective_filter.py", "PerspectiveFilter"),
# "PresidioFilter": (cur_path + "presidio_filter.py", "PresidioFilter"),
# "RMFilter": (cur_path + "reward_model_filter.py", "RMFilter"),
# "TreeinstructFilter": (cur_path + "treeinstruct_filter.py", "TreeinstructFilter"),
# # Heuristic filters
# "ColonEndFilter": (cur_path + "heuristics.py", "ColonEndFilter"),
# "WordNumberFilter": (cur_path + "heuristics.py", "WordNumberFilter"),
# "BlocklistFilter": (cur_path + "heuristics.py", "BlocklistFilter"),
# "SentenceNumberFilter": (cur_path + "heuristics.py", "SentenceNumberFilter"),
# "LineEndWithEllipsisFilter": (cur_path + "heuristics.py", "LineEndWithEllipsisFilter"),
# "ContentNullFilter": (cur_path + "heuristics.py", "ContentNullFilter"),
# "MeanWordLengthFilter": (cur_path + "heuristics.py", "MeanWordLengthFilter"),
# "SymbolWordRatioFilter": (cur_path + "heuristics.py", "SymbolWordRatioFilter"),
# "HtmlEntityFilter": (cur_path + "heuristics.py", "HtmlEntityFilter"),
# "IDCardFilter": (cur_path + "heuristics.py", "IDCardFilter"),
# "NoPuncFilter": (cur_path + "heuristics.py", "NoPuncFilter"),
# "SpecialCharacterFilter": (cur_path + "heuristics.py", "SpecialCharacterFilter"),
# "WatermarkFilter": (cur_path + "heuristics.py", "WatermarkFilter"),
# "StopWordFilter": (cur_path + "heuristics.py", "StopWordFilter"),
# "CurlyBracketFilter": (cur_path + "heuristics.py", "CurlyBracketFilter"),
# "CapitalWordsFilter": (cur_path + "heuristics.py", "CapitalWordsFilter"),
# "LoremIpsumFilter": (cur_path + "heuristics.py", "LoremIpsumFilter"),
# "UniqueWordsFilter": (cur_path + "heuristics.py", "UniqueWordsFilter"),
# "CharNumberFilter": (cur_path + "heuristics.py", "CharNumberFilter"),
# "LineStartWithBulletpointFilter": (cur_path + "heuristics.py", "LineStartWithBulletpointFilter"),
# "LineWithJavascriptFilter": (cur_path + "heuristics.py", "LineWithJavascriptFilter"),
# # Deduplicators
# "MinHashDeduplicator": (cur_path + "minhash_deduplicator.py", "MinHashDeduplicator"),
# "CCNetDeduplicator": (cur_path + "ccnet_deduplicator.py", "CCNetDeduplicator"),
# "HashDeduplicator": (cur_path + "hash_deduplicator.py", "HashDeduplicator"),
# "NgramHashDeduplicator": (cur_path + "ngramhash_deduplicator.py", "NgramHashDeduplicator"),
# "SemDeduplicator": (cur_path + "sem_deduplicator.py", "SemDeduplicator"),
# "SimHashDeduplicator": (cur_path + "simhash_deduplicator.py", "SimHashDeduplicator"),
# }
# sys.modules[__name__] = LazyLoader(__name__, cur_path, _import_structure)
\ No newline at end of file
2g1c
2 girls 1 cup
acrotomophilia
alabama hot pocket
alaskan pipeline
anal
anilingus
anus
apeshit
arsehole
ass
asshole
assmunch
auto erotic
autoerotic
babeland
baby batter
baby juice
ball gag
ball gravy
ball kicking
ball licking
ball sack
ball sucking
bangbros
bangbus
bareback
barely legal
barenaked
bastard
bastardo
bastinado
bbw
bdsm
beaner
beaners
beaver cleaver
beaver lips
beastiality
bestiality
big black
big breasts
big knockers
big tits
bimbos
birdlock
bitch
bitches
black cock
blonde action
blonde on blonde action
blowjob
blow job
blow your load
blue waffle
blumpkin
bollocks
bondage
boner
boob
boobs
booty call
brown showers
brunette action
bukkake
bulldyke
bullet vibe
bullshit
bung hole
bunghole
busty
butt
buttcheeks
butthole
camel toe
camgirl
camslut
camwhore
carpet muncher
carpetmuncher
chocolate rosebuds
cialis
circlejerk
cleveland steamer
clit
clitoris
clover clamps
clusterfuck
cock
cocks
coprolagnia
coprophilia
cornhole
coon
coons
creampie
cum
cumming
cumshot
cumshots
cunnilingus
cunt
darkie
date rape
daterape
deep throat
deepthroat
dendrophilia
dick
dildo
dingleberry
dingleberries
dirty pillows
dirty sanchez
doggie style
doggiestyle
doggy style
doggystyle
dog style
dolcett
domination
dominatrix
dommes
donkey punch
double dong
double penetration
dp action
dry hump
dvda
eat my ass
ecchi
ejaculation
erotic
erotism
escort
eunuch
fag
faggot
fecal
felch
fellatio
feltch
female squirting
femdom
figging
fingerbang
fingering
fisting
foot fetish
footjob
frotting
fuck
fuck buttons
fuckin
fucking
fucktards
fudge packer
fudgepacker
futanari
gangbang
gang bang
gay sex
genitals
giant cock
girl on
girl on top
girls gone wild
goatcx
goatse
god damn
gokkun
golden shower
goodpoop
goo girl
goregasm
grope
group sex
g-spot
guro
hand job
handjob
hard core
hardcore
hentai
homoerotic
honkey
hooker
horny
hot carl
hot chick
how to kill
how to murder
huge fat
humping
incest
intercourse
jack off
jail bait
jailbait
jelly donut
jerk off
jigaboo
jiggaboo
jiggerboo
jizz
juggs
kike
kinbaku
kinkster
kinky
knobbing
leather restraint
leather straight jacket
lemon party
livesex
lolita
lovemaking
make me come
male squirting
masturbate
masturbating
masturbation
menage a trois
milf
missionary position
mong
motherfucker
mound of venus
mr hands
muff diver
muffdiving
nambla
nawashi
negro
neonazi
nigga
nigger
nig nog
nimphomania
nipple
nipples
nsfw
nsfw images
nude
nudity
nutten
nympho
nymphomania
octopussy
omorashi
one cup two girls
one guy one jar
orgasm
orgy
paedophile
paki
panties
panty
pedobear
pedophile
pegging
penis
phone sex
piece of shit
pikey
pissing
piss pig
pisspig
playboy
pleasure chest
pole smoker
ponyplay
poof
poon
poontang
punany
poop chute
poopchute
porn
porno
pornography
prince albert piercing
pthc
pubes
pussy
queaf
queef
quim
raghead
raging boner
rape
raping
rapist
rectum
reverse cowgirl
rimjob
rimming
rosy palm
rosy palm and her 5 sisters
rusty trombone
sadism
santorum
scat
schlong
scissoring
semen
sex
sexcam
sexo
sexy
sexual
sexually
sexuality
shaved beaver
shaved pussy
shemale
shibari
shit
shitblimp
shitty
shota
shrimping
skeet
slanteye
slut
s&m
smut
snatch
snowballing
sodomize
sodomy
spastic
spic
splooge
splooge moose
spooge
spread legs
spunk
strap on
strapon
strappado
strip club
style doggy
suck
sucks
suicide girls
sultry women
swastika
swinger
tainted love
taste my
tea bagging
threesome
throating
thumbzilla
tied up
tight white
tit
tits
titties
titty
tongue in a
topless
tosser
towelhead
tranny
tribadism
tub girl
tubgirl
tushy
twat
twink
twinkie
two girls one cup
undressing
upskirt
urethra play
urophilia
vagina
venus mound
viagra
vibrator
violet wand
vorarephilia
voyeur
voyeurweb
voyuer
vulva
wank
wetback
wet dream
white power
whore
worldsex
wrapping men
wrinkled starfish
xx
xxx
yaoi
yellow showers
yiffy
zoophilia
🖕
13.
13点
三级片
下三烂
下贱
个老子的
九游
乳交
乳头
乳房
乳波臀浪
交配
仆街
他奶奶
他奶奶的
他奶娘的
他妈
他妈ㄉ王八蛋
他妈地
他妈的
他娘
他马的
你个傻比
你他马的
你全家
你奶奶的
你她马的
你妈
你妈的
你娘
你娘卡好
你娘咧
你它妈的
你它马的
你是鸡
你是鸭
你马的
做爱
傻比
傻逼
册那
军妓
几八
几叭
几巴
几芭
刚度
刚瘪三
包皮
十三点
卖B
卖比
卖淫
卵子
双峰微颤
口交
口肯
叫床
吃屎
后庭
吹箫
塞你公
塞你娘
塞你母
塞你爸
塞你老师
塞你老母
处女
外阴
大卵子
大卵泡
大鸡巴
奶奶的熊
奶子
奸你
她妈地
她妈的
她马的
妈B
妈个B
妈个比
妈个老比
妈妈的
妈比
妈的
妈的B
妈逼
妓女
妓院
妳她妈的
妳妈的
妳娘的
妳老母的
妳马的
姘头
姣西
娘个比
娘的
婊子
婊子养的
嫖娼
嫖客
它妈地
它妈的
密洞
射你
射精
小乳头
小卵子
小卵泡
小瘪三
小肉粒
小骚比
小骚货
小鸡巴
小鸡鸡
屁眼
屁股
巨乳
干x娘
干七八
干你
干你妈
干你娘
干你老母
干你良
干妳妈
干妳娘
干妳老母
干妳马
干您娘
干机掰
干死CS
干死GM
干死你
干死客服
强奸
强奸你
性交
性器
性无能
性爱
情色
想上你
懆您妈
懆您娘
懒8
懒八
懒叫
懒教
成人
我操你祖宗十八代
扒光
打炮
打飞机
抽插
招妓
插你
插死你
撒尿
操你
操你全家
操你奶奶
操你妈
操你娘
操你祖宗
操你老妈
操你老母
操妳
操妳全家
操妳妈
操妳娘
操妳祖宗
操机掰
操比
操逼
放荡
日他娘
日你
日你妈
日你老娘
日你老母
日批
月经
机八
机巴
机机歪歪
杂种
浪叫
淫乱
淫妇
淫棍
淫水
淫秽
淫荡
淫西
湿透的内裤
激情
灨你娘
烂货
烂逼
狗屁
狗日
狗狼养的
玉杵
王八蛋
瓜娃子
瓜婆娘
瓜批
瘪三
白烂
白痴
白癡
祖宗
私服
笨蛋
精子
老二
老味
老母
老瘪三
老骚比
老骚货
肉壁
肉棍子
肉棒
肉缝
肛交
肥西
色情
花柳
荡妇
贝肉
贱B
贱人
贱货
贼你妈
赛你老母
赛妳阿母
赣您娘
轮奸
迷药
逼样
野鸡
阳具
阳萎
阴唇
阴户
阴核
阴毛
阴茎
阴道
阴部
雞巴
靠北
靠母
靠爸
靠背
靠腰
驶你公
驶你娘
驶你母
驶你爸
驶你老师
驶你老母
骚比
骚货
骚逼
鬼公
鸡8
鸡八
鸡叭
鸡吧
鸡奸
鸡巴
鸡芭
鸡鸡
龟儿子
龟头
𨳒
陰莖
𨳊
𡳞
𨶙
𨳍
仆街
咸家鏟
冚家鏟
咸家伶
冚家拎
笨實
粉腸
屎忽
躝癱
你老闆
你老味
你老母
硬膠
from tqdm import tqdm
import numpy as np
import nltk
import os
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.cli_funcs.paths import DataFlowPath
from nltk.tokenize import word_tokenize
@OPERATOR_REGISTRY.register()
class BlocklistFilter(OperatorABC):
def __init__(self, language:str = 'en', threshold:int = 1, use_tokenizer:bool = False):
self.logger = get_logger()
self.language = language
self.threshold = threshold
self.use_tokenizer = use_tokenizer
self.logger.info(f"Initializing {self.__class__.__name__} with language = {self.language}, threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...")
# 设置 NLTK 数据路径(如果环境变量中有的话)
if 'NLTK_DATA' in os.environ:
nltk.data.path.insert(0, os.environ['NLTK_DATA'])
# 尝试查找数据,如果不存在则下载
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
self.blocklist = self.load_blocklist()
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子使用特定语言的阻止列表进行文本过滤,支持可选的分词器进行单词级匹配。\n"
"输入参数:\n"
"- input_key:输入文本字段名,默认为'text'\n"
"- language:语言代码,默认为'zh'\n"
"- blocklist_dir:阻止列表文件目录,默认为'./blocklists/'\n"
"- threshold:匹配次数阈值,默认为1\n"
"- use_tokenizer:是否使用分词器,默认为True\n"
"- tokenizer:分词器对象,默认为None\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留不包含阻止列表关键词的文本行\n"
"- 返回包含输入字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"This operator filters text using language-specific blocklists with optional tokenizer integration for word-level filtering.\n"
"Input Parameters:\n"
"- input_key: Input text field name, default is 'text'\n"
"- language: Language code, default is 'zh'\n"
"- blocklist_dir: Blocklist file directory, default is './blocklists/'\n"
"- threshold: Matching count threshold, default is 1\n"
"- use_tokenizer: Whether to use tokenizer, default is True\n"
"- tokenizer: Tokenizer object, default is None\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only rows without blocklist keywords\n"
"- List containing input field name for subsequent operator reference"
)
else:
return "BlocklistFilter uses language-specific blocklists with optional tokenizer integration."
def load_blocklist(self):
dataflow_dir = DataFlowPath.get_dataflow_dir()
file_path = f"{dataflow_dir}/operators/general_text/filter/blocklist/{self.language}.txt"
self.logger.info(f"Loading blocklist for language '{self.language}' from {file_path}...")
with open(file_path, 'r', encoding='utf-8') as file:
blocklist = set(line.strip().lower() for line in file if line.strip())
self.logger.info(f"Blocklist for '{self.language}' loaded. Total words in blocklist: {len(blocklist)}.")
return blocklist
def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'blocklist_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
if self.use_tokenizer:
text = word_tokenize(text.lower())
else:
text = text.lower().split()
blocklist_count = sum(1 for word in text if word in self.blocklist)
valid_checks.append(blocklist_count <= self.threshold)
else:
valid_checks.append(0)
valid_checks = np.array(valid_checks, dtype=int)
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
\ No newline at end of file
from tqdm import tqdm
from hashlib import md5, sha256
from xxhash import xxh3_128
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
@OPERATOR_REGISTRY.register()
class HashDeduplicateFilter(OperatorABC):
def __init__(self, hash_func: str = 'md5'):
self.logger = get_logger()
self.hash_func = hash_func
self.hash_func_dict = {
'md5': md5,
'sha256': sha256,
'xxh3': xxh3_128
}
if self.hash_func not in self.hash_func_dict:
raise ValueError(f'Invalid hash function: {self.hash_func}')
self.logger.info(f"Initializing {self.__class__.__name__} with hash_func = {self.hash_func}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用多种哈希函数对文本进行精确去重,支持md5、sha256或xxh3算法。通过计算文本的哈希值识别重复数据。\n\n"
"初始化参数:\n"
"- hash_func: 哈希函数名称,可选'md5'、'sha256'或'xxh3',默认为'md5'\n\n"
"运行参数:\n"
"- input_keys: 用于计算哈希的多个字段列表(与input_key二选一)\n"
"- input_key: 用于计算哈希的单个字段名(与input_keys二选一)\n"
"- output_key: 去重标记字段名,默认为'minhash_deduplicated_label'\n\n"
"输出说明:标记为1的数据表示首次出现,标记为0的数据表示重复数据\n"
"算法特点:\n"
"- md5: 128位哈希值,平衡速度和唯一性\n"
"- sha256: 256位哈希值,更高安全性,速度较慢\n"
"- xxh3: 128位哈希值,最快的哈希算法"
)
else:
return (
"Exact deduplication using multiple hash functions, chosen from md5, sha256 or xxh3. Identify duplicate data by calculating text hash values.\n\n"
"Initialization Parameters:\n"
"- hash_func: Hash function name, options are 'md5', 'sha256' or 'xxh3', default is 'md5'\n\n"
"Run Parameters:\n"
"- input_keys: List of multiple fields for hash calculation (alternative to input_key)\n"
"- input_key: Single field name for hash calculation (alternative to input_keys)\n"
"- output_key: Deduplication label field name, default is 'minhash_deduplicated_label'\n\n"
"Output Description: Data marked as 1 indicates first occurrence, 0 indicates duplicate\n"
"Algorithm Characteristics:\n"
"- md5: 128-bit hash, balances speed and uniqueness\n"
"- sha256: 256-bit hash, higher security, slower speed\n"
"- xxh3: 128-bit hash, fastest hashing algorithm"
)
def _compute_hash(self, text: str) -> str:
return self.hash_func_dict[self.hash_func](text.encode('utf-8')).hexdigest()
def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
if input_keys is None and input_key is None:
self.logger.error(f"Need to specify either input_keys or input_key!")
raise ValueError(f"Need to specify either input_keys or input_key!")
if input_keys is not None and input_key is not None:
self.logger.error(f"{self.__class__.__name__} only need one input args!")
raise ValueError(f"{self.__class__.__name__} only need one input args!")
if input_keys is not None:
self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
else:
self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
self.input_key = input_key
self.input_keys = input_keys
self.output_key = output_key
seen_hashes = set()
dataframe = storage.read("dataframe")
labels = [0] * len(dataframe)
for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
if input_keys is not None and len(input_keys) > 1:
text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
else:
text = sample[self.input_key]
hash_value = self._compute_hash(text)
if hash_value not in seen_hashes:
labels[idx] = 1
seen_hashes.add(hash_value)
dataframe[self.output_key] = labels
filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
return [self.output_key,]
import pandas as pd
import numpy as np
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.general_text import LangkitSampleEvaluator
@OPERATOR_REGISTRY.register()
class LangkitFilter(OperatorABC):
def __init__(self,
min_scores = {
"flesch_reading_ease": 0, # max(−144.8, 55.19−18.03)
"automated_readability_index": 0, # max(0.9, 11.77−4.41)
"aggregate_reading_level": 0, # max(0.0, 11.23−3.70)
"syllable_count": 32.0, # max(32, 815.4−1516.6 → clip to 32)
"lexicon_count": 23.0, # max(23, 524.2−1029.8 → clip to 23)
"sentence_count": 1.0, # max(1, 29.0−60.1 → clip to 1)
"character_count": 118.0, # max(118, 2610.2−4856.0 → clip to 118)
"letter_count": 109.0, # max(109, 2513.5−4679.5 → clip to 109)
"polysyllable_count": 0.0, # max(0, 78.9−137.5 → clip to 0)
"monosyllable_count": 13.0, # max(13, 334.7−709.4 → clip to 13)
"difficult_words": 4.0, # max(4, 93.4−120.0 → clip to 4)
},
max_scores = {
"flesch_reading_ease": 100, # min(106.4, 55.19+18.03)
"automated_readability_index": 100, # min(98.2, 11.77+4.41)
"aggregate_reading_level": 100, # min(77.0, 11.23+3.70)
"syllable_count": 2331.9, # min(43237, 815.4+1516.6)
"lexicon_count": 1554.0, # min(33033, 524.2+1029.8)
"sentence_count": 89.1, # min(2193, 29.0+60.1)
"character_count": 7466.3, # min(139807,2610.2+4856.0)
"letter_count": 7193.0, # min(134507,2513.5+4679.5)
"polysyllable_count": 216.4, # min(3261, 78.9+137.5)
"monosyllable_count": 1044.1, # min(25133,334.7+709.4)
"difficult_words": 213.4, # min(2366, 93.4+120.0)
},
metrics_to_keep: list = [
"flesch_reading_ease",
"automated_readability_index",
"aggregate_reading_level",
"syllable_count",
"lexicon_count",
"sentence_count",
"character_count",
"letter_count",
"polysyllable_count",
"monosyllable_count",
"difficult_words",
]):
self.min_scores = min_scores
self.max_scores = max_scores
self.metric_name_map = {
'flesch_reading_ease': 'LangkitFleschReadingEaseScore',
'automated_readability_index': 'LangkitAutomatedReadabilityIndexScore',
'aggregate_reading_level': 'LangkitAggregateReadingLevelScore',
'syllable_count': 'LangkitSyllableCountScore',
'lexicon_count': 'LangkitLexiconCountScore',
'sentence_count': 'LangkitSentenceCountScore',
'character_count': 'LangkitCharacterCountScore',
'letter_count': 'LangkitLetterCountScore',
'polysyllable_count': 'LangkitPolysyllableCountScore',
'monosyllable_count': 'LangkitMonosyllableCountScore',
'difficult_words': 'LangkitDifficultWordsScore'
}
if not self.min_scores.keys() == self.max_scores.keys():
raise ValueError("min_scores and max_scores must have the same keys")
self.logger = get_logger()
self.scorer = LangkitSampleEvaluator()
self.logger.info(f"Initializing {self.__class__.__name__} with min_scores: {self.min_scores} and max_scores: {self.max_scores}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于LangkitScorer打分器的得分对数据进行过滤。使用Langkit工具包计算11种文本统计信息,帮助评估文本结构复杂性和可读性。\n"
"输入参数:\n"
"- min_scores:各指标的最小阈值字典,包含11个语言统计指标\n"
"- max_scores:各指标的最大阈值字典,包含11个语言统计指标\n"
"- metrics_to_keep:需要保留的评估指标列表\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留所有指标都在指定范围内的文本\n"
"- 返回包含各指标标签字段名的列表"
)
else:
return (
"Filter data using scores from the LangkitScorer. Uses Langkit to extract 11 types of text statistics for evaluating text structure complexity and readability.\n"
"Input Parameters:\n"
"- min_scores: Dictionary of minimum thresholds for each metric, containing 11 language statistics\n"
"- max_scores: Dictionary of maximum thresholds for each metric, containing 11 language statistics\n"
"- metrics_to_keep: List of evaluation metrics to keep\n\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only texts with all metrics within specified ranges\n"
"- List containing label field names for each metric"
)
def run(self, storage: DataFlowStorage, input_key: str, output_keys: list = ["flesch_reading_ease", "automated_readability_index", "aggregate_reading_level", "syllable_count", "lexicon_count", "sentence_count", "character_count", "letter_count", "polysyllable_count", "monosyllable_count", "difficult_words"]):
self.input_key = input_key
self.output_keys = output_keys
if not list(self.min_scores.keys()) == output_keys:
raise ValueError("min_scores and output_keys must have the same keys")
self.logger.info("Running {self.__class__.__name__}...")
dataframe = storage.read("dataframe")
scores = self.scorer.eval(dataframe, self.input_key)
results = np.ones(len(dataframe), dtype=int)
for _label in self.output_keys:
label = self.metric_name_map[_label]
min_score = self.min_scores[_label]
max_score = self.max_scores[_label]
dataframe[label] = pd.DataFrame(scores)[label]
metric_scores = np.array(dataframe[label])
metric_filter = (min_score <= metric_scores) & (metric_scores <= max_score)
results = results & metric_filter.astype(int)
self.logger.debug(f"Filtered by {_label}, {np.sum(results)} data remained")
dataframe[f"{label}_label"] = metric_filter.astype(int)
filtered_dataframe = dataframe[results == 1]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [f"{label}_label" for label in self.output_keys]
import fasttext
import numpy as np
from huggingface_hub import hf_hub_download
from dataflow.core import OperatorABC
from dataflow.utils.registry import OPERATOR_REGISTRY
from tqdm import tqdm
from dataflow.utils.utils import get_logger
from dataflow.utils.storage import DataFlowStorage
@OPERATOR_REGISTRY.register()
class LanguageFilter(OperatorABC):
def __init__(self, allowed_languages: list, model_cache_dir: str = None):
self.logger = get_logger()
self.filter_name = 'LanguageFilter'
self.logger.info(f"Initializing {self.__class__.__name__} with allowed_languages = {allowed_languages} and model_cache_dir = {model_cache_dir}...")
self.allowed_languages = allowed_languages
self.model_cache_dir = model_cache_dir
# Download and load the FastText language model
try:
self.logger.info("Downloading model from Hugging Face Hub...")
model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin", cache_dir=self.model_cache_dir)
self.model = fasttext.load_model(model_path)
self.logger.info("Model loaded successfully.")
except Exception as e:
self.logger.error(f"Error downloading or loading model: {e}")
raise
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用FastText语言识别模型过滤数据。下载并加载预训练的FastText语言识别模型,检查文本的语言是否在允许的语言列表中。\n"
"输入参数:\n"
"- allowed_languages:允许的语言标签列表\n"
"- model_cache_dir:模型缓存目录路径\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留语言在允许列表中的文本\n"
"- 返回包含语言标签字段名的列表"
)
else:
return (
"Filter data using FastText language identification model. Downloads and loads pre-trained FastText language identification model to check if text language is in allowed list.\n"
"Input Parameters:\n"
"- allowed_languages: List of allowed language labels\n"
"- model_cache_dir: Model cache directory path\n\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only texts with language in allowed list\n"
"- List containing language label field name"
)
def eval(self, dataframe, input_key):
self.logger.info(f"Start evaluating {self.filter_name}...")
predictions = []
# Assuming the dataframe contains the text in `input_key`
for text in tqdm(dataframe[input_key], desc=f"Implementing {self.filter_name}"):
labels, scores = self.model.predict(text.replace('\n', ' '), k=5)
label_score_pairs = list(zip(labels, scores))
label_score_pairs.sort(key=lambda x: x[1], reverse=True) # Sort by score
top_labels = [label for label, score in label_score_pairs]
predictions.append(any(label in self.allowed_languages for label in top_labels))
return np.array(predictions).astype(int)
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='language_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.filter_name} with input_key = {self.input_key} and output_key = {self.output_key}...")
predictions = self.eval(dataframe, self.input_key)
dataframe[self.output_key] = predictions
filtered_dataframe = dataframe[dataframe[self.output_key] == 1]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
import pandas as pd
import numpy as np
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.general_text import LexicalDiversitySampleEvaluator
@OPERATOR_REGISTRY.register()
class LexicalDiversityFilter(OperatorABC):
def __init__(self, min_scores: dict = {'mtld': 50, 'hdd': 0.8}, max_scores: dict = {'mtld': 99999, 'hdd': 1.0}):
self.min_scores = min_scores
self.max_scores = max_scores
if not self.min_scores.keys() == self.max_scores.keys():
raise ValueError("min_scores and max_scores must have the same keys")
self.logger = get_logger()
self.logger.info(f"Initializing {self.__class__.__name__} with min_scores: {self.min_scores} and max_scores: {self.max_scores}...")
self.metric_name_map = {
'hdd': 'LexicalDiversityHD-DScore',
'mtld': 'LexicalDiversityMTLDScore',
}
self.scorer = LexicalDiversitySampleEvaluator()
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于LexicalDiversityScorer打分器的得分对数据进行过滤。使用MTLD(移动平均类型-令牌比)和HDD(超几何分布多样性)两种方法计算词汇多样性,高分代表更丰富的词汇使用。\n"
"输入参数:\n"
"- min_scores:各指标的最小阈值字典,包含'mtld'和'hdd'\n"
"- max_scores:各指标的最大阈值字典,包含'mtld'和'hdd'\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留词汇多样性在指定范围内的文本\n"
"- 返回包含各指标标签字段名的列表"
)
else:
return (
"Filter data using scores from the LexicalDiversityScorer. Measure lexical diversity using MTLD (Moving-Average Type-Token Ratio) and HDD (Hypergeometric Distribution Diversity) methods; higher scores indicate more diverse vocabulary usage.\n"
"Input Parameters:\n"
"- min_scores: Dictionary of minimum thresholds for each metric, containing 'mtld' and 'hdd'\n"
"- max_scores: Dictionary of maximum thresholds for each metric, containing 'mtld' and 'hdd'\n\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only texts with lexical diversity within specified range\n"
"- List containing label field names for each metric"
)
def run(self, storage: DataFlowStorage, input_key: str, output_keys = ['mtld', 'hdd']):
self.input_key = input_key
self.output_keys = output_keys
if not list(self.min_scores.keys()) == output_keys:
raise ValueError("min_scores and output_keys must have the same keys")
self.logger.info(f"Running {self.__class__.__name__} with input_key: {self.input_key} and output_keys: {self.output_keys}...")
dataframe = storage.read("dataframe")
scores = self.scorer.eval(dataframe, self.input_key)
results = np.ones(len(dataframe), dtype=int)
for _label in self.output_keys:
min_score = self.min_scores[_label]
max_score = self.max_scores[_label]
label = self.metric_name_map[_label]
dataframe[label] = pd.DataFrame(scores)[label]
metric_scores = np.array(dataframe[label])
metric_filter = (min_score <= metric_scores) & (metric_scores <= max_score)
nan_filter = np.isnan(metric_scores)
metric_filter = metric_filter | nan_filter
results = results & metric_filter.astype(int)
self.logger.debug(f"Filtered by {_label}, {np.sum(results)} data remained")
dataframe[f"{label}_label"] = metric_filter.astype(int)
filtered_dataframe = dataframe[results == 1]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [f"{label}_label" for label in self.output_keys]
import numpy as np
from tqdm import tqdm
from dataflow.core import OperatorABC, LLMServingABC
from dataflow.prompts.general_text import LanguageFilterPrompt
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.logger import get_logger
from dataflow.utils.storage import DataFlowStorage
@OPERATOR_REGISTRY.register()
class LLMLanguageFilter(OperatorABC):
"""
Operator for filtering text based on language using LLM.
Argument allowed_languages is a list of allowed languages, using the ISO 639-1 two-letter language code to specify the language (for example, 'en' for English, 'zh' for Chinese, etc.).
"""
def __init__(self, llm_serving: LLMServingABC = None, allowed_languages: list[str] = ['en']):
self.logger = get_logger()
self.prompt = LanguageFilterPrompt()
self.llm_serving = llm_serving
self.allowed_languages = allowed_languages
self.logger.info(f"Initializing {self.__class__.__name__}...")
@staticmethod
def get_desc(lang: str = "zh"):
return "使用大语言模型识别语言并过滤数据" if lang == "zh" else "Using large language models to identify languages and filter data."
def _reformat_prompt(self, dataframe):
formatted_prompts = [self.prompt.build_prompt(text=item) for item in tqdm(dataframe[self.input_key], desc="Reformatting Prompt...")]
return formatted_prompts
def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'language_label'):
self.input_key, self.output_key = input_key, output_key
dataframe = storage.read("dataframe")
formatted_prompts = self._reformat_prompt(dataframe)
llm_outputs = self.llm_serving.generate_from_input(formatted_prompts)
dataframe[self.output_key] = llm_outputs
filtered_dataframe = dataframe[dataframe[self.output_key].isin(self.allowed_languages)]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return dataframe
\ No newline at end of file
from tqdm import tqdm
from datasketch import MinHash, MinHashLSH # use datasketch-1.6.5
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
@OPERATOR_REGISTRY.register()
class MinHashDeduplicateFilter(OperatorABC):
def __init__(self, num_perm=128, threshold=0.9, use_n_gram=True, ngram=5):
self.logger = get_logger()
self.num_perm = num_perm
self.threshold = threshold
self.use_n_gram = use_n_gram
self.n_gram = ngram
self.logger.info(f"Initializing {self.__class__.__name__} with num_perm = {self.num_perm}, threshold = {self.threshold}, use_n_gram = {self.use_n_gram}, ngram = {self.n_gram}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"结合MinHash与LSH(局部敏感哈希)实现高效近似去重。将文本转换为MinHash签名,使用LSH快速查找相似文本,实现大规模数据集的近似去重。\n"
"输入参数:\n"
"- num_perm:生成MinHash签名的排列数\n"
"- threshold:相似度阈值,超过此阈值判定为相似文本\n"
"- use_n_gram:是否使用n-gram分词\n"
"- ngram:n-gram的n值\n"
"输出参数:\n"
"- 去重后的DataFrame,仅保留唯一文本\n"
"- 返回包含去重标签字段名的列表"
)
else:
return (
"Efficient near-duplicate detection using MinHash and LSH (Locality-Sensitive Hashing). Converts texts to MinHash signatures and uses LSH to quickly find similar texts, enabling near-deduplication for large-scale datasets.\n"
"Input Parameters:\n"
"- num_perm: Number of permutations for generating MinHash signatures\n"
"- threshold: Similarity threshold above which texts are considered duplicates\n"
"- use_n_gram: Whether to use n-gram tokenization\n"
"- ngram: n value for n-gram\n\n"
"Output Parameters:\n"
"- Deduplicated DataFrame containing only unique texts\n"
"- List containing deduplication label field name"
)
def create_minhash(self, data):
minhash = MinHash(num_perm=self.num_perm)
if self.use_n_gram:
for i in range(len(data) - self.n_gram + 1):
minhash.update(data[i:i + self.n_gram].encode('utf8'))
else:
for d in data:
minhash.update(d.encode('utf8'))
return minhash
def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
if input_keys is None and input_key is None:
self.logger.error(f"Need to specify either input_keys or input_key!")
raise ValueError(f"Need to specify either input_keys or input_key!")
if input_keys is not None and input_key is not None:
self.logger.error(f"{self.__class__.__name__} only need one input args!")
raise ValueError(f"{self.__class__.__name__} only need one input args!")
if input_keys is not None:
self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
else:
self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
self.input_key = input_key
self.input_keys = input_keys
self.output_key = output_key
dataframe = storage.read("dataframe")
labels = [0] * len(dataframe)
with lsh.insertion_session() as session:
for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
if input_keys is not None and len(input_keys) > 1:
text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
else:
text = sample[self.input_key]
minhash = self.create_minhash(text)
result = lsh.query(minhash)
if len(result) == 0:
labels[idx] = 1
session.insert(idx, minhash)
self.logger.debug(f"Inserted item {idx} into LSH with minhash.")
dataframe[self.output_key] = labels
filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
return [self.output_key,]
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.general_text import NgramSampleEvaluator
@OPERATOR_REGISTRY.register()
class NgramFilter(OperatorABC):
def __init__(self, min_score=0.8, max_score=1, ngrams=5):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.scorer = NgramSampleEvaluator(ngrams)
self.logger.info(f"Initializing {self.__class__.__name__} with min_scores: {self.min_score} and max_scores: {self.max_score}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于NgramScorer打分器的得分对数据进行过滤。计算文本中n-gram的重复比例,得分越高表示重复比例越低,文本冗余度越小。\n"
"输入参数:\n"
"- min_score:最小n-gram得分阈值\n"
"- max_score:最大n-gram得分阈值\n"
"- ngrams:n-gram的n值\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留n-gram得分在指定范围内的文本\n"
"- 返回包含n-gram得分字段名的列表"
)
else:
return (
"Filter data using scores from the NgramScorer. Evaluate text redundancy via n-gram repetition ratio; higher score means lower repetition and less text redundancy.\n"
"Input Parameters:\n"
"- min_score: Minimum n-gram score threshold\n"
"- max_score: Maximum n-gram score threshold\n"
"- ngrams: n value for n-gram\n\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only texts with n-gram score within specified range\n"
"- List containing n-gram score field name"
)
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='NgramScore'):
self.input_key = input_key
self.output_key = output_key
self.logger.info(f"Running {self.__class__.__name__} with input_key: {self.input_key} and output_key: {self.output_key}...")
dataframe = storage.read("dataframe")
scores = self.scorer.eval(dataframe, self.input_key)
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
\ No newline at end of file
from tqdm import tqdm
from hashlib import md5, sha256
from xxhash import xxh3_128
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
@OPERATOR_REGISTRY.register()
class NgramHashDeduplicateFilter(OperatorABC):
def __init__(self, n_gram: int = 3, hash_func: str = 'md5', diff_size : int = 1):
self.logger = get_logger()
self.n_gram = n_gram
self.hash_func = hash_func
self.diff_size = diff_size
self.hash_func_dict = {
'md5': md5,
'sha256': sha256,
'xxh3': xxh3_128
}
if self.hash_func not in self.hash_func_dict:
raise ValueError(f'Invalid hash function: {self.hash_func}')
self.logger.info(f"Initializing {self.__class__.__name__} with n_gram = {self.n_gram}, hash_func = {self.hash_func}, diff_size = {self.diff_size}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"结合n-gram技术与哈希算法识别相似文本,实现近似去重。将文本分割为多个n-gram片段,计算每个片段的哈希值,通过比较哈希集合的相似度来判断文本相似性。\n"
"输入参数:\n"
"- n_gram:将文本分割的片段数量\n"
"- hash_func:哈希函数类型,支持'md5'、'sha256'和'xxh3'\n"
"- diff_size:哈希集合差异阈值,小于此值判定为相似文本\n"
"输出参数:\n"
"- 去重后的DataFrame,仅保留唯一文本\n"
"- 返回包含去重标签字段名的列表"
)
else:
return (
"Detect similar text using n-gram technology and hashing algorithm for near deduplication. Splits text into multiple n-gram segments, computes hash values for each segment, and judges text similarity by comparing hash set similarity.\n"
"Input Parameters:\n"
"- n_gram: Number of segments to split text into\n"
"- hash_func: Hash function type, supporting 'md5', 'sha256', and 'xxh3'\n"
"- diff_size: Hash set difference threshold below which texts are considered similar\n\n"
"Output Parameters:\n"
"- Deduplicated DataFrame containing only unique texts\n"
"- List containing deduplication label field name"
)
def _compute_hash(self, text: str) -> str:
return self.hash_func_dict[self.hash_func](text.encode('utf-8')).hexdigest()
def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
if input_keys is None and input_key is None:
self.logger.error(f"Need to specify either input_keys or input_key!")
raise ValueError(f"Need to specify either input_keys or input_key!")
if input_keys is not None and input_key is not None:
self.logger.error(f"{self.__class__.__name__} only need one input args!")
raise ValueError(f"{self.__class__.__name__} only need one input args!")
if input_keys is not None:
self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
else:
self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
self.input_key = input_key
self.input_keys = input_keys
self.output_key = output_key
seen_hashes = []
dataframe = storage.read("dataframe")
labels = [0] * len(dataframe)
for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
if input_keys is not None and len(input_keys) > 1:
text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
else:
text = sample[self.input_key]
gram_length = len(text) // self.n_gram
ngrams = [text[i*gram_length:(i+1)*gram_length] for i in range(self.n_gram)]
hash_value = set(self._compute_hash(ngram) for ngram in ngrams)
if all(len(hash_value & hash) < self.diff_size for hash in seen_hashes):
labels[idx] = 1
seen_hashes.append(hash_value)
dataframe[self.output_key] = labels
filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
return [self.output_key,]
import numpy as np
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.general_text import PerspectiveSampleEvaluator
from dataflow.serving import PerspectiveAPIServing
@OPERATOR_REGISTRY.register()
class PerspectiveFilter(OperatorABC):
def __init__(self, min_score: float = 0.0, max_score: float = 0.5):
self.logger = get_logger()
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {min_score} and max_score = {max_score}")
self.min_score = min_score
self.max_score = max_score
self.serving = PerspectiveAPIServing(max_workers=10)
self.scorer = PerspectiveSampleEvaluator(serving=self.serving)
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于PerspectiveScorer打分器的得分对数据进行过滤使用Perspective API评估文本的毒性,返回毒性概率,得分越高表明文本毒性越高。\n"
"输入参数:\n"
"- min_score:最小毒性得分阈值\n"
"- max_score:最大毒性得分阈值\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留毒性得分在指定范围内的文本\n"
"- 返回包含毒性得分字段名的列表"
)
else:
return (
"Filter data using scores from the PerspectiveScorer. Assess text toxicity using Perspective API; higher scores indicate more toxicity.\n"
"Input Parameters:\n"
"- min_score: Minimum toxicity score threshold\n"
"- max_score: Maximum toxicity score threshold\n\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only texts with toxicity score within specified range\n"
"- List containing toxicity score field name"
)
def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'PerspectiveScore'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
# Get the scores for filtering
scores = np.array(self.scorer.eval(dataframe, self.input_key))
dataframe[self.output_key] = scores
metric_filter = (scores >= self.min_score) & (scores <= self.max_score)
nan_filter = np.isnan(scores)
metric_filter = metric_filter | nan_filter
filtered_dataframe = dataframe[metric_filter]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
import numpy as np
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.operators.general_text import PresidioSampleEvaluator
@OPERATOR_REGISTRY.register()
class PresidioFilter(OperatorABC):
def __init__(self, min_score: int = 0, max_score: int = 5, lang='en', device='cuda', model_cache_dir='./dataflow_cache'):
self.logger = get_logger()
self.min_score = min_score
self.max_score = max_score
self.scorer = PresidioSampleEvaluator(lang=lang, device=device, model_cache_dir=model_cache_dir)
self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于PresidioScorer打分器的得分对数据进行过滤。使用Microsoft Presidio模型识别文本中的私人实体(PII),返回PII信息个数。\n"
"支持识别姓名、邮箱、电话号码、身份证号等多种敏感信息类型,可用于数据隐私保护和合规性检查。\n"
"输入参数:\n"
"- min_score:保留样本的最小PII数量阈值,默认为0\n"
"- max_score:保留样本的最大PII数量阈值,默认为5\n"
"- lang:文本语言,默认为'en'\n"
"- device:模型运行设备,默认为'cuda'\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留PII数量在[min_score, max_score]范围内的样本\n"
"- 返回包含输出字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"Filter data using scores from the PresidioScorer. Detect personally identifiable information (PII) entities in text using Microsoft Presidio model and return the count of detected PII items.\n"
"Supports recognition of multiple sensitive information types including names, emails, phone numbers, and IDs for data privacy protection and compliance checks.\n"
"Input Parameters:\n"
"- min_score: Minimum PII count threshold for retaining samples, default is 0\n"
"- max_score: Maximum PII count threshold for retaining samples, default is 5\n"
"- lang: Text language, default is 'en'\n"
"- device: Model running device, default is 'cuda'\n"
"- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only samples with PII count within [min_score, max_score] range\n"
"- List containing output field name for subsequent operator reference"
)
else:
return "Filter data based on PII detection results using Microsoft Presidio model."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'PresidioScore'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__}...")
# Get the scores for filtering
scores = np.array(self.scorer.eval(dataframe, self.input_key))
dataframe[self.output_key] = scores
filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
\ No newline at end of file
from dataflow.core import OperatorABC
from typing import Callable, Tuple
import numpy as np
from nltk.tokenize import word_tokenize, WordPunctTokenizer
from dataflow.utils.registry import OPERATOR_REGISTRY
from dataflow.utils.utils import get_logger
from dataflow.utils.storage import DataFlowStorage
from tqdm import tqdm
import re
@OPERATOR_REGISTRY.register()
class ColonEndFilter(OperatorABC):
def __init__(self):
self.logger = get_logger()
self.logger.info(f"Initializing {self.__class__.__name__}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检查文本是否以冒号结尾,常用于判断问题是否为不完整的提问。\n"
"初始化参数:\n"
"- 无\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'{类名小写}_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator checks if text ends with a colon, commonly used to identify incomplete questions.\n"
"Initialization Parameters:\n"
"- None\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is '{classname_lower}_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "ColonEndFilter checks if text ends with a colon and filters out incomplete questions."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str = None):
self.input_key = input_key
self.output_key = output_key or f"{self.__class__.__name__.lower()}_label"
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__}...")
colon_end_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
colon_end_checks.append(not text.endswith(':'))
else:
colon_end_checks.append(0)
colon_end_checks = np.array(colon_end_checks, dtype=int)
dataframe[self.output_key] = colon_end_checks
filtered_dataframe = dataframe[colon_end_checks == 1]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class SentenceNumberFilter(OperatorABC):
def __init__(self, min_sentences: int=3, max_sentences: int=7500):
self.logger = get_logger()
self.min_sentences = min_sentences
self.max_sentences = max_sentences
self.logger.info(f"Initializing {self.__class__.__name__} with min_sentences = {self.min_sentences}, max_sentences = {self.max_sentences}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检查文本中的句子数量是否在指定范围内,使用正则表达式匹配句子结束符号(。!?.!?)进行分割。\n"
"初始化参数:\n"
"- min_sentences:最小句子数量阈值,默认为3\n"
"- max_sentences:最大句子数量阈值,默认为7500\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'sentence_number_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator checks if the number of sentences in text is within specified range, using regex to match sentence-ending punctuation(。!?.!?).\n"
"Initialization Parameters:\n"
"- min_sentences: Minimum sentence count threshold, default is 3\n"
"- max_sentences: Maximum sentence count threshold, default is 7500\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'sentence_number_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "SentenceNumberFilter filters text based on sentence count range using regex pattern matching."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'sentence_number_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_check = []
SENT_PATTERN = re.compile(r'\b[^.!?\n]+[.!?]*', flags=re.UNICODE)
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
num_sentence = len(SENT_PATTERN.findall(text))
valid_check.append(self.min_sentences <= num_sentence <= self.max_sentences)
else:
valid_check.append(0)
valid_check = np.array(valid_check, dtype=int)
dataframe[self.output_key] = valid_check
filtered_dataframe = dataframe[valid_check == 1]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
class TextSlice:
# A slice of text from a document.
def __init__(self, text: str, start: int, end: int):
self.text = text
self.start = start
self.end = end
def split_paragraphs(
text: str, normalizer: Callable[[str], str], remove_empty: bool = True
) -> Tuple[TextSlice]:
"""
Split a string into paragraphs. A paragraph is defined as a sequence of zero or more characters, followed
by a newline character, or a sequence of one or more characters, followed by the end of the string.
"""
text_slices = tuple(
TextSlice(normalizer(text[match.start():match.end()]), match.start(), match.end())
for match in re.finditer(r"([^\n]*\n|[^\n]+$)", text)
)
if remove_empty is True:
text_slices = tuple(
text_slice for text_slice in text_slices if text_slice.text.strip()
)
return text_slices
def normalize(
text: str,
remove_punct: bool = True,
lowercase: bool = True,
nfd_unicode: bool = True,
white_space: bool = True
) -> str:
import string
import unicodedata
if remove_punct:
text = text.translate(str.maketrans('', '', string.punctuation))
# lowercase
if lowercase:
text = text.lower()
if white_space:
text = text.strip()
text = re.sub(r'\s+', ' ', text)
# NFD unicode normalization
if nfd_unicode:
text = unicodedata.normalize('NFD', text)
return text
@OPERATOR_REGISTRY.register()
class LineEndWithEllipsisFilter(OperatorABC):
def __init__(self, threshold: float=0.3):
self.logger = get_logger()
self.threshold = threshold
self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检测并过滤以省略号(...)或(……)结尾的文本行,常用于识别不完整的表述。\n"
"初始化参数:\n"
"- threshold:以省略号结尾的行数比率阈值,默认为0.3\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'line_end_with_ellipsis_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator detects and filters text lines ending with ellipsis (...) or (……), commonly used to identify incomplete statements.\n"
"Initialization Parameters:\n"
"- threshold: Ratio threshold of lines ending with ellipsis, default is 0.3\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'line_end_with_ellipsis_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "LineEndWithEllipsisFilter detects and filters text ending with ellipsis characters."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'line_end_with_ellipsis_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
ellipsis_checks = []
ellipsis = ["...", "…"]
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
raw_lines = split_paragraphs(text=text, normalizer=lambda x: x, remove_empty=True)
num_lines = len(raw_lines)
if num_lines == 0:
ellipsis_checks.append(False)
continue
num_occurrences = sum([line.text.rstrip().endswith(tuple(ellipsis)) for line in raw_lines])
ratio = num_occurrences / num_lines
ellipsis_checks.append(ratio < self.threshold)
else:
ellipsis_checks.append(False)
ellipsis_checks = np.array(ellipsis_checks, dtype=int)
dataframe[self.output_key] = ellipsis_checks
filtered_dataframe = dataframe[ellipsis_checks == 1]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class ContentNullFilter(OperatorABC):
def __init__(self):
self.logger = get_logger()
self.logger.info(f"Initializing {self.__class__.__name__}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于过滤空值、空字符串或仅包含空白字符的文本,确保输入数据的有效性。\n"
"初始化参数:\n"
"- 无\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'content_null_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator filters null values, empty strings, or text containing only whitespace characters to ensure data validity.\n"
"Initialization Parameters:\n"
"- None\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'content_null_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "ContentNullFilter removes null, empty, and whitespace-only text content."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='content_null_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
null_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
null_checks.append(text is not None and text.strip() != '')
null_checks = np.array(null_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = null_checks
filtered_dataframe = dataframe[null_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class SymbolWordRatioFilter(OperatorABC):
def __init__(self, threshold: float=0.4):
self.logger = get_logger()
self.threshold = threshold
self.symbol = ["#", "...", "…"]
self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检查文本中特定符号(#, ..., …)与单词数量的比率是否超过阈值,过滤符号使用过多的文本。\n"
"初始化参数:\n"
"- threshold:符号与单词比率阈值,默认为0.4\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'symbol_word_ratio_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator checks if the ratio of specific symbols(#, ..., …) to word count exceeds threshold, filtering text with excessive symbol usage.\n"
"Initialization Parameters:\n"
"- threshold: Symbol-to-word ratio threshold, default is 0.4\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'symbol_word_ratio_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "SymbolWordRatioFilter checks ratio of specified symbols to word count and filters excessive usage."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='symbol_word_ratio_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
raw_words = tuple(WordPunctTokenizer().tokenize(text))
num_words = len(raw_words)
num_symbols = float(sum(text.count(symbol) for symbol in self.symbol))
if num_words == 0:
valid_checks.append(False)
continue
ratio = num_symbols / num_words
valid_checks.append(ratio < self.threshold)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class AlphaWordsFilter(OperatorABC):
def __init__(self, threshold: float, use_tokenizer: bool):
import nltk
import os
# 设置 NLTK 数据路径(如果环境变量中有的话)
if 'NLTK_DATA' in os.environ:
nltk.data.path.insert(0, os.environ['NLTK_DATA'])
# 尝试查找数据,如果不存在则下载
try:
nltk.data.find('tokenizers/punkt_tab')
except LookupError:
nltk.download('punkt_tab')
self.logger = get_logger()
self.threshold = threshold
self.use_tokenizer = use_tokenizer
self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于验证文本中字母单词的比率是否达到阈值,支持NLTK分词或简单空格分割两种模式。\n"
"初始化参数:\n"
"- threshold:字母单词比率阈值(无默认值,必须提供)\n"
"- use_tokenizer:是否使用NLTK分词器(无默认值,必须提供)\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'alpha_words_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator verifies if the ratio of alphabetic words in text meets threshold, supporting NLTK tokenization or simple space splitting.\n"
"Initialization Parameters:\n"
"- threshold: Alphabetic word ratio threshold (no default, required)\n"
"- use_tokenizer: Whether to use NLTK tokenizer (no default, required)\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'alpha_words_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "AlphaWordsFilter verifies alphabetic word ratio using either NLTK tokenization or space splitting."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='alpha_words_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if self.use_tokenizer:
words = word_tokenize(text)
else:
words = text.split()
alpha_count = sum(1 for word in words if re.search(r'[a-zA-Z]', word))
word_count = len(words)
if word_count > 0:
ratio = alpha_count / word_count
valid_checks.append(ratio > self.threshold)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
dataframe[self.output_key] = valid_checks
# Filter the dataframe based on the result
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class HtmlEntityFilter(OperatorABC):
def __init__(self):
self.logger = get_logger()
self.logger.info(f"Initializing {self.__class__.__name__}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检测并过滤包含HTML实体(如&amp;、&lt;、&gt;等)的文本,确保内容不包含标记语言元素。\n"
"初始化参数:\n"
"- 无\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'html_entity_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator detects and filters text containing HTML entities (e.g., &amp;, &lt;, &gt;) to ensure content has no markup language elements.\n"
"Initialization Parameters:\n"
"- None\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'html_entity_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "HtmlEntityFilter detects and removes text containing HTML entity patterns."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='html_entity_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
# Define the list of HTML entities
html_entity = ["nbsp", "lt", "gt", "amp", "quot", "apos", "hellip", "ndash", "mdash", "lsquo", "rsquo", "ldquo", "rdquo"]
full_entities_1 = [f"&{entity};" for entity in html_entity]
full_entities_2 = [f"&{entity};" for entity in html_entity]
full_entities_3 = [f"&{entity};" for entity in html_entity]
full_entities_4 = [f"&{entity};" for entity in html_entity]
half_entities = [f"&{entity}" for entity in html_entity] + [f"&{entity}" for entity in html_entity]
all_entities = full_entities_1 + full_entities_2 + full_entities_3 + full_entities_4 + half_entities
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
has_html_entity = any(entity in text for entity in all_entities)
valid_checks.append(not has_html_entity)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class IDCardFilter(OperatorABC):
def __init__(self, threshold:int=3):
self.logger = get_logger()
self.threshold = threshold
self.logger.info(f"Initializing {self.__class__.__name__}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检测并过滤包含身份证相关术语的文本,使用正则表达式匹配身份证号码模式以保护敏感信息。\n"
"初始化参数:\n"
"- threshold:身份证相关词汇匹配次数阈值,默认为3\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'id_card_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator detects and filters text containing ID card-related terms using regex patterns to protect sensitive information.\n"
"Initialization Parameters:\n"
"- threshold: ID card-related terms matching count threshold, default is 3\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'id_card_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "IDCardFilter detects and removes text containing ID card numbers and related sensitive information."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='id_card_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
pattern = re.compile(r"(身\s{0,10}份|id\s{0,10}number\s{0,10}|identification|identity|\s{0,10}ID\s{0,10}No\s{0,10}|id\s{0,10}card\s{0,10}|NRIC\s{0,10}number\s{0,10}|IC\s{0,10}number\s{0,10}|resident\s{0,10}registration\s{0,10}|I.D.\s{0,10}Number\s{0,10})", re.I)
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
matches = pattern.findall(text)
has_too_many_id_terms = len(matches) >= self.threshold
valid_checks.append(not has_too_many_id_terms)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class NoPuncFilter(OperatorABC):
def __init__(self, threshold: int=112):
self.logger = get_logger()
self.threshold = threshold
self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于确保文本包含足够的标点符号,通过统计句子间最大单词数量进行过滤。\n"
"初始化参数:\n"
"- threshold:句子间最大单词数量阈值,默认为112\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'no_punc_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator ensures text contains sufficient punctuation by counting maximum word count between sentences.\n"
"Initialization Parameters:\n"
"- threshold: Maximum word count between sentences threshold, default is 112\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'no_punc_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "NoPuncFilter ensures text contains sufficient punctuation marks based on ratio threshold."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='no_punc_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
paragraphs = text.split('\n')
max_word_count = 0
for paragraph in paragraphs:
if len(paragraph.strip()) == 0:
continue
sentences = re.split("[–.!?,;•/|…]", paragraph)
for sentence in sentences:
words = sentence.split()
word_count = len(words)
if word_count > max_word_count:
max_word_count = word_count
valid_checks.append(int(max_word_count) <= self.threshold)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class SpecialCharacterFilter(OperatorABC):
def __init__(self):
self.logger = get_logger()
self.logger.info(f"Initializing {self.__class__.__name__}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于移除包含特殊/unicode字符的文本,使用预定义模式检测非标准字符以确保文本规范性。\n"
"初始化参数:\n"
"- 无\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'special_character_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator removes text containing special/unicode characters using predefined patterns to ensure text normalization.\n"
"Initialization Parameters:\n"
"- None\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'special_character_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "SpecialCharacterFilter removes text containing special or non-standard unicode characters."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='special_character_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
speclai_character = [
r"u200e",
r"&#247;|\? :",
r"[�□]|\{\/U\}",
r"U\+26[0-F][0-D]|U\+273[3-4]|U\+1F[3-6][0-4][0-F]|U\+1F6[8-F][0-F]"
]
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
# Check for special characters using regular expressions
has_special_character = any(re.search(pattern, text) for pattern in speclai_character)
valid_checks.append(not has_special_character)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class WatermarkFilter(OperatorABC):
def __init__(self, watermarks: list= ['Copyright', 'Watermark', 'Confidential']):
self.logger = get_logger()
self.watermarks = watermarks
self.logger.info(f"Initializing {self.__class__.__name__} with watermarks={self.watermarks}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检测并移除包含版权/水印内容的文本,使用指定关键词列表识别受保护内容。\n"
"初始化参数:\n"
"- watermarks:水印关键词列表,默认为['Copyright', 'Watermark', 'Confidential']\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'watermark_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator detects and removes copyrighted/watermarked content using specified keyword lists to identify protected material.\n"
"Initialization Parameters:\n"
"- watermarks: List of watermark keywords, default is ['Copyright', 'Watermark', 'Confidential']\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'watermark_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "WatermarkFilter detects and removes text containing copyright or watermark keywords."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='watermark_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
matches = re.search('|'.join(self.watermarks), text)
valid_checks.append(matches is None)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class MeanWordLengthFilter(OperatorABC):
def __init__(self, min_length: float=3, max_length: float=10):
self.logger = get_logger()
self.min_length = min_length
self.max_length = max_length
self.logger.info(f"Initializing {self.__class__.__name__} with min_length={self.min_length}, max_length={self.max_length}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检查文本中单词的平均长度是否在指定范围内,通过字符总数除以单词数量计算平均值。\n"
"初始化参数:\n"
"- min_length:最小平均单词长度,默认为3\n"
"- max_length:最大平均单词长度,默认为10\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'mean_word_length_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator checks if the average word length in text is within specified range, calculated by total characters divided by word count.\n"
"Initialization Parameters:\n"
"- min_length: Minimum average word length, default is 3\n"
"- max_length: Maximum average word length, default is 10\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'mean_word_length_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "MeanWordLengthFilter checks average word length against specified range using character and word counts."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='mean_word_length_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
normalized_words = text.split()
num_words = len(normalized_words)
if num_words == 0:
valid_checks.append(False)
continue
num_chars = sum(len(word) for word in normalized_words)
mean_length = round(num_chars / num_words, 2)
valid_checks.append(self.min_length <= mean_length < self.max_length)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class StopWordFilter(OperatorABC):
def __init__(self, threshold: float, use_tokenizer: bool):
self.logger = get_logger()
self.threshold = threshold
self.use_tokenizer = use_tokenizer
self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...")
import nltk
import os
from nltk.corpus import stopwords
# 设置 NLTK 数据路径(如果环境变量中有的话)
if 'NLTK_DATA' in os.environ:
nltk.data.path.insert(0, os.environ['NLTK_DATA'])
else:
nltk.data.path.append('./dataflow/operators/filter/GeneralText/nltkdata/')
# 尝试查找数据,如果不存在则下载
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords', download_dir='./dataflow/operators/filter/GeneralText/nltkdata/')
# 加载停用词
self.stw = set(stopwords.words('english'))
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于验证文本中停用词的比率是否高于阈值,使用NLTK分词器进行单词分割和停用词识别。\n"
"初始化参数:\n"
"- threshold:停用词比率阈值(无默认值,必须提供)\n"
"- use_tokenizer:是否使用NLTK分词器(无默认值,必须提供)\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'stop_word_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator verifies if the ratio of stop words in text is above threshold, using NLTK tokenizer for word splitting and stop word identification.\n"
"Initialization Parameters:\n"
"- threshold: Stop word ratio threshold (no default, required)\n"
"- use_tokenizer: Whether to use NLTK tokenizer (no default, required)\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'stop_word_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "StopWordFilter verifies stop word ratio using NLTK tokenization with configurable threshold."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='stop_word_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
if self.use_tokenizer:
words = word_tokenize(text.lower())
else:
words = text.lower().split()
num_words = len(words)
num_stop_words = sum(map(lambda w: w in self.stw, words))
ratio = num_stop_words / num_words if num_words > 0 else 0
valid_checks.append(ratio > self.threshold and num_stop_words > 2)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class CurlyBracketFilter(OperatorABC):
def __init__(self, threshold: float=0.025):
self.logger = get_logger()
self.threshold = threshold
self.logger.info(f"Initializing {self.__class__.__name__} with threshold={self.threshold}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检测文本中是否存在过多的花括号使用,通过花括号数量与文本长度的比率进行过滤。\n"
"初始化参数:\n"
"- threshold:花括号比率阈值,默认为0.025\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'curly_bracket_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator detects excessive curly bracket usage in text by comparing bracket count to text length ratio.\n"
"Initialization Parameters:\n"
"- threshold: Bracket ratio threshold, default is 0.025\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'curly_bracket_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "CurlyBracketFilter detects excessive curly bracket usage with ratio thresholding."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='curly_bracket_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
num = text.count('{') + text.count('}')
ratio = num / len(text) if len(text) != 0 else 0
valid_checks.append(ratio < self.threshold)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class CapitalWordsFilter(OperatorABC):
def __init__(self, threshold: float=0.2, use_tokenizer: bool=False):
self.logger = get_logger()
self.threshold = threshold
self.use_tokenizer = use_tokenizer
self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...")
# 如果使用分词器,配置 NLTK 数据路径
if self.use_tokenizer:
import nltk
import os
# 设置 NLTK 数据路径(如果环境变量中有的话)
if 'NLTK_DATA' in os.environ:
nltk.data.path.insert(0, os.environ['NLTK_DATA'])
# 尝试查找数据,如果不存在则下载
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检查文本中大写单词的比率是否超过阈值,支持可选的分词器进行单词识别。\n"
"初始化参数:\n"
"- threshold:大写单词比率阈值,默认为0.2\n"
"- use_tokenizer:是否使用NLTK分词器,默认为False\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'capital_words_filter'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator checks if the ratio of capitalized words in text exceeds threshold, supporting optional tokenizer for word identification.\n"
"Initialization Parameters:\n"
"- threshold: Capitalized word ratio threshold, default is 0.2\n"
"- use_tokenizer: Whether to use NLTK tokenizer, default is False\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'capital_words_filter'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "CapitalWordsFilter checks uppercase word ratio with optional tokenizer usage."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='capital_words_filter'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
if self.use_tokenizer:
words = word_tokenize(text)
else:
words = text.split()
num_words = len(words)
num_caps_words = sum(map(str.isupper, words))
ratio = num_caps_words / num_words if num_words > 0 else 0
valid_checks.append(ratio <= self.threshold)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class LoremIpsumFilter(OperatorABC):
def __init__(self, threshold: float=3e-8):
self.logger = get_logger()
self.threshold = threshold
self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检测并过滤包含占位文本(如'lorem ipsum')的文本,使用正则表达式模式匹配并结合阈值过滤。\n"
"初始化参数:\n"
"- threshold:'lorem ipsum'出现次数与文本长度的比率阈值,默认为3e-8\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'loremipsum_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator detects and filters text containing placeholder text (e.g., 'lorem ipsum') using regex pattern matching with threshold filtering.\n"
"Initialization Parameters:\n"
"- threshold: Ratio threshold of 'lorem ipsum' occurrences to text length, default is 3e-8\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'loremipsum_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "LoremIpsumFilter detects and removes text containing placeholder text patterns."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='loremipsum_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
SEARCH_REGEX = re.compile(r"lorem ipsum", re.IGNORECASE)
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
normalized_content = text.lower()
num_occurrences = len(SEARCH_REGEX.findall(normalized_content))
ratio = num_occurrences / len(normalized_content) if len(normalized_content) > 0 else 0
valid_checks.append(ratio <= self.threshold)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class UniqueWordsFilter(OperatorABC):
def __init__(self, threshold: float=0.1):
self.logger = get_logger()
self.threshold = threshold
self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检查文本中唯一单词的比率是否达到阈值,通过集合操作计算唯一单词数量与总单词数量的比率。\n"
"初始化参数:\n"
"- threshold:最小唯一单词比率阈值,默认为0.1\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'unique_words_filter'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator checks if the ratio of unique words in text meets threshold, calculating ratio of unique word count to total word count using set operations.\n"
"Initialization Parameters:\n"
"- threshold: Minimum unique word ratio threshold, default is 0.1\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'unique_words_filter'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "UniqueWordsFilter checks unique word ratio using set operations and threshold comparison."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='unique_words_filter'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
normalized_text = text.lower()
normalized_words = tuple(normalized_text.split())
num_normalized_words = len(normalized_words)
if num_normalized_words == 0:
valid_checks.append(False)
continue
num_unique_words = len(set(normalized_words))
ratio = num_unique_words / num_normalized_words
valid_checks.append(ratio > self.threshold)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class CharNumberFilter(OperatorABC):
def __init__(self, threshold: int=100):
self.logger = get_logger()
self.threshold = threshold
self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于验证文本在去除空白字符后的字符数量是否达到最小阈值。\n"
"初始化参数:\n"
"- threshold:最小字符数量阈值,默认为100\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'char_number_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator verifies if the character count of text (after whitespace removal) meets minimum threshold.\n"
"Initialization Parameters:\n"
"- threshold: Minimum character count threshold, default is 100\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'char_number_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "CharNumberFilter verifies character count after whitespace removal against specified threshold."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='char_number_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
# Remove whitespace and count the number of characters
text = text.strip().replace(" ", "").replace("\n", "").replace("\t", "")
num_char = len(text)
# Check if the number of characters meets the threshold
valid_checks.append(num_char >= self.threshold)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class LineStartWithBulletpointFilter(OperatorABC):
def __init__(self, threshold: float=0.9):
self.logger = get_logger()
self.threshold = threshold
self.logger.info(f"Initializing {self.__class__.__name__} with threshold={self.threshold}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于检测并过滤以各种项目符号符号开头的文本行,使用Unicode字符匹配结合比率阈值进行过滤。\n"
"初始化参数:\n"
"- threshold:以项目符号开头的行数比率阈值,默认为0.9\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'line_start_with_bullet_point_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator detects and filters lines starting with various bullet point symbols using Unicode character matching with ratio thresholding.\n"
"Initialization Parameters:\n"
"- threshold: Ratio threshold of lines starting with bullet points, default is 0.9\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'line_start_with_bullet_point_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "LineStartWithBulletpointFilter detects various bullet point symbols using Unicode character matching."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='line_start_with_bullet_point_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__}...")
valid_checks = []
key_list = [
"\u2022", "\u2023", "\u25B6", "\u25C0", "\u25E6", "\u25A0", "\u25A1", "\u25AA", "\u25AB", "\u2013"
]
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
raw_lines = split_paragraphs(text=text, normalizer=lambda x: x, remove_empty=True)
num_lines = len(raw_lines)
if num_lines == 0:
valid_checks.append(False)
continue
num_occurrences = sum([line.text.lstrip().startswith(tuple(key_list)) for line in raw_lines])
ratio = num_occurrences / num_lines
valid_checks.append(ratio <= self.threshold)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
@OPERATOR_REGISTRY.register()
class LineWithJavascriptFilter(OperatorABC):
def __init__(self, threshold: int=3):
self.logger = get_logger()
self.threshold = threshold
self.logger.info(f"Initializing {self.__class__.__name__} with threshold={self.threshold}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于识别并过滤包含'javascript'引用的文本,通过关键词匹配和阈值判断进行内容过滤。\n"
"初始化参数:\n"
"- threshold:不包含'javascript'的最小行数阈值,默认为3\n"
"运行参数:\n"
"- storage:DataFlowStorage对象\n"
"- input_key:输入文本字段名\n"
"- output_key:输出标签字段名,默认为'line_with_javascript_filter_label'\n"
"返回值:\n"
"- 包含output_key的列表"
)
elif lang == "en":
return (
"This operator identifies and filters text containing 'javascript' references through keyword matching and threshold judgment.\n"
"Initialization Parameters:\n"
"- threshold: Minimum line count threshold without 'javascript', default is 3\n"
"Run Parameters:\n"
"- storage: DataFlowStorage object\n"
"- input_key: Input text field name\n"
"- output_key: Output label field name, default is 'line_with_javascript_filter_label'\n"
"Returns:\n"
"- List containing output_key"
)
else:
return "LineWithJavascriptFilter identifies 'javascript' references in text with threshold-based filtering."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='line_with_javascript_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__}...")
valid_checks = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
normalized_lines = split_paragraphs(text=text, normalizer=normalize, remove_empty=True)
num_lines = len(normalized_lines)
if num_lines == 0:
valid_checks.append(False)
continue
num_occurrences = sum(['javascript' in line.text.lower() for line in normalized_lines])
num_not_occur = num_lines - num_occurrences
valid_checks.append(num_lines <= 3 or num_not_occur >= self.threshold)
else:
valid_checks.append(False)
valid_checks = np.array(valid_checks, dtype=int)
# Filter the dataframe based on the result
dataframe[self.output_key] = valid_checks
filtered_dataframe = dataframe[valid_checks == 1]
# Write the filtered dataframe back to storage
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
import torch
from tqdm import tqdm
from hashlib import md5, sha256
from xxhash import xxh3_128
from transformers import BertModel, BertTokenizer
from torch.nn.functional import normalize
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
def load_model(device, model_path):
"""
Load the pretrained BERT model and tokenizer.
Args:
model_path (str): Path to the pretrained model.
Returns:
model, tokenizer: The loaded BERT model and tokenizer.
"""
model = BertModel.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
model = model.to(device)
model = model.eval()
return model, tokenizer
def get_text_embedding(texts, tokenizer, model, device):
"""
Compute text embeddings using the provided BERT model.
Args:
texts (list): List of texts to be embedded.
tokenizer: Tokenizer for the model.
model: The BERT model.
Returns:
np.ndarray: Embeddings for the input texts.
"""
inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).cpu().numpy() # Use mean pooling for sentence embeddings
def compute_cos_sim_matrix(embeddings):
"""
Compute the cosine similarity matrix for the given embeddings.
Args:
embeddings (np.ndarray): Text embeddings.
Returns:
np.ndarray: Cosine similarity matrix.
"""
embeddings = torch.tensor(embeddings)
embeddings = normalize(embeddings, dim=1)
return embeddings @ embeddings.T
@OPERATOR_REGISTRY.register()
class SemDeduplicateFilter(OperatorABC):
def __init__(self, eps: float = 0.05, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', model_cache_dir: str = './dataflow_cache', device: str = 'cuda'):
self.logger = get_logger()
self.eps = eps
self.device = device
self.model_name = model_name
self.model_cache_dir = model_cache_dir
self.model = BertModel.from_pretrained(self.model_name, cache_dir=model_cache_dir).to(self.device)
self.tokenizer = BertTokenizer.from_pretrained(self.model_name, cache_dir=model_cache_dir)
self.logger.info(f"Initializing {self.__class__.__name__} with eps = {self.eps}, model_name = {self.model_name}, model_cache_dir = {self.model_cache_dir}, device = {self.device}")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"基于BERT语义相似度识别语义重复文本,执行近似去重操作。通过计算文本嵌入向量间的余弦相似度,识别语义相似的文本并保留唯一样本。\n"
"支持多字段组合作为去重依据,可有效去除内容相似但表述不同的重复数据,提高数据集多样性。\n"
"输入参数:\n"
"- eps:相似度阈值,值越小表示允许的相似度越低,默认为0.05(即余弦相似度大于0.95视为重复)\n"
"- model_name:预训练模型名称,默认为'sentence-transformers/all-MiniLM-L6-v2'\n"
"- model_cache_dir:模型缓存目录,默认为'./dataflow_cache'\n"
"- device:模型运行设备,默认为'cuda'\n"
"- input_keys:多个输入字段名列表,与input_key二选一\n"
"- input_key:单个输入字段名,与input_keys二选一\n"
"- output_key:去重结果字段名,默认为'minhash_deduplicated_label'\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留语义不重复的样本(标记为1的样本)\n"
"- 返回包含去重结果字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"Identify semantically duplicate text using BERT embeddings for near deduplication. Calculate cosine similarity between text embedding vectors to detect semantically similar texts and retain unique samples.\n"
"Supports multiple field combinations as deduplication criteria, effectively removing duplicate data with similar content but different expressions to improve dataset diversity.\n"
"Input Parameters:\n"
"- eps: Similarity threshold, smaller values allow lower similarity, default is 0.05 (cosine similarity > 0.95 is considered duplicate)\n"
"- model_name: Pretrained model name, default is 'sentence-transformers/all-MiniLM-L6-v2'\n"
"- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
"- device: Model running device, default is 'cuda'\n"
"- input_keys: List of multiple input field names, alternative to input_key\n"
"- input_key: Single input field name, alternative to input_keys\n"
"- output_key: Deduplication result field name, default is 'minhash_deduplicated_label'\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only semantically unique samples (samples marked as 1)\n"
"- List containing deduplication result field name for subsequent operator reference"
)
else:
return "Near deduplication by identifying semantically similar content using BERT embeddings."
def _compute_hash(self, text: str) -> str:
return self.hash_func_dict[self.hash_func](text.encode('utf-8')).hexdigest()
def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
if input_keys is None and input_key is None:
self.logger.error(f"Need to specify either input_keys or input_key!")
raise ValueError(f"Need to specify either input_keys or input_key!")
if input_keys is not None and input_key is not None:
self.logger.error(f"{self.__class__.__name__} only need one input args!")
raise ValueError(f"{self.__class__.__name__} only need one input args!")
if input_keys is not None:
self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
else:
self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
self.input_key = input_key
self.input_keys = input_keys
self.output_key = output_key
seen_hashes = set()
dataframe = storage.read("dataframe")
texts = []
for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
if input_keys is not None and len(input_keys) > 1:
text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
else:
text = sample[self.input_key]
texts.append(text)
embeddings = get_text_embedding(texts, self.tokenizer, self.model, self.device)
embeddings = normalize(torch.tensor(embeddings), dim=1)
# Compute cosine similarity matrix
cos_sim_matrix = compute_cos_sim_matrix(embeddings)
cos_sim_matrix.fill_diagonal_(0) # Set diagonal to 0 to avoid self-comparison
cos_sim_matrix = torch.triu(cos_sim_matrix, diagonal=1)
# Find pairs with similarity greater than or equal to the threshold
similar_pairs = torch.where(cos_sim_matrix >= (1 - self.eps))
labels = [1] * len(dataframe)
for idx in similar_pairs[1].tolist():
labels[idx] = 0
dataframe[self.output_key] = labels
filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
return [self.output_key,]
from tqdm import tqdm
from simhash import Simhash
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
def get_similarity(simhash, another_simhash):
max_hashbit = max(len(bin(simhash.value)), len(bin(another_simhash.value)))
distince = simhash.distance(another_simhash)
similar = 1 - distince / max_hashbit
return similar
@OPERATOR_REGISTRY.register()
class SimHashDeduplicateFilter(OperatorABC):
def __init__(self, fingerprint_size: int = 64, bound: float = 0.1):
self.logger = get_logger()
self.fingerprint_size = fingerprint_size
self.bound = bound
self.logger.info(f"Initializing {self.__class__.__name__} with fingerprint_size = {self.fingerprint_size}, bound = {self.bound}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"使用SimHash算法通过汉明距离识别相似文本,执行近似去重操作。将文本转换为固定长度的指纹,通过计算指纹间的汉明距离判断文本相似度。\n"
"相比语义去重速度更快,适合大规模数据集的快速去重预处理,尤其适用于检测字符层面相似的文本。\n"
"输入参数:\n"
"- fingerprint_size:指纹长度,默认为64位\n"
"- bound:相似度阈值,值越小表示允许的相似度越低,默认为0.1(即相似度大于0.9视为重复)\n"
"- input_keys:多个输入字段名列表,与input_key二选一\n"
"- input_key:单个输入字段名,与input_keys二选一\n"
"- output_key:去重结果字段名,默认为'minhash_deduplicated_label'\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留相似性低于阈值的唯一样本(标记为1的样本)\n"
"- 返回包含去重结果字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"Detect similar text via SimHash algorithm and Hamming distance for near deduplication. Convert text to fixed-length fingerprints and determine text similarity by calculating Hamming distance between fingerprints.\n"
"Faster than semantic deduplication, suitable for fast deduplication preprocessing of large-scale datasets, especially for detecting character-level similar texts.\n"
"Input Parameters:\n"
"- fingerprint_size: Fingerprint length, default is 64 bits\n"
"- bound: Similarity threshold, smaller values allow lower similarity, default is 0.1 (similarity > 0.9 is considered duplicate)\n"
"- input_keys: List of multiple input field names, alternative to input_key\n"
"- input_key: Single input field name, alternative to input_keys\n"
"- output_key: Deduplication result field name, default is 'minhash_deduplicated_label'\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only unique samples with similarity below threshold (samples marked as 1)\n"
"- List containing deduplication result field name for subsequent operator reference"
)
else:
return "Near deduplication by detecting text similarity using SimHash algorithm and Hamming distance."
def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
if input_keys is None and input_key is None:
self.logger.error(f"Need to specify either input_keys or input_key!")
raise ValueError(f"Need to specify either input_keys or input_key!")
if input_keys is not None and input_key is not None:
self.logger.error(f"{self.__class__.__name__} only need one input args!")
raise ValueError(f"{self.__class__.__name__} only need one input args!")
if input_keys is not None:
self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
else:
self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
self.input_key = input_key
self.input_keys = input_keys
self.output_key = output_key
dataframe = storage.read("dataframe")
simhashes = []
labels = [0] * len(dataframe)
for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
if input_keys is not None and len(input_keys) > 1:
text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
else:
text = sample[self.input_key]
simhash = Simhash(text, f=self.fingerprint_size)
if all(get_similarity(simhash, another_simhash) < 1 - self.bound for another_simhash in simhashes):
labels[idx] = 1
simhashes.append(simhash)
dataframe[self.output_key] = labels
filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
output_file = storage.write(filtered_dataframe)
self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
return [self.output_key,]
from tqdm import tqdm
import numpy as np
from dataflow import get_logger
from dataflow.core import OperatorABC
from dataflow.utils.storage import DataFlowStorage
from dataflow.utils.registry import OPERATOR_REGISTRY
@OPERATOR_REGISTRY.register()
class WordNumberFilter(OperatorABC):
def __init__(self, min_words: int=20, max_words: int=100000):
self.logger = get_logger()
self.min_words = min_words
self.max_words = max_words
self.logger.info(f"Initializing {self.__class__.__name__} with min_words = {self.min_words}, max_words = {self.max_words}...")
@staticmethod
def get_desc(lang: str = "zh"):
if lang == "zh":
return (
"该算子用于过滤单词数量不在指定范围内的文本,通过空格分割计算单词数量。\n"
"输入参数:\n"
"- input_key:输入文本字段名,默认为'text'\n"
"- min_words:最小单词数量阈值,默认为5\n"
"- max_words:最大单词数量阈值,默认为100\n"
"输出参数:\n"
"- 过滤后的DataFrame,仅保留单词数量在指定范围内的文本行\n"
"- 返回包含输入字段名的列表,用于后续算子引用"
)
elif lang == "en":
return (
"This operator filters text with word count outside the specified range, using space splitting for word counting.\n"
"Input Parameters:\n"
"- input_key: Input text field name, default is 'text'\n"
"- min_words: Minimum word count threshold, default is 5\n"
"- max_words: Maximum word count threshold, default is 100\n"
"Output Parameters:\n"
"- Filtered DataFrame containing only rows with word count within specified range\n"
"- List containing input field name for subsequent operator reference"
)
else:
return "WordNumberFilter filters text based on word count range using space splitting."
def run(self, storage: DataFlowStorage, input_key: str, output_key: str='word_number_filter_label'):
self.input_key = input_key
self.output_key = output_key
dataframe = storage.read("dataframe")
self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
word_counts = []
for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
if text:
normalized_words = tuple(text.split())
num_normalized_words = len(normalized_words)
word_counts.append(num_normalized_words)
else:
word_counts.append(0)
word_counts = np.array(word_counts)
metric_filter = (self.min_words <= word_counts) & (word_counts < self.max_words)
dataframe[self.output_key] = word_counts
filtered_dataframe = dataframe[metric_filter]
storage.write(filtered_dataframe)
self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
return [self.output_key]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment