适配后端vllm

97e8278b · zzg_666 · 97e8278b · 97e8278b · 97e8278b · 97e8278b
Commit 97e8278b authored Dec 03, 2025 by zzg_666
20 changed files
--- a/dataflow/operators/general_text/eval/task2vec_dataset_evaluator.py
+++ b/dataflow/operators/general_text/eval/task2vec_dataset_evaluator.py
+from dataflow.operators.general_text.eval.task2vec.task2vec import Task2Vec
+from dataflow.operators.general_text.eval.task2vec import task_similarity
+import torch
+import random
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from torch.utils.data import Dataset
+from dataflow import get_logger
+from typing import Optional
+# Task2Vec dataset diversity evaluation
+# Cited from: Beyond Scale: the Diversity Coefficient as a Data Quality Metric Demonstrates LLMs are Pre-trained on Formally Diverse Data
+@OPERATOR_REGISTRY.register()
+class Task2VecDatasetEvaluator(OperatorABC):
+    def __init__(self, device='cuda', sample_nums=10, sample_size=1, method: Optional[str]='montecarlo', model_cache_dir='./dataflow_cache'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        # evaluating diversity by extract sample_nums * sample_size samples
+        self.sample_nums = sample_nums  
+        self.sample_size = sample_size  
+        self.device = device
+        self.model_cache_dir = model_cache_dir  
+        self.score_name = 'Task2VecScore'
+        self.method = method
+        if method not in ['montecarlo', 'variational']:
+            raise ValueError(f"Invalid method '{method}'. Valid options are 'montecarlo' and 'variational'.")
+        self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2', cache_dir=self.model_cache_dir)
+        self.probe_network = GPT2LMHeadModel.from_pretrained('gpt2', cache_dir=self.model_cache_dir)
+        self.device = torch.device(self.device if self.device and torch.cuda.is_available() else "cpu")
+        self.probe_network = self.probe_network.to(self.device)
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "使用Task2Vec方法评估数据集的多样性，通过计算样本嵌入的余弦距离矩阵来量化多样性。\n"
+                "输入参数：\n"
+                "- device：计算设备，默认为'cuda'\n"
+                "- sample_nums：采样次数，默认为10\n"
+                "- sample_size：每次采样样本数，默认为1\n"
+                "- method：嵌入方法，可选'montecarlo'或'variational'，默认为'montecarlo'\n"
+                "- model_cache_dir：模型缓存目录，默认为'./dataflow_cache'\n"
+                "- input_key：输入文本字段名\n"
+                "输出参数：\n"
+                "- Task2VecDiversityScore：多样性得分\n"
+                "- ConfidenceInterval：置信区间"
+            )
+        elif lang == "en":
+            return (
+                "Evaluate dataset diversity using Task2Vec by calculating cosine distance matrix of sample embeddings.\n"
+                "Input Parameters:\n"
+                "- device: Computing device, default 'cuda'\n"
+                "- sample_nums: Number of sampling iterations, default 10\n"
+                "- sample_size: Number of samples per iteration, default 1\n"
+                "- method: Embedding method, 'montecarlo' or 'variational', default 'montecarlo'\n"
+                "- model_cache_dir: Model cache directory, default './dataflow_cache'\n"
+                "- input_key: Field name for input text\n"
+                "Output Parameters:\n"
+                "- Task2VecDiversityScore: Diversity score\n"
+                "- ConfidenceInterval: Confidence interval"
+            )
+        else:
+            return "Evaluate dataset diversity using Task2Vec method."
+    
+    def preprocess(self, texts):
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+        tokenized_outputs = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
+        return {key: value.to(self.device) for key, value in tokenized_outputs.items()}
+
+    def get_score(self, sentences):
+        embeddings = []
+        data_length = len(sentences)
+        for sample_num in range(self.sample_nums):
+            self.logger.info(f'--> Sample {sample_num + 1}/{self.sample_nums}')
+            indices = random.sample(range(data_length), self.sample_size)
+            texts = [sentences[i] for i in indices]
+            tokenized_batch = self.preprocess(texts)
+            tokenized_dataset = CustomTensorDataset(tokenized_batch)
+            embedding, _ = Task2Vec(self.probe_network, method=self.method).embed(tokenized_dataset)
+            embeddings.append(embedding)
+        distance_matrix = task_similarity.pdist(embeddings, distance='cosine')
+        div_coeff, conf_interval = task_similarity.stats_of_distance_matrix(distance_matrix)
+        
+        return {
+            "Task2VecDiversityScore": div_coeff,
+            "ConfidenceInterval": conf_interval
+        }
+
+    def run(self, storage: DataFlowStorage, input_key: str):
+        dataframe = storage.read("dataframe")
+        samples = dataframe[input_key].to_list()
+        self.logger.info(f"Evaluating {self.score_name}...")
+        task2vec_score = self.get_score(samples)
+        self.logger.info("Evaluation complete!")
+        self.logger.info(f"Task2Vec Diversity Score: {task2vec_score}")
+        return task2vec_score
+
+
+class CustomTensorDataset(Dataset):
+    def __init__(self, tokenized_batch):
+        self.tokenized_batch = tokenized_batch
+
+    def __getitem__(self, index):
+        return {key: self.tokenized_batch[key][index] for key in self.tokenized_batch}
+
+    def __len__(self):
+        return len(next(iter(self.tokenized_batch.values())))
--- a/dataflow/operators/general_text/eval/vendi_dataset_evaluator.py
+++ b/dataflow/operators/general_text/eval/vendi_dataset_evaluator.py
+from vendi_score import text_utils
+from dataflow.utils.storage import DataFlowStorage
+import pandas as pd
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+
+# VendiScore dataset diversity evaluation
+# Cited from: The Vendi Score: A Diversity Evaluation Metric for Machine Learning
+@OPERATOR_REGISTRY.register()
+class VendiDatasetEvaluator(OperatorABC):
+    def __init__(self, device='cuda'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.bert_model_path = 'bert-base-uncased'
+        self.simcse_model_path = 'princeton-nlp/unsup-simcse-bert-base-uncased'
+        self.device = device
+        self.score_name = 'VendiScore'
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "通过计算VendiScore来评估数据集的多样性，使用BERT和SimCSE模型生成嵌入并计算分数。\n"
+                "输入参数：\n"
+                "- device：计算设备，默认为'cuda'\n"
+                "- input_key：输入文本字段名\n"
+                "输出参数：\n"
+                "- BERTVendiScore：基于BERT的多样性得分\n"
+                "- SimCSEVendiScore：基于SimCSE的多样性得分"
+            )
+        elif lang == "en":
+            return (
+                "Assess dataset diversity using VendiScore with embeddings from BERT and SimCSE models.\n"
+                "Input Parameters:\n"
+                "- device: Computing device, default 'cuda'\n"
+                "- input_key: Field name for input text\n"
+                "Output Parameters:\n"
+                "- BERTVendiScore: Diversity score based on BERT\n"
+                "- SimCSEVendiScore: Diversity score based on SimCSE"
+            )
+        else:
+            return "Assess dataset diversity using VendiScore."
+    
+    def get_score(self, sentences):
+        result = {}
+        bert_vs = text_utils.embedding_vendi_score(sentences, model_path=self.bert_model_path, device=self.device)
+        result["BERTVendiScore"] = round(bert_vs, 2)
+        simcse_vs = text_utils.embedding_vendi_score(sentences, model_path=self.simcse_model_path, device=self.device)
+        result["SimCSEVendiScore"] = round(simcse_vs, 2)
+        return result
+
+    def run(self, storage: DataFlowStorage, input_key: str):
+        dataframe = storage.read("dataframe")
+        samples = dataframe[input_key].to_list()
+        self.logger.info(f"Evaluating {self.score_name}...")
+        vendiscore = self.get_score(samples)
+        self.logger.info("Evaluation complete!")
+        self.logger.info(f"VendiScore: {vendiscore}")
+        return vendiscore
\ No newline at end of file
--- a/dataflow/operators/general_text/filter/__init__.py
+++ b/dataflow/operators/general_text/filter/__init__.py
+# import sys
+# from dataflow.utils.registry import LazyLoader
+
+# cur_path = "dataflow/operators/filter/"
+
+# _import_structure = {
+#     # Primary filters
+#     "NgramFilter": (cur_path + "ngram_filter.py", "NgramFilter"),
+#     "LanguageFilter": (cur_path + "language_filter.py", "LanguageFilter"),
+#     "DeitaQualityFilter": (cur_path + "deita_quality_filter.py", "DeitaQualityFilter"),
+#     "DeitaComplexityFilter": (cur_path + "deita_complexity_filter.py", "DeitaComplexityFilter"),
+#     "InstagFilter": (cur_path + "instag_filter.py", "InstagFilter"),
+#     "PairQualFilter": (cur_path + "pair_qual_filter.py", "PairQualFilter"),
+#     "QuratingFilter": (cur_path + "qurating_filter.py", "QuratingFilter"),
+#     "SuperfilteringFilter": (cur_path + "superfiltering_filter.py", "SuperfilteringFilter"),
+#     "FineWebEduFilter": (cur_path + "fineweb_edu_filter.py", "FineWebEduFilter"),
+#     "TextbookFilter": (cur_path + "text_book_filter.py", "TextbookFilter"),
+#     "AlpagasusFilter": (cur_path + "alpagasus_filter.py", "AlpagasusFilter"),
+#     "DebertaV3Filter": (cur_path + "debertav3_filter.py", "DebertaV3Filter"),
+#     "LangkitFilter": (cur_path + "langkit_filter.py", "LangkitFilter"),
+#     "LexicalDiversityFilter": (cur_path + "lexical_diversity_filter.py", "LexicalDiversityFilter"),
+#     "PerplexityFilter": (cur_path + "perplexity_filter.py", "PerplexityFilter"),
+#     "PerspectiveFilter": (cur_path + "perspective_filter.py", "PerspectiveFilter"),
+#     "PresidioFilter": (cur_path + "presidio_filter.py", "PresidioFilter"),
+#     "RMFilter": (cur_path + "reward_model_filter.py", "RMFilter"),
+#     "TreeinstructFilter": (cur_path + "treeinstruct_filter.py", "TreeinstructFilter"),
+
+#     # Heuristic filters
+#     "ColonEndFilter": (cur_path + "heuristics.py", "ColonEndFilter"),
+#     "WordNumberFilter": (cur_path + "heuristics.py", "WordNumberFilter"),
+#     "BlocklistFilter": (cur_path + "heuristics.py", "BlocklistFilter"),
+#     "SentenceNumberFilter": (cur_path + "heuristics.py", "SentenceNumberFilter"),
+#     "LineEndWithEllipsisFilter": (cur_path + "heuristics.py", "LineEndWithEllipsisFilter"),
+#     "ContentNullFilter": (cur_path + "heuristics.py", "ContentNullFilter"),
+#     "MeanWordLengthFilter": (cur_path + "heuristics.py", "MeanWordLengthFilter"),
+#     "SymbolWordRatioFilter": (cur_path + "heuristics.py", "SymbolWordRatioFilter"),
+#     "HtmlEntityFilter": (cur_path + "heuristics.py", "HtmlEntityFilter"),
+#     "IDCardFilter": (cur_path + "heuristics.py", "IDCardFilter"),
+#     "NoPuncFilter": (cur_path + "heuristics.py", "NoPuncFilter"),
+#     "SpecialCharacterFilter": (cur_path + "heuristics.py", "SpecialCharacterFilter"),
+#     "WatermarkFilter": (cur_path + "heuristics.py", "WatermarkFilter"),
+#     "StopWordFilter": (cur_path + "heuristics.py", "StopWordFilter"),
+#     "CurlyBracketFilter": (cur_path + "heuristics.py", "CurlyBracketFilter"),
+#     "CapitalWordsFilter": (cur_path + "heuristics.py", "CapitalWordsFilter"),
+#     "LoremIpsumFilter": (cur_path + "heuristics.py", "LoremIpsumFilter"),
+#     "UniqueWordsFilter": (cur_path + "heuristics.py", "UniqueWordsFilter"),
+#     "CharNumberFilter": (cur_path + "heuristics.py", "CharNumberFilter"),
+#     "LineStartWithBulletpointFilter": (cur_path + "heuristics.py", "LineStartWithBulletpointFilter"),
+#     "LineWithJavascriptFilter": (cur_path + "heuristics.py", "LineWithJavascriptFilter"),
+
+#     # Deduplicators
+#     "MinHashDeduplicator": (cur_path + "minhash_deduplicator.py", "MinHashDeduplicator"),
+#     "CCNetDeduplicator": (cur_path + "ccnet_deduplicator.py", "CCNetDeduplicator"),
+#     "HashDeduplicator": (cur_path + "hash_deduplicator.py", "HashDeduplicator"),
+#     "NgramHashDeduplicator": (cur_path + "ngramhash_deduplicator.py", "NgramHashDeduplicator"),
+#     "SemDeduplicator": (cur_path + "sem_deduplicator.py", "SemDeduplicator"),
+#     "SimHashDeduplicator": (cur_path + "simhash_deduplicator.py", "SimHashDeduplicator"),
+# }
+
+# sys.modules[__name__] = LazyLoader(__name__, cur_path, _import_structure)
\ No newline at end of file
--- a/dataflow/operators/general_text/filter/blocklist/en.txt
+++ b/dataflow/operators/general_text/filter/blocklist/en.txt
+2g1c
+2 girls 1 cup
+acrotomophilia
+alabama hot pocket
+alaskan pipeline
+anal
+anilingus
+anus
+apeshit
+arsehole
+ass
+asshole
+assmunch
+auto erotic
+autoerotic
+babeland
+baby batter
+baby juice
+ball gag
+ball gravy
+ball kicking
+ball licking
+ball sack
+ball sucking
+bangbros
+bangbus
+bareback
+barely legal
+barenaked
+bastard
+bastardo
+bastinado
+bbw
+bdsm
+beaner
+beaners
+beaver cleaver
+beaver lips
+beastiality
+bestiality
+big black
+big breasts
+big knockers
+big tits
+bimbos
+birdlock
+bitch
+bitches
+black cock
+blonde action
+blonde on blonde action
+blowjob
+blow job
+blow your load
+blue waffle
+blumpkin
+bollocks
+bondage
+boner
+boob
+boobs
+booty call
+brown showers
+brunette action
+bukkake
+bulldyke
+bullet vibe
+bullshit
+bung hole
+bunghole
+busty
+butt
+buttcheeks
+butthole
+camel toe
+camgirl
+camslut
+camwhore
+carpet muncher
+carpetmuncher
+chocolate rosebuds
+cialis
+circlejerk
+cleveland steamer
+clit
+clitoris
+clover clamps
+clusterfuck
+cock
+cocks
+coprolagnia
+coprophilia
+cornhole
+coon
+coons
+creampie
+cum
+cumming
+cumshot
+cumshots
+cunnilingus
+cunt
+darkie
+date rape
+daterape
+deep throat
+deepthroat
+dendrophilia
+dick
+dildo
+dingleberry
+dingleberries
+dirty pillows
+dirty sanchez
+doggie style
+doggiestyle
+doggy style
+doggystyle
+dog style
+dolcett
+domination
+dominatrix
+dommes
+donkey punch
+double dong
+double penetration
+dp action
+dry hump
+dvda
+eat my ass
+ecchi
+ejaculation
+erotic
+erotism
+escort
+eunuch
+fag
+faggot
+fecal
+felch
+fellatio
+feltch
+female squirting
+femdom
+figging
+fingerbang
+fingering
+fisting
+foot fetish
+footjob
+frotting
+fuck
+fuck buttons
+fuckin
+fucking
+fucktards
+fudge packer
+fudgepacker
+futanari
+gangbang
+gang bang
+gay sex
+genitals
+giant cock
+girl on
+girl on top
+girls gone wild
+goatcx
+goatse
+god damn
+gokkun
+golden shower
+goodpoop
+goo girl
+goregasm
+grope
+group sex
+g-spot
+guro
+hand job
+handjob
+hard core
+hardcore
+hentai
+homoerotic
+honkey
+hooker
+horny
+hot carl
+hot chick
+how to kill
+how to murder
+huge fat
+humping
+incest
+intercourse
+jack off
+jail bait
+jailbait
+jelly donut
+jerk off
+jigaboo
+jiggaboo
+jiggerboo
+jizz
+juggs
+kike
+kinbaku
+kinkster
+kinky
+knobbing
+leather restraint
+leather straight jacket
+lemon party
+livesex
+lolita
+lovemaking
+make me come
+male squirting
+masturbate
+masturbating
+masturbation
+menage a trois
+milf
+missionary position
+mong
+motherfucker
+mound of venus
+mr hands
+muff diver
+muffdiving
+nambla
+nawashi
+negro
+neonazi
+nigga
+nigger
+nig nog
+nimphomania
+nipple
+nipples
+nsfw
+nsfw images
+nude
+nudity
+nutten
+nympho
+nymphomania
+octopussy
+omorashi
+one cup two girls
+one guy one jar
+orgasm
+orgy
+paedophile
+paki
+panties
+panty
+pedobear
+pedophile
+pegging
+penis
+phone sex
+piece of shit
+pikey
+pissing
+piss pig
+pisspig
+playboy
+pleasure chest
+pole smoker
+ponyplay
+poof
+poon
+poontang
+punany
+poop chute
+poopchute
+porn
+porno
+pornography
+prince albert piercing
+pthc
+pubes
+pussy
+queaf
+queef
+quim
+raghead
+raging boner
+rape
+raping
+rapist
+rectum
+reverse cowgirl
+rimjob
+rimming
+rosy palm
+rosy palm and her 5 sisters
+rusty trombone
+sadism
+santorum
+scat
+schlong
+scissoring
+semen
+sex
+sexcam
+sexo
+sexy
+sexual
+sexually
+sexuality
+shaved beaver
+shaved pussy
+shemale
+shibari
+shit
+shitblimp
+shitty
+shota
+shrimping
+skeet
+slanteye
+slut
+s&m
+smut
+snatch
+snowballing
+sodomize
+sodomy
+spastic
+spic
+splooge
+splooge moose
+spooge
+spread legs
+spunk
+strap on
+strapon
+strappado
+strip club
+style doggy
+suck
+sucks
+suicide girls
+sultry women
+swastika
+swinger
+tainted love
+taste my
+tea bagging
+threesome
+throating
+thumbzilla
+tied up
+tight white
+tit
+tits
+titties
+titty
+tongue in a
+topless
+tosser
+towelhead
+tranny
+tribadism
+tub girl
+tubgirl
+tushy
+twat
+twink
+twinkie
+two girls one cup
+undressing
+upskirt
+urethra play
+urophilia
+vagina
+venus mound
+viagra
+vibrator
+violet wand
+vorarephilia
+voyeur
+voyeurweb
+voyuer
+vulva
+wank
+wetback
+wet dream
+white power
+whore
+worldsex
+wrapping men
+wrinkled starfish
+xx
+xxx
+yaoi
+yellow showers
+yiffy
+zoophilia
+🖕
--- a/dataflow/operators/general_text/filter/blocklist/zh.txt
+++ b/dataflow/operators/general_text/filter/blocklist/zh.txt
+13.
+13点
+三级片
+下三烂
+下贱
+个老子的
+九游
+乳
+乳交
+乳头
+乳房
+乳波臀浪
+交配
+仆街
+他奶奶
+他奶奶的
+他奶娘的
+他妈
+他妈ㄉ王八蛋
+他妈地
+他妈的
+他娘
+他马的
+你个傻比
+你他马的
+你全家
+你奶奶的
+你她马的
+你妈
+你妈的
+你娘
+你娘卡好
+你娘咧
+你它妈的
+你它马的
+你是鸡
+你是鸭
+你马的
+做爱
+傻比
+傻逼
+册那
+军妓
+几八
+几叭
+几巴
+几芭
+刚度
+刚瘪三
+包皮
+十三点
+卖B
+卖比
+卖淫
+卵
+卵子
+双峰微颤
+口交
+口肯
+叫床
+吃屎
+后庭
+吹箫
+塞你公
+塞你娘
+塞你母
+塞你爸
+塞你老师
+塞你老母
+处女
+外阴
+大卵子
+大卵泡
+大鸡巴
+奶
+奶奶的熊
+奶子
+奸
+奸你
+她妈地
+她妈的
+她马的
+妈B
+妈个B
+妈个比
+妈个老比
+妈妈的
+妈比
+妈的
+妈的B
+妈逼
+妓
+妓女
+妓院
+妳她妈的
+妳妈的
+妳娘的
+妳老母的
+妳马的
+姘头
+姣西
+姦
+娘个比
+娘的
+婊子
+婊子养的
+嫖娼
+嫖客
+它妈地
+它妈的
+密洞
+射你
+射精
+小乳头
+小卵子
+小卵泡
+小瘪三
+小肉粒
+小骚比
+小骚货
+小鸡巴
+小鸡鸡
+屁眼
+屁股
+屄
+屌
+巨乳
+干x娘
+干七八
+干你
+干你妈
+干你娘
+干你老母
+干你良
+干妳妈
+干妳娘
+干妳老母
+干妳马
+干您娘
+干机掰
+干死CS
+干死GM
+干死你
+干死客服
+幹
+强奸
+强奸你
+性
+性交
+性器
+性无能
+性爱
+情色
+想上你
+懆您妈
+懆您娘
+懒8
+懒八
+懒叫
+懒教
+成人
+我操你祖宗十八代
+扒光
+打炮
+打飞机
+抽插
+招妓
+插你
+插死你
+撒尿
+操你
+操你全家
+操你奶奶
+操你妈
+操你娘
+操你祖宗
+操你老妈
+操你老母
+操妳
+操妳全家
+操妳妈
+操妳娘
+操妳祖宗
+操机掰
+操比
+操逼
+放荡
+日他娘
+日你
+日你妈
+日你老娘
+日你老母
+日批
+月经
+机八
+机巴
+机机歪歪
+杂种
+浪叫
+淫
+淫乱
+淫妇
+淫棍
+淫水
+淫秽
+淫荡
+淫西
+湿透的内裤
+激情
+灨你娘
+烂货
+烂逼
+爛
+狗屁
+狗日
+狗狼养的
+玉杵
+王八蛋
+瓜娃子
+瓜婆娘
+瓜批
+瘪三
+白烂
+白痴
+白癡
+祖宗
+私服
+笨蛋
+精子
+老二
+老味
+老母
+老瘪三
+老骚比
+老骚货
+肉壁
+肉棍子
+肉棒
+肉缝
+肏
+肛交
+肥西
+色情
+花柳
+荡妇
+賤
+贝肉
+贱B
+贱人
+贱货
+贼你妈
+赛你老母
+赛妳阿母
+赣您娘
+轮奸
+迷药
+逼
+逼样
+野鸡
+阳具
+阳萎
+阴唇
+阴户
+阴核
+阴毛
+阴茎
+阴道
+阴部
+雞巴
+靠北
+靠母
+靠爸
+靠背
+靠腰
+驶你公
+驶你娘
+驶你母
+驶你爸
+驶你老师
+驶你老母
+骚比
+骚货
+骚逼
+鬼公
+鸡8
+鸡八
+鸡叭
+鸡吧
+鸡奸
+鸡巴
+鸡芭
+鸡鸡
+龟儿子
+龟头
+𨳒
+陰莖
+㞗
+尻
+𨳊
+鳩
+𡳞
+𨶙
+撚
+𨳍
+柒
+閪
+仆街
+咸家鏟
+冚家鏟
+咸家伶
+冚家拎
+笨實
+粉腸
+屎忽
+躝癱
+你老闆
+你老味
+你老母
+硬膠
--- a/dataflow/operators/general_text/filter/blocklist_filter.py
+++ b/dataflow/operators/general_text/filter/blocklist_filter.py
+from tqdm import tqdm
+import numpy as np
+import nltk
+import os
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.cli_funcs.paths import DataFlowPath
+from nltk.tokenize import word_tokenize
+
+@OPERATOR_REGISTRY.register()
+class BlocklistFilter(OperatorABC):
+
+    def __init__(self, language:str = 'en', threshold:int = 1, use_tokenizer:bool = False):
+        self.logger = get_logger()
+        self.language = language
+        self.threshold = threshold
+        self.use_tokenizer = use_tokenizer
+        self.logger.info(f"Initializing {self.__class__.__name__} with language = {self.language}, threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...")
+        
+        # 设置 NLTK 数据路径（如果环境变量中有的话）
+        if 'NLTK_DATA' in os.environ:
+            nltk.data.path.insert(0, os.environ['NLTK_DATA'])
+        
+        # 尝试查找数据，如果不存在则下载
+        try:
+            nltk.data.find('tokenizers/punkt')
+        except LookupError:
+            nltk.download('punkt')
+        
+        self.blocklist = self.load_blocklist()
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子使用特定语言的阻止列表进行文本过滤，支持可选的分词器进行单词级匹配。\n"
+                "输入参数：\n"
+                "- input_key：输入文本字段名，默认为'text'\n"
+                "- language：语言代码，默认为'zh'\n"
+                "- blocklist_dir：阻止列表文件目录，默认为'./blocklists/'\n"
+                "- threshold：匹配次数阈值，默认为1\n"
+                "- use_tokenizer：是否使用分词器，默认为True\n"
+                "- tokenizer：分词器对象，默认为None\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留不包含阻止列表关键词的文本行\n"
+                "- 返回包含输入字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "This operator filters text using language-specific blocklists with optional tokenizer integration for word-level filtering.\n"
+                "Input Parameters:\n"
+                "- input_key: Input text field name, default is 'text'\n"
+                "- language: Language code, default is 'zh'\n"
+                "- blocklist_dir: Blocklist file directory, default is './blocklists/'\n"
+                "- threshold: Matching count threshold, default is 1\n"
+                "- use_tokenizer: Whether to use tokenizer, default is True\n"
+                "- tokenizer: Tokenizer object, default is None\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only rows without blocklist keywords\n"
+                "- List containing input field name for subsequent operator reference"
+            )
+        else:
+            return "BlocklistFilter uses language-specific blocklists with optional tokenizer integration."
+        
+    def load_blocklist(self):
+        dataflow_dir = DataFlowPath.get_dataflow_dir()
+        file_path = f"{dataflow_dir}/operators/general_text/filter/blocklist/{self.language}.txt"
+        self.logger.info(f"Loading blocklist for language '{self.language}' from {file_path}...")
+        with open(file_path, 'r', encoding='utf-8') as file:
+            blocklist = set(line.strip().lower() for line in file if line.strip())
+        self.logger.info(f"Blocklist for '{self.language}' loaded. Total words in blocklist: {len(blocklist)}.")
+        return blocklist
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'blocklist_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__}...")
+        valid_checks = []
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                if self.use_tokenizer:
+                    text = word_tokenize(text.lower())
+                else:
+                    text = text.lower().split()
+                blocklist_count = sum(1 for word in text if word in self.blocklist)
+                valid_checks.append(blocklist_count <= self.threshold)
+            else:
+                valid_checks.append(0)
+        valid_checks = np.array(valid_checks, dtype=int)
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
\ No newline at end of file
--- a/dataflow/operators/general_text/filter/hash_deduplicate_filter.py
+++ b/dataflow/operators/general_text/filter/hash_deduplicate_filter.py
+from tqdm import tqdm
+from hashlib import md5, sha256
+from xxhash import xxh3_128
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+
+@OPERATOR_REGISTRY.register()
+class HashDeduplicateFilter(OperatorABC):
+    def __init__(self, hash_func: str = 'md5'):
+        self.logger = get_logger()
+        self.hash_func = hash_func
+        self.hash_func_dict = {
+            'md5': md5,
+            'sha256': sha256,
+            'xxh3': xxh3_128
+        }
+        
+        if self.hash_func not in self.hash_func_dict:
+            raise ValueError(f'Invalid hash function: {self.hash_func}')
+        self.logger.info(f"Initializing {self.__class__.__name__} with hash_func = {self.hash_func}...")
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "使用多种哈希函数对文本进行精确去重，支持md5、sha256或xxh3算法。通过计算文本的哈希值识别重复数据。\n\n"
+                "初始化参数：\n"
+                "- hash_func: 哈希函数名称，可选'md5'、'sha256'或'xxh3'，默认为'md5'\n\n"
+                "运行参数：\n"
+                "- input_keys: 用于计算哈希的多个字段列表（与input_key二选一）\n"
+                "- input_key: 用于计算哈希的单个字段名（与input_keys二选一）\n"
+                "- output_key: 去重标记字段名，默认为'minhash_deduplicated_label'\n\n"
+                "输出说明：标记为1的数据表示首次出现，标记为0的数据表示重复数据\n"
+                "算法特点：\n"
+                "- md5: 128位哈希值，平衡速度和唯一性\n"
+                "- sha256: 256位哈希值，更高安全性，速度较慢\n"
+                "- xxh3: 128位哈希值，最快的哈希算法"
+            )
+        else:
+            return (
+                "Exact deduplication using multiple hash functions, chosen from md5, sha256 or xxh3. Identify duplicate data by calculating text hash values.\n\n"
+                "Initialization Parameters:\n"
+                "- hash_func: Hash function name, options are 'md5', 'sha256' or 'xxh3', default is 'md5'\n\n"
+                "Run Parameters:\n"
+                "- input_keys: List of multiple fields for hash calculation (alternative to input_key)\n"
+                "- input_key: Single field name for hash calculation (alternative to input_keys)\n"
+                "- output_key: Deduplication label field name, default is 'minhash_deduplicated_label'\n\n"
+                "Output Description: Data marked as 1 indicates first occurrence, 0 indicates duplicate\n"
+                "Algorithm Characteristics:\n"
+                "- md5: 128-bit hash, balances speed and uniqueness\n"
+                "- sha256: 256-bit hash, higher security, slower speed\n"
+                "- xxh3: 128-bit hash, fastest hashing algorithm"
+            )
+
+
+    def _compute_hash(self, text: str) -> str:
+        return self.hash_func_dict[self.hash_func](text.encode('utf-8')).hexdigest()
+
+    def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
+        if input_keys is None and input_key is None:
+            self.logger.error(f"Need to specify either input_keys or input_key!")
+            raise ValueError(f"Need to specify either input_keys or input_key!")
+        if input_keys is not None and input_key is not None:
+            self.logger.error(f"{self.__class__.__name__} only need one input args!")
+            raise ValueError(f"{self.__class__.__name__} only need one input args!")
+        if input_keys is not None:
+            self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
+        else:
+            self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
+        self.input_key = input_key
+        self.input_keys = input_keys
+        self.output_key = output_key
+        seen_hashes = set()
+        dataframe = storage.read("dataframe")
+        labels = [0] * len(dataframe)
+        for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
+            if input_keys is not None and len(input_keys) > 1:
+                text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
+            else:
+                text = sample[self.input_key]
+            hash_value = self._compute_hash(text)
+            if hash_value not in seen_hashes:
+                labels[idx] = 1
+                seen_hashes.add(hash_value)
+        dataframe[self.output_key] = labels
+        filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
+        return [self.output_key,]
+        
+        
+
+        
+        
+
--- a/dataflow/operators/general_text/filter/langkit_filter.py
+++ b/dataflow/operators/general_text/filter/langkit_filter.py
+import pandas as pd
+import numpy as np
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.general_text import LangkitSampleEvaluator
+
+@OPERATOR_REGISTRY.register()
+class LangkitFilter(OperatorABC):
+    def __init__(self, 
+                 min_scores = {
+                    "flesch_reading_ease": 0,     # max(−144.8, 55.19−18.03)
+                    "automated_readability_index": 0,  # max(0.9, 11.77−4.41)
+                    "aggregate_reading_level": 0,  # max(0.0, 11.23−3.70)
+                    "syllable_count": 32.0,         # max(32,   815.4−1516.6 → clip to 32)
+                    "lexicon_count": 23.0,          # max(23,   524.2−1029.8 → clip to 23)
+                    "sentence_count": 1.0,          # max(1,    29.0−60.1 → clip to 1)
+                    "character_count": 118.0,       # max(118,  2610.2−4856.0 → clip to 118)
+                    "letter_count": 109.0,          # max(109,  2513.5−4679.5 → clip to 109)
+                    "polysyllable_count": 0.0,      # max(0,    78.9−137.5 → clip to 0)
+                    "monosyllable_count": 13.0,     # max(13,   334.7−709.4 → clip to 13)
+                    "difficult_words": 4.0,         # max(4,    93.4−120.0 → clip to 4)
+                },
+                max_scores = {
+                    "flesch_reading_ease": 100,    # min(106.4, 55.19+18.03)
+                    "automated_readability_index": 100, # min(98.2, 11.77+4.41)
+                    "aggregate_reading_level": 100, # min(77.0, 11.23+3.70)
+                    "syllable_count": 2331.9,       # min(43237, 815.4+1516.6)
+                    "lexicon_count": 1554.0,        # min(33033, 524.2+1029.8)
+                    "sentence_count": 89.1,         # min(2193,  29.0+60.1)
+                    "character_count": 7466.3,      # min(139807,2610.2+4856.0)
+                    "letter_count": 7193.0,         # min(134507,2513.5+4679.5)
+                    "polysyllable_count": 216.4,    # min(3261,  78.9+137.5)
+                    "monosyllable_count": 1044.1,   # min(25133,334.7+709.4)
+                    "difficult_words": 213.4,       # min(2366,  93.4+120.0)
+                },
+                metrics_to_keep: list = [
+                    "flesch_reading_ease",
+                    "automated_readability_index",
+                    "aggregate_reading_level",
+                    "syllable_count",
+                    "lexicon_count",
+                    "sentence_count",
+                    "character_count",
+                    "letter_count",
+                    "polysyllable_count",
+                    "monosyllable_count",
+                    "difficult_words",
+                 ]):
+        self.min_scores = min_scores
+        self.max_scores = max_scores
+        self.metric_name_map = {
+            'flesch_reading_ease': 'LangkitFleschReadingEaseScore',
+            'automated_readability_index': 'LangkitAutomatedReadabilityIndexScore',
+            'aggregate_reading_level': 'LangkitAggregateReadingLevelScore',
+            'syllable_count': 'LangkitSyllableCountScore',
+            'lexicon_count': 'LangkitLexiconCountScore',
+            'sentence_count': 'LangkitSentenceCountScore',
+            'character_count': 'LangkitCharacterCountScore',
+            'letter_count': 'LangkitLetterCountScore',
+            'polysyllable_count': 'LangkitPolysyllableCountScore',
+            'monosyllable_count': 'LangkitMonosyllableCountScore',
+            'difficult_words': 'LangkitDifficultWordsScore'
+        }
+        if not self.min_scores.keys() == self.max_scores.keys():
+            raise ValueError("min_scores and max_scores must have the same keys")  
+        self.logger = get_logger()
+        self.scorer = LangkitSampleEvaluator()
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_scores: {self.min_scores} and max_scores: {self.max_scores}...")
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于LangkitScorer打分器的得分对数据进行过滤。使用Langkit工具包计算11种文本统计信息，帮助评估文本结构复杂性和可读性。\n"
+                "输入参数：\n"
+                "- min_scores：各指标的最小阈值字典，包含11个语言统计指标\n"
+                "- max_scores：各指标的最大阈值字典，包含11个语言统计指标\n"
+                "- metrics_to_keep：需要保留的评估指标列表\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留所有指标都在指定范围内的文本\n"
+                "- 返回包含各指标标签字段名的列表"
+            )
+        else:
+            return (
+                "Filter data using scores from the LangkitScorer. Uses Langkit to extract 11 types of text statistics for evaluating text structure complexity and readability.\n"
+                "Input Parameters:\n"
+                "- min_scores: Dictionary of minimum thresholds for each metric, containing 11 language statistics\n"
+                "- max_scores: Dictionary of maximum thresholds for each metric, containing 11 language statistics\n"
+                "- metrics_to_keep: List of evaluation metrics to keep\n\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only texts with all metrics within specified ranges\n"
+                "- List containing label field names for each metric"
+            )
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_keys: list = ["flesch_reading_ease", "automated_readability_index", "aggregate_reading_level", "syllable_count", "lexicon_count", "sentence_count", "character_count", "letter_count", "polysyllable_count", "monosyllable_count", "difficult_words"]):
+        self.input_key = input_key
+        self.output_keys = output_keys
+        if not list(self.min_scores.keys()) == output_keys:
+            raise ValueError("min_scores and output_keys must have the same keys")  
+        self.logger.info("Running {self.__class__.__name__}...")
+        dataframe = storage.read("dataframe")
+        scores = self.scorer.eval(dataframe, self.input_key)
+        results = np.ones(len(dataframe), dtype=int)
+        for _label in self.output_keys:
+            label = self.metric_name_map[_label]
+            min_score = self.min_scores[_label]
+            max_score = self.max_scores[_label]
+            dataframe[label] = pd.DataFrame(scores)[label]
+            metric_scores = np.array(dataframe[label])
+            metric_filter = (min_score <= metric_scores) & (metric_scores <= max_score)
+            results = results & metric_filter.astype(int)
+            self.logger.debug(f"Filtered by {_label}, {np.sum(results)} data remained")
+            dataframe[f"{label}_label"] = metric_filter.astype(int)
+        filtered_dataframe = dataframe[results == 1]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [f"{label}_label" for label in self.output_keys]
+
--- a/dataflow/operators/general_text/filter/language_filter.py
+++ b/dataflow/operators/general_text/filter/language_filter.py
+import fasttext
+import numpy as np
+from huggingface_hub import hf_hub_download
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from tqdm import tqdm
+from dataflow.utils.utils import get_logger
+from dataflow.utils.storage import DataFlowStorage
+
+@OPERATOR_REGISTRY.register()
+class LanguageFilter(OperatorABC):
+
+    def __init__(self, allowed_languages: list, model_cache_dir: str = None):
+        self.logger = get_logger()
+        self.filter_name = 'LanguageFilter'
+        self.logger.info(f"Initializing {self.__class__.__name__} with allowed_languages = {allowed_languages} and model_cache_dir = {model_cache_dir}...")
+        
+        self.allowed_languages = allowed_languages
+        self.model_cache_dir = model_cache_dir
+        
+        # Download and load the FastText language model
+        try:
+            self.logger.info("Downloading model from Hugging Face Hub...")
+            model_path = hf_hub_download(repo_id="facebook/fasttext-language-identification", filename="model.bin", cache_dir=self.model_cache_dir)
+            self.model = fasttext.load_model(model_path)
+            self.logger.info("Model loaded successfully.")
+        except Exception as e:
+            self.logger.error(f"Error downloading or loading model: {e}")
+            raise
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "使用FastText语言识别模型过滤数据。下载并加载预训练的FastText语言识别模型，检查文本的语言是否在允许的语言列表中。\n"
+                "输入参数：\n"
+                "- allowed_languages：允许的语言标签列表\n"
+                "- model_cache_dir：模型缓存目录路径\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留语言在允许列表中的文本\n"
+                "- 返回包含语言标签字段名的列表"
+            )
+        else:
+            return (
+                "Filter data using FastText language identification model. Downloads and loads pre-trained FastText language identification model to check if text language is in allowed list.\n"
+                "Input Parameters:\n"
+                "- allowed_languages: List of allowed language labels\n"
+                "- model_cache_dir: Model cache directory path\n\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only texts with language in allowed list\n"
+                "- List containing language label field name"
+            )
+
+    def eval(self, dataframe, input_key):
+        self.logger.info(f"Start evaluating {self.filter_name}...")
+
+        predictions = []
+
+        # Assuming the dataframe contains the text in `input_key`
+        for text in tqdm(dataframe[input_key], desc=f"Implementing {self.filter_name}"):
+            labels, scores = self.model.predict(text.replace('\n', ' '), k=5)
+            label_score_pairs = list(zip(labels, scores))
+            label_score_pairs.sort(key=lambda x: x[1], reverse=True)  # Sort by score
+            top_labels = [label for label, score in label_score_pairs]
+            predictions.append(any(label in self.allowed_languages for label in top_labels))
+
+        return np.array(predictions).astype(int)
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='language_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.filter_name} with input_key = {self.input_key} and output_key = {self.output_key}...")
+        predictions = self.eval(dataframe, self.input_key)
+        dataframe[self.output_key] = predictions
+        filtered_dataframe = dataframe[dataframe[self.output_key] == 1]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
--- a/dataflow/operators/general_text/filter/lexical_diversity_filter.py
+++ b/dataflow/operators/general_text/filter/lexical_diversity_filter.py
+import pandas as pd
+import numpy as np
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.general_text import LexicalDiversitySampleEvaluator
+
+@OPERATOR_REGISTRY.register()
+class LexicalDiversityFilter(OperatorABC):
+    def __init__(self, min_scores: dict = {'mtld': 50, 'hdd': 0.8}, max_scores: dict = {'mtld': 99999, 'hdd': 1.0}):
+        
+        self.min_scores = min_scores
+        self.max_scores = max_scores
+        if not self.min_scores.keys() == self.max_scores.keys():
+            raise ValueError("min_scores and max_scores must have the same keys")
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_scores: {self.min_scores} and max_scores: {self.max_scores}...")  
+        self.metric_name_map = {
+            'hdd': 'LexicalDiversityHD-DScore',
+            'mtld': 'LexicalDiversityMTLDScore',
+        }
+        self.scorer = LexicalDiversitySampleEvaluator()
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于LexicalDiversityScorer打分器的得分对数据进行过滤。使用MTLD（移动平均类型-令牌比）和HDD（超几何分布多样性）两种方法计算词汇多样性，高分代表更丰富的词汇使用。\n"
+                "输入参数：\n"
+                "- min_scores：各指标的最小阈值字典，包含'mtld'和'hdd'\n"
+                "- max_scores：各指标的最大阈值字典，包含'mtld'和'hdd'\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留词汇多样性在指定范围内的文本\n"
+                "- 返回包含各指标标签字段名的列表"
+            )
+        else:
+            return (
+                "Filter data using scores from the LexicalDiversityScorer. Measure lexical diversity using MTLD (Moving-Average Type-Token Ratio) and HDD (Hypergeometric Distribution Diversity) methods; higher scores indicate more diverse vocabulary usage.\n"
+                "Input Parameters:\n"
+                "- min_scores: Dictionary of minimum thresholds for each metric, containing 'mtld' and 'hdd'\n"
+                "- max_scores: Dictionary of maximum thresholds for each metric, containing 'mtld' and 'hdd'\n\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only texts with lexical diversity within specified range\n"
+                "- List containing label field names for each metric"
+            )
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_keys = ['mtld', 'hdd']):
+        self.input_key = input_key
+        self.output_keys = output_keys
+        if not list(self.min_scores.keys()) == output_keys:
+            raise ValueError("min_scores and output_keys must have the same keys")  
+        self.logger.info(f"Running {self.__class__.__name__} with input_key: {self.input_key} and output_keys: {self.output_keys}...")
+        dataframe = storage.read("dataframe")
+        scores = self.scorer.eval(dataframe, self.input_key)
+        results = np.ones(len(dataframe), dtype=int)
+        for _label in self.output_keys:
+            min_score = self.min_scores[_label]
+            max_score = self.max_scores[_label]
+            label = self.metric_name_map[_label]
+            dataframe[label] = pd.DataFrame(scores)[label]
+            metric_scores = np.array(dataframe[label])
+            metric_filter = (min_score <= metric_scores) & (metric_scores <= max_score)
+            nan_filter = np.isnan(metric_scores)
+            metric_filter = metric_filter | nan_filter    
+            results = results & metric_filter.astype(int)
+            self.logger.debug(f"Filtered by {_label}, {np.sum(results)} data remained")
+            dataframe[f"{label}_label"] = metric_filter.astype(int)
+        filtered_dataframe = dataframe[results == 1]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [f"{label}_label" for label in self.output_keys]
--- a/dataflow/operators/general_text/filter/llm_language_filter.py
+++ b/dataflow/operators/general_text/filter/llm_language_filter.py
+import numpy as np
+from tqdm import tqdm
+from dataflow.core import OperatorABC, LLMServingABC
+from dataflow.prompts.general_text import LanguageFilterPrompt
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.logger import get_logger
+from dataflow.utils.storage import DataFlowStorage
+
+@OPERATOR_REGISTRY.register()
+class LLMLanguageFilter(OperatorABC):
+    """
+    Operator for filtering text based on language using LLM.
+    Argument allowed_languages is a list of allowed languages, using the ISO 639-1 two-letter language code to specify the language (for example, 'en' for English, 'zh' for Chinese, etc.).
+    """
+    def __init__(self, llm_serving: LLMServingABC = None, allowed_languages: list[str] = ['en']):
+        self.logger = get_logger()
+        self.prompt = LanguageFilterPrompt()
+        self.llm_serving = llm_serving
+        self.allowed_languages = allowed_languages
+        self.logger.info(f"Initializing {self.__class__.__name__}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        return "使用大语言模型识别语言并过滤数据" if lang == "zh" else "Using large language models to identify languages and filter data."
+    
+    def _reformat_prompt(self, dataframe):
+        formatted_prompts = [self.prompt.build_prompt(text=item) for item in tqdm(dataframe[self.input_key], desc="Reformatting Prompt...")]
+        return formatted_prompts
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'language_label'):
+        self.input_key, self.output_key = input_key, output_key
+        dataframe = storage.read("dataframe")
+        formatted_prompts = self._reformat_prompt(dataframe)
+        llm_outputs = self.llm_serving.generate_from_input(formatted_prompts)
+        dataframe[self.output_key] = llm_outputs
+        filtered_dataframe = dataframe[dataframe[self.output_key].isin(self.allowed_languages)]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return dataframe
\ No newline at end of file
--- a/dataflow/operators/general_text/filter/minhash_deduplicate_filter.py
+++ b/dataflow/operators/general_text/filter/minhash_deduplicate_filter.py
+from tqdm import tqdm
+from datasketch import MinHash, MinHashLSH  # use datasketch-1.6.5
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+
+@OPERATOR_REGISTRY.register()
+class MinHashDeduplicateFilter(OperatorABC):
+    def __init__(self, num_perm=128, threshold=0.9, use_n_gram=True, ngram=5):
+        self.logger = get_logger()
+        self.num_perm = num_perm
+        self.threshold = threshold
+        self.use_n_gram = use_n_gram
+        self.n_gram = ngram
+        self.logger.info(f"Initializing {self.__class__.__name__} with num_perm = {self.num_perm}, threshold = {self.threshold}, use_n_gram = {self.use_n_gram}, ngram = {self.n_gram}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "结合MinHash与LSH（局部敏感哈希）实现高效近似去重。将文本转换为MinHash签名，使用LSH快速查找相似文本，实现大规模数据集的近似去重。\n"
+                "输入参数：\n"
+                "- num_perm：生成MinHash签名的排列数\n"
+                "- threshold：相似度阈值，超过此阈值判定为相似文本\n"
+                "- use_n_gram：是否使用n-gram分词\n"
+                "- ngram：n-gram的n值\n"
+                "输出参数：\n"
+                "- 去重后的DataFrame，仅保留唯一文本\n"
+                "- 返回包含去重标签字段名的列表"
+            )
+        else:
+            return (
+                "Efficient near-duplicate detection using MinHash and LSH (Locality-Sensitive Hashing). Converts texts to MinHash signatures and uses LSH to quickly find similar texts, enabling near-deduplication for large-scale datasets.\n"
+                "Input Parameters:\n"
+                "- num_perm: Number of permutations for generating MinHash signatures\n"
+                "- threshold: Similarity threshold above which texts are considered duplicates\n"
+                "- use_n_gram: Whether to use n-gram tokenization\n"
+                "- ngram: n value for n-gram\n\n"
+                "Output Parameters:\n"
+                "- Deduplicated DataFrame containing only unique texts\n"
+                "- List containing deduplication label field name"
+            )
+
+    def create_minhash(self, data):
+        minhash = MinHash(num_perm=self.num_perm)
+        if self.use_n_gram:
+            for i in range(len(data) - self.n_gram + 1):
+                minhash.update(data[i:i + self.n_gram].encode('utf8'))
+        else:
+            for d in data:
+                minhash.update(d.encode('utf8'))
+        return minhash
+
+    def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
+        if input_keys is None and input_key is None:
+            self.logger.error(f"Need to specify either input_keys or input_key!")
+            raise ValueError(f"Need to specify either input_keys or input_key!")
+        if input_keys is not None and input_key is not None:
+            self.logger.error(f"{self.__class__.__name__} only need one input args!")
+            raise ValueError(f"{self.__class__.__name__} only need one input args!")
+        if input_keys is not None:
+            self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
+        else:
+            self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
+        lsh = MinHashLSH(threshold=self.threshold, num_perm=self.num_perm)
+        self.input_key = input_key
+        self.input_keys = input_keys
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        labels = [0] * len(dataframe)
+        with lsh.insertion_session() as session:  
+            for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
+                if input_keys is not None and len(input_keys) > 1:
+                    text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
+                else:
+                    text = sample[self.input_key]
+                minhash = self.create_minhash(text)
+                result = lsh.query(minhash)
+                
+                if len(result) == 0:
+                    labels[idx] = 1
+                    session.insert(idx, minhash)
+                    self.logger.debug(f"Inserted item {idx} into LSH with minhash.")
+        dataframe[self.output_key] = labels
+        filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
+        return [self.output_key,]
+        
+        
+
+        
+        
+
--- a/dataflow/operators/general_text/filter/ngram_filter.py
+++ b/dataflow/operators/general_text/filter/ngram_filter.py
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.general_text import NgramSampleEvaluator
+
+@OPERATOR_REGISTRY.register()
+class NgramFilter(OperatorABC):
+
+    def __init__(self, min_score=0.8, max_score=1, ngrams=5):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        self.scorer = NgramSampleEvaluator(ngrams)
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_scores: {self.min_score} and max_scores: {self.max_score}...")  
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于NgramScorer打分器的得分对数据进行过滤。计算文本中n-gram的重复比例，得分越高表示重复比例越低，文本冗余度越小。\n"
+                "输入参数：\n"
+                "- min_score：最小n-gram得分阈值\n"
+                "- max_score：最大n-gram得分阈值\n"
+                "- ngrams：n-gram的n值\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留n-gram得分在指定范围内的文本\n"
+                "- 返回包含n-gram得分字段名的列表"
+            )
+        else:
+            return (
+                "Filter data using scores from the NgramScorer. Evaluate text redundancy via n-gram repetition ratio; higher score means lower repetition and less text redundancy.\n"
+                "Input Parameters:\n"
+                "- min_score: Minimum n-gram score threshold\n"
+                "- max_score: Maximum n-gram score threshold\n"
+                "- ngrams: n value for n-gram\n\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only texts with n-gram score within specified range\n"
+                "- List containing n-gram score field name"
+            )
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='NgramScore'):
+        self.input_key = input_key
+        self.output_key = output_key
+        self.logger.info(f"Running {self.__class__.__name__} with input_key: {self.input_key} and output_key: {self.output_key}...")
+        dataframe = storage.read("dataframe")
+        scores = self.scorer.eval(dataframe, self.input_key)
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
+        
+        
\ No newline at end of file
--- a/dataflow/operators/general_text/filter/ngramhash_deduplicate_filter.py
+++ b/dataflow/operators/general_text/filter/ngramhash_deduplicate_filter.py
+from tqdm import tqdm
+from hashlib import md5, sha256
+from xxhash import xxh3_128
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+
+@OPERATOR_REGISTRY.register()
+class NgramHashDeduplicateFilter(OperatorABC):
+    def __init__(self, n_gram: int = 3, hash_func: str = 'md5', diff_size : int = 1):
+        self.logger = get_logger()
+        self.n_gram = n_gram
+        self.hash_func = hash_func
+        self.diff_size = diff_size
+        self.hash_func_dict = {
+            'md5': md5,
+            'sha256': sha256,
+            'xxh3': xxh3_128
+        }
+        
+        if self.hash_func not in self.hash_func_dict:
+            raise ValueError(f'Invalid hash function: {self.hash_func}')
+        self.logger.info(f"Initializing {self.__class__.__name__} with n_gram = {self.n_gram}, hash_func = {self.hash_func}, diff_size = {self.diff_size}...")
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "结合n-gram技术与哈希算法识别相似文本，实现近似去重。将文本分割为多个n-gram片段，计算每个片段的哈希值，通过比较哈希集合的相似度来判断文本相似性。\n"
+                "输入参数：\n"
+                "- n_gram：将文本分割的片段数量\n"
+                "- hash_func：哈希函数类型，支持'md5'、'sha256'和'xxh3'\n"
+                "- diff_size：哈希集合差异阈值，小于此值判定为相似文本\n"
+                "输出参数：\n"
+                "- 去重后的DataFrame，仅保留唯一文本\n"
+                "- 返回包含去重标签字段名的列表"
+            )
+        else:
+            return (
+                "Detect similar text using n-gram technology and hashing algorithm for near deduplication. Splits text into multiple n-gram segments, computes hash values for each segment, and judges text similarity by comparing hash set similarity.\n"
+                "Input Parameters:\n"
+                "- n_gram: Number of segments to split text into\n"
+                "- hash_func: Hash function type, supporting 'md5', 'sha256', and 'xxh3'\n"
+                "- diff_size: Hash set difference threshold below which texts are considered similar\n\n"
+                "Output Parameters:\n"
+                "- Deduplicated DataFrame containing only unique texts\n"
+                "- List containing deduplication label field name"
+            )
+
+    def _compute_hash(self, text: str) -> str:
+        return self.hash_func_dict[self.hash_func](text.encode('utf-8')).hexdigest()
+
+    def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
+        if input_keys is None and input_key is None:
+            self.logger.error(f"Need to specify either input_keys or input_key!")
+            raise ValueError(f"Need to specify either input_keys or input_key!")
+        if input_keys is not None and input_key is not None:
+            self.logger.error(f"{self.__class__.__name__} only need one input args!")
+            raise ValueError(f"{self.__class__.__name__} only need one input args!")
+        if input_keys is not None:
+            self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
+        else:
+            self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
+        self.input_key = input_key
+        self.input_keys = input_keys
+        self.output_key = output_key
+        seen_hashes = []
+        dataframe = storage.read("dataframe")
+        labels = [0] * len(dataframe)
+        for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
+            if input_keys is not None and len(input_keys) > 1:
+                text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
+            else:
+                text = sample[self.input_key]
+            gram_length = len(text) // self.n_gram
+            ngrams = [text[i*gram_length:(i+1)*gram_length] for i in range(self.n_gram)]
+            hash_value = set(self._compute_hash(ngram) for ngram in ngrams)
+            if all(len(hash_value & hash) < self.diff_size for hash in seen_hashes):
+                labels[idx] = 1
+                seen_hashes.append(hash_value)
+        dataframe[self.output_key] = labels
+        filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
+        return [self.output_key,]
+        
+        
+
+        
+        
+
--- a/dataflow/operators/general_text/filter/perspective_filter.py
+++ b/dataflow/operators/general_text/filter/perspective_filter.py
+import numpy as np
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.general_text import PerspectiveSampleEvaluator
+from dataflow.serving import PerspectiveAPIServing
+
+@OPERATOR_REGISTRY.register()
+class PerspectiveFilter(OperatorABC):
+    def __init__(self, min_score: float = 0.0, max_score: float = 0.5):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {min_score} and max_score = {max_score}")
+        self.min_score = min_score
+        self.max_score = max_score
+        self.serving = PerspectiveAPIServing(max_workers=10)
+        self.scorer = PerspectiveSampleEvaluator(serving=self.serving)
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于PerspectiveScorer打分器的得分对数据进行过滤使用Perspective API评估文本的毒性，返回毒性概率，得分越高表明文本毒性越高。\n"
+                "输入参数：\n"
+                "- min_score：最小毒性得分阈值\n"
+                "- max_score：最大毒性得分阈值\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留毒性得分在指定范围内的文本\n"
+                "- 返回包含毒性得分字段名的列表"
+            )
+        else:
+            return (
+                "Filter data using scores from the PerspectiveScorer. Assess text toxicity using Perspective API; higher scores indicate more toxicity.\n"
+                "Input Parameters:\n"
+                "- min_score: Minimum toxicity score threshold\n"
+                "- max_score: Maximum toxicity score threshold\n\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only texts with toxicity score within specified range\n"
+                "- List containing toxicity score field name"
+            )
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'PerspectiveScore'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        # Get the scores for filtering
+        scores = np.array(self.scorer.eval(dataframe, self.input_key))
+
+        dataframe[self.output_key] = scores
+        metric_filter = (scores >= self.min_score) & (scores <= self.max_score)
+        nan_filter = np.isnan(scores)
+        metric_filter = metric_filter | nan_filter    
+        filtered_dataframe = dataframe[metric_filter]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
--- a/dataflow/operators/general_text/filter/presidio_filter.py
+++ b/dataflow/operators/general_text/filter/presidio_filter.py
+import numpy as np
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.general_text import PresidioSampleEvaluator
+
+@OPERATOR_REGISTRY.register()
+class PresidioFilter(OperatorABC):
+
+    def __init__(self, min_score: int = 0, max_score: int = 5, lang='en', device='cuda', model_cache_dir='./dataflow_cache'):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        self.scorer = PresidioSampleEvaluator(lang=lang, device=device, model_cache_dir=model_cache_dir)
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}")
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于PresidioScorer打分器的得分对数据进行过滤。使用Microsoft Presidio模型识别文本中的私人实体(PII)，返回PII信息个数。\n"
+                "支持识别姓名、邮箱、电话号码、身份证号等多种敏感信息类型，可用于数据隐私保护和合规性检查。\n"
+                "输入参数：\n"
+                "- min_score：保留样本的最小PII数量阈值，默认为0\n"
+                "- max_score：保留样本的最大PII数量阈值，默认为5\n"
+                "- lang：文本语言，默认为'en'\n"
+                "- device：模型运行设备，默认为'cuda'\n"
+                "- model_cache_dir：模型缓存目录，默认为'./dataflow_cache'\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留PII数量在[min_score, max_score]范围内的样本\n"
+                "- 返回包含输出字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Filter data using scores from the PresidioScorer. Detect personally identifiable information (PII) entities in text using Microsoft Presidio model and return the count of detected PII items.\n"
+                "Supports recognition of multiple sensitive information types including names, emails, phone numbers, and IDs for data privacy protection and compliance checks.\n"
+                "Input Parameters:\n"
+                "- min_score: Minimum PII count threshold for retaining samples, default is 0\n"
+                "- max_score: Maximum PII count threshold for retaining samples, default is 5\n"
+                "- lang: Text language, default is 'en'\n"
+                "- device: Model running device, default is 'cuda'\n"
+                "- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only samples with PII count within [min_score, max_score] range\n"
+                "- List containing output field name for subsequent operator reference"
+            )
+        else:
+            return "Filter data based on PII detection results using Microsoft Presidio model."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'PresidioScore'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__}...")
+
+        # Get the scores for filtering
+        scores = np.array(self.scorer.eval(dataframe, self.input_key))
+
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
\ No newline at end of file
--- a/dataflow/operators/general_text/filter/rule_based_filter.py
+++ b/dataflow/operators/general_text/filter/rule_based_filter.py
+from dataflow.core import OperatorABC
+from typing import Callable, Tuple
+import numpy as np
+from nltk.tokenize import word_tokenize, WordPunctTokenizer
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.utils import get_logger
+from dataflow.utils.storage import DataFlowStorage
+from tqdm import tqdm
+import re
+
+@OPERATOR_REGISTRY.register()
+class ColonEndFilter(OperatorABC):
+
+    def __init__(self):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检查文本是否以冒号结尾，常用于判断问题是否为不完整的提问。\n"
+                "初始化参数：\n"
+                "- 无\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'{类名小写}_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator checks if text ends with a colon, commonly used to identify incomplete questions.\n"
+                "Initialization Parameters:\n"
+                "- None\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is '{classname_lower}_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "ColonEndFilter checks if text ends with a colon and filters out incomplete questions."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str = None):
+        self.input_key = input_key
+        self.output_key = output_key or f"{self.__class__.__name__.lower()}_label"
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__}...")
+        colon_end_checks = []
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                colon_end_checks.append(not text.endswith(':'))
+            else:
+                colon_end_checks.append(0)
+        colon_end_checks = np.array(colon_end_checks, dtype=int)
+        dataframe[self.output_key] = colon_end_checks
+        filtered_dataframe = dataframe[colon_end_checks == 1]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class SentenceNumberFilter(OperatorABC):
+
+    def __init__(self, min_sentences: int=3, max_sentences: int=7500):
+        self.logger = get_logger()
+        self.min_sentences = min_sentences
+        self.max_sentences = max_sentences
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_sentences = {self.min_sentences}, max_sentences = {self.max_sentences}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检查文本中的句子数量是否在指定范围内，使用正则表达式匹配句子结束符号(。！？.!?)进行分割。\n"
+                "初始化参数：\n"
+                "- min_sentences：最小句子数量阈值，默认为3\n"
+                "- max_sentences：最大句子数量阈值，默认为7500\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'sentence_number_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator checks if the number of sentences in text is within specified range, using regex to match sentence-ending punctuation(。！？.!?).\n"
+                "Initialization Parameters:\n"
+                "- min_sentences: Minimum sentence count threshold, default is 3\n"
+                "- max_sentences: Maximum sentence count threshold, default is 7500\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'sentence_number_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "SentenceNumberFilter filters text based on sentence count range using regex pattern matching."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'sentence_number_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_check = []
+        SENT_PATTERN = re.compile(r'\b[^.!?\n]+[.!?]*', flags=re.UNICODE)
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                num_sentence = len(SENT_PATTERN.findall(text))
+                valid_check.append(self.min_sentences <= num_sentence <= self.max_sentences)
+            else:
+                valid_check.append(0)
+
+        valid_check = np.array(valid_check, dtype=int)
+        dataframe[self.output_key] = valid_check
+        filtered_dataframe = dataframe[valid_check == 1]
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+
+
+class TextSlice:
+    # A slice of text from a document.
+    def __init__(self, text: str, start: int, end: int):
+        self.text = text
+        self.start = start
+        self.end = end
+
+def split_paragraphs(
+        text: str, normalizer: Callable[[str], str], remove_empty: bool = True
+) -> Tuple[TextSlice]:
+    """
+    Split a string into paragraphs. A paragraph is defined as a sequence of zero or more characters, followed
+    by a newline character, or a sequence of one or more characters, followed by the end of the string.
+    """
+    text_slices = tuple(
+        TextSlice(normalizer(text[match.start():match.end()]), match.start(), match.end())
+        for match in re.finditer(r"([^\n]*\n|[^\n]+$)", text)
+    )
+
+    if remove_empty is True:
+        text_slices = tuple(
+            text_slice for text_slice in text_slices if text_slice.text.strip()
+        )
+
+    return text_slices
+
+def normalize(
+        text: str,
+        remove_punct: bool = True,
+        lowercase: bool = True,
+        nfd_unicode: bool = True,
+        white_space: bool = True
+) -> str:
+    import string
+    import unicodedata
+    if remove_punct:
+        text = text.translate(str.maketrans('', '', string.punctuation))
+
+    # lowercase
+    if lowercase:
+        text = text.lower()
+
+    if white_space:
+        text = text.strip()
+        text = re.sub(r'\s+', ' ', text)
+
+    # NFD unicode normalization
+    if nfd_unicode:
+        text = unicodedata.normalize('NFD', text)
+
+    return text
+
+@OPERATOR_REGISTRY.register()
+class LineEndWithEllipsisFilter(OperatorABC):
+
+    def __init__(self, threshold: float=0.3):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检测并过滤以省略号(...)或(……)结尾的文本行，常用于识别不完整的表述。\n"
+                "初始化参数：\n"
+                "- threshold：以省略号结尾的行数比率阈值，默认为0.3\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'line_end_with_ellipsis_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator detects and filters text lines ending with ellipsis (...) or (……), commonly used to identify incomplete statements.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Ratio threshold of lines ending with ellipsis, default is 0.3\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'line_end_with_ellipsis_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "LineEndWithEllipsisFilter detects and filters text ending with ellipsis characters."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'line_end_with_ellipsis_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        ellipsis_checks = []
+        ellipsis = ["...", "…"]
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                raw_lines = split_paragraphs(text=text, normalizer=lambda x: x, remove_empty=True)
+                num_lines = len(raw_lines)
+
+                if num_lines == 0:
+                    ellipsis_checks.append(False)
+                    continue
+
+                num_occurrences = sum([line.text.rstrip().endswith(tuple(ellipsis)) for line in raw_lines])
+                ratio = num_occurrences / num_lines
+                ellipsis_checks.append(ratio < self.threshold)
+            else:
+                ellipsis_checks.append(False)
+
+        ellipsis_checks = np.array(ellipsis_checks, dtype=int)
+        dataframe[self.output_key] = ellipsis_checks
+        filtered_dataframe = dataframe[ellipsis_checks == 1]
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+    
+@OPERATOR_REGISTRY.register()
+class ContentNullFilter(OperatorABC):
+
+    def __init__(self):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于过滤空值、空字符串或仅包含空白字符的文本，确保输入数据的有效性。\n"
+                "初始化参数：\n"
+                "- 无\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'content_null_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator filters null values, empty strings, or text containing only whitespace characters to ensure data validity.\n"
+                "Initialization Parameters:\n"
+                "- None\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'content_null_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "ContentNullFilter removes null, empty, and whitespace-only text content."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='content_null_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        null_checks = []
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            null_checks.append(text is not None and text.strip() != '')
+
+        null_checks = np.array(null_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = null_checks
+        filtered_dataframe = dataframe[null_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class SymbolWordRatioFilter(OperatorABC):
+
+    def __init__(self, threshold: float=0.4):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.symbol = ["#", "...", "…"]
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检查文本中特定符号(#, ..., …)与单词数量的比率是否超过阈值，过滤符号使用过多的文本。\n"
+                "初始化参数：\n"
+                "- threshold：符号与单词比率阈值，默认为0.4\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'symbol_word_ratio_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator checks if the ratio of specific symbols(#, ..., …) to word count exceeds threshold, filtering text with excessive symbol usage.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Symbol-to-word ratio threshold, default is 0.4\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'symbol_word_ratio_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "SymbolWordRatioFilter checks ratio of specified symbols to word count and filters excessive usage."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='symbol_word_ratio_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                raw_words = tuple(WordPunctTokenizer().tokenize(text))
+                num_words = len(raw_words)
+                num_symbols = float(sum(text.count(symbol) for symbol in self.symbol))
+
+                if num_words == 0:
+                    valid_checks.append(False)
+                    continue
+
+                ratio = num_symbols / num_words
+                valid_checks.append(ratio < self.threshold)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class AlphaWordsFilter(OperatorABC):
+
+    def __init__(self, threshold: float, use_tokenizer: bool):
+        import nltk
+        import os
+        
+        # 设置 NLTK 数据路径（如果环境变量中有的话）
+        if 'NLTK_DATA' in os.environ:
+            nltk.data.path.insert(0, os.environ['NLTK_DATA'])
+        
+        # 尝试查找数据，如果不存在则下载
+        try:
+            nltk.data.find('tokenizers/punkt_tab')
+        except LookupError:
+            nltk.download('punkt_tab')
+        
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.use_tokenizer = use_tokenizer
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于验证文本中字母单词的比率是否达到阈值，支持NLTK分词或简单空格分割两种模式。\n"
+                "初始化参数：\n"
+                "- threshold：字母单词比率阈值（无默认值，必须提供）\n"
+                "- use_tokenizer：是否使用NLTK分词器（无默认值，必须提供）\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'alpha_words_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator verifies if the ratio of alphabetic words in text meets threshold, supporting NLTK tokenization or simple space splitting.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Alphabetic word ratio threshold (no default, required)\n"
+                "- use_tokenizer: Whether to use NLTK tokenizer (no default, required)\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'alpha_words_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "AlphaWordsFilter verifies alphabetic word ratio using either NLTK tokenization or space splitting."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='alpha_words_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+        
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if self.use_tokenizer:
+                words = word_tokenize(text)
+            else:
+                words = text.split()
+            alpha_count = sum(1 for word in words if re.search(r'[a-zA-Z]', word))
+            word_count = len(words)
+            if word_count > 0:
+                ratio = alpha_count / word_count
+                valid_checks.append(ratio > self.threshold)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+        dataframe[self.output_key] = valid_checks
+        # Filter the dataframe based on the result
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class HtmlEntityFilter(OperatorABC):
+
+    def __init__(self):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检测并过滤包含HTML实体（如&amp;、&lt;、&gt;等）的文本，确保内容不包含标记语言元素。\n"
+                "初始化参数：\n"
+                "- 无\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'html_entity_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator detects and filters text containing HTML entities (e.g., &amp;, &lt;, &gt;) to ensure content has no markup language elements.\n"
+                "Initialization Parameters:\n"
+                "- None\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'html_entity_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "HtmlEntityFilter detects and removes text containing HTML entity patterns."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='html_entity_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+
+        # Define the list of HTML entities
+        html_entity = ["nbsp", "lt", "gt", "amp", "quot", "apos", "hellip", "ndash", "mdash", "lsquo", "rsquo", "ldquo", "rdquo"]
+        full_entities_1 = [f"&{entity}；" for entity in html_entity]
+        full_entities_2 = [f"&{entity};" for entity in html_entity]
+        full_entities_3 = [f"＆{entity}；" for entity in html_entity]
+        full_entities_4 = [f"＆{entity};" for entity in html_entity]
+        half_entities = [f"＆{entity}" for entity in html_entity] + [f"&{entity}" for entity in html_entity]
+        all_entities = full_entities_1 + full_entities_2 + full_entities_3 + full_entities_4 + half_entities
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                has_html_entity = any(entity in text for entity in all_entities)
+                valid_checks.append(not has_html_entity)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class IDCardFilter(OperatorABC):
+
+    def __init__(self, threshold:int=3):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.logger.info(f"Initializing {self.__class__.__name__}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检测并过滤包含身份证相关术语的文本，使用正则表达式匹配身份证号码模式以保护敏感信息。\n"
+                "初始化参数：\n"
+                "- threshold：身份证相关词汇匹配次数阈值，默认为3\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'id_card_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator detects and filters text containing ID card-related terms using regex patterns to protect sensitive information.\n"
+                "Initialization Parameters:\n"
+                "- threshold: ID card-related terms matching count threshold, default is 3\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'id_card_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "IDCardFilter detects and removes text containing ID card numbers and related sensitive information."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='id_card_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+        pattern = re.compile(r"(身\s{0,10}份|id\s{0,10}number\s{0,10}|identification|identity|\s{0,10}ID\s{0,10}No\s{0,10}|id\s{0,10}card\s{0,10}|NRIC\s{0,10}number\s{0,10}|IC\s{0,10}number\s{0,10}|resident\s{0,10}registration\s{0,10}|I.D.\s{0,10}Number\s{0,10})", re.I)
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                matches = pattern.findall(text)
+                has_too_many_id_terms = len(matches) >= self.threshold
+                valid_checks.append(not has_too_many_id_terms)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class NoPuncFilter(OperatorABC):
+
+    def __init__(self, threshold: int=112):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于确保文本包含足够的标点符号，通过统计句子间最大单词数量进行过滤。\n"
+                "初始化参数：\n"
+                "- threshold：句子间最大单词数量阈值，默认为112\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'no_punc_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator ensures text contains sufficient punctuation by counting maximum word count between sentences.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Maximum word count between sentences threshold, default is 112\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'no_punc_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "NoPuncFilter ensures text contains sufficient punctuation marks based on ratio threshold."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='no_punc_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                paragraphs = text.split('\n')
+                max_word_count = 0
+                for paragraph in paragraphs:
+                    if len(paragraph.strip()) == 0:
+                        continue
+                    sentences = re.split("[–.!?,;•/|…]", paragraph)
+                    for sentence in sentences:
+                        words = sentence.split()
+                        word_count = len(words)
+                        if word_count > max_word_count:
+                            max_word_count = word_count
+
+                valid_checks.append(int(max_word_count) <= self.threshold)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class SpecialCharacterFilter(OperatorABC):
+
+    def __init__(self):
+        self.logger = get_logger()
+        self.logger.info(f"Initializing {self.__class__.__name__}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于移除包含特殊/unicode字符的文本，使用预定义模式检测非标准字符以确保文本规范性。\n"
+                "初始化参数：\n"
+                "- 无\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'special_character_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator removes text containing special/unicode characters using predefined patterns to ensure text normalization.\n"
+                "Initialization Parameters:\n"
+                "- None\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'special_character_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "SpecialCharacterFilter removes text containing special or non-standard unicode characters."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='special_character_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        speclai_character = [
+            r"u200e",
+            r"&#247;|\? :",
+            r"[�□]|\{\/U\}",
+            r"U\+26[0-F][0-D]|U\+273[3-4]|U\+1F[3-6][0-4][0-F]|U\+1F6[8-F][0-F]"
+        ]
+
+        valid_checks = []
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                # Check for special characters using regular expressions
+                has_special_character = any(re.search(pattern, text) for pattern in speclai_character)
+                valid_checks.append(not has_special_character)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class WatermarkFilter(OperatorABC):
+
+    def __init__(self, watermarks: list= ['Copyright', 'Watermark', 'Confidential']):
+        self.logger = get_logger()
+        self.watermarks = watermarks
+        self.logger.info(f"Initializing {self.__class__.__name__} with watermarks={self.watermarks}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检测并移除包含版权/水印内容的文本，使用指定关键词列表识别受保护内容。\n"
+                "初始化参数：\n"
+                "- watermarks：水印关键词列表，默认为['Copyright', 'Watermark', 'Confidential']\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'watermark_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator detects and removes copyrighted/watermarked content using specified keyword lists to identify protected material.\n"
+                "Initialization Parameters:\n"
+                "- watermarks: List of watermark keywords, default is ['Copyright', 'Watermark', 'Confidential']\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'watermark_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "WatermarkFilter detects and removes text containing copyright or watermark keywords."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='watermark_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+        
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                matches = re.search('|'.join(self.watermarks), text)
+                valid_checks.append(matches is None)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class MeanWordLengthFilter(OperatorABC):
+
+    def __init__(self, min_length: float=3, max_length: float=10):
+        self.logger = get_logger()
+        self.min_length = min_length
+        self.max_length = max_length
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_length={self.min_length}, max_length={self.max_length}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检查文本中单词的平均长度是否在指定范围内，通过字符总数除以单词数量计算平均值。\n"
+                "初始化参数：\n"
+                "- min_length：最小平均单词长度，默认为3\n"
+                "- max_length：最大平均单词长度，默认为10\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'mean_word_length_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator checks if the average word length in text is within specified range, calculated by total characters divided by word count.\n"
+                "Initialization Parameters:\n"
+                "- min_length: Minimum average word length, default is 3\n"
+                "- max_length: Maximum average word length, default is 10\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'mean_word_length_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "MeanWordLengthFilter checks average word length against specified range using character and word counts."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='mean_word_length_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+        
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                normalized_words = text.split()
+                num_words = len(normalized_words)
+
+                if num_words == 0:
+                    valid_checks.append(False)
+                    continue
+
+                num_chars = sum(len(word) for word in normalized_words)
+                mean_length = round(num_chars / num_words, 2)
+
+                valid_checks.append(self.min_length <= mean_length < self.max_length)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class StopWordFilter(OperatorABC):
+
+    def __init__(self, threshold: float, use_tokenizer: bool):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.use_tokenizer = use_tokenizer
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...")
+        import nltk
+        import os
+        from nltk.corpus import stopwords
+        
+        # 设置 NLTK 数据路径（如果环境变量中有的话）
+        if 'NLTK_DATA' in os.environ:
+            nltk.data.path.insert(0, os.environ['NLTK_DATA'])
+        else:
+            nltk.data.path.append('./dataflow/operators/filter/GeneralText/nltkdata/')
+        
+        # 尝试查找数据，如果不存在则下载
+        try:
+            nltk.data.find('corpora/stopwords')
+        except LookupError:
+            nltk.download('stopwords', download_dir='./dataflow/operators/filter/GeneralText/nltkdata/')
+        
+        # 加载停用词
+        self.stw = set(stopwords.words('english'))
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于验证文本中停用词的比率是否高于阈值，使用NLTK分词器进行单词分割和停用词识别。\n"
+                "初始化参数：\n"
+                "- threshold：停用词比率阈值（无默认值，必须提供）\n"
+                "- use_tokenizer：是否使用NLTK分词器（无默认值，必须提供）\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'stop_word_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator verifies if the ratio of stop words in text is above threshold, using NLTK tokenizer for word splitting and stop word identification.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Stop word ratio threshold (no default, required)\n"
+                "- use_tokenizer: Whether to use NLTK tokenizer (no default, required)\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'stop_word_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "StopWordFilter verifies stop word ratio using NLTK tokenization with configurable threshold."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='stop_word_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                if self.use_tokenizer:
+                    words = word_tokenize(text.lower())
+                else:
+                    words = text.lower().split()
+
+                num_words = len(words)
+                num_stop_words = sum(map(lambda w: w in self.stw, words))
+                
+                ratio = num_stop_words / num_words if num_words > 0 else 0
+
+                valid_checks.append(ratio > self.threshold and num_stop_words > 2)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class CurlyBracketFilter(OperatorABC):
+
+    def __init__(self, threshold: float=0.025):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold={self.threshold}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检测文本中是否存在过多的花括号使用，通过花括号数量与文本长度的比率进行过滤。\n"
+                "初始化参数：\n"
+                "- threshold：花括号比率阈值，默认为0.025\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'curly_bracket_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator detects excessive curly bracket usage in text by comparing bracket count to text length ratio.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Bracket ratio threshold, default is 0.025\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'curly_bracket_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "CurlyBracketFilter detects excessive curly bracket usage with ratio thresholding."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='curly_bracket_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                num = text.count('{') + text.count('}')
+                ratio = num / len(text) if len(text) != 0 else 0
+                valid_checks.append(ratio < self.threshold)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class CapitalWordsFilter(OperatorABC):
+
+    def __init__(self, threshold: float=0.2, use_tokenizer: bool=False):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.use_tokenizer = use_tokenizer
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}, use_tokenizer = {self.use_tokenizer}...")
+        
+        # 如果使用分词器，配置 NLTK 数据路径
+        if self.use_tokenizer:
+            import nltk
+            import os
+            
+            # 设置 NLTK 数据路径（如果环境变量中有的话）
+            if 'NLTK_DATA' in os.environ:
+                nltk.data.path.insert(0, os.environ['NLTK_DATA'])
+            
+            # 尝试查找数据，如果不存在则下载
+            try:
+                nltk.data.find('tokenizers/punkt')
+            except LookupError:
+                nltk.download('punkt')
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检查文本中大写单词的比率是否超过阈值，支持可选的分词器进行单词识别。\n"
+                "初始化参数：\n"
+                "- threshold：大写单词比率阈值，默认为0.2\n"
+                "- use_tokenizer：是否使用NLTK分词器，默认为False\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'capital_words_filter'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator checks if the ratio of capitalized words in text exceeds threshold, supporting optional tokenizer for word identification.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Capitalized word ratio threshold, default is 0.2\n"
+                "- use_tokenizer: Whether to use NLTK tokenizer, default is False\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'capital_words_filter'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "CapitalWordsFilter checks uppercase word ratio with optional tokenizer usage."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='capital_words_filter'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                if self.use_tokenizer:
+                    words = word_tokenize(text)
+                else:
+                    words = text.split()
+
+                num_words = len(words)
+                num_caps_words = sum(map(str.isupper, words))
+
+                ratio = num_caps_words / num_words if num_words > 0 else 0
+
+                valid_checks.append(ratio <= self.threshold)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class LoremIpsumFilter(OperatorABC):
+
+    def __init__(self, threshold: float=3e-8):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检测并过滤包含占位文本（如'lorem ipsum'）的文本，使用正则表达式模式匹配并结合阈值过滤。\n"
+                "初始化参数：\n"
+                "- threshold：'lorem ipsum'出现次数与文本长度的比率阈值，默认为3e-8\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'loremipsum_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator detects and filters text containing placeholder text (e.g., 'lorem ipsum') using regex pattern matching with threshold filtering.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Ratio threshold of 'lorem ipsum' occurrences to text length, default is 3e-8\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'loremipsum_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "LoremIpsumFilter detects and removes text containing placeholder text patterns."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='loremipsum_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+
+        SEARCH_REGEX = re.compile(r"lorem ipsum", re.IGNORECASE)
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                normalized_content = text.lower()
+                num_occurrences = len(SEARCH_REGEX.findall(normalized_content))
+
+                ratio = num_occurrences / len(normalized_content) if len(normalized_content) > 0 else 0
+                valid_checks.append(ratio <= self.threshold)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class UniqueWordsFilter(OperatorABC):
+
+    def __init__(self, threshold: float=0.1):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检查文本中唯一单词的比率是否达到阈值，通过集合操作计算唯一单词数量与总单词数量的比率。\n"
+                "初始化参数：\n"
+                "- threshold：最小唯一单词比率阈值，默认为0.1\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'unique_words_filter'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator checks if the ratio of unique words in text meets threshold, calculating ratio of unique word count to total word count using set operations.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Minimum unique word ratio threshold, default is 0.1\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'unique_words_filter'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "UniqueWordsFilter checks unique word ratio using set operations and threshold comparison."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='unique_words_filter'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                normalized_text = text.lower()
+                normalized_words = tuple(normalized_text.split())
+                num_normalized_words = len(normalized_words)
+
+                if num_normalized_words == 0:
+                    valid_checks.append(False)
+                    continue
+
+                num_unique_words = len(set(normalized_words))
+                ratio = num_unique_words / num_normalized_words
+                valid_checks.append(ratio > self.threshold)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class CharNumberFilter(OperatorABC):
+
+    def __init__(self, threshold: int=100):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold = {self.threshold}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于验证文本在去除空白字符后的字符数量是否达到最小阈值。\n"
+                "初始化参数：\n"
+                "- threshold：最小字符数量阈值，默认为100\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'char_number_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator verifies if the character count of text (after whitespace removal) meets minimum threshold.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Minimum character count threshold, default is 100\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'char_number_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "CharNumberFilter verifies character count after whitespace removal against specified threshold."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='char_number_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        valid_checks = []
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                # Remove whitespace and count the number of characters
+                text = text.strip().replace(" ", "").replace("\n", "").replace("\t", "")
+                num_char = len(text)
+
+                # Check if the number of characters meets the threshold
+                valid_checks.append(num_char >= self.threshold)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class LineStartWithBulletpointFilter(OperatorABC):
+
+    def __init__(self, threshold: float=0.9):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold={self.threshold}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于检测并过滤以各种项目符号符号开头的文本行，使用Unicode字符匹配结合比率阈值进行过滤。\n"
+                "初始化参数：\n"
+                "- threshold：以项目符号开头的行数比率阈值，默认为0.9\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'line_start_with_bullet_point_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator detects and filters lines starting with various bullet point symbols using Unicode character matching with ratio thresholding.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Ratio threshold of lines starting with bullet points, default is 0.9\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'line_start_with_bullet_point_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "LineStartWithBulletpointFilter detects various bullet point symbols using Unicode character matching."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='line_start_with_bullet_point_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__}...")
+
+        valid_checks = []
+
+        key_list = [
+            "\u2022", "\u2023", "\u25B6", "\u25C0", "\u25E6", "\u25A0", "\u25A1", "\u25AA", "\u25AB", "\u2013"
+        ]
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                raw_lines = split_paragraphs(text=text, normalizer=lambda x: x, remove_empty=True)
+                num_lines = len(raw_lines)
+                
+                if num_lines == 0:
+                    valid_checks.append(False)
+                    continue
+
+                num_occurrences = sum([line.text.lstrip().startswith(tuple(key_list)) for line in raw_lines])
+                ratio = num_occurrences / num_lines
+                valid_checks.append(ratio <= self.threshold)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
+
+@OPERATOR_REGISTRY.register()
+class LineWithJavascriptFilter(OperatorABC):
+
+    def __init__(self, threshold: int=3):
+        self.logger = get_logger()
+        self.threshold = threshold
+        self.logger.info(f"Initializing {self.__class__.__name__} with threshold={self.threshold}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于识别并过滤包含'javascript'引用的文本，通过关键词匹配和阈值判断进行内容过滤。\n"
+                "初始化参数：\n"
+                "- threshold：不包含'javascript'的最小行数阈值，默认为3\n"
+                "运行参数：\n"
+                "- storage：DataFlowStorage对象\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出标签字段名，默认为'line_with_javascript_filter_label'\n"
+                "返回值：\n"
+                "- 包含output_key的列表"
+            )
+        elif lang == "en":
+            return (
+                "This operator identifies and filters text containing 'javascript' references through keyword matching and threshold judgment.\n"
+                "Initialization Parameters:\n"
+                "- threshold: Minimum line count threshold without 'javascript', default is 3\n"
+                "Run Parameters:\n"
+                "- storage: DataFlowStorage object\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output label field name, default is 'line_with_javascript_filter_label'\n"
+                "Returns:\n"
+                "- List containing output_key"
+            )
+        else:
+            return "LineWithJavascriptFilter identifies 'javascript' references in text with threshold-based filtering."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='line_with_javascript_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__}...")
+
+        valid_checks = []
+
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                normalized_lines = split_paragraphs(text=text, normalizer=normalize, remove_empty=True)
+                num_lines = len(normalized_lines)
+
+                if num_lines == 0:
+                    valid_checks.append(False)
+                    continue
+
+                num_occurrences = sum(['javascript' in line.text.lower() for line in normalized_lines])
+                num_not_occur = num_lines - num_occurrences
+
+                valid_checks.append(num_lines <= 3 or num_not_occur >= self.threshold)
+            else:
+                valid_checks.append(False)
+
+        valid_checks = np.array(valid_checks, dtype=int)
+
+        # Filter the dataframe based on the result
+        dataframe[self.output_key] = valid_checks
+        filtered_dataframe = dataframe[valid_checks == 1]
+
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
--- a/dataflow/operators/general_text/filter/sem_deduplicate_filter.py
+++ b/dataflow/operators/general_text/filter/sem_deduplicate_filter.py
+import torch
+from tqdm import tqdm
+from hashlib import md5, sha256
+from xxhash import xxh3_128
+from transformers import BertModel, BertTokenizer
+from torch.nn.functional import normalize
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+
+def load_model(device, model_path):
+    """
+    Load the pretrained BERT model and tokenizer.
+
+    Args:
+        model_path (str): Path to the pretrained model.
+
+    Returns:
+        model, tokenizer: The loaded BERT model and tokenizer.
+    """
+    model = BertModel.from_pretrained(model_path)
+    tokenizer = BertTokenizer.from_pretrained(model_path)
+    model = model.to(device)
+    model = model.eval()
+    return model, tokenizer
+
+
+def get_text_embedding(texts, tokenizer, model, device):
+    """
+    Compute text embeddings using the provided BERT model.
+
+    Args:
+        texts (list): List of texts to be embedded.
+        tokenizer: Tokenizer for the model.
+        model: The BERT model.
+
+    Returns:
+        np.ndarray: Embeddings for the input texts.
+    """
+    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True).to(device)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()  # Use mean pooling for sentence embeddings
+
+
+def compute_cos_sim_matrix(embeddings):
+    """
+    Compute the cosine similarity matrix for the given embeddings.
+
+    Args:
+        embeddings (np.ndarray): Text embeddings.
+
+    Returns:
+        np.ndarray: Cosine similarity matrix.
+    """
+    embeddings = torch.tensor(embeddings)
+    embeddings = normalize(embeddings, dim=1)
+    return embeddings @ embeddings.T
+
+
+@OPERATOR_REGISTRY.register()
+class SemDeduplicateFilter(OperatorABC):
+    def __init__(self, eps: float = 0.05, model_name: str = 'sentence-transformers/all-MiniLM-L6-v2', model_cache_dir: str = './dataflow_cache', device: str = 'cuda'):
+        self.logger = get_logger()
+        self.eps = eps
+        self.device = device
+        self.model_name = model_name
+        self.model_cache_dir = model_cache_dir
+        self.model = BertModel.from_pretrained(self.model_name, cache_dir=model_cache_dir).to(self.device)
+        self.tokenizer = BertTokenizer.from_pretrained(self.model_name, cache_dir=model_cache_dir)
+        self.logger.info(f"Initializing {self.__class__.__name__} with eps = {self.eps}, model_name = {self.model_name}, model_cache_dir = {self.model_cache_dir}, device = {self.device}")
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于BERT语义相似度识别语义重复文本，执行近似去重操作。通过计算文本嵌入向量间的余弦相似度，识别语义相似的文本并保留唯一样本。\n"
+                "支持多字段组合作为去重依据，可有效去除内容相似但表述不同的重复数据，提高数据集多样性。\n"
+                "输入参数：\n"
+                "- eps：相似度阈值，值越小表示允许的相似度越低，默认为0.05（即余弦相似度大于0.95视为重复）\n"
+                "- model_name：预训练模型名称，默认为'sentence-transformers/all-MiniLM-L6-v2'\n"
+                "- model_cache_dir：模型缓存目录，默认为'./dataflow_cache'\n"
+                "- device：模型运行设备，默认为'cuda'\n"
+                "- input_keys：多个输入字段名列表，与input_key二选一\n"
+                "- input_key：单个输入字段名，与input_keys二选一\n"
+                "- output_key：去重结果字段名，默认为'minhash_deduplicated_label'\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留语义不重复的样本（标记为1的样本）\n"
+                "- 返回包含去重结果字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Identify semantically duplicate text using BERT embeddings for near deduplication. Calculate cosine similarity between text embedding vectors to detect semantically similar texts and retain unique samples.\n"
+                "Supports multiple field combinations as deduplication criteria, effectively removing duplicate data with similar content but different expressions to improve dataset diversity.\n"
+                "Input Parameters:\n"
+                "- eps: Similarity threshold, smaller values allow lower similarity, default is 0.05 (cosine similarity > 0.95 is considered duplicate)\n"
+                "- model_name: Pretrained model name, default is 'sentence-transformers/all-MiniLM-L6-v2'\n"
+                "- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
+                "- device: Model running device, default is 'cuda'\n"
+                "- input_keys: List of multiple input field names, alternative to input_key\n"
+                "- input_key: Single input field name, alternative to input_keys\n"
+                "- output_key: Deduplication result field name, default is 'minhash_deduplicated_label'\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only semantically unique samples (samples marked as 1)\n"
+                "- List containing deduplication result field name for subsequent operator reference"
+            )
+        else:
+            return "Near deduplication by identifying semantically similar content using BERT embeddings."
+
+    def _compute_hash(self, text: str) -> str:
+        return self.hash_func_dict[self.hash_func](text.encode('utf-8')).hexdigest()
+
+    def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
+        if input_keys is None and input_key is None:
+            self.logger.error(f"Need to specify either input_keys or input_key!")
+            raise ValueError(f"Need to specify either input_keys or input_key!")
+        if input_keys is not None and input_key is not None:
+            self.logger.error(f"{self.__class__.__name__} only need one input args!")
+            raise ValueError(f"{self.__class__.__name__} only need one input args!")
+        if input_keys is not None:
+            self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
+        else:
+            self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
+        self.input_key = input_key
+        self.input_keys = input_keys
+        self.output_key = output_key
+        seen_hashes = set()
+        dataframe = storage.read("dataframe")
+        texts = []
+        for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
+            if input_keys is not None and len(input_keys) > 1:
+                text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
+            else:
+                text = sample[self.input_key]
+            texts.append(text) 
+        embeddings = get_text_embedding(texts, self.tokenizer, self.model, self.device)
+        embeddings = normalize(torch.tensor(embeddings), dim=1)
+
+        # Compute cosine similarity matrix
+        cos_sim_matrix = compute_cos_sim_matrix(embeddings)
+        cos_sim_matrix.fill_diagonal_(0)  # Set diagonal to 0 to avoid self-comparison
+        cos_sim_matrix = torch.triu(cos_sim_matrix, diagonal=1)
+
+        # Find pairs with similarity greater than or equal to the threshold
+        similar_pairs = torch.where(cos_sim_matrix >= (1 - self.eps))
+
+        labels = [1] * len(dataframe) 
+        for idx in similar_pairs[1].tolist():
+            labels[idx] = 0
+        dataframe[self.output_key] = labels
+        filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
+        return [self.output_key,]
+        
+        
+
+        
+        
+
--- a/dataflow/operators/general_text/filter/simhash_deduplicate_filter.py
+++ b/dataflow/operators/general_text/filter/simhash_deduplicate_filter.py
+from tqdm import tqdm
+from simhash import Simhash
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+
+def get_similarity(simhash, another_simhash):
+    max_hashbit = max(len(bin(simhash.value)), len(bin(another_simhash.value)))
+    distince = simhash.distance(another_simhash)
+    similar = 1 - distince / max_hashbit
+    return similar
+
+@OPERATOR_REGISTRY.register()
+class SimHashDeduplicateFilter(OperatorABC):
+    def __init__(self, fingerprint_size: int = 64, bound: float = 0.1):
+        self.logger = get_logger()
+        self.fingerprint_size = fingerprint_size
+        self.bound = bound
+        self.logger.info(f"Initializing {self.__class__.__name__} with fingerprint_size = {self.fingerprint_size}, bound = {self.bound}...")
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "使用SimHash算法通过汉明距离识别相似文本，执行近似去重操作。将文本转换为固定长度的指纹，通过计算指纹间的汉明距离判断文本相似度。\n"
+                "相比语义去重速度更快，适合大规模数据集的快速去重预处理，尤其适用于检测字符层面相似的文本。\n"
+                "输入参数：\n"
+                "- fingerprint_size：指纹长度，默认为64位\n"
+                "- bound：相似度阈值，值越小表示允许的相似度越低，默认为0.1（即相似度大于0.9视为重复）\n"
+                "- input_keys：多个输入字段名列表，与input_key二选一\n"
+                "- input_key：单个输入字段名，与input_keys二选一\n"
+                "- output_key：去重结果字段名，默认为'minhash_deduplicated_label'\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留相似性低于阈值的唯一样本（标记为1的样本）\n"
+                "- 返回包含去重结果字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Detect similar text via SimHash algorithm and Hamming distance for near deduplication. Convert text to fixed-length fingerprints and determine text similarity by calculating Hamming distance between fingerprints.\n"
+                "Faster than semantic deduplication, suitable for fast deduplication preprocessing of large-scale datasets, especially for detecting character-level similar texts.\n"
+                "Input Parameters:\n"
+                "- fingerprint_size: Fingerprint length, default is 64 bits\n"
+                "- bound: Similarity threshold, smaller values allow lower similarity, default is 0.1 (similarity > 0.9 is considered duplicate)\n"
+                "- input_keys: List of multiple input field names, alternative to input_key\n"
+                "- input_key: Single input field name, alternative to input_keys\n"
+                "- output_key: Deduplication result field name, default is 'minhash_deduplicated_label'\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only unique samples with similarity below threshold (samples marked as 1)\n"
+                "- List containing deduplication result field name for subsequent operator reference"
+            )
+        else:
+            return "Near deduplication by detecting text similarity using SimHash algorithm and Hamming distance."
+
+    def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
+        if input_keys is None and input_key is None:
+            self.logger.error(f"Need to specify either input_keys or input_key!")
+            raise ValueError(f"Need to specify either input_keys or input_key!")
+        if input_keys is not None and input_key is not None:
+            self.logger.error(f"{self.__class__.__name__} only need one input args!")
+            raise ValueError(f"{self.__class__.__name__} only need one input args!")
+        if input_keys is not None:
+            self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
+        else:
+            self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
+        self.input_key = input_key
+        self.input_keys = input_keys
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        simhashes = []
+        labels = [0] * len(dataframe)
+        for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
+            if input_keys is not None and len(input_keys) > 1:
+                text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
+            else:
+                text = sample[self.input_key]
+            simhash = Simhash(text, f=self.fingerprint_size)
+            if all(get_similarity(simhash, another_simhash) < 1 - self.bound for another_simhash in simhashes):
+                labels[idx] = 1
+                simhashes.append(simhash)
+        dataframe[self.output_key] = labels
+        filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
+        return [self.output_key,]
+        
+        
+
+        
+        
+
--- a/dataflow/operators/general_text/filter/word_number_filter.py
+++ b/dataflow/operators/general_text/filter/word_number_filter.py
+from tqdm import tqdm
+import numpy as np
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+
+@OPERATOR_REGISTRY.register()
+class WordNumberFilter(OperatorABC):
+
+    def __init__(self, min_words: int=20, max_words: int=100000):
+        self.logger = get_logger()
+        self.min_words = min_words
+        self.max_words = max_words
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_words = {self.min_words}, max_words = {self.max_words}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "该算子用于过滤单词数量不在指定范围内的文本，通过空格分割计算单词数量。\n"
+                "输入参数：\n"
+                "- input_key：输入文本字段名，默认为'text'\n"
+                "- min_words：最小单词数量阈值，默认为5\n"
+                "- max_words：最大单词数量阈值，默认为100\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留单词数量在指定范围内的文本行\n"
+                "- 返回包含输入字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "This operator filters text with word count outside the specified range, using space splitting for word counting.\n"
+                "Input Parameters:\n"
+                "- input_key: Input text field name, default is 'text'\n"
+                "- min_words: Minimum word count threshold, default is 5\n"
+                "- max_words: Maximum word count threshold, default is 100\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only rows with word count within specified range\n"
+                "- List containing input field name for subsequent operator reference"
+            )
+        else:
+            return "WordNumberFilter filters text based on word count range using space splitting."
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='word_number_filter_label'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+        word_counts = []
+        for text in tqdm(dataframe[self.input_key], desc=f"Implementing {self.__class__.__name__}"):
+            if text:
+                normalized_words = tuple(text.split())
+                num_normalized_words = len(normalized_words)
+                word_counts.append(num_normalized_words)
+            else:
+                word_counts.append(0)
+        word_counts = np.array(word_counts)
+        metric_filter = (self.min_words <= word_counts) & (word_counts < self.max_words)
+        dataframe[self.output_key] = word_counts
+        filtered_dataframe = dataframe[metric_filter]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
+