适配后端vllm

97e8278b · zzg_666 · 97e8278b · 97e8278b · 97e8278b · 97e8278b
Commit 97e8278b authored Dec 03, 2025 by zzg_666
20 changed files
--- a/dataflow/operators/text2sql/generate/text2sql_question_generator.py
+++ b/dataflow/operators/text2sql/generate/text2sql_question_generator.py
+import random
+import re
+import pandas as pd
+from tqdm import tqdm
+import numpy as np
+from scipy.spatial.distance import cdist
+from dataflow.prompts.text2sql import Text2SQLQuestionGeneratorPrompt, Text2VecSQLQuestionGeneratorPrompt
+from dataflow.core.prompt import prompt_restrict, DIYPromptABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.core import OperatorABC, LLMServingABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.text2sql.database_manager import DatabaseManager
+from typing import Union
+
+@prompt_restrict(Text2SQLQuestionGeneratorPrompt, Text2VecSQLQuestionGeneratorPrompt)
+
+@OPERATOR_REGISTRY.register()
+class Text2SQLQuestionGenerator(OperatorABC):
+    def __init__(self, 
+                llm_serving: LLMServingABC, 
+                embedding_serving: LLMServingABC, 
+                database_manager: DatabaseManager, 
+                question_candidates_num: int = 5,
+                prompt_template: Union[Text2SQLQuestionGeneratorPrompt, Text2VecSQLQuestionGeneratorPrompt, DIYPromptABC] = None
+                ):
+                
+        self.llm_serving = llm_serving
+        self.embedding_serving = embedding_serving
+        self.database_manager = database_manager
+        if prompt_template is None:
+            self.prompt_template = Text2SQLQuestionGeneratorPrompt()
+        else:
+            self.prompt_template = prompt_template
+        self.logger = get_logger()
+        self.question_candidates_num = question_candidates_num
+        random.seed(42)
+
+    @staticmethod
+    def get_desc(lang):
+        if lang == "zh":
+            return (
+                "对于每个条目，如果自然语言问题为空，生成SQL对应的自然语言问题。为保证正确，生成多个候选问题，并选择最优的。\n\n"
+                "输入参数：\n"
+                "- input_sql_key: 输入SQL列名\n"
+                "- input_db_id_key: 数据库ID列名\n\n"
+                "输出参数：\n"
+                "- output_question_key: 输出问题列名"
+            )
+        elif lang == "en":
+            return (
+                "This operator generates natural language questions for Text2SQL tasks if the natural language question is empty. Multiple candidate questions are generated to ensure correctness.\n\n"
+                "Input parameters:\n"
+                "- input_sql_key: The name of the input SQL column\n"
+                "- input_db_id_key: The name of the database ID column\n\n"
+                "Output parameters:\n"
+                "- output_question_key: The name of the output question column"
+            )
+        else:
+            return "Question generator for Text2SQL tasks."
+
+    def extract_column_descriptions(self, create_statements):
+        column_name2column_desc = dict()
+        pattern = r'"(\w+)"\s+\w+\s*/\*\s*(.*?)\s*\*/'
+
+        for create_statement in create_statements:
+            matches = re.findall(pattern, create_statement)
+
+            for column_name, description in matches:
+                column_name = column_name.lower()
+                if column_name not in column_name2column_desc:
+                    column_name2column_desc[column_name] = description
+
+        return column_name2column_desc
+
+    def parse_llm_response(self, response, style):
+        explanation_pattern = re.compile(r'\[EXPLANATION-START\](.*?)\[EXPLANATION-END\]', re.DOTALL)
+        question_pattern = re.compile(r'\[QUESTION-START\](.*?)\[QUESTION-END\]', re.DOTALL)
+        external_knowledge_pattern = re.compile(r'\[EXTERNAL-KNOWLEDGE-START\](.*?)\[EXTERNAL-KNOWLEDGE-END\]', re.DOTALL)
+
+        explanation_match = explanation_pattern.search(response)
+        question_match = question_pattern.search(response)
+        external_knowledge_match = external_knowledge_pattern.search(response)
+
+        explanation_content = explanation_match.group(1).strip() if explanation_match else ""
+        question_content = question_match.group(1).strip() if question_match else ""
+        external_knowledge_content = external_knowledge_match.group(1).strip() if external_knowledge_match else ""
+
+        if explanation_content == "" or question_content == "":
+            return None
+        else:
+            return {
+                "question": question_content.strip(),
+                "external_knowledge": external_knowledge_content.strip()
+            }
+
+    def select_best_question(self, question_candidates, start_idx, embeddings):
+        if len(question_candidates) == 0:
+            return None
+        elif len(question_candidates) == 1:
+            return question_candidates[0]
+        elif len(question_candidates) == 2:
+            return random.sample(question_candidates, 1)[0]
+        else:
+            end_idx = start_idx + len(question_candidates)
+            candidate_embeddings = embeddings[start_idx:end_idx]
+            distance_matrix = cdist(candidate_embeddings, candidate_embeddings, metric='cosine')
+            distance_sums = distance_matrix.sum(axis=1)
+            min_index = np.argmin(distance_sums)
+            return question_candidates[min_index]
+    
+    def run(self, storage: DataFlowStorage,
+            input_sql_key: str = "sql",
+            input_db_id_key: str = "db_id",
+            output_question_key: str = "question",
+            output_evidence_key: str = "evidence"
+        ):
+        self.input_sql_key = input_sql_key
+        self.input_db_id_key = input_db_id_key
+        self.output_question_key = output_question_key
+        self.output_evidence_key = output_evidence_key
+        raw_dataframe = storage.read("dataframe")
+        
+        existing_data = []
+        raw_data = []
+        
+        if self.output_question_key in raw_dataframe.columns:
+            for _, row in raw_dataframe.iterrows():
+                if pd.notna(row.get(self.output_question_key)) and row.get(self.output_question_key) is not None:
+                    existing_data.append(row.to_dict())
+                else:
+                    raw_data.append(row.to_dict())
+        else:
+            raw_data = [row.to_dict() for _, row in raw_dataframe.iterrows()]
+        
+        db_ids = list(set([data[self.input_db_id_key] for data in raw_data]))
+        db_id2column_info = dict()
+        
+        for db_id in tqdm(db_ids, desc="Extracting database schema"):
+            create_statements, _ = self.database_manager.get_create_statements_and_insert_statements(db_id)
+            db_id2column_info[db_id] = self.extract_column_descriptions(create_statements)
+        
+        self.logger.info("Generating question candidates...")
+        prompts = []
+        prompt_data_mapping = []
+        
+        for data in tqdm(raw_data, desc="Preparing prompts"):
+            prompt = self.prompt_template.build_prompt(
+                data[self.input_sql_key],
+                data[self.input_db_id_key],
+                db_id2column_info,
+                self.database_manager.db_type
+            )
+            
+            for _ in range(self.question_candidates_num):
+                prompts.append(prompt)
+                prompt_data_mapping.append({**data})
+
+        responses = self.llm_serving.generate_from_input(prompts, system_prompt="You are a helpful assistant.")
+        
+        self.logger.info("Parsing responses and organizing candidates...")
+        grouped_responses = [responses[i:i+self.question_candidates_num] for i in range(0, len(responses), self.question_candidates_num)]
+
+        all_question_candidates = []
+        question_groups = [] 
+        embedding_texts = []
+        
+        for data, response_group in zip(raw_data, grouped_responses):
+            question_candidates = []
+            for response in response_group:
+                parsed_response = self.parse_llm_response(response, data.get("style", "Formal"))
+                if parsed_response:
+                    question_candidates.append(parsed_response)
+                    text = parsed_response["external_knowledge"] + " " + parsed_response["question"]
+                    embedding_texts.append(text.strip())
+            
+            question_groups.append(question_candidates)
+            all_question_candidates.extend(question_candidates)
+        
+        self.logger.info("Generating embeddings for all question candidates...")
+        if embedding_texts:
+            embeddings = self.embedding_serving.generate_embedding_from_input(embedding_texts)
+        else:
+            embeddings = []
+        
+        processed_results = []
+        failed_data = []
+        embedding_start_idx = 0
+        
+        for data, question_candidates in zip(raw_data, question_groups):
+            if question_candidates:
+                best_question = self.select_best_question(
+                    question_candidates, 
+                    embedding_start_idx, 
+                    embeddings
+                )
+                embedding_start_idx += len(question_candidates)
+                
+                if best_question:
+                    result = {
+                        **data,
+                        self.output_question_key: best_question["question"],
+                        self.output_evidence_key: best_question["external_knowledge"]
+                    }
+                    processed_results.append(result)
+                else:
+                    self.logger.warning(f"No valid question generated for data: {data[self.input_db_id_key]}")
+                    failed_data.append(data)
+            else:
+                self.logger.warning(f"No question candidates for data: {data[self.input_db_id_key]}")
+                failed_data.append(data)
+        
+        if self.output_question_key in raw_dataframe.columns:
+            all_results = existing_data + processed_results
+        else:
+            all_results = processed_results
+        
+        final_df = pd.DataFrame(all_results)
+        output_file = storage.write(final_df)
+        
+        self.logger.info(f"Question generation results saved to {output_file}")
+        self.logger.info(f"Successfully processed: {len(processed_results)}")
+        if failed_data:
+            self.logger.warning(f"Failed to generate questions for: {len(failed_data)} entries")
+
+        return [self.output_question_key, self.output_evidence_key]
--- a/dataflow/operators/text_pt/__init__.py
+++ b/dataflow/operators/text_pt/__init__.py
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    # filter
+    from .filter.ccnet_deduplicate_filter import CCNetDeduplicateFilter
+    from .filter.debertav3_filter import DebertaV3Filter
+    from .filter.fineweb_edu_filter import FineWebEduFilter
+    from .filter.pair_qual_filter import PairQualFilter
+    from .filter.perplexity_filter import PerplexityFilter
+    from .filter.qurating_filter import QuratingFilter
+    from .filter.text_book_filter import TextbookFilter
+
+    # generate
+    from .generate.phi4qa_generator import Phi4QAGenerator
+    
+    # eval
+    from .eval.debertav3_sample_evaluator import DebertaV3SampleEvaluator
+    from .eval.fineweb_edu_sample_evaluator import FineWebEduSampleEvaluator
+    from .eval.pair_qual_sample_evaluator import PairQualSampleEvaluator
+    from .eval.textbook_sample_evaluator import TextbookSampleEvaluator
+    from .eval.qurating_sample_evaluator import QuratingSampleEvaluator
+    from .eval.perplexity_sample_evaluator import PerplexitySampleEvaluator
+    from .eval.meta_sample_evaluator import MetaSampleEvaluator
+
+else:
+    import sys
+    from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
+
+    cur_path = "dataflow/operators/text_pt/"
+
+    _import_structure = generate_import_structure_from_type_checking(__file__, cur_path)
+    sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/text_pt/", _import_structure)
--- a/dataflow/operators/text_pt/eval/Qurating/modeling/modeling_flash_llama.py
+++ b/dataflow/operators/text_pt/eval/Qurating/modeling/modeling_flash_llama.py
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch LLaMA model."""
+from typing import List, Optional, Tuple, Union, Any
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+import torch.distributed as dist
+
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import logging
+from transformers.models.llama.configuration_llama import LlamaConfig
+
+def try_import_flash_attention():
+    try:
+        from flash_attn import flash_attn_kvpacked_func, flash_attn_varlen_kvpacked_func, flash_attn_with_kvcache
+        from flash_attn.bert_padding import unpad_input, pad_input
+        from flash_attn.layers.rotary import apply_rotary_emb_func
+    except ImportError as e:
+        if 'flash_attn.layers.rotary' in str(e):
+            raise ImportError('Please install RoPE kernels: `pip install git+https://github.com/HazyResearch/flash-attention.git#subdirectory=csrc/rotary`')
+        else:
+            raise ImportError('Please install flash_attention dependency in GPU environment')
+from dataflow import get_logger
+
+logger = logging.get_logger(__name__)
+
+# @torch.jit.script
+def rmsnorm_func(hidden_states, weight, variance_epsilon):
+    input_dtype = hidden_states.dtype
+    hidden_states = hidden_states.to(torch.float32)
+    variance = hidden_states.pow(2).mean(-1, keepdim=True)
+    hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
+    return (weight * hidden_states).to(input_dtype)
+
+
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        LlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.register_buffer(
+            "variance_epsilon",
+            torch.tensor(eps),
+            persistent=False,
+        )
+
+    def forward(self, hidden_states):
+        return rmsnorm_func(hidden_states, self.weight, self.variance_epsilon)
+
+
+class FlashRotaryEmbedding(torch.nn.Module):
+    """
+    The rotary position embeddings from RoFormer_ (Su et. al).
+    A crucial insight from the method is that the query and keys are
+    transformed by rotation matrices which depend on the relative positions.
+
+    Other implementations are available in the Rotary Transformer repo_ and in
+    GPT-NeoX_, GPT-NeoX was an inspiration
+
+    .. _RoFormer: https://arxiv.org/abs/2104.09864
+    .. _repo: https://github.com/ZhuiyiTechnology/roformer
+    .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox
+
+    If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554).
+    A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96
+    Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py
+    """
+
+    def __init__(self, dim: int, base=10000.0, interleaved=False, scale_base=None,
+                 scaling_factor=1.0, pos_idx_in_fp32=True, device=None):
+        """
+            interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
+                of 1st half and 2nd half (GPT-NeoX style).
+            pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32,
+                otherwise they might be in lower precision.
+                This option was added because previously (before 2023-07-02), when we construct
+                the position indices, we use the dtype of self.inv_freq. In most cases this would
+                be fp32, but if the model is trained in pure bf16 (not mixed precision), then
+                self.inv_freq would be bf16, and the position indices are also in bf16.
+                Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the
+                embeddings for some positions will coincide.
+                To maintain compatibility with models previously trained in pure bf16,
+                we add this option.
+            scaling_factor: RotaryEmbedding extended with linear scaling.
+        """
+        super().__init__()
+        self.dim = dim
+        self.base = float(base)
+        self.pos_idx_in_fp32 = pos_idx_in_fp32
+        # Generate and save the inverse frequency buffer (non trainable)
+        inv_freq = self._compute_inv_freq(device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.interleaved = interleaved
+        self.scale_base = scale_base
+        self.scaling_factor = scaling_factor
+        scale = ((torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim)
+                 / (1.4 * dim) if scale_base is not None else None)
+        self.register_buffer("scale", scale)
+
+        self._seq_len_cached = 0
+        self._cos_cached = None
+        self._sin_cached = None
+        self._cos_k_cached = None
+        self._sin_k_cached = None
+
+    def _compute_inv_freq(self, device=None):
+        return 1 / (self.base ** (torch.arange(0, self.dim, 2, device=device,
+                                                 dtype=torch.float32) / self.dim))
+
+
+    def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
+        # Reset the tables if the sequence length has changed,
+        # if we're on a new device (possibly due to tracing for instance),
+        # or if we're switching from inference mode to training
+        if (seqlen > self._seq_len_cached or self._cos_cached.device != device
+            or self._cos_cached.dtype != dtype
+            or (self.training and self._cos_cached.is_inference())):
+            self._seq_len_cached = seqlen
+            # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16
+            # And the output of arange can be quite large, so bf16 would lose a lot of precision.
+            # However, for compatibility reason, we add an option to use the dtype of self.inv_freq.
+            if self.pos_idx_in_fp32:
+                t = torch.arange(seqlen, device=device, dtype=torch.float32)
+                t /= self.scaling_factor
+                # We want fp32 here as well since inv_freq will be multiplied with t, and the output
+                # will be large. Having it in bf16 will lose a lot of precision and cause the
+                # cos & sin output to change significantly.
+                # We want to recompute self.inv_freq if it was not loaded in fp32
+                if self.inv_freq.dtype != torch.float32:
+                    inv_freq = self.inv_freq.to(torch.float32)
+                else:
+                    inv_freq = self.inv_freq
+            else:
+                t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+                t /= self.scaling_factor
+                inv_freq = self.inv_freq
+            # Don't do einsum, it converts fp32 to fp16 under AMP
+            # freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+            freqs = torch.outer(t, inv_freq)
+            if self.scale is None:
+                self._cos_cached = torch.cos(freqs).to(dtype)
+                self._sin_cached = torch.sin(freqs).to(dtype)
+            else:
+                power = ((torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
+                          - seqlen // 2) / self.scale_base)
+                scale = self.scale.to(device=power.device) ** power.unsqueeze(-1)
+                # We want the multiplication by scale to happen in fp32
+                self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+                self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+                self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+                self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+
+    def forward(self,
+                q: torch.Tensor, k: torch.Tensor,
+                seqlen_offset: int = 0,
+                unpadded_lengths: Optional[Tuple[torch.Tensor]] = None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        q: (batch, seqlen, nheads, headdim)
+        k: (batch, seqlen, nheads, headdim)
+        seqlen_offset: can be used in generation where the qkv being passed in is only the last
+        token in the batch.
+        """
+        if unpadded_lengths is not None:
+            cu_seqlens, max_seqlen = unpadded_lengths
+        else:
+            cu_seqlens, max_seqlen = None, q.shape[1]
+        self._update_cos_sin_cache(max_seqlen + seqlen_offset, device=q.device, dtype=q.dtype)
+
+        if self.scale is None:
+            return apply_rotary_emb_func(
+                q, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:],
+                self.interleaved, True, # inplace=True,
+                cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
+            ), apply_rotary_emb_func(
+                k, self._cos_cached[seqlen_offset:], self._sin_cached[seqlen_offset:],
+                self.interleaved, True, # inplace=True
+                cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
+            )
+        else:
+            assert False
+
+
+class LlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
+
+@torch.jit.script
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    if n_rep == 1:
+        return hidden_states
+    final_shape = list(hidden_states.shape[:-2]) + [-1] + [hidden_states.shape[-1]]
+    expand_shape = [-1] * (len(hidden_states.shape) - 1) + [n_rep] + [-1]
+    hidden_states = hidden_states.unsqueeze(-2).expand(expand_shape)
+    return hidden_states.reshape(final_shape)
+
+
+class LlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = getattr(config, "num_key_value_heads", self.num_heads)
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.register_buffer(
+            "norm_factor",
+            torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype()),
+            persistent=False,
+        )
+
+        if not getattr(self.config, "rope_scaling", None):
+            scaling_factor = 1
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            assert scaling_type == 'linear'
+        theta = getattr(self.config, "rope_theta", 10000)
+        self.rotary_emb = FlashRotaryEmbedding(
+            self.head_dim, base=theta, interleaved=False, scaling_factor=scaling_factor,
+        )
+
+        self.distributed_attn_func = flash_attn_kvpacked_func
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        unpadded_lengths: Optional[Tuple[torch.Tensor]] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        h_size = hidden_states.size(-1)
+
+        has_layer_past = past_key_value is not None
+
+        if has_layer_past:
+            past_kv = past_key_value[0]
+            past_len = past_key_value[1]
+        else:
+            past_len = 0
+
+        # NOTE: Hack to include position_ids, assuming they are increasing uniformly per block
+        if position_ids is not None:
+            past_len += position_ids.min()
+
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+
+        q = q.view(*q.shape[:-1], self.num_heads, self.head_dim)
+        k = k.view(*k.shape[:-1], self.num_key_value_heads, self.head_dim)
+        v = v.view(*v.shape[:-1], self.num_key_value_heads, self.head_dim)
+
+        q, k = self.rotary_emb(q, k, past_len, unpadded_lengths)
+
+        kv = torch.stack([k, v], -3)
+        kv = repeat_kv(kv, self.num_key_value_groups)
+
+        # Cache QKV values
+        if has_layer_past:
+            new_len = past_len+q.size(1)
+            if new_len > past_kv.size(1):
+                past_kv = torch.cat([past_kv, torch.empty(hidden_states.size(0), 256, 2, kv.size(3), kv.size(4), dtype=kv.dtype, device=kv.device)], 1)
+            past_kv[:, past_len:new_len] = kv
+            kv = past_kv[:, :new_len]
+        else:
+            past_kv = kv
+
+        if unpadded_lengths is not None:
+            # varlen, ignore padding tokens, efficient for large batch with many paddings
+            assert attention_mask is not None
+            cu_seqlens, max_seqlen = unpadded_lengths
+
+            attn_outputs = flash_attn_varlen_kvpacked_func(
+                q, kv,
+                cu_seqlens, cu_seqlens,
+                max_seqlen, max_seqlen,
+                dropout_p=0.0, softmax_scale=1.0/self.norm_factor,
+                causal=True, return_attn_probs=output_attentions
+            )
+        # elif use_cache and past_key_value is not None:
+        #     attn_outputs = flash_attn_with_kvcache(
+        #         q,
+        #         kv[:, :, 0],
+        #         kv[:, :, 1],
+        #         softmax_scale=1.0/self.norm_factor,
+        #         causal=True,
+        #     )
+        else:
+            attn_outputs = flash_attn_kvpacked_func(
+                q, kv,
+                dropout_p=0.0,
+                softmax_scale=1.0/self.norm_factor,
+                causal=True,
+                return_attn_probs=output_attentions,
+            )
+        past_key_value = (past_kv, past_len+q.size(1)) if use_cache else None
+
+
+        attn_output = attn_outputs[0] if output_attentions else attn_outputs
+        attn_output = attn_output.reshape(*attn_output.shape[:-2], h_size)
+        attn_weights = attn_outputs[2] if output_attentions else None
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class LlamaDecoderLayer(nn.Module):
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = LlamaAttention(config=config)
+        self.mlp = LlamaMLP(config)
+        self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self._fsdp_wrap = True
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        unpadded_lengths: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            unpadded_lengths=unpadded_lengths,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class LlamaPreTrainedModel(PreTrainedModel):
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class LlamaModel(LlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
+
+    Args:
+        config: LlamaConfig
+    """
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        try_import_flash_attention()
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # position_ids = None
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+        bsz = hidden_states.size(0)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if (
+            ((attention_mask is not None) and (not attention_mask.all().item()))
+            and not use_cache
+        ):
+            try: # for flash-attn latest version
+                hidden_states, unpad_indices, cu_seqlens, max_seqlen, _ = unpad_input(hidden_states, attention_mask)
+            except: # for flash-attn 2.3.3 verstion
+                hidden_states, unpad_indices, cu_seqlens, max_seqlen = unpad_input(hidden_states, attention_mask)
+            unpadded_lengths = (cu_seqlens, max_seqlen)
+        else:
+            unpadded_lengths = None
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                if unpadded_lengths is not None:
+                    all_hidden_states += (pad_input(hidden_states, unpad_indices, bsz, max_seqlen),)
+                else:
+                    all_hidden_states += (hidden_states,)
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    decoder_layer,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    None,
+                    unpadded_lengths,
+                    output_attentions,
+                    False,
+                    use_reentrant=False
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_value,
+                    unpadded_lengths=unpadded_lengths,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        if unpadded_lengths is not None:
+            hidden_states = pad_input(hidden_states, unpad_indices, bsz, max_seqlen)
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class LlamaForCausalLM(LlamaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = LlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        try_import_flash_attention()
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        avg_valid_labels_per_chunk: Optional[float] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, LlamaForCausalLM
+
+        >>> model = LlamaForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states).float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+class LlamaForSequenceClassification(LlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = LlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+        try_import_flash_attention()
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
\ No newline at end of file
--- a/dataflow/operators/text_pt/eval/Qurating/qurater_annotate.py
+++ b/dataflow/operators/text_pt/eval/Qurating/qurater_annotate.py
+from datasets import load_from_disk, load_dataset, concatenate_datasets
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from .modeling.modeling_flash_llama import LlamaForSequenceClassification
+import torch
+import argparse
+import numpy as np
+
+class TokenizeAndChunk:
+    def __init__(self, tokenizer_name, text_field, tokens_field, tokens, model_cache_dir):
+        self.tokens = tokens
+        self.tokenizer_name = tokenizer_name
+        self.text_field = text_field
+        self.tokens_field = tokens_field
+
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_fast=True, cache_dir = model_cache_dir)
+        self.tokenizer.pad_token_id = 0
+
+    def __getstate__(self):
+        return {
+            "tokenizer_name": self.tokenizer_name,
+            "text_field": self.text_field,
+            "tokens_field": self.tokens_field,
+            "tokens": self.tokens,
+        }
+
+    def __setstate__(self, state):
+        self.__init__(**state)
+
+    def tokenize_and_chunk(self, source_tokens):
+        chunks_token_ids = []
+        chunks_token_counts = []
+
+        for seq in source_tokens:
+            chunks = torch.tensor(seq, dtype=torch.long).split(self.tokens)
+            chunks_token_ids.append([chunk.tolist() for chunk in chunks])
+            chunks_token_counts.append([len(x) for x in chunks])
+
+        return chunks_token_ids, chunks_token_counts
+
+    def __call__(self, example):
+        if self.tokens_field in example:
+            source_tokens = example[self.tokens_field]
+        else:
+            source_tokens = self.tokenizer(example[self.text_field], truncation=False, padding=False, add_special_tokens=False).input_ids
+
+        chunks_token_ids, chunks_token_counts = self.tokenize_and_chunk(source_tokens)
+
+        assert len(example[self.text_field]) == len(chunks_token_ids)
+        assert len(example[self.text_field]) == len(chunks_token_counts)
+
+        return {
+            "chunks_token_ids": chunks_token_ids,
+            "chunks_token_counts": chunks_token_counts,
+        }
+
+
+class ModelAnnotator:
+    def __init__(self, model_name, labels, device_batch_size, device, model_cache_dir):
+
+        self.model_name = model_name
+        self.labels = labels
+        self.device_batch_size = device_batch_size
+
+        self.model = LlamaForSequenceClassification.from_pretrained(
+            self.model_name,
+            torch_dtype=torch.bfloat16,
+            cache_dir=model_cache_dir)
+        self.model.config.pad_token_id = 0
+        self.model.eval()
+
+        self.device = device
+        print(f"Using device {self.device}")
+        self.model.to(self.device)
+
+        self.num_labels = len(labels)
+        assert self.num_labels == self.model.config.num_labels, f"Number of labels ({self.num_labels}) does not match model config ({self.model.config.num_labels})"
+
+    def __getstate__(self):
+        return {
+            "model_name": self.model_name,
+            "labels": self.labels,
+            "device_batch_size": self.device_batch_size,
+        }
+
+    def __setstate__(self, state):
+        self.__init__(**state)
+
+    @torch.inference_mode()
+    def score_chunks(self, chunks_token_ids, chunks_token_counts):
+        sorted_indices = torch.argsort(chunks_token_counts)
+
+        scores = torch.zeros(len(chunks_token_ids), self.num_labels, dtype=torch.float32)
+
+        for batch_indices in sorted_indices.split(self.device_batch_size):
+            max_len = chunks_token_counts[batch_indices].max()
+
+            input_ids = torch.zeros((len(batch_indices), max_len), dtype=torch.long)
+            attention_mask = torch.zeros((len(batch_indices), max_len), dtype=torch.long)
+
+            for i, j in enumerate(batch_indices):
+                seq = chunks_token_ids[j]
+                input_ids[i, :len(seq)] = seq
+                attention_mask[i, :len(seq)] = 1
+
+            outputs = self.model(input_ids.to(self.device), attention_mask=attention_mask.to(self.device), use_cache=False)
+            scores[batch_indices] = outputs.logits.float().cpu()
+        return scores
+
+    def __call__(self, example, indices):
+        num_seqs = len(indices)
+
+        source_ids = [i for i, counts in enumerate(example["chunks_token_counts"]) for _ in range(len(counts))]
+        chunks_token_ids = [torch.tensor(chunk, dtype=torch.long) for chunks in example["chunks_token_ids"] for chunk in chunks]
+        flattened_chunks_token_counts = torch.tensor([chunk for chunks in example["chunks_token_counts"] for chunk in chunks], dtype=torch.long)
+
+        flattened_scores = self.score_chunks(chunks_token_ids, flattened_chunks_token_counts)
+
+        chunk_token_counts = example["chunks_token_counts"]
+        chunk_scores = [[[] for _ in range(num_seqs)] for _ in range(self.num_labels)]
+
+        for source_id, score in zip(source_ids, flattened_scores):
+            for label in range(self.num_labels):
+                chunk_scores[label][source_id].append(score[label].item())
+
+        output = {
+            "index": indices,
+            "chunk_lengths": chunk_token_counts,
+            "length": [sum(counts) for counts in chunk_token_counts],
+        }
+
+        for i, label in enumerate(self.labels):
+            output[f"{label}_chunks"] = chunk_scores[i]
+            output[f"{label}_average"] = [
+                np.average(scores, weights=token_counts).item()
+                for scores, token_counts in zip(chunk_scores[i], chunk_token_counts)
+            ]
+
+        return output
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input", type=str)
+    parser.add_argument("output", type=str)
+
+    parser.add_argument("-F", "--data_files", type=str, nargs="+", default=[])
+    parser.add_argument("-S", "--shard", type=int, nargs=2, default=[0, 1])
+    parser.add_argument("-M", "--model", type=str, required=True)
+    parser.add_argument("-t", "--tokens", type=int, default=512)
+    parser.add_argument("--map_batch_size", type=int, default=512)
+    parser.add_argument("-b", "--device_batch_size", type=int, default=16)
+    parser.add_argument("-w", "--num_workers", type=int, default=1)
+    parser.add_argument("--text_field", type=str, default="text")
+    parser.add_argument("--tokens_field", type=str, default="input_ids")
+    parser.add_argument("--labels", type=str, nargs="+")
+
+    args = parser.parse_args()
+    print(args)
+
+    if args.input == "json":
+        dataset = load_dataset("json", data_files=args.data_files, split="train")
+    else:
+        dataset = load_from_disk(args.input)
+
+    src_dataset = dataset.shard(args.shard[1], args.shard[0], contiguous=True)
+    dataset = src_dataset
+
+    print(dataset)
+    print("Total number of examples:", len(dataset))
+    dataset = dataset.map(
+        TokenizeAndChunk(args.model, args.text_field, args.tokens_field, args.tokens),
+        batched=True,
+        batch_size=args.map_batch_size,
+        num_proc=args.num_workers,
+        remove_columns=dataset.column_names)
+
+    print("After tokenization: Total number of examples:", len(dataset))
+    dataset = dataset.map(
+        ModelAnnotator(args.model, args.labels, args.device_batch_size),
+        batched=True,
+        with_indices=True,
+        batch_size=args.map_batch_size,
+        remove_columns=dataset.column_names)
+
+    dataset = concatenate_datasets([dataset, src_dataset], axis=1)
+
+    print("After annotation: Total number of examples:", len(dataset))
+
+    print(f"Saving to {args.output}")
+    dataset.save_to_disk(args.output)
\ No newline at end of file
--- a/dataflow/operators/text_pt/eval/__init__.py
+++ b/dataflow/operators/text_pt/eval/__init__.py
--- a/dataflow/operators/text_pt/eval/debertav3_sample_evaluator.py
+++ b/dataflow/operators/text_pt/eval/debertav3_sample_evaluator.py
+import torch
+from torch import nn
+from transformers import AutoModel, AutoTokenizer, AutoConfig
+from huggingface_hub import PyTorchModelHubMixin
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from tqdm import tqdm
+from dataflow import get_logger
+
+@OPERATOR_REGISTRY.register()
+class DebertaV3SampleEvaluator(OperatorABC):
+    def __init__(self, model_name, model_cache_dir='./dataflow_cache', device='cuda'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.model_name = model_name
+        self.model_cache_dir = model_cache_dir
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.score_name = 'DebertaV3Score'
+        self.config = AutoConfig.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
+        self.model = QualityModel.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
+        self.model.eval()
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于Nvidia Deberta V3模型的质量分类器，用于评估文本质量并返回分类结果。\n"
+                "输入参数：\n"
+                "- model_name：预训练模型名称\n"
+                "- model_cache_dir：模型缓存目录，默认为'./dataflow_cache'\n"
+                "- device：计算设备，默认为'cuda'\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：输出分类结果字段名，默认为'Debertav3Score'\n"
+                "输出参数：\n"
+                "- 包含文本质量分类结果的DataFrame"
+            )
+        elif lang == "en":
+            return (
+                "Text quality classifier based on Nvidia Deberta V3 model for quality assessment and classification.\n"
+                "Input Parameters:\n"
+                "- model_name: Pretrained model name\n"
+                "- model_cache_dir: Model cache directory, default './dataflow_cache'\n"
+                "- device: Computing device, default 'cuda'\n"
+                "- input_key: Field name for input text\n"
+                "- output_key: Field name for output classification, default 'Debertav3Score'\n"
+                "Output Parameters:\n"
+                "- DataFrame containing text quality classification results"
+            )
+        else:
+            return "Text quality classifier based on Nvidia Deberta V3."
+    
+    def _score_func(self, sample):
+        inputs = self.tokenizer(
+            sample, return_tensors="pt", padding="longest", truncation=True
+        ).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(inputs["input_ids"], inputs["attention_mask"])
+        predicted_classes = torch.argmax(outputs, dim=1)
+        predicted_domains = [
+            self.config.id2label[class_idx.item()] for class_idx in predicted_classes.cpu().numpy()
+        ]
+        return predicted_domains[0]  # Assuming one sample per batch
+
+    def eval(self, dataframe, input_key):
+        scores = []
+        self.logger.info(f"Evaluating {self.score_name}...")
+        for sample in tqdm(dataframe[input_key], desc="DebertaV3 modle evaluating..."):
+            score = self._score_func(sample)
+            scores.append(score)
+        self.logger.info("Evaluation complete!")
+        return scores
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='Debertav3Score'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_key)
+        dataframe[self.output_key] = scores
+        storage.write(dataframe)
+
+
+class QualityModel(nn.Module, PyTorchModelHubMixin):
+    def __init__(self, config):
+        super(QualityModel, self).__init__()
+        self.model = AutoModel.from_pretrained(config["base_model"])
+        self.dropout = nn.Dropout(config["fc_dropout"])
+        self.fc = nn.Linear(self.model.config.hidden_size, len(config["id2label"]))
+
+    def forward(self, input_ids, attention_mask):
+        features = self.model(
+            input_ids=input_ids, attention_mask=attention_mask
+        ).last_hidden_state
+        dropped = self.dropout(features)
+        outputs = self.fc(dropped)
+        return torch.softmax(outputs[:, 0, :], dim=1)
--- a/dataflow/operators/text_pt/eval/fineweb_edu_sample_evaluator.py
+++ b/dataflow/operators/text_pt/eval/fineweb_edu_sample_evaluator.py
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from dataflow.core import OperatorABC
+from dataflow import get_logger
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from tqdm import tqdm
+import numpy as np
+
+@OPERATOR_REGISTRY.register()
+class FineWebEduSampleEvaluator(OperatorABC):
+    def __init__(self, model_cache_dir: str = './dataflow_cache', device: str = 'cuda'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.model_name = 'HuggingFaceTB/fineweb-edu-classifier'
+        self.model_cache_dir = model_cache_dir
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.batch_size = 1
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
+        self.model = AutoModelForSequenceClassification.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
+        self.model.eval()
+        self.score_name = 'FineWebEduScore'
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于Fineweb-Edu分类器评估文本的教育价值。该分类器使用预训练的序列分类模型对文本进行评估，返回0-1之间的分数，" 
+                "分数越高表示文本的教育价值越高。适用于筛选具有教育意义的文本内容。\n" 
+                "输入参数：\n" 
+                "- text: 待评估的文本字符串\n" 
+                "输出参数：\n" 
+                "- float: 0-1之间的教育价值分数，越高表示教育价值越大"
+            )
+        else:
+            return (
+                "Evaluate the educational value of text using the Fineweb-Edu classifier. This classifier uses a pre-trained sequence classification model " 
+                "to assess text and returns a score between 0 and 1, where higher scores indicate greater educational value. Suitable for filtering educational content.\n" 
+                "Input parameters:\n" 
+                "- text: Text string to be evaluated\n" 
+                "Output parameters:\n" 
+                "- float: Educational value score between 0 and 1, higher values indicate greater educational value"
+            )
+
+    def _score_func(self, sample):
+        tokenized_inputs = self.tokenizer(sample, return_tensors="pt", padding="longest", truncation=True).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**tokenized_inputs)
+            logits = outputs.logits.squeeze(-1).float().detach().cpu().numpy() 
+        
+        return logits.tolist()[0]  # Return as list for individual sample
+
+    def eval(self, dataframe, input_key):
+        scores = []
+        self.logger.info(f"Evaluating {self.score_name}...")
+        for sample in tqdm(dataframe[input_key], desc="Fineweb-edu model evaluating..."):
+            score = self._score_func(sample)
+            scores.append(score)
+        self.logger.info("Evaluation complete!")
+        return np.array(scores)
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='FinewebEduScore'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_key)
+        dataframe[self.output_key] = scores
+        storage.write(dataframe)
--- a/dataflow/operators/text_pt/eval/meta_sample_evaluator.py
+++ b/dataflow/operators/text_pt/eval/meta_sample_evaluator.py
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+import pandas as pd
+from dataflow.core import LLMServingABC
+from dataflow.prompts.general_text import MetaPrompt  
+import ast
+from dataflow.core.prompt import prompt_restrict
+
+example_dimensions = [
+    {
+        "dimension_name": "Text Structure",
+        "description": "Evaluate the surface-level quality of the text, including spelling accuracy, grammar, vocabulary richness, and sentence structure.",
+        "example_list": [
+            {
+                "text": "The experimental procedure was meticulously documented, with each variable clearly defined.",
+                "score": "5"
+            },
+            {
+                "text": "teh data was wrong and we dont no why it happen like that",
+                "score": "2"
+            }
+        ]
+    },
+    {
+        "dimension_name": "Diversity and Complexity",
+        "description": "Assess how rich and conceptually varied the content is, and whether it requires expert or deep reasoning to understand.",
+        "example_list": [
+            {
+                "text": "This article compares Bayesian inference and frequentist approaches in statistical modeling, highlighting theoretical and practical trade-offs.",
+                "score": "5"
+            },
+            {
+                "text": "Dogs are pets. They bark. They are friendly.",
+                "score": "2"
+            }
+        ]
+    },
+    {
+        "dimension_name": "Fluency and Understandability",
+        "description": "Evaluate whether the text flows naturally, is easy to follow, and avoids awkward or disjointed phrasing.",
+        "example_list": [
+            {
+                "text": "Despite initial challenges, the team successfully completed the deployment by adhering to a revised strategy.",
+                "score": "5"
+            },
+            {
+                "text": "The problem was and then fixed by something happens deployment successful maybe.",
+                "score": "2"
+            }
+        ]
+    },
+    {
+        "dimension_name": "Safety",
+        "description": "Identify whether the text contains profanities, hate speech, or excessive personally identifiable information (PII).",
+        "example_list": [
+            {
+                "text": "The software collects anonymous usage data to improve performance.",
+                "score": "5"
+            },
+            {
+                "text": "You idiot, your address 123 Main St will be posted online.",
+                "score": "1"
+            }
+        ]
+    },
+    {
+        "dimension_name": "Educational Value",
+        "description": "Determine whether the text provides insight, stimulates thinking, or offers meaningful learning potential.",
+        "example_list": [
+            {
+                "text": "Understanding the principles of thermodynamics allows engineers to design more efficient engines.",
+                "score": "5"
+            },
+            {
+                "text": "The sky is blue. Water is wet. This is how it is.",
+                "score": "2"
+            }
+        ]
+    },
+    {
+        "dimension_name": "Content Accuracy and Effectiveness",
+        "description": "Assess the truthfulness, relevance, and practical usefulness of the content.",
+        "example_list": [
+            {
+                "text": "Newton's second law states that F = ma, which explains the relationship between force, mass, and acceleration.",
+                "score": "5"
+            },
+            {
+                "text": "The Earth is flat and doesn't rotate around the Sun.",
+                "score": "1"
+            }
+        ]
+    }
+]
+
+@prompt_restrict(
+    MetaPrompt
+)
+
+@OPERATOR_REGISTRY.register()
+class MetaSampleEvaluator(OperatorABC):
+    def __init__(self, 
+                 llm_serving: LLMServingABC = None,
+                 dimensions: list[dict] = example_dimensions,
+                ):
+        
+        """
+        Operator that evaluate the quality of the text based on the given dimensions.
+        Argument Dimensions should be list of dict, each dict should contain:
+        {
+            "dimension_name": "Dimension Name",
+            "description": "Description of the dimension",
+            "example_list": [ // a list of example text and score
+                {
+                    "text": "example1 text to be evaluated",
+                    "score": "the score of this dimension of the text above"
+                },
+                {
+                    "text": "example2 text to be evaluated",
+                    "score": "the score of this dimension of the text above"
+                }            
+            ]
+        }
+        """
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.llm_serving = llm_serving
+        self.score_name = 'MetaScore'
+        self.prompt = MetaPrompt(dimensions=dimensions)
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+        self.dimensions = dimensions
+        for item in dimensions:
+            if 'dimension_name' not in item or 'description' not in item or 'example_list' not in item:
+                raise ValueError('Invalid dimension format. Refer to the docstring for the correct format.')
+            for example in item['example_list']:
+                if 'text' not in example or 'score' not in example:
+                    raise ValueError('Invalid example format. Refer to the docstring for the correct format.')
+        self.output_columns = [item['dimension_name'] for item in dimensions]
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "通过LLM评估文本的多个元属性，包括文本结构、多样性与复杂性、流畅性与可理解性、安全性、教育价值以及内容准确性与有效性。\n"
+                "输入参数：\n"
+                "- llm_serving：LLM服务对象，需实现LLMServingABC接口\n"
+                "- dimensions：评估维度列表，每个维度对应的字典中包含dimension_name，description，和示例字段：\n"
+                "   * dimension_name：维度名称\n"
+                "   * description：维度的描述\n"
+                "   * example_list：包含示例文本和得分的列表\n"
+                "- input_key：输入文本字段名\n"
+                "输出参数：\n"
+                "- 包含6个评估维度得分的DataFrame，列名为：Text Structure, Diversity & Complexity, Fluency & Understandability, Safety, Educational Value, Content Accuracy & Effectiveness"
+            )
+        elif lang == "en":
+            return (
+                "Evaluate multiple meta attributes of text using LLM, including Text Structure, Diversity & Complexity, Fluency & Understandability, Safety, Educational Value, and Content Accuracy & Effectiveness.\n"
+                "Input Parameters:\n"
+                "- llm_serving: LLM serving object implementing LLMServingABC interface\n"
+                "- dimensions: List of evaluation dimensions, each dimension corresponding to a dictionary containing dimension_name, description, and example field:\n"
+                "   * dimension_name: Name of the dimension\n"
+                "   * description: Description of the dimension\n"
+                "   * example_list: List containing example texts and scores\n"
+                "- input_key: Field name for input text\n"
+                "Output Parameters:\n"
+                "- DataFrame containing scores for 6 evaluation dimensions with columns: Text Structure, Diversity & Complexity, Fluency & Understandability, Safety, Educational Value, Content Accuracy & Effectiveness"
+            )
+        else:
+            return "Evaluate multiple meta attributes of text using LLM."
+    
+    def get_score(self, samples, input_key):
+        system_prompt = self.prompt.build_system_prompt()
+        user_prompts = []
+        for sample in samples:
+            input_text = sample.get(input_key, '')
+            user_prompt = self.prompt.build_prompt(input_text)
+            full_prompt = system_prompt + "\n" + user_prompt
+            user_prompts.append(full_prompt)
+
+        responses = self.llm_serving.generate_from_input(user_inputs=user_prompts)
+        scores = []
+
+        for i, response in enumerate(responses):
+            try:
+                lines = response.strip().split("\n")
+                last_line = lines[-1].strip()
+                parsed_scores = ast.literal_eval(last_line)
+                if isinstance(parsed_scores, list) and len(parsed_scores) == 6:
+                    scores.append(parsed_scores)
+                else:
+                    raise ValueError("Score format invalid")
+            except Exception as e:
+                self.logger.warning(f"Failed to extract score from response {i}: {e}")
+                scores.append([float('nan')] * 6)
+
+        return scores
+
+    def eval(self, dataframe: pd.DataFrame, input_key: str):
+        samples = dataframe.to_dict(orient='records')
+        self.logger.info(f"Evaluating {self.score_name}...")
+        scores = self.get_score(samples, input_key)
+        self.logger.info("Evaluation complete!")
+        return scores
+
+    def run(self, storage: DataFlowStorage, input_key: str):
+        self.input_key = input_key
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, self.input_key)
+        # 展开6列固定命名
+        score_df = pd.DataFrame(scores, columns=self.output_columns)
+        dataframe = pd.concat([dataframe, score_df], axis=1)
+        storage.write(dataframe)
--- a/dataflow/operators/text_pt/eval/pair_qual_sample_evaluator.py
+++ b/dataflow/operators/text_pt/eval/pair_qual_sample_evaluator.py
+import torch
+from torch import nn
+from transformers import BertModel, BertConfig, PreTrainedModel, AutoTokenizer
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from tqdm import tqdm
+from dataflow.utils.utils import get_logger
+import numpy as np
+
+@OPERATOR_REGISTRY.register()
+class PairQualSampleEvaluator(OperatorABC):
+    def __init__(self, model_cache_dir:str='./dataflow_cache', device="cuda", lang='en', max_length=512):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model_cache_dir = model_cache_dir
+        self.lang = lang
+        self.max_length = max_length
+        self.score_name = 'PairQualScore'
+        if lang not in ['en', 'zh']:
+            raise ValueError("Invalid value for 'lang'. Only 'en' or 'zh' are allowed.")
+        if self.lang == 'en':
+            model = "zks2856/PairQual-Scorer-en"
+            config = BertConfig.from_pretrained(model, cache_dir=self.model_cache_dir)
+            self.model = BertForRegression_en.from_pretrained(model, config=config, trust_remote_code=True, cache_dir=self.model_cache_dir).to(self.device).eval()
+            self.tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True, cache_dir=self.model_cache_dir)
+        else:
+            model = "zks2856/PairQual-Scorer-zh"
+            config = BertConfig.from_pretrained(model, cache_dir=self.model_cache_dir)
+            self.model = BertForRegression_zh.from_pretrained(model, config=config, trust_remote_code=True, cache_dir=self.model_cache_dir).to(self.device).eval()
+            self.tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True, cache_dir=self.model_cache_dir)
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于BGE模型和GPT成对比较数据训练的文本质量评分器，支持中英文输入。通过对文本进行单样本评估，返回0-1之间的质量分数，" 
+                "分数越高表示文本质量越好。模型分为英文版本(zks2856/PairQual-Scorer-en)和中文版本(zks2856/PairQual-Scorer-zh)。\n" 
+                "输入参数：\n" 
+                "- text: 待评估的文本字符串\n" 
+                "- lang: 语言类型，可选'en'或'zh'\n" 
+                "输出参数：\n" 
+                "- float: 0-1之间的质量分数，越高表示质量越好"
+            )
+        else:
+            return (
+                "Text quality scorer trained on BGE model and GPT pairwise comparison data, supporting bilingual input. Evaluate text through single-sample assessment, " 
+                "returning a quality score between 0 and 1, where higher scores indicate better text quality. Models include English version (zks2856/PairQual-Scorer-en) and Chinese version (zks2856/PairQual-Scorer-zh).\n" 
+                "Input parameters:\n" 
+                "- text: Text string to be evaluated\n" 
+                "- lang: Language type, optional 'en' or 'zh'\n" 
+                "Output parameters:\n" 
+                "- float: Quality score between 0 and 1, higher values indicate better quality"
+            )
+
+    def inference(self, input_text):
+        inputs = self.tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=self.max_length).to(self.device)
+        with torch.no_grad():
+            _, score = self.model(inputs)
+        return score.item()
+
+    def eval(self, dataframe, input_key):
+        self.logger.info(f"Evaluating {self.score_name}...")
+        scores = []
+        for sample in tqdm(dataframe[input_key], desc="PairQualScorer Evaluating..."):
+            score = self.inference(sample)
+            scores.append(score)
+        self.logger.info("Evaluation complete!")
+        return np.array(scores)
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='PairQualScore'):
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_key)
+        dataframe[output_key] = scores
+        storage.write(dataframe)
+
+
+class BertForRegression_en(PreTrainedModel):
+    config_class = BertConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.regression = nn.Sequential(
+            nn.Linear(config.hidden_size, 512),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(512, 1)
+        )
+
+        self.post_init()
+
+    def forward(self, inputs):
+        encoded = self.bert(**inputs)
+        score = self.regression(encoded['pooler_output'])
+        return encoded, score
+
+class BertForRegression_zh(PreTrainedModel):
+    config_class = BertConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BertModel(config)
+        self.regression = nn.Sequential(
+            nn.Linear(config.hidden_size, 256),
+            nn.ReLU(),
+            nn.Linear(256, 1)
+        )
+
+        self.post_init()
+
+    def forward(self, inputs):
+        encoded = self.bert(**inputs)
+        score = self.regression(encoded['pooler_output'])
+        return encoded, score
\ No newline at end of file
--- a/dataflow/operators/text_pt/eval/perplexity_sample_evaluator.py
+++ b/dataflow/operators/text_pt/eval/perplexity_sample_evaluator.py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from tqdm import tqdm
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.utils import get_logger
+
+@OPERATOR_REGISTRY.register()
+class PerplexitySampleEvaluator(OperatorABC):
+    def __init__(self, model_name: str = 'gpt2', device='cuda'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.model_name = model_name
+        self.score_name = 'PerplexityScore'
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        # Load Hugging Face model and tokenizer
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
+            self.model = AutoModelForCausalLM.from_pretrained(self.model_name).to(self.device)
+            self.model.eval()  # Set the model to evaluation mode
+            self.logger.info(f'{self.__class__.__name__} initialized with model {self.model_name}.')
+        except Exception as e:
+            self.logger.error(f"Error loading model: {e}")
+            raise RuntimeError(f"Model loading failed. Please ensure the model is available from Hugging Face.")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于Huggingface语言模型计算文本的困惑度(Perplexity)，困惑度越低表示文本的流畅性和可理解性越高。" 
+                "输入参数：\n" 
+                "- model_name：Huggingface模型路径或名称\n"
+                "- device：模型运行设备\n"
+                "输出参数：\n" 
+                "- float: 困惑度值，越低表示文本流畅性越好"
+            )
+        else:
+            return (
+                "Calculate text perplexity using a Huggingface language model; lower perplexity indicates better fluency and understandability."
+                "Input Parameters:\n"
+                "- model_name: Huggingface model path or name\n"
+                "- device: Model device\n\n"
+                "Output Parameters:\n"
+                "- float: Perplexity score, lower values indicate better fluency and understandability"
+            )
+
+    def eval(self, dataframe, input_key):
+        input_texts = dataframe.get(input_key, '').to_list()
+        self.logger.info(f"Evaluating {self.score_name}...")
+        results = []
+        
+        # Use tqdm to show progress
+        for text in tqdm(input_texts, desc="Evaluating perplexity", unit="text"):
+            perplexity = self.calculate_perplexity(text)
+            results.append(perplexity)
+        
+        self.logger.info("Evaluation complete!")
+        return results
+
+    def calculate_perplexity(self, text: str) -> float:
+        """ 使用Hugging Face模型计算困惑度 """
+        # Encode the input text
+        inputs = self.tokenizer(text, return_tensors='pt', padding="longest", truncation=True).to(self.device)
+        # Calculate log probability
+        with torch.no_grad():
+            outputs = self.model(**inputs, labels=inputs['input_ids'])
+            log_likelihood = outputs.loss * inputs['input_ids'].size(1)
+
+        # Perplexity calculation formula: exp(log_prob / N) -> Perplexity = exp(-average log probability)
+        perplexity = torch.exp(log_likelihood / inputs['input_ids'].size(1)).item()
+        return perplexity
+    
+    def run(self, storage: DataFlowStorage, input_key: str = 'raw_content', output_key: str = 'PerplexityScore'):
+        # Read the data, evaluate the score, and save the results
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Perplexity score ready to evaluate.")
+        scores = self.eval(dataframe, input_key)
+        dataframe[output_key] = scores      
+        storage.write(dataframe)
--- a/dataflow/operators/text_pt/eval/qurating_sample_evaluator.py
+++ b/dataflow/operators/text_pt/eval/qurating_sample_evaluator.py
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from datasets import Dataset
+from tqdm import tqdm
+from dataflow import get_logger
+from dataflow.operators.text_pt.eval.Qurating.qurater_annotate import ModelAnnotator
+from dataflow.operators.text_pt.eval.Qurating.qurater_annotate import TokenizeAndChunk
+import torch
+
+@OPERATOR_REGISTRY.register()
+class QuratingSampleEvaluator(OperatorABC):
+    def __init__(self, map_batch_size: int = 512, num_workers: int = 1, device_batch_size: int = 16, device: str = 'cuda', 
+                 labels: list = ['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value'], model_cache_dir: str = './dataflow_cache'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.model = 'princeton-nlp/QuRater-1.3B'
+        self.tokens_field = 'input_ids'
+        self.tokens = 512
+        self.map_batch_size = map_batch_size
+        self.batch_size = -1 
+        self.num_workers = num_workers
+        self.model_cache_dir = model_cache_dir
+        self.labels = labels or []
+        self.device_batch_size = device_batch_size
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.score_type = float 
+        self.data_type = 'text'  
+        self.score_name = 'QuratingScore'
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "通过Qurating模型(princeton-nlp/QuRater-1.3B)从四个维度评估文本质量：写作风格(writing_style)、所需专业程度(required_expertise)、" 
+                "事实与趣闻(facts_and_trivia)和教育价值(educational_value)。每个维度返回0-1之间的分数，综合评估文本的整体质量。\n" 
+                "输入参数：\n" 
+                "- text: 待评估的文本字符串\n" 
+                "- labels: 评估维度列表，默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n" 
+                "输出参数：\n" 
+                "- dict: 包含各维度分数的字典，键为维度名称，值为0-1之间的分数"
+            )
+        else:
+            return (
+                "Evaluate text quality across four dimensions using the Qurating model (princeton-nlp/QuRater-1.3B): writing_style, required_expertise, " 
+                "facts_and_trivia, and educational_value. Each dimension returns a score between 0 and 1, providing a comprehensive assessment of overall text quality.\n" 
+                "Input parameters:\n" 
+                "- text: Text string to be evaluated\n" 
+                "- labels: List of evaluation dimensions, default ['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n" 
+                "Output parameters:\n" 
+                "- dict: Dictionary containing scores for each dimension, with keys as dimension names and values as scores between 0 and 1"
+            )
+
+    def _score_func(self, sample):
+        """Process a single sample and return the score."""
+        batch_dict = {'text': [sample]}  # Wrap sample into a list for processing
+        dataset = Dataset.from_dict(batch_dict)
+        
+        # Tokenize and chunk
+        dataset = dataset.map(
+            TokenizeAndChunk(self.model, 'text', self.tokens_field, self.tokens, self.model_cache_dir),
+            batched=True,
+            batch_size=self.map_batch_size,
+            num_proc=self.num_workers,
+            remove_columns=dataset.column_names
+        )
+        
+        # Annotate the model results
+        dataset = dataset.map(
+            ModelAnnotator(self.model, self.labels, self.device_batch_size, self.device, self.model_cache_dir),
+            batched=True,
+            with_indices=True,
+            batch_size=self.map_batch_size,
+            remove_columns=dataset.column_names
+        )
+
+        results_dict = dataset.to_dict()
+        result_filtered = {}
+
+        for key in results_dict:
+            for label in self.labels:
+                average_key = f"{label}_average"
+                if average_key in results_dict[key]:
+                    new_key = f"Qurating{''.join([word.capitalize() for word in label.split('_')])}Score"
+                    result_filtered[new_key] = results_dict[key]
+
+        return result_filtered
+
+    def eval(self, dataframe, input_key):
+        self.logger.info(f"Evaluating {self.score_name}...")
+        batch_dict = {'text': dataframe[input_key]}  # Wrap sample into a list for processing
+        dataset = Dataset.from_dict(batch_dict)
+        # Tokenize and chunk
+        dataset = dataset.map(
+            TokenizeAndChunk(self.model, 'text', self.tokens_field, self.tokens, self.model_cache_dir),
+            batched=True,
+            batch_size=self.map_batch_size,
+            num_proc=self.num_workers,
+            remove_columns=dataset.column_names
+        )
+        
+        # Annotate the model results
+        dataset = dataset.map(
+            ModelAnnotator(self.model, self.labels, self.device_batch_size, self.device, self.model_cache_dir),
+            batched=True,
+            with_indices=True,
+            batch_size=self.map_batch_size,
+            remove_columns=dataset.column_names
+        )
+        results_dict = dataset.to_dict()
+        result_filtered = {}
+        for label in self.labels:
+            average_key = f"{label}_average"
+            if average_key in results_dict:
+                new_key = f"Qurating{''.join([word.capitalize() for word in label.split('_')])}Score"
+                result_filtered[new_key] = results_dict[average_key]  # Use the average values
+
+        self.logger.info("Evaluation complete!")
+        return result_filtered
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str):
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_key)
+        for score_dict in scores:
+            for key, value in score_dict.items():
+                if key not in dataframe:
+                    dataframe[key] = value
+        
+        storage.write(dataframe)
--- a/dataflow/operators/text_pt/eval/textbook_sample_evaluator.py
+++ b/dataflow/operators/text_pt/eval/textbook_sample_evaluator.py
+from typing import List
+import re
+from huggingface_hub import hf_hub_download
+import fasttext
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from tqdm import tqdm
+import numpy as np
+
+@OPERATOR_REGISTRY.register()
+class TextbookSampleEvaluator(OperatorABC):
+    def __init__(self, model_cache_dir='./dataflow_cache'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        model_path = hf_hub_download(
+            repo_id='kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2',
+            filename='model.bin',
+            cache_dir=model_cache_dir
+        )
+        low_score=1.0
+        mid_score=3.0
+        high_score=5.0
+        self.model = fasttext.load_model(model_path)
+        self.score_type = float
+        self.data_type = 'text'
+        self.score_name = 'TextbookScore'
+        self.score_dict = {
+            '__label__Low': low_score,
+            '__label__Mid': mid_score,
+            '__label__High': high_score
+        }
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于FastText分类器(kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2)评估文本的教育价值，将文本分为低(Low)、中(Mid)、高(High)三个等级，" 
+                "并映射为1.0、3.0、5.0的分数。适用于筛选适合作为教材的高质量文本内容。\n" 
+                "输入参数：\n" 
+                "- text: 待评估的文本字符串\n" 
+                "输出参数：\n" 
+                "- float: 教育价值分数，可能值为1.0(低)、3.0(中)、5.0(高)"
+            )
+        else:
+            return (
+                "Assess the educational value of text using a FastText classifier (kenhktsui/llm-data-textbook-quality-fasttext-classifer-v2), categorizing text into Low, Mid, and High levels, " 
+                "mapped to scores of 1.0, 3.0, and 5.0 respectively. Suitable for filtering high-quality text content suitable as teaching materials.\n" 
+                "Input parameters:\n" 
+                "- text: Text string to be evaluated\n" 
+                "Output parameters:\n" 
+                "- float: Educational value score, possible values 1.0 (Low), 3.0 (Mid), 5.0 (High)"
+            )
+
+    @staticmethod
+    def replace_newlines(text: str) -> str:
+        return re.sub("\n+", " ", text)
+
+    def _score_func(self, text_list: List[str]) -> List[float]:
+        text_list = [self.replace_newlines(text) for text in text_list]
+        pred = self.model.predict(text_list, k=-1)
+        
+        score_list = []
+        for labels, scores in zip(*pred):
+            score = 0
+            for label, score_value in zip(labels, scores):
+                score += self.score_dict.get(label, 0) * score_value
+            score_list.append(float(score))
+        
+        return score_list
+
+    def eval(self, dataframe, input_key):
+        scores = []
+        text_list = dataframe[input_key]
+        self.logger.info(f"Evaluating {self.score_name}...")
+        for sample in tqdm(text_list, desc="TextbookScorer Evaluating..."):
+            score = self._score_func([sample])
+            scores.append(score)
+        self.logger.info("Evaluation complete!")
+        return np.array(scores)
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='TextbookScore'):
+        dataframe = storage.read("dataframe")  
+        scores = self.eval(dataframe, input_key, output_key) 
+        for i, score_list in enumerate(scores):
+            dataframe[output_key] = score_list 
+        storage.write(dataframe)
--- a/dataflow/operators/text_pt/filter/__init__.py
+++ b/dataflow/operators/text_pt/filter/__init__.py
+# import sys
+# from dataflow.utils.registry import LazyLoader
+
+# cur_path = "dataflow/operators/filter/"
+
+# _import_structure = {
+#     # Primary filters
+#     "NgramFilter": (cur_path + "ngram_filter.py", "NgramFilter"),
+#     "LanguageFilter": (cur_path + "language_filter.py", "LanguageFilter"),
+#     "DeitaQualityFilter": (cur_path + "deita_quality_filter.py", "DeitaQualityFilter"),
+#     "DeitaComplexityFilter": (cur_path + "deita_complexity_filter.py", "DeitaComplexityFilter"),
+#     "InstagFilter": (cur_path + "instag_filter.py", "InstagFilter"),
+#     "PairQualFilter": (cur_path + "pair_qual_filter.py", "PairQualFilter"),
+#     "QuratingFilter": (cur_path + "qurating_filter.py", "QuratingFilter"),
+#     "SuperfilteringFilter": (cur_path + "superfiltering_filter.py", "SuperfilteringFilter"),
+#     "FineWebEduFilter": (cur_path + "fineweb_edu_filter.py", "FineWebEduFilter"),
+#     "TextbookFilter": (cur_path + "text_book_filter.py", "TextbookFilter"),
+#     "AlpagasusFilter": (cur_path + "alpagasus_filter.py", "AlpagasusFilter"),
+#     "DebertaV3Filter": (cur_path + "debertav3_filter.py", "DebertaV3Filter"),
+#     "LangkitFilter": (cur_path + "langkit_filter.py", "LangkitFilter"),
+#     "LexicalDiversityFilter": (cur_path + "lexical_diversity_filter.py", "LexicalDiversityFilter"),
+#     "PerplexityFilter": (cur_path + "perplexity_filter.py", "PerplexityFilter"),
+#     "PerspectiveFilter": (cur_path + "perspective_filter.py", "PerspectiveFilter"),
+#     "PresidioFilter": (cur_path + "presidio_filter.py", "PresidioFilter"),
+#     "RMFilter": (cur_path + "reward_model_filter.py", "RMFilter"),
+#     "TreeinstructFilter": (cur_path + "treeinstruct_filter.py", "TreeinstructFilter"),
+
+#     # Heuristic filters
+#     "ColonEndFilter": (cur_path + "heuristics.py", "ColonEndFilter"),
+#     "WordNumberFilter": (cur_path + "heuristics.py", "WordNumberFilter"),
+#     "BlocklistFilter": (cur_path + "heuristics.py", "BlocklistFilter"),
+#     "SentenceNumberFilter": (cur_path + "heuristics.py", "SentenceNumberFilter"),
+#     "LineEndWithEllipsisFilter": (cur_path + "heuristics.py", "LineEndWithEllipsisFilter"),
+#     "ContentNullFilter": (cur_path + "heuristics.py", "ContentNullFilter"),
+#     "MeanWordLengthFilter": (cur_path + "heuristics.py", "MeanWordLengthFilter"),
+#     "SymbolWordRatioFilter": (cur_path + "heuristics.py", "SymbolWordRatioFilter"),
+#     "HtmlEntityFilter": (cur_path + "heuristics.py", "HtmlEntityFilter"),
+#     "IDCardFilter": (cur_path + "heuristics.py", "IDCardFilter"),
+#     "NoPuncFilter": (cur_path + "heuristics.py", "NoPuncFilter"),
+#     "SpecialCharacterFilter": (cur_path + "heuristics.py", "SpecialCharacterFilter"),
+#     "WatermarkFilter": (cur_path + "heuristics.py", "WatermarkFilter"),
+#     "StopWordFilter": (cur_path + "heuristics.py", "StopWordFilter"),
+#     "CurlyBracketFilter": (cur_path + "heuristics.py", "CurlyBracketFilter"),
+#     "CapitalWordsFilter": (cur_path + "heuristics.py", "CapitalWordsFilter"),
+#     "LoremIpsumFilter": (cur_path + "heuristics.py", "LoremIpsumFilter"),
+#     "UniqueWordsFilter": (cur_path + "heuristics.py", "UniqueWordsFilter"),
+#     "CharNumberFilter": (cur_path + "heuristics.py", "CharNumberFilter"),
+#     "LineStartWithBulletpointFilter": (cur_path + "heuristics.py", "LineStartWithBulletpointFilter"),
+#     "LineWithJavascriptFilter": (cur_path + "heuristics.py", "LineWithJavascriptFilter"),
+
+#     # Deduplicators
+#     "MinHashDeduplicator": (cur_path + "minhash_deduplicator.py", "MinHashDeduplicator"),
+#     "CCNetDeduplicator": (cur_path + "ccnet_deduplicator.py", "CCNetDeduplicator"),
+#     "HashDeduplicator": (cur_path + "hash_deduplicator.py", "HashDeduplicator"),
+#     "NgramHashDeduplicator": (cur_path + "ngramhash_deduplicator.py", "NgramHashDeduplicator"),
+#     "SemDeduplicator": (cur_path + "sem_deduplicator.py", "SemDeduplicator"),
+#     "SimHashDeduplicator": (cur_path + "simhash_deduplicator.py", "SimHashDeduplicator"),
+# }
+
+# sys.modules[__name__] = LazyLoader(__name__, cur_path, _import_structure)
\ No newline at end of file
--- a/dataflow/operators/text_pt/filter/ccnet_deduplicate_filter.py
+++ b/dataflow/operators/text_pt/filter/ccnet_deduplicate_filter.py
+import hashlib
+import struct
+from tqdm import tqdm
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+
+def sha1_hash(data: bytes, d: int = 32) -> int:
+    """
+    Generate a d-bit hash value from the given data.
+
+    Parameters
+    ----------
+    data : bytes
+        The data to be hashed.
+    d : int
+        The number of bits of the hash value.
+
+    Returns
+    -------
+    int
+        The hash value.
+
+    Examples
+    --------
+    >>> sha1_hash(b"hello world", 32)
+    896314922
+    >>> sha1_hash(b"hello world", 64)
+    13028719972609469994
+    >>> sha1_hash(b"hello world", 128)
+    310522945683037930239412421226792791594
+    """
+    if d == 32:
+        return struct.unpack("<I", hashlib.sha1(data, usedforsecurity=False).digest()[:4])[0]
+    if d == 64:
+        return struct.unpack("<Q", hashlib.sha1(data, usedforsecurity=False).digest()[:8])[0]
+    # struct is faster but does not support arbitrary bit lengths
+    return int.from_bytes(hashlib.sha1(data, usedforsecurity=False).digest()[: d // 8], byteorder="little")
+
+
+@OPERATOR_REGISTRY.register()
+class CCNetDeduplicateFilter(OperatorABC):
+    
+    def __init__(self, bit_length: int = 64):
+        self.logger = get_logger()
+        self.bit_length = bit_length
+        self.logger.info(f"Initializing {self.__class__.__name__} with bit length = {bit_length}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "CCNet去重方法，基于SHA-1哈希算法的前N位进行重复识别，实现精确去重。\n\n"
+                "初始化参数：\n"
+                "- bit_length: 哈希值的位数，默认为64位\n\n"
+                "运行参数：\n"
+                "- input_keys: 用于计算哈希的多个字段列表（与input_key二选一）\n"
+                "- input_key: 用于计算哈希的单个字段名（与input_keys二选一）\n"
+                "- output_key: 去重标记字段名，默认为'minhash_deduplicated_label'\n\n"
+                "输出说明：标记为1的数据表示首次出现，标记为0的数据表示重复数据"
+            )
+        else:
+            return (
+                "CCNet deduplication method. Identify duplicates using first N bits of SHA-1 hash for exact deduplication.\n\n"
+                "Initialization Parameters:\n"
+                "- bit_length: Number of bits for hash value, default is 64\n\n"
+                "Run Parameters:\n"
+                "- input_keys: List of multiple fields for hash calculation (alternative to input_key)\n"
+                "- input_key: Single field name for hash calculation (alternative to input_keys)\n"
+                "- output_key: Deduplication label field name, default is 'minhash_deduplicated_label'\n\n"
+                "Output Description: Data marked as 1 indicates first occurrence, 0 indicates duplicate"
+            )
+
+    def _compute_hash(self, text: str) -> str:
+        return sha1_hash(text, self.bit_length)
+
+    def run(self, storage: DataFlowStorage, input_keys: list = None, input_key: str = None, output_key: str = 'minhash_deduplicated_label'):
+        if input_keys is None and input_key is None:
+            self.logger.error(f"Need to specify either input_keys or input_key!")
+            raise ValueError(f"Need to specify either input_keys or input_key!")
+        if input_keys is not None and input_key is not None:
+            self.logger.error(f"{self.__class__.__name__} only need one input args!")
+            raise ValueError(f"{self.__class__.__name__} only need one input args!")
+        if input_keys is not None:
+            self.logger.info(f"Running {self.__class__.__name__} with input_keys = {input_keys} and output_key = {output_key}")
+        else:
+            self.logger.info(f"Running {self.__class__.__name__} with input_key = {input_key} and output_key = {output_key}")
+        self.input_key = input_key
+        self.input_keys = input_keys
+        self.output_key = output_key
+        seen_hashes = set()
+        dataframe = storage.read("dataframe")
+        labels = [0] * len(dataframe)
+        for idx, sample in tqdm(enumerate(dataframe.to_dict(orient='records')), desc=f"Implementing {self.__class__.__name__}", total=len(dataframe)):
+            if input_keys is not None and len(input_keys) > 1:
+                text = '\n'.join([f"{k}:\n{sample[k]}" for k in input_keys])
+            else:
+                text = sample[self.input_key]
+            text = text.encode('utf-8')
+            hash_value = self._compute_hash(text)
+            if hash_value not in seen_hashes:
+                labels[idx] = 1
+                seen_hashes.add(hash_value)
+        dataframe[self.output_key] = labels
+        filtered_dataframe = dataframe[(dataframe[self.output_key] > 0)]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Deduplication completed. Total unique items: {sum(labels)}")
+        return [self.output_key,]
+        
+        
+
+        
+        
+
--- a/dataflow/operators/text_pt/filter/debertav3_filter.py
+++ b/dataflow/operators/text_pt/filter/debertav3_filter.py
+import numpy as np
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.text_pt import DebertaV3SampleEvaluator
+
+@OPERATOR_REGISTRY.register()
+class DebertaV3Filter(OperatorABC):
+
+    def __init__(self, allowed_scores : list = ['Medium', 'High'], model_name='nvidia/quality-classifier-deberta', model_cache_dir='./dataflow_cache', device='cuda', batch_size=16):
+        self.logger = get_logger()
+        self.allowed_scores = allowed_scores
+        self.scorer = DebertaV3SampleEvaluator(
+            model_name=model_name,
+            model_cache_dir=model_cache_dir,
+            device=device,
+            batch_size=batch_size,
+        )
+        self.logger.info(f"Initializing {self.__class__.__name__} with allowed_scores = {self.allowed_scores}...")
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于DebertaV3Scorer打分器的得分对数据进行过滤。使用Nvidia Deberta V3模型的质量分类器评估文本质量。\n\n"
+                "初始化参数：\n"
+                "- allowed_scores: 允许通过的分数列表，默认为['Medium', 'High']\n"
+                "- model_name: 模型名称，默认为'nvidia/quality-classifier-deberta'\n"
+                "- model_cache_dir: 模型缓存目录，默认为'./dataflow_cache'\n"
+                "- device: 运行设备，默认为'cuda'\n"
+                "- batch_size: 批处理大小，默认为16\n\n"
+                "运行参数：\n"
+                "- input_key: 输入文本字段名\n"
+                "- output_key: 输出分数字段名，默认为'Debertav3Score'\n\n"
+                "过滤逻辑：保留分类结果在allowed_scores列表中的数据"
+            )
+        else:
+            return (
+                "Filter data using scores from the DebertaV3Scorer. Evaluate text quality using Nvidia Deberta V3 model-based quality classifier.\n\n"
+                "Initialization Parameters:\n"
+                "- allowed_scores: List of allowed scores, default is ['Medium', 'High']\n"
+                "- model_name: Model name, default is 'nvidia/quality-classifier-deberta'\n"
+                "- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
+                "- device: Running device, default is 'cuda'\n"
+                "- batch_size: Batch size, default is 16\n\n"
+                "Run Parameters:\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output score field name, default is 'Debertav3Score'\n\n"
+                "Filter Logic: Keep data with classification results in allowed_scores list"
+            )
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'Debertav3Score'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        scores = self.scorer.eval(dataframe, self.input_key)
+        dataframe[self.output_key] = scores
+        labels = np.array([1 if score in self.allowed_scores else 0 for score in scores])
+        filtered_dataframe = dataframe[labels == 1]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
+        
+        
\ No newline at end of file
--- a/dataflow/operators/text_pt/filter/fineweb_edu_filter.py
+++ b/dataflow/operators/text_pt/filter/fineweb_edu_filter.py
+from dataflow.operators.text_pt import FineWebEduSampleEvaluator
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.utils import get_logger
+
+@OPERATOR_REGISTRY.register()
+class FineWebEduFilter(OperatorABC):
+    def __init__(self, min_score: float = 2.5, max_score: float = 10000, model_cache_dir: str = './dataflow_cache', device: str = 'cuda'):
+        self.min_score = min_score
+        self.max_score = max_score
+        self.logger = get_logger()
+        self.scorer = FineWebEduSampleEvaluator(model_cache_dir=model_cache_dir, device=device)
+        self.filter_name = 'FineWebEduFilter'
+        self.logger.info(f"Initializing {self.filter_name} with min_score = {self.min_score}, max_score = {self.max_score}, "
+                         f"device = {device}, model_cache_dir = {model_cache_dir}")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于FineWebEduScorer打分器的得分对数据进行过滤。Fineweb-Edu是一个用于评估文本教育价值的分类器。\n\n"
+                "初始化参数：\n"
+                "- min_score: 最低分数阈值，默认为2.5\n"
+                "- max_score: 最高分数阈值，默认为10000\n"
+                "- model_cache_dir: 模型缓存目录，默认为'./dataflow_cache'\n"
+                "- device: 运行设备，默认为'cuda'\n\n"
+                "运行参数：\n"
+                "- input_key: 输入文本字段名\n"
+                "- output_key: 输出分数字段名，默认为'FinewebEduScore'\n\n"
+                "评分标准：0-5分，分数越高表示文本具有越高的教育价值\n"
+                "过滤逻辑：保留分数在[min_score, max_score]范围内的数据"
+            )
+        else:
+            return (
+                "Filter data using scores from the FineWebEduScorer. Fineweb-Edu is a classifier for evaluating educational value of text.\n\n"
+                "Initialization Parameters:\n"
+                "- min_score: Minimum score threshold, default is 2.5\n"
+                "- max_score: Maximum score threshold, default is 10000\n"
+                "- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
+                "- device: Running device, default is 'cuda'\n\n"
+                "Run Parameters:\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Output score field name, default is 'FinewebEduScore'\n\n"
+                "Scoring Standard: 0-5 points, higher score indicates more educational content\n"
+                "Filter Logic: Keep data with scores in [min_score, max_score] range"
+            )
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='FinewebEduScore'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.filter_name}...")
+        scores = self.scorer.eval(dataframe, input_key)
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
--- a/dataflow/operators/text_pt/filter/pair_qual_filter.py
+++ b/dataflow/operators/text_pt/filter/pair_qual_filter.py
+from dataflow.operators.text_pt import PairQualSampleEvaluator
+import numpy as np
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.utils import get_logger
+from dataflow.utils.storage import DataFlowStorage
+
+@OPERATOR_REGISTRY.register()
+class PairQualFilter(OperatorABC):
+    def __init__(self, min_score=0, max_score=10000, model_cache_dir='./dataflow_cache', lang='en'):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        
+        self.scorer = PairQualSampleEvaluator(model_cache_dir=model_cache_dir, lang=lang)
+        self.filter_name = 'PairQualFilter'
+
+        self.logger.info(f"Initializing {self.filter_name} with min_score = {self.min_score}, max_score = {self.max_score}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于PairQualScorer打分器的得分对数据进行过滤。基于BGE模型，使用GPT对文本成对比较打分后训练而成的双语文本质量评分器，得分越高表示质量越高。\n"
+                "输入参数：\n"
+                "- min_score：最小质量得分阈值\n"
+                "- max_score：最大质量得分阈值\n"
+                "- model_cache_dir：模型缓存目录路径\n"
+                "- lang：文本语言类型\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留质量得分在指定范围内的文本\n"
+                "- 返回包含质量得分字段名的列表"
+            )
+        else:
+            return (
+                "Filter data using scores from the PairQualScorer. Bilingual text quality scorer trained on GPT pairwise comparison annotations using BGE model; higher scores indicate better quality.\n"
+                "Input Parameters:\n"
+                "- min_score: Minimum quality score threshold\n"
+                "- max_score: Maximum quality score threshold\n"
+                "- model_cache_dir: Model cache directory path\n"
+                "- lang: Text language type\n\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only texts with quality score within specified range\n"
+                "- List containing quality score field name"
+            )
+
+    def eval(self, dataframe, input_key):
+        self.logger.info(f"Start evaluating {self.filter_name}...")
+        
+        # Get the scores using the scorer
+        scores = self.scorer.eval(dataframe, input_key)
+
+        # Return the scores for filtering
+        return np.array(scores)
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='PairQualScore'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.filter_name} with input_key = {self.input_key} and output_key = {self.output_key}...")
+        scores = np.array(self.scorer.eval(dataframe, input_key))
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
--- a/dataflow/operators/text_pt/filter/perplexity_filter.py
+++ b/dataflow/operators/text_pt/filter/perplexity_filter.py
+import numpy as np
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.text_pt import PerplexitySampleEvaluator
+
+@OPERATOR_REGISTRY.register()
+class PerplexityFilter(OperatorABC):
+    def __init__(self, min_score: float = 10.0, max_score: float = 500.0, model_name: str = 'gpt2', device='cuda'):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        self.scorer = PerplexitySampleEvaluator(model_name=model_name, device=device)
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}")
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于PerplexityScorer打分器的得分对数据进行过滤。基于Huggingface模型计算文本的困惑度，困惑度越低，文本的流畅性和可理解性越高。\n"
+                "输入参数：\n"
+                "- min_score：最小困惑度阈值\n"
+                "- max_score：最大困惑度阈值\n"
+                "- model_name：Huggingface模型路径或名称\n"
+                "- device：模型运行设备\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留困惑度在指定范围内的文本\n"
+                "- 返回包含困惑度得分字段名的列表"
+            )
+        else:
+            return (
+                "Filter data using scores from the PerplexityScorer. Uses Huggingface model to calculate text perplexity; lower scores indicate better fluency and understandability.\n"
+                "Input Parameters:\n"
+                "- min_score: Minimum perplexity threshold\n"
+                "- max_score: Maximum perplexity threshold\n"
+                "- model_name: Huggingface model path or name\n"
+                "- device: Model device\n\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only texts with perplexity within specified range\n"
+                "- List containing perplexity score field name"
+            )
+        
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'PerplexityScore'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+
+        # 获取得分并过滤
+        scores = np.array(self.scorer.eval(dataframe, self.input_key))
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
+
+        # 将过滤后的数据框写回存储
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        return [self.output_key]
--- a/dataflow/operators/text_pt/filter/qurating_filter.py
+++ b/dataflow/operators/text_pt/filter/qurating_filter.py
+from dataflow.operators.text_pt import QuratingSampleEvaluator
+import numpy as np
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.utils import get_logger
+from dataflow.utils.storage import DataFlowStorage
+
+@OPERATOR_REGISTRY.register()
+class QuratingFilter(OperatorABC):
+
+    def __init__(self, min_scores: dict = {'writing_style': 0,'required_expertise': 0,'facts_and_trivia': 0,'educational_value': 0}, max_scores: dict = {'writing_style': 9,'required_expertise': 9,'facts_and_trivia': 9,'educational_value': 9}, 
+                 map_batch_size: int = 512, num_workers: int = 1, device_batch_size: int = 16, device: str = 'cuda', 
+                 labels: list = ['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value'], model_cache_dir: str = './dataflow_cache'):
+        self.logger = get_logger()
+        self.min_scores = min_scores
+        self.max_scores = max_scores
+
+        # Initialize the QuratingScorer with the passed parameters
+        self.scorer = QuratingSampleEvaluator(map_batch_size=map_batch_size, 
+                                     num_workers=num_workers, device_batch_size=device_batch_size, device=device, 
+                                     labels=labels, model_cache_dir=model_cache_dir)
+        
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_scores = {self.min_scores} and max_scores = {self.max_scores}...")
+
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于QuratingScorer打分器的得分对数据进行过滤。通过Qurating模型从四个维度评估文本质量：写作风格、所需专业知识、事实与 trivia 内容、教育价值。\n"
+                "每个维度评分范围为0-9分，综合判断文本质量，可用于筛选高质量教育类或知识类内容。\n"
+                "输入参数：\n"
+                "- min_scores：各维度保留样本的最小分数阈值，默认为{'writing_style':0,'required_expertise':0,'facts_and_trivia':0,'educational_value':0}\n"
+                "- max_scores：各维度保留样本的最大分数阈值，默认为{'writing_style':9,'required_expertise':9,'facts_and_trivia':9,'educational_value':9}\n"
+                "- map_batch_size：映射批次大小，默认为512\n"
+                "- num_workers：数据加载工作进程数，默认为1\n"
+                "- device_batch_size：设备批次大小，默认为16\n"
+                "- device：模型运行设备，默认为'cuda'\n"
+                "- labels：评估维度列表，默认为['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n"
+                "- model_cache_dir：模型缓存目录，默认为'./dataflow_cache'\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留所有维度分数均在对应阈值范围内的样本\n"
+                "- 返回包含各维度过滤结果字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Filter data using scores from the QuratingScorer. Evaluate text quality across four dimensions using Qurating model: writing style, required expertise, facts and trivia content, and educational value.\n"
+                "Each dimension is scored from 0-9, providing comprehensive quality assessment for filtering high-quality educational or knowledge-based content.\n"
+                "Input Parameters:\n"
+                "- min_scores: Minimum score thresholds for each dimension, default is {'writing_style':0,'required_expertise':0,'facts_and_trivia':0,'educational_value':0}\n"
+                "- max_scores: Maximum score thresholds for each dimension, default is {'writing_style':9,'required_expertise':9,'facts_and_trivia':9,'educational_value':9}\n"
+                "- map_batch_size: Mapping batch size, default is 512\n"
+                "- num_workers: Number of data loading workers, default is 1\n"
+                "- device_batch_size: Device batch size, default is 16\n"
+                "- device: Model running device, default is 'cuda'\n"
+                "- labels: List of evaluation dimensions, default is ['writing_style', 'required_expertise', 'facts_and_trivia', 'educational_value']\n"
+                "- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only samples with all dimension scores within corresponding threshold ranges\n"
+                "- List containing field names of each dimension's filtering results for subsequent operator reference"
+            )
+        else:
+            return "Filter data based on multi-dimensional quality assessment using Qurating model."
+
+
+    def run(self, storage: DataFlowStorage, input_key: str):
+        self.input_key = input_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__}...")
+
+        # Get the scores for filtering
+        scores = self.scorer.eval(dataframe, self.input_key)
+
+        # Initialize results to all valid (1)
+        results = np.ones(len(dataframe), dtype=int)
+
+        # Iterate over each label to apply the filter and add a column
+        for label in self.min_scores.keys():
+            min_score = self.min_scores[label]
+            max_score = self.max_scores[label]
+            score_key = f"Qurating{''.join([word.capitalize() for word in label.split('_')])}Score"
+            metric_scores = np.array(scores[score_key])
+
+            # Apply score filter for the current label
+            metric_filter = (min_score <= metric_scores) & (metric_scores <= max_score)
+            results = results & metric_filter.astype(int)
+
+            # Add a new column with the name '{label}_filter' containing 0 or 1 based on the filter
+            dataframe[f"{label}_label"] = metric_filter.astype(int)
+
+        # Filter the dataframe based on the results
+        filtered_dataframe = dataframe[results == 1]
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+
+        result = [f"{label}_label" for label in self.min_scores.keys()]
+        
+        return result
--- a/dataflow/operators/text_pt/filter/text_book_filter.py
+++ b/dataflow/operators/text_pt/filter/text_book_filter.py
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.text_pt import TextbookSampleEvaluator
+
+@OPERATOR_REGISTRY.register()
+class TextbookFilter(OperatorABC):
+
+    def __init__(self, min_score=0.99, max_score=1, model_cache_dir:str='./dataflow_cache'):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        self.scorer = TextbookSampleEvaluator(model_cache_dir=model_cache_dir)
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {min_score} and max_score = {max_score}")
+    
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于TextbookScorer打分器的得分对数据进行过滤。使用FastText分类器评估文本的教育价值，判断文本是否适合作为教材内容。\n"
+                "分类器经过训练可识别具有教育意义、结构清晰、知识准确的文本，适用于构建教育类数据集。\n"
+                "输入参数：\n"
+                "- min_score：保留样本的最小教育价值分数阈值，默认为0.99\n"
+                "- max_score：保留样本的最大教育价值分数阈值，默认为1.0\n"
+                "- model_cache_dir：模型缓存目录，默认为'./dataflow_cache'\n"
+                "- input_key：输入文本字段名\n"
+                "- output_key：教育价值分数字段名，默认为'TextbookScore'\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留教育价值分数在[min_score, max_score]范围内的样本\n"
+                "- 返回包含教育价值分数字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Filter data using scores from the TextbookScorer. Assess educational value of text using FastText classifier to determine if text is suitable as educational material.\n"
+                "Classifier is trained to identify text with educational significance, clear structure, and accurate knowledge, suitable for building educational datasets.\n"
+                "Input Parameters:\n"
+                "- min_score: Minimum educational value score threshold for retaining samples, default is 0.99\n"
+                "- max_score: Maximum educational value score threshold for retaining samples, default is 1.0\n"
+                "- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
+                "- input_key: Input text field name\n"
+                "- output_key: Educational value score field name, default is 'TextbookScore'\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only samples with educational value scores within [min_score, max_score] range\n"
+                "- List containing educational value score field name for subsequent operator reference"
+            )
+        else:
+            return "Filter data based on educational value assessment using FastText textbook classifier."
+
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str='TextbookScore'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        scores = self.scorer.eval(dataframe, self.input_key)
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
+        
+        
\ No newline at end of file