适配后端vllm

97e8278b · zzg_666 · 97e8278b · 97e8278b · 97e8278b · 97e8278b
Commit 97e8278b authored Dec 03, 2025 by zzg_666
20 changed files
--- a/dataflow/operators/text_pt/generate/phi4qa_generator.py
+++ b/dataflow/operators/text_pt/generate/phi4qa_generator.py
+from dataflow.prompts.general_text import Phi4QAGeneratorPrompt
+import pandas as pd
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.core import OperatorABC
+from dataflow.core import LLMServingABC
+from dataflow.core.prompt import prompt_restrict
+@prompt_restrict(
+    Phi4QAGeneratorPrompt
+)
+@OPERATOR_REGISTRY.register()
+class Phi4QAGenerator(OperatorABC):
+    '''
+    Answer Generator is a class that generates answers for given questions.
+    '''
+    def __init__(self, llm_serving: LLMServingABC):
+        self.logger = get_logger()
+        self.prompts = Phi4QAGeneratorPrompt()    
+        self.llm_serving = llm_serving
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于给定文档内容，生成预训练格式的多轮对话问答数据。将原始文档内容转换为适合语言模型预训练的对话格式数据。"
+                "输入参数：\n"
+                "- llm_serving：LLM服务对象，需实现LLMServingABC接口\n"
+                "- input_key：输入文档内容字段名，默认为'raw_content'\n"
+                "- output_key：输出生成内容字段名，默认为'generated_content'\n"
+                "输出参数：\n"
+                "- 包含原始内容和生成内容的DataFrame\n"
+                "- 返回输出字段名，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Generate pre-training format multi-turn dialogue Q&A data based on the given document content. \n"
+                "Converts raw document content into dialogue format data suitable for language model pre-training.\n"
+                "Input Parameters:\n"
+                "- llm_serving: LLM serving object implementing LLMServingABC interface\n"
+                "- input_key: Field name for input document content, default is 'raw_content'\n"
+                "- output_key: Field name for output generated content, default is 'generated_content'\n\n"
+                "Output Parameters:\n"
+                "- DataFrame containing original and generated content\n"
+                "- Returns output field name for subsequent operator reference"
+            )
+        else:
+            return (
+                "PretrainGenerator converts document content into pre-training format multi-turn dialogue data."
+            )
+    def run(self, storage: DataFlowStorage, input_key: str = "raw_content", output_key: str = "generated_content"):
+        self.input_key, self.output_key = input_key, output_key
+        self.logger.info("Running PretrainGenerator...")
+        # Load the raw dataframe from the input file
+        dataframe = storage.read('dataframe')
+        self.logger.info(f"Loading, number of rows: {len(dataframe)}")
+        # Create a list to hold all generated questions and answers
+        llm_inputs = []
+        # Prepare LLM inputs by formatting the prompt with raw content from the dataframe
+        for index, row in dataframe.iterrows():
+            raw_content = row.get(self.input_key, '')
+            if raw_content:
+                llm_input = self.prompts.build_prompt(raw_content)
+                llm_inputs.append(llm_input)
+        # Generate the text using the model
+        try:
+            self.logger.info("Generating text using the model...")
+            generated_outputs = self.llm_serving.generate_from_input(llm_inputs)
+            self.logger.info("Text generation completed.")
+        except Exception as e:
+            self.logger.error(f"Error during text generation: {e}")
+            return
+        # Add the generated content back to the dataframe
+        dataframe['generated_content'] = generated_outputs
+        # Save the updated dataframe to the output file
+        output_file = storage.write(dataframe)
+        return output_key
\ No newline at end of file
--- a/dataflow/operators/text_sft/__init__.py
+++ b/dataflow/operators/text_sft/__init__.py
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from .eval.alpagasus_sample_evaluator import AlpagasusSampleEvaluator
+    from .eval.deita_quality_sample_evaluator import DeitaQualitySampleEvaluator
+    from .eval.deita_complexity_sample_evaluator import DeitaComplexitySampleEvaluator
+    from .eval.instag_sample_evaluator import InstagSampleEvaluator
+    from .eval.rm_sample_evaluator import RMSampleEvaluator
+    from .eval.superfiltering_sample_evaluator import SuperfilteringSampleEvaluator
+    from .eval.treeinstruct_sample_evaluator import TreeinstructSampleEvaluator
+    from .filter.alpagasus_filter import AlpagasusFilter
+    from .filter.deita_quality_filter import DeitaQualityFilter
+    from .filter.deita_complexity_filter import DeitaComplexityFilter
+    from .filter.instag_filter import InstagFilter
+    from .filter.rm_filter import RMFilter
+    from .filter.superfiltering_filter import SuperfilteringFilter
+    from .filter.treeinstruct_filter import TreeinstructFilter
+    from .generate.condor_generator import CondorGenerator
+    from .generate.sft_generator_from_seed import SFTGeneratorSeed
+    from .refine.condor_refiner import CondorRefiner
+else:
+    import sys
+    from dataflow.utils.registry import LazyLoader, generate_import_structure_from_type_checking
+    cur_path = "dataflow/operators/text_sft/"
+    _import_structure = generate_import_structure_from_type_checking(__file__, cur_path)
+    sys.modules[__name__] = LazyLoader(__name__, "dataflow/operators/text_sft/", _import_structure)
--- a/dataflow/operators/text_sft/eval/Superfiltering/data_analysis.py
+++ b/dataflow/operators/text_sft/eval/Superfiltering/data_analysis.py
+import os
+import json
+import torch
+import argparse
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+PROMPT_DICT_NONE = {
+    "prompt_input": (
+        "{instruction}\n{input}\n"
+    ),
+    "prompt_no_input": (
+        "{instruction}\n"
+    ),
+}
+# Used to get the ppl and emb for the whole input
+def get_perplexity_and_embedding_whole_text(tokenizer, model, text, max_length, device):
+    input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
+    with torch.no_grad(): 
+        outputs = model(input_ids, labels=input_ids.contiguous())
+    loss = outputs.loss
+    perplexity = torch.exp(loss)
+    return perplexity.to('cpu').item(), loss.to('cpu').item()
+# Used to get the ppl and emb for part of input, used in conditional version, and token-wise loss
+def get_perplexity_and_embedding_part_text(tokenizer, model, text, target_span, max_length, device):
+    try:
+        input_ids = tokenizer.encode(text, return_tensors="pt", truncation=True, max_length=max_length).to(device)
+        start_index = text.rfind(target_span)
+        start_token = len(tokenizer.encode(text[:start_index]))
+        end_token = input_ids.shape[1]
+        labels = input_ids.clone()
+        labels[0, :start_token] = -100
+        with torch.no_grad():
+            outputs = model(input_ids, labels=labels)
+        loss = outputs.loss
+        perplexity = torch.exp(loss)
+        return perplexity.to('cpu').item(), loss.to('cpu').item()
+    except:
+        return 0, 0
--- a/dataflow/operators/text_sft/eval/alpagasus_sample_evaluator.py
+++ b/dataflow/operators/text_sft/eval/alpagasus_sample_evaluator.py
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+import pandas as pd
+from dataflow.core import LLMServingABC
+from dataflow.core.prompt import prompt_restrict
+from dataflow.prompts.general_text import AlpagasusPrompt  
+@prompt_restrict(
+    AlpagasusPrompt
+)
+@OPERATOR_REGISTRY.register()
+class AlpagasusSampleEvaluator(OperatorABC):
+    def __init__(self, llm_serving: LLMServingABC = None, dimension: str = 'quality'):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.llm_serving = llm_serving
+        self.score_name = 'AlpagasusScore'
+        self.dimension = dimension
+        self.prompt = AlpagasusPrompt(dimension=self.dimension)
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "通过调用GPT评估指令的质量，返回一个质量得分，得分越高表明指令的质量越高。\n"
+                "输入参数：\n"
+                "- llm_serving：LLM服务对象，需实现LLMServingABC接口\n"
+                "- dimension：评估维度，默认为'quality'\n"
+                "- input_instruction_key：指令字段名\n"
+                "- input_input_key：输入文本字段名\n"
+                "- input_output_key：输出文本字段名\n"
+                "- output_key：输出得分字段名，默认'AlpagasusScore'\n"
+                "输出参数：\n"
+                "- 包含评估得分的DataFrame"
+            )
+        elif lang == "en":
+            return (
+                "Evaluate instruction quality using GPT; higher scores indicate better quality.\n"
+                "Input Parameters:\n"
+                "- llm_serving: LLM serving object implementing LLMServingABC interface\n"
+                "- dimension: Evaluation dimension, default 'quality'\n"
+                "- input_instruction_key: Field name for instruction\n"
+                "- input_input_key: Field name for input text\n"
+                "- input_output_key: Field name for output text\n"
+                "- output_key: Field name for output score, default 'AlpagasusScore'\n"
+                "Output Parameters:\n"
+                "- DataFrame containing evaluation scores"
+            )
+        else:
+            return "Evaluate instruction quality using GPT; higher scores indicate better quality."
+    def get_score(self, samples, input_instruction_key, input_input_key, input_output_key):
+        system_prompts = []
+        user_prompts = []
+        for sample in samples:
+            instruction = sample.get(input_instruction_key, [''])
+            response = sample.get(input_output_key, [''])
+            input_text = sample.get(input_input_key, [''])
+            system_prompts.append(self.prompt.build_system_prompt(instruction, input_text, response))
+            user_prompts.append(self.prompt.build_prompt())
+        inputs = [system + "\n" + user for system, user in zip(system_prompts, user_prompts)]
+        responses = self.llm_serving.generate_from_input(user_inputs=inputs)
+        scores = []
+        for response in responses:
+            score_line = response.strip().split("\n")[0]
+            score = float(score_line.split()[0])
+            scores.append(score)
+        return scores
+    def eval(self, dataframe: pd.DataFrame, input_instruction_key: str, input_input_key: str, input_output_key: str):
+        samples = dataframe.to_dict(orient='records')
+        self.logger.info(f"Evaluating {self.score_name}...")
+        scores = self.get_score(samples, input_instruction_key, input_input_key, input_output_key)
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_instruction_key: str, input_input_key: str, input_output_key: str, output_key: str='AlpagasusScore'):
+        self.input_instruction_key = input_instruction_key
+        self.input_input_key = input_input_key
+        self.input_output_key = input_output_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, self.input_instruction_key, self.input_input_key, self.input_output_key)
+        dataframe[self.output_key] = scores
+        storage.write(dataframe)
--- a/dataflow/operators/text_sft/eval/deita_complexity_sample_evaluator.py
+++ b/dataflow/operators/text_sft/eval/deita_complexity_sample_evaluator.py
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import numpy as np
+from scipy.special import softmax
+from dataflow import get_logger
+import torch
+from tqdm import tqdm
+@OPERATOR_REGISTRY.register()
+class DeitaComplexitySampleEvaluator(OperatorABC):
+    def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model_name = "hkust-nlp/deita-complexity-scorer"
+        self.model_cache_dir = model_cache_dir
+        self.max_length = max_length
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
+        self.score_name = 'DeitaComplexityScore'
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于Llama模型的Deita指令复杂性评估器，通过生成1-6分的复杂性评分评估指令难度。\n"
+                "输入参数：\n"
+                "- device：计算设备，默认为'cuda'\n"
+                "- model_cache_dir：模型缓存目录，默认为'./dataflow_cache'\n"
+                "- max_length：最大序列长度，默认为512\n"
+                "- input_instruction_key：指令文本字段名，默认为'instruction'\n"
+                "- input_output_key：输出文本字段名，默认为'output'\n"
+                "- output_key：输出得分字段名，默认为'DeitaComplexityScore'\n"
+                "输出参数：\n"
+                "- 包含指令复杂性评分的DataFrame（1-6分）"
+            )
+        elif lang == "en":
+            return (
+                "Llama-based Deita instruction complexity evaluator generating 1-6 complexity scores.\n"
+                "Input Parameters:\n"
+                "- device: Computing device, default 'cuda'\n"
+                "- model_cache_dir: Model cache directory, default './dataflow_cache'\n"
+                "- max_length: Maximum sequence length, default 512\n"
+                "- input_instruction_key: Field name for instruction text, default 'instruction'\n"
+                "- input_output_key: Field name for output text, default 'output'\n"
+                "- output_key: Field name for output score, default 'DeitaComplexityScore'\n"
+                "Output Parameters:\n"
+                "- DataFrame containing instruction complexity scores (1-6)"
+            )
+        else:
+            return "Measure instruction complexity using Llama-based Deita model."
+    def infer_complexity(self, input_text):
+        complexity_template = ("You are a helpful assistant. Please identify the complexity score of the following user query. \n##Query: {instruction}\n##Complexity: ")
+        user_input = complexity_template.format(instruction=input_text)
+        input_ids = self.tokenizer.encode(user_input, return_tensors="pt").to(self.device)
+        outputs = self.model.generate(input_ids, max_new_tokens=self.max_length, num_return_sequences=1, return_dict_in_generate=True, output_scores=True)
+        logprobs_list = outputs.scores[0][0]
+        # Mapping of token IDs to complexity scores
+        id2score = {
+            29896: 1,  # Complexity level 1
+            29906: 2,  # Complexity level 2
+            29941: 3,  # Complexity level 3
+            29946: 4,  # Complexity level 4
+            29945: 5,  # Complexity level 5
+            29953: 6   # Complexity level 6
+        }
+        score_template = np.array([1, 2, 3, 4, 5, 6])  # Define the score template
+        score_logits = []
+        for k in id2score:
+            score_logits.append(logprobs_list[k].cpu().numpy())
+        score_logits = np.array(score_logits)
+        score_npy = softmax(score_logits, axis=0)  # Apply softmax to get probabilities
+        score_npy = score_npy * score_template  # Weight the scores by the corresponding complexity level
+        final_score = np.sum(score_npy, axis=0)  # Sum the weighted scores to get the final score
+        return final_score
+    def eval(self, dataframe, input_instruction_key: str = 'instruction', input_output_key: str = 'output'):
+        self.logger.info(f"Evaluating {self.score_name}...")
+        scores = []
+        for sample in tqdm(dataframe[[input_instruction_key, input_output_key]].to_dict(orient='records'), desc="Deita complexity model evaluating..."):
+            quality_score = self.infer_complexity(sample[input_instruction_key])
+            scores.append(quality_score)
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key: str = 'output', output_key: str = 'DeitaComplexityScore'):
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_instruction_key, input_output_key)
+        dataframe[output_key] = scores        
+        storage.write(dataframe)
--- a/dataflow/operators/text_sft/eval/deita_quality_sample_evaluator.py
+++ b/dataflow/operators/text_sft/eval/deita_quality_sample_evaluator.py
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import numpy as np
+from scipy.special import softmax
+import requests
+import torch
+from dataflow import get_logger
+from tqdm import tqdm
+@OPERATOR_REGISTRY.register()
+class DeitaQualitySampleEvaluator(OperatorABC):
+    def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model_name = 'hkust-nlp/deita-quality-scorer'
+        self.model_cache_dir = model_cache_dir
+        self.max_length = max_length
+        self.token_strs = ["1", "2", "3", "4", "5", "6"]
+        self.score_template = np.array([1, 2, 3, 4, 5, 6])
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
+        self.score_name = 'DeitaQualityScore'
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于Llama模型的Deita指令质量评估器，通过生成1-6分的质量评分评估指令质量。\n"
+                "输入参数：\n"
+                "- device：计算设备，默认为'cuda'\n"
+                "- model_cache_dir：模型缓存目录，默认为'./dataflow_cache'\n"
+                "- max_length：最大序列长度，默认为512\n"
+                "- input_instruction_key：指令文本字段名，默认为'instruction'\n"
+                "- input_output_key：输出文本字段名，默认为'output'\n"
+                "- output_key：输出得分字段名，默认为'DeitaQualityScore'\n"
+                "输出参数：\n"
+                "- 包含指令质量评分的DataFrame（1-6分）"
+            )
+        elif lang == "en":
+            return (
+                "Llama-based Deita instruction quality evaluator generating 1-6 quality scores.\n"
+                "Input Parameters:\n"
+                "- device: Computing device, default 'cuda'\n"
+                "- model_cache_dir: Model cache directory, default './dataflow_cache'\n"
+                "- max_length: Maximum sequence length, default 512\n"
+                "- input_instruction_key: Field name for instruction text, default 'instruction'\n"
+                "- input_output_key: Field name for output text, default 'output'\n"
+                "- output_key: Field name for output score, default 'DeitaQualityScore'\n"
+                "Output Parameters:\n"
+                "- DataFrame containing instruction quality scores (1-6)"
+            )
+        else:
+            return "Evaluate instruction quality using Llama-based Deita model."
+    def infer_quality(self, input_text, resp_text):
+        # Define the template and input format
+        quality_template = ("You are a helpful assistant. Please identify the quality score of the Response corresponding to the Question.\n"
+                            "#Question#:\n{instruction}\n#Response#:\n{output}\n##Quality: ")
+        user_input = quality_template.format(instruction=input_text, output=resp_text)
+        input_ids = self.tokenizer.encode(user_input, return_tensors="pt").to(self.device)
+        outputs = self.model.generate(input_ids, max_new_tokens=self.max_length, num_return_sequences=1, return_dict_in_generate=True, output_scores=True)
+        logprobs_list = outputs.scores[0][0]
+        id2score = {
+            29896: "1",
+            29906: "2",
+            29941: "3",
+            29946: "4",
+            29945: "5",
+            29953: "6"
+        }
+        score_logits = []
+        for k in id2score:
+            score_logits.append(logprobs_list[k].cpu().numpy())
+        score_logits = np.array(score_logits)
+        score_npy = softmax(score_logits, axis=0)
+        score_npy = score_npy * self.score_template
+        final_score = np.sum(score_npy, axis=0)
+        return final_score
+    def eval(self, dataframe, input_instruction_key: str = 'instruction', input_output_key: str = 'output'):
+        scores = []
+        self.logger.info(f"Evaluating {self.score_name}...")
+        for sample in tqdm(dataframe[[input_instruction_key, input_output_key]].to_dict(orient='records'), desc="Deita quality model Evaluating..."):
+            quality_score = self.infer_quality(sample[input_instruction_key], sample[input_output_key])  # assuming response and instruction are the same for now
+            scores.append(quality_score)
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key: str = 'output', output_key: str = 'DeitaQualityScore'):
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_instruction_key, input_output_key)
+        dataframe[output_key] = scores        
+        storage.write(dataframe)
--- a/dataflow/operators/text_sft/eval/instag_sample_evaluator.py
+++ b/dataflow/operators/text_sft/eval/instag_sample_evaluator.py
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.storage import DataFlowStorage
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+import torch
+from dataflow import get_logger
+import json
+@OPERATOR_REGISTRY.register()
+class InstagSampleEvaluator(OperatorABC):
+    def __init__(self, model_cache_dir='./dataflow_cache', device='cuda', max_new_tokens=1024, temperature=0, do_sample=False, num_return_sequences=1, return_dict_in_generate=True):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.device = device or ('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model_name = 'OFA-Sys/InsTagger'
+        self.model_cache_dir = model_cache_dir
+        self.max_new_tokens = max_new_tokens
+        self.temperature = temperature
+        self.do_sample = do_sample
+        self.num_return_sequences = num_return_sequences
+        self.return_dict_in_generate = return_dict_in_generate
+        self.token_strs = ["1", "2", "3", "4", "5", "6"]
+        self.score_template = np.array([1, 2, 3, 4, 5, 6])
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
+        self.model = AutoModelForCausalLM.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
+        self.model.requires_grad_(False)
+        self.model.eval()
+        self.score_name = 'InstagScore'
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "使用Instag评分器评估指令的内容多样性和意图标签。通过分析指令文本生成相关标签，标签数量越多表示内容多样性越大，" 
+                "同时返回标签的详细解释。基于OFA-Sys/InsTagger模型实现。\n" 
+                "输入参数：\n" 
+                "- query: 待评估的指令文本\n" 
+                "输出参数：\n" 
+                "- int: 标签数量（内容多样性指标）\n" 
+                "- list: 包含标签和解释的字典列表"
+            )
+        else:
+            return (
+                "Evaluate instruction content diversity and intention tags using the Instag scorer. Generate relevant tags by analyzing instruction text, " 
+                "with more tags indicating greater content diversity, while returning detailed explanations of tags. Implemented based on OFA-Sys/InsTagger model.\n" 
+                "Input parameters:\n" 
+                "- query: Instruction text to be evaluated\n" 
+                "Output parameters:\n" 
+                "- int: Number of tags (content diversity indicator)\n" 
+                "- list: List of dictionaries containing tags and explanations"
+            )
+    def make_prompt(self, query):
+        prompt = f"Please identify tags of user intentions in the following user query and provide an explanation for each tag. Please respond in the JSON format {{\"tag\": str, \"explanation\": str}}.\nUser query: {query}"
+        messages = [("user", prompt), ("Assistant", None)]
+        seps = [" ", "</s>"]
+        ret = "system: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions." + seps[0]
+        for i, (role, message) in enumerate(messages):
+            if message:
+                ret += role + ": " + message + seps[i % 2]
+            else:
+                ret += role + ":"
+        return ret
+    def inference_batch(self, queries):
+        """Process batch of queries using either local model or API."""
+        input_strs = [self.make_prompt(query) for query in queries]
+        input_tokens = self.tokenizer(input_strs, return_tensors="pt", padding=True)
+        if torch.cuda.is_available():
+            input_tokens = {key: value.to(self.device) for key, value in input_tokens.items()}
+        output = self.model.generate(
+            input_tokens['input_ids'],
+            temperature=self.temperature,
+            do_sample=self.do_sample,
+            max_new_tokens=self.max_new_tokens,
+            num_return_sequences=self.num_return_sequences,
+            return_dict_in_generate=self.return_dict_in_generate,
+        )
+        num_input_tokens = input_tokens["input_ids"].shape[1]
+        output_tokens = output.sequences
+        generated_tokens = output_tokens[:, num_input_tokens:]
+        generated_texts = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+        json_outputs = []
+        for generated_text in generated_texts:
+            string_output = generated_text.strip()
+            try:
+                json_output = json.loads(string_output)
+            except json.JSONDecodeError:
+                self.logger.warning(f"JSON parse error: {string_output}")
+                json_output = {"tag": "Parsing error", "explanation": string_output[:100]}
+            json_outputs.append(json_output)
+        return json_outputs
+    def _score_func(self, sample, input_instruction_key):
+        json_output = self.inference_batch([sample])[0]
+        complexity_score = None
+        if isinstance(json_output, list):
+            complexity_score = len(json_output)
+            self.logger.info(f"列表类型JSON,标签数量: {complexity_score}")
+        elif isinstance(json_output, dict) and "tag" in json_output: 
+            complexity_score = 1
+            self.logger.info(f"字典类型JSON,包含tag字段,评分为1")
+        elif isinstance(json_output, dict) and len(json_output) > 0: 
+            complexity_score = 1
+            self.logger.info(f"其他字典类型JSON,评分为1: {json_output}")
+        else:
+            complexity_score = 0
+            self.logger.warning(f"未识别的JSON类型或空数据,评分为0: {json_output}") 
+        return complexity_score
+    def eval(self, dataframe: pd.DataFrame, input_instruction_key: str):
+        self.logger.info(f"Evaluating {self.score_name}...")
+        scores = []
+        for sample in tqdm(dataframe[input_instruction_key], desc="Instagger mode evaluating..."):
+            scores.append(self._score_func(sample, input_instruction_key))
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', output_key: str = 'InstagScore'):
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_instruction_key)
+        dataframe[output_key] = scores
+        storage.write(dataframe)
--- a/dataflow/operators/text_sft/eval/rm_sample_evaluator.py
+++ b/dataflow/operators/text_sft/eval/rm_sample_evaluator.py
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+import torch
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.utils import get_logger
+# RMScorer for evaluating based on reward-model-deberta-v3-large-v2
+@OPERATOR_REGISTRY.register()
+class RMSampleEvaluator(OperatorABC):
+    def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', ):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.model_name = 'OpenAssistant/reward-model-deberta-v3-large-v2'
+        self.model_cache_dir = model_cache_dir
+        self.score_name = 'RewardModelScore'
+        self.device = device
+        self.rank_model = AutoModelForSequenceClassification.from_pretrained(self.model_name, cache_dir=self.model_cache_dir).to(self.device)
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于人类偏好数据训练的奖励模型(OpenAssistant/reward-model-deberta-v3-large-v2)对文本质量进行打分，高分代表质量较高。" 
+                "模型输入为指令和响应文本对，输出0-1之间的奖励分数，反映人类对文本质量的偏好判断。\n" 
+                "输入参数：\n" 
+                "- instruction: 指令文本字符串\n" 
+                "- output: 响应文本字符串\n" 
+                "输出参数：\n" 
+                "- float: 0-1之间的奖励分数，越高表示质量越好"
+            )
+        else:
+            return (
+                "Score text quality using a reward model trained on human preference data (OpenAssistant/reward-model-deberta-v3-large-v2), where higher scores indicate better quality. " 
+                "The model takes instruction-response text pairs as input and outputs a reward score between 0 and 1, reflecting human preference judgments on text quality.\n" 
+                "Input parameters:\n" 
+                "- instruction: Instruction text string\n" 
+                "- output: Response text string\n" 
+                "Output parameters:\n" 
+                "- float: Reward score between 0 and 1, higher values indicate better quality"
+            )
+    def eval(self, dataframe, input_instruction_key: str = 'instruction', input_output_key: str = 'output'):
+        input_texts = dataframe.get(input_instruction_key, '').to_list()
+        output_texts = dataframe.get(input_output_key, '').to_list()
+        inputs = self.tokenizer(input_texts, output_texts, return_tensors='pt', padding=True, truncation=True).to(self.device)
+        self.logger.info(f"Evaluating {self.score_name}...")
+        with torch.no_grad():
+            logits = self.rank_model(**inputs).logits.cpu().detach().numpy()
+        scores = logits.squeeze() 
+        if scores.ndim == 0:  
+            scores = [float(scores)]
+        self.logger.info("Evaluation complete!")
+        return scores.tolist() 
+    def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key: str = 'output', output_key: str = 'RMScore'):
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, input_instruction_key, input_output_key)
+        dataframe[output_key] = scores        
+        storage.write(dataframe)
\ No newline at end of file
--- a/dataflow/operators/text_sft/eval/superfiltering_sample_evaluator.py
+++ b/dataflow/operators/text_sft/eval/superfiltering_sample_evaluator.py
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.text_sft.eval.Superfiltering.data_analysis import get_perplexity_and_embedding_whole_text, get_perplexity_and_embedding_part_text
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from tqdm import tqdm
+import torch
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.utils import get_logger
+import pandas as pd
+# Superfiltering instruction quality (ifd) evaluation
+# cited from: Superfiltering: Weak-to-Strong Data Filtering for Fast Instruction-Tuning
+@OPERATOR_REGISTRY.register()
+class SuperfilteringSampleEvaluator(OperatorABC):
+    def __init__(self, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.device = device
+        self.model_name = 'gpt2'
+        self.model_cache_dir = model_cache_dir
+        self.max_length = max_length
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, cache_dir=self.model_cache_dir)
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_name, 
+            device_map=self.device, 
+            cache_dir=self.model_cache_dir, 
+            output_hidden_states=True
+        ).to(self.device)
+        self.score_name = 'SuperfilteringScore'
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "使用Superfiltering方法评估指令的跟随难度，基于GPT-2模型计算条件困惑度与独立困惑度的比值，得分越高表示指令越难跟随。" 
+                "该方法通过比较指令条件下的响应困惑度与独立响应困惑度，评估指令的清晰度和跟随难度。\n" 
+                "输入参数：\n" 
+                "- instruction: 指令文本\n" 
+                "- input_text: 输入文本（可选）\n" 
+                "- output: 响应文本\n" 
+                "输出参数：\n" 
+                "- float: 困惑度比值，越高表示指令跟随难度越大"
+            )
+        else:
+            return (
+                "Evaluate the follow difficulty of instructions using the Superfiltering method, which calculates the ratio of conditional perplexity to independent perplexity based on the GPT-2 model. " 
+                "Higher scores indicate greater difficulty in following the instruction. This method assesses instruction clarity and follow difficulty by comparing response perplexity under instruction conditions with independent response perplexity.\n" 
+                "Input parameters:\n" 
+                "- instruction: Instruction text\n" 
+                "- input_text: Input text (optional)\n" 
+                "- output: Response text\n" 
+                "Output parameters:\n" 
+                "- float: Perplexity ratio, higher values indicate greater instruction following difficulty"
+            )
+    def inference(self, instruction, input_text, output):
+        PROMPT_DICT_NONE = {
+            "prompt_input": (
+                "{instruction}\n{input}\n"
+            ),
+            "prompt_no_input": (
+                "{instruction}\n"
+            ),
+        }
+        prompt_no_input = PROMPT_DICT_NONE["prompt_no_input"]
+        prompt_input = PROMPT_DICT_NONE["prompt_input"]
+        if input_text == '':
+            temp_dict = {'instruction': instruction}
+            prompt_to_use = prompt_no_input.format_map(temp_dict)
+            whole_text = prompt_to_use + output
+            instruction = prompt_to_use
+        else:
+            temp_dict = {'instruction': instruction, 'input': input_text}
+            prompt_to_use = prompt_input.format_map(temp_dict)
+            whole_text = prompt_to_use + output
+            instruction = prompt_to_use
+        if output == '':
+            return None
+        instruction_input_ids = self.tokenizer.encode(instruction, return_tensors="pt", truncation=True, max_length=self.max_length).to(self.device)
+        instruction_len = instruction_input_ids.shape[1]
+        ppl_out_alone, _ = get_perplexity_and_embedding_whole_text(self.tokenizer, self.model, output, self.max_length - instruction_len + 1, self.device)
+        ppl_out_condition, _ = get_perplexity_and_embedding_part_text(self.tokenizer, self.model, whole_text, output, self.max_length, self.device)
+        if ppl_out_alone != 0:
+            score = ppl_out_condition / ppl_out_alone
+        else:
+            score = 0
+        if score != score:  # 检查NaN
+            score = None
+        return score
+    def _score_func(self, sample, input_instruction_key: str = 'instruction', input_input_key: str = 'input', input_output_key: str = 'output'):
+        instruction = sample.get(input_instruction_key, [''])
+        output = sample.get(input_output_key, [''])
+        input_text = sample.get(input_input_key, ['']) if input_input_key is not None and input_input_key in sample else ''
+        if not output:
+            score = None
+        else:
+            score = self.inference(instruction, input_text, output)
+        return score
+    def eval(self, dataframe: pd.DataFrame, input_instruction_key: str = 'instruction', input_input_key: str = None, input_output_key: str = 'output'):
+        self.logger.info(f"Evaluating {self.score_name}...")
+        key_list = [input_instruction_key, input_output_key]
+        if input_input_key is not None:
+            key_list.append(input_input_key)
+        scores = [self._score_func(sample, input_instruction_key, input_input_key, input_output_key) for sample in tqdm(dataframe[key_list].to_dict(orient='records'), desc="SuperfilteringScorer evaluating...")]
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_input_key: str = None, input_output_key: str = 'output', output_key: str = 'SuperfilteringScore'):
+        dataframe = storage.read("dataframe") 
+        scores = self.eval(dataframe, input_instruction_key, input_input_key, input_output_key)
+        dataframe[output_key] = scores
+        storage.write(dataframe)
--- a/dataflow/operators/text_sft/eval/treeinstruct_sample_evaluator.py
+++ b/dataflow/operators/text_sft/eval/treeinstruct_sample_evaluator.py
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+import pandas as pd
+from dataflow.core import LLMServingABC
+from dataflow.prompts.general_text import TreeinstructPrompt 
+from dataflow.core.prompt import prompt_restrict
+@prompt_restrict(
+    TreeinstructPrompt
+) 
+@OPERATOR_REGISTRY.register()
+class TreeinstructSampleEvaluator(OperatorABC):
+    def __init__(self, llm_serving: LLMServingABC = None):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.llm_serving = llm_serving
+        self.score_name = 'TreeinstructScore'
+        self.prompt = TreeinstructPrompt()
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "通过生成语法树的节点数来衡量指令复杂性，节点越多表示指令越复杂。\n"
+                "输入参数：\n"
+                "- llm_serving：LLM服务对象，需实现LLMServingABC接口\n"
+                "- input_instruction_key：指令字段名\n"
+                "- output_key：输出得分字段名，默认'TreeinstructScore'\n"
+                "输出参数：\n"
+                "- 包含指令复杂性得分的DataFrame"
+            )
+        elif lang == "en":
+            return (
+                "Measure instruction complexity by syntax tree size; more nodes mean more complexity.\n"
+                "Input Parameters:\n"
+                "- llm_serving: LLM serving object implementing LLMServingABC interface\n"
+                "- input_instruction_key: Field name for instruction\n"
+                "- output_key: Field name for output score, default 'TreeinstructScore'\n"
+                "Output Parameters:\n"
+                "- DataFrame containing instruction complexity scores"
+            )
+        else:
+            return "Measure instruction complexity by syntax tree size; more nodes mean more complexity."
+    def get_score(self, samples, input_instruction_key):
+        system_prompts = []
+        user_prompts = []
+        for sample in samples:
+            instruction = sample.get(input_instruction_key, [''])
+            system_prompts.append(self.prompt.build_system_prompt(instruction))
+            user_prompts.append(self.prompt.build_prompt())
+        inputs = [system + "\n" + user for system, user in zip(system_prompts, user_prompts)]
+        responses = self.llm_serving.generate_from_input(user_inputs=inputs)
+        scores = []
+        for response in responses:
+            response_lines = response.strip().split("\n")
+            score_line = response_lines[-1]
+            score = float(score_line.split()[0])
+            scores.append(score)
+        return scores
+    def eval(self, dataframe: pd.DataFrame, input_instruction_key: str):
+        self.logger.info(f"Evaluating {self.score_name}...")
+        samples = dataframe.to_dict(orient='records')
+        scores = self.get_score(samples, input_instruction_key)
+        self.logger.info("Evaluation complete!")
+        return scores
+    def run(self, storage: DataFlowStorage, input_instruction_key: str, output_key: str='TreeinstructScore'):
+        self.input_instruction_key = input_instruction_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        scores = self.eval(dataframe, self.input_instruction_key)
+        dataframe[self.output_key] = scores
+        storage.write(dataframe)
--- a/dataflow/operators/text_sft/filter/alpagasus_filter.py
+++ b/dataflow/operators/text_sft/filter/alpagasus_filter.py
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.core import LLMServingABC
+from dataflow.operators.text_sft import AlpagasusSampleEvaluator
+@OPERATOR_REGISTRY.register()
+class AlpagasusFilter(OperatorABC):
+    def __init__(self, min_score=3, max_score=5, llm_serving: LLMServingABC = None, dimension='quality'):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...")
+        self.scorer = AlpagasusSampleEvaluator(llm_serving, dimension)
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于AlpagasusScorer打分器的得分对数据进行过滤。通过调用GPT模型评估指令的质量，返回一个质量得分。\n\n"
+                "初始化参数：\n"
+                "- min_score: 最低分数阈值，默认为3\n"
+                "- max_score: 最高分数阈值，默认为5\n"
+                "- llm_serving: LLM服务实例\n"
+                "- dimension: 评估维度，默认为'quality'（质量）\n\n"
+                "运行参数：\n"
+                "- input_instruction_key: 输入指令字段名\n"
+                "- input_input_key: 输入内容字段名\n"
+                "- input_output_key: 输出内容字段名\n"
+                "- output_key: 输出分数字段名，默认为'AlpagasusScore'\n\n"
+                "过滤逻辑：保留分数在[min_score, max_score]范围内的数据"
+            )
+        else:
+            return (
+                "Filter data using scores from the AlpagasusScorer. Evaluate instruction quality using GPT model and return a quality score.\n\n"
+                "Initialization Parameters:\n"
+                "- min_score: Minimum score threshold, default is 3\n"
+                "- max_score: Maximum score threshold, default is 5\n"
+                "- llm_serving: LLM serving instance\n"
+                "- dimension: Evaluation dimension, default is 'quality'\n\n"
+                "Run Parameters:\n"
+                "- input_instruction_key: Input instruction field name\n"
+                "- input_input_key: Input content field name\n"
+                "- input_output_key: Output content field name\n"
+                "- output_key: Output score field name, default is 'AlpagasusScore'\n\n"
+                "Filter Logic: Keep data with scores in [min_score, max_score] range"
+            )
+    def run(self, storage: DataFlowStorage, input_instruction_key: str, input_input_key: str, input_output_key: str, output_key: str='AlpagasusScore'):
+        self.input_instruction_key = input_instruction_key
+        self.input_input_key = input_input_key
+        self.input_output_key = input_output_key
+        self.output_key = output_key
+        self.logger.info(f"Running {self.__class__.__name__} with input_instruction_key = {self.input_instruction_key}, input_input_key = {self.input_input_key}, input_output_key = {self.input_output_key} and output_key = {self.output_key}...")    
+        dataframe = storage.read("dataframe")
+        scores = self.scorer.eval(dataframe, self.input_instruction_key, self.input_input_key, self.input_output_key)
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
+        output_file = storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
\ No newline at end of file
--- a/dataflow/operators/text_sft/filter/deita_complexity_filter.py
+++ b/dataflow/operators/text_sft/filter/deita_complexity_filter.py
+from dataflow.operators.text_sft import DeitaComplexitySampleEvaluator
+from dataflow.core import OperatorABC
+import numpy as np
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.utils import get_logger
+from dataflow.utils.storage import DataFlowStorage
+@OPERATOR_REGISTRY.register()
+class DeitaComplexityFilter(OperatorABC):
+    def __init__(self, min_score=3.0, max_score=5.0, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        self.scorer = DeitaComplexitySampleEvaluator(
+            device=device,
+            model_cache_dir=model_cache_dir,
+            max_length=max_length,
+        )
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...")
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于DeitaComplexityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令复杂性评估器，评估指令的复杂程度。\n\n"
+                "初始化参数：\n"
+                "- min_score: 最低分数阈值，默认为3.0\n"
+                "- max_score: 最高分数阈值，默认为5.0\n"
+                "- device: 运行设备，默认为'cuda'\n"
+                "- model_cache_dir: 模型缓存目录，默认为'./dataflow_cache'\n"
+                "- max_length: 最大序列长度，默认为512\n\n"
+                "运行参数：\n"
+                "- input_instruction_key: 输入指令字段名，默认为'instruction'\n"
+                "- input_output_key: 输入输出字段名，默认为'output'\n"
+                "- output_key: 输出分数字段名，默认为'DeitaComplexityScore'\n\n"
+                "评分标准：1-6分，分数越高表示指令复杂性越高\n"
+                "过滤逻辑：保留分数在[min_score, max_score]范围内的数据"
+            )
+        else:
+            return (
+                "Filter data using scores from the DeitaComplexityScorer. Evaluate instruction complexity using Llama-based Deita model.\n\n"
+                "Initialization Parameters:\n"
+                "- min_score: Minimum score threshold, default is 3.0\n"
+                "- max_score: Maximum score threshold, default is 5.0\n"
+                "- device: Running device, default is 'cuda'\n"
+                "- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
+                "- max_length: Maximum sequence length, default is 512\n\n"
+                "Run Parameters:\n"
+                "- input_instruction_key: Input instruction field name, default is 'instruction'\n"
+                "- input_output_key: Input output field name, default is 'output'\n"
+                "- output_key: Output score field name, default is 'DeitaComplexityScore'\n\n"
+                "Scoring Standard: 1-6 points, higher score indicates higher instruction complexity\n"
+                "Filter Logic: Keep data with scores in [min_score, max_score] range"
+            )
+    def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key : str = 'output', output_key: str = "DeitaComplexityScore"):
+        self.input_instruction_key = input_instruction_key
+        self.input_output_key = input_output_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__}...")
+        # Get the quality scores
+        scores = self.scorer.eval(dataframe, input_instruction_key, input_output_key)
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
--- a/dataflow/operators/text_sft/filter/deita_quality_filter.py
+++ b/dataflow/operators/text_sft/filter/deita_quality_filter.py
+from dataflow.operators.text_sft import DeitaQualitySampleEvaluator
+from dataflow.core import OperatorABC
+import numpy as np
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.utils import get_logger
+from dataflow.utils.storage import DataFlowStorage
+@OPERATOR_REGISTRY.register()
+class DeitaQualityFilter(OperatorABC):
+    def __init__(self, min_score=2.5, max_score=10000.0, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        self.scorer = DeitaQualitySampleEvaluator(
+            device=device,
+            model_cache_dir=model_cache_dir,
+            max_length=max_length,
+        )
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...")
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于DeitaQualityScorer打分器的得分对数据进行过滤。使用基于Llama模型的Deita指令质量评估器，评估指令的质量高低。\n\n"
+                "初始化参数：\n"
+                "- min_score: 最低分数阈值，默认为2.5\n"
+                "- max_score: 最高分数阈值，默认为10000.0\n"
+                "- device: 运行设备，默认为'cuda'\n"
+                "- model_cache_dir: 模型缓存目录，默认为'./dataflow_cache'\n"
+                "- max_length: 最大序列长度，默认为512\n\n"
+                "运行参数：\n"
+                "- input_instruction_key: 输入指令字段名，默认为'instruction'\n"
+                "- input_output_key: 输入输出字段名，默认为'output'\n"
+                "- output_key: 输出分数字段名，默认为'DeitaQualityScore'\n\n"
+                "评分标准：1-6分，分数越高表示指令质量越高\n"
+                "过滤逻辑：保留分数在[min_score, max_score]范围内的数据"
+            )
+        else:
+            return (
+                "Filter data using scores from the DeitaQualityScorer. Evaluate instruction quality using Llama-based Deita model.\n\n"
+                "Initialization Parameters:\n"
+                "- min_score: Minimum score threshold, default is 2.5\n"
+                "- max_score: Maximum score threshold, default is 10000.0\n"
+                "- device: Running device, default is 'cuda'\n"
+                "- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
+                "- max_length: Maximum sequence length, default is 512\n\n"
+                "Run Parameters:\n"
+                "- input_instruction_key: Input instruction field name, default is 'instruction'\n"
+                "- input_output_key: Input output field name, default is 'output'\n"
+                "- output_key: Output score field name, default is 'DeitaQualityScore'\n\n"
+                "Scoring Standard: 1-6 points, higher score indicates higher instruction quality\n"
+                "Filter Logic: Keep data with scores in [min_score, max_score] range"
+            )
+    def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key : str = 'output', output_key: str = "DeitaQualityScore"):
+        self.input_instruction_key = input_instruction_key
+        self.input_output_key = input_output_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__}...")
+        scores = self.scorer.eval(dataframe, input_instruction_key, input_output_key)
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
--- a/dataflow/operators/text_sft/filter/instag_filter.py
+++ b/dataflow/operators/text_sft/filter/instag_filter.py
+from dataflow.operators.text_sft import InstagSampleEvaluator
+from dataflow.core import OperatorABC
+import numpy as np
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.utils import get_logger
+from dataflow.utils.storage import DataFlowStorage
+@OPERATOR_REGISTRY.register()
+class InstagFilter(OperatorABC):
+    def __init__(self, min_score=0.0, max_score=1.0, model_cache_dir='./dataflow_cache', device='cuda', max_new_tokens=1024):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        # Initialize the scorer
+        self.scorer = InstagSampleEvaluator(
+            model_cache_dir=model_cache_dir,
+            device=device,
+            max_new_tokens=max_new_tokens
+        )
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...")
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return "基于InstagScorer打分器的过滤算子。使用预训练的Instag模型对指令进行分析，返回标签的数量来评估指令的内容多样性。参数包括模型缓存目录(model_cache_dir)、计算设备(device)和最大新生成标记数(max_new_tokens)。过滤范围由min_score和max_score参数控制，标签越多表示内容多样性越大。"
+        else:
+            return "Filter operator based on InstagScorer. Uses pre-trained Instag model to analyze instructions, returning the number of tags to evaluate content diversity. Parameters include model cache directory (model_cache_dir), computing device (device), and maximum new tokens (max_new_tokens). Filter range is controlled by min_score and max_score parameters, with more tags indicating greater content diversity."
+    def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', output_key: str = 'InstagScore'):
+        self.input_instruction_key = input_instruction_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__}...")
+        scores = self.scorer.eval(dataframe, self.input_instruction_key)
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
--- a/dataflow/operators/text_sft/filter/rm_filter.py
+++ b/dataflow/operators/text_sft/filter/rm_filter.py
+import numpy as np
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.text_sft import RMSampleEvaluator
+@OPERATOR_REGISTRY.register()
+class RMFilter(OperatorABC):
+    def __init__(self, min_score: float = 0.2, max_score: float = 0.8, device='cuda', model_cache_dir='./dataflow_cache'):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        self.scorer = RMSampleEvaluator(device=device, model_cache_dir=model_cache_dir)
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score}, max_score = {self.max_score}")
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于RMScorer打分器的得分对数据进行过滤。使用基于人类偏好数据训练的奖励模型对文本质量进行评分，高分代表质量较高。\n"
+                "奖励模型能够评估文本的相关性、有用性、无害性等人类偏好指标，可用于筛选符合人类价值观的高质量文本。\n"
+                "输入参数：\n"
+                "- min_score：保留样本的最小奖励分数阈值，默认为0.2\n"
+                "- max_score：保留样本的最大奖励分数阈值，默认为0.8\n"
+                "- device：模型运行设备，默认为'cuda'\n"
+                "- model_cache_dir：模型缓存目录，默认为'./dataflow_cache'\n"
+                "- input_instruction_key：指令字段名，默认为'instruction'\n"
+                "- input_output_key：输出字段名，默认为'output'\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留奖励分数在[min_score, max_score]范围内的样本\n"
+                "- 返回包含奖励分数字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Filter data using scores from the RMScorer. Quality scoring using reward model trained with human preference data, where higher scores indicate better quality.\n"
+                "Reward model evaluates human preference metrics such as relevance, helpfulness, and harmlessness, useful for filtering high-quality text aligned with human values.\n"
+                "Input Parameters:\n"
+                "- min_score: Minimum reward score threshold for retaining samples, default is 0.2\n"
+                "- max_score: Maximum reward score threshold for retaining samples, default is 0.8\n"
+                "- device: Model running device, default is 'cuda'\n"
+                "- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
+                "- input_instruction_key: Instruction field name, default is 'instruction'\n"
+                "- input_output_key: Output field name, default is 'output'\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only samples with reward scores within [min_score, max_score] range\n"
+                "- List containing reward score field name for subsequent operator reference"
+            )
+        else:
+            return "Filter data based on quality scores from human preference-trained reward model."
+    def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_output_key: str = 'output', output_key: str = 'RMScore'):
+        self.input_instruction_key = input_instruction_key
+        self.input_output_key = input_output_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_instruction_key = {self.input_instruction_key}, intput_output_key = {self.input_output_key}, output_key = {self.output_key}...")
+        scores = np.array(self.scorer.eval(dataframe, self.input_instruction_key, self.input_output_key))
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
\ No newline at end of file
--- a/dataflow/operators/text_sft/filter/superfiltering_filter.py
+++ b/dataflow/operators/text_sft/filter/superfiltering_filter.py
+from dataflow.operators.text_sft import SuperfilteringSampleEvaluator
+import numpy as np
+from dataflow.core import OperatorABC
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.utils.utils import get_logger
+from dataflow.utils.storage import DataFlowStorage
+@OPERATOR_REGISTRY.register()
+class SuperfilteringFilter(OperatorABC):
+    def __init__(self, min_score=0.0, max_score=1.0, device='cuda', model_cache_dir='./dataflow_cache', max_length=512):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        self.scorer = SuperfilteringSampleEvaluator(
+            device=device,
+            model_cache_dir=model_cache_dir,
+            max_length=max_length
+        )
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {self.min_score} and max_score = {self.max_score}...")
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "使用Superfiltering评分器过滤掉低质量数据。基于GPT-2模型计算困惑度比值来评估指令跟随难度，比值越低表示指令越容易被模型理解和执行。\n"
+                "适用于筛选适合特定模型能力的指令数据，提高模型训练效率和效果。\n"
+                "输入参数：\n"
+                "- min_score：保留样本的最小分数阈值，默认为0.0\n"
+                "- max_score：保留样本的最大分数阈值，默认为1.0\n"
+                "- device：模型运行设备，默认为'cuda'\n"
+                "- model_cache_dir：模型缓存目录，默认为'./dataflow_cache'\n"
+                "- max_length：文本最大长度，默认为512\n"
+                "- input_instruction_key：指令字段名，默认为'instruction'\n"
+                "- input_input_key：输入字段名，默认为'input'\n"
+                "- input_output_key：输出字段名，默认为'output'\n"
+                "- output_key：过滤结果分数字段名，默认为'SuperfilteringScore'\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留分数在[min_score, max_score]范围内的样本\n"
+                "- 返回包含过滤结果分数字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Filter out low-quality data using the Superfiltering scorer. Evaluate instruction following difficulty by calculating perplexity ratio with GPT-2 model; lower ratios indicate instructions are easier for models to understand and execute.\n"
+                "Suitable for selecting instruction data appropriate for specific model capabilities, improving model training efficiency and effectiveness.\n"
+                "Input Parameters:\n"
+                "- min_score: Minimum score threshold for retaining samples, default is 0.0\n"
+                "- max_score: Maximum score threshold for retaining samples, default is 1.0\n"
+                "- device: Model running device, default is 'cuda'\n"
+                "- model_cache_dir: Model cache directory, default is './dataflow_cache'\n"
+                "- max_length: Maximum text length, default is 512\n"
+                "- input_instruction_key: Instruction field name, default is 'instruction'\n"
+                "- input_input_key: Input field name, default is 'input'\n"
+                "- input_output_key: Output field name, default is 'output'\n"
+                "- output_key: Filter result score field name, default is 'SuperfilteringScore'\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only samples with scores within [min_score, max_score] range\n"
+                "- List containing filter result score field name for subsequent operator reference"
+            )
+        else:
+            return "Filter low-quality data using perplexity ratio calculated with GPT-2 model."
+    def run(self, storage: DataFlowStorage, input_instruction_key: str = 'instruction', input_input_key: str = 'input', input_output_key: str = 'output', output_key: str = "SuperfilteringScore"):
+        self.input_instruction_key = input_instruction_key
+        self.input_input_key = input_input_key
+        self.input_output_key = input_output_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__ } with input_instruction_key = {self.input_instruction_key}, intput_output_key = {self.input_output_key}, output_key = {self.output_key}...")
+        # Get the scores for filtering
+        scores = self.scorer.eval(dataframe, input_instruction_key, input_input_key, input_output_key)
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(dataframe[self.output_key] >= self.min_score) & (dataframe[self.output_key] <= self.max_score)]
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
--- a/dataflow/operators/text_sft/filter/treeinstruct_filter.py
+++ b/dataflow/operators/text_sft/filter/treeinstruct_filter.py
+import numpy as np
+from dataflow import get_logger
+from dataflow.core import OperatorABC, LLMServingABC
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow.operators.text_sft import TreeinstructSampleEvaluator
+@OPERATOR_REGISTRY.register()
+class TreeinstructFilter(OperatorABC):
+    def __init__(self, min_score: int = 7, max_score: int = 100, llm_serving: LLMServingABC = None):
+        self.logger = get_logger()
+        self.min_score = min_score
+        self.max_score = max_score
+        self.scorer = TreeinstructSampleEvaluator(llm_serving=llm_serving)
+        self.logger.info(f"Initializing {self.__class__.__name__} with min_score = {min_score} and max_score = {max_score}")
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于TreeinstructScore打分器的得分对数据进行过滤。通过生成语法树的节点数来衡量指令复杂性，节点越多表示指令越复杂。\n"
+                "适用于筛选特定复杂度范围内的指令数据，平衡数据集难度分布，优化模型训练效果。\n"
+                "输入参数：\n"
+                "- min_score：保留样本的最小语法树节点数阈值，默认为7\n"
+                "- max_score：保留样本的最大语法树节点数阈值，默认为100\n"
+                "- llm_serving：LLM服务对象，需实现LLMServingABC接口\n"
+                "- input_key：输入指令字段名\n"
+                "- output_key：语法树节点数字段名，默认为'TreeinstructScore'\n"
+                "输出参数：\n"
+                "- 过滤后的DataFrame，仅保留语法树节点数在[min_score, max_score]范围内的样本\n"
+                "- 返回包含语法树节点数字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Filter data using scores from the TreeinstructScore. Measure instruction complexity by the number of nodes in the generated syntax tree; more nodes indicate more complex instructions.\n"
+                "Suitable for selecting instruction data within specific complexity ranges, balancing dataset difficulty distribution and optimizing model training effectiveness.\n"
+                "Input Parameters:\n"
+                "- min_score: Minimum syntax tree node count threshold for retaining samples, default is 7\n"
+                "- max_score: Maximum syntax tree node count threshold for retaining samples, default is 100\n"
+                "- llm_serving: LLM serving object implementing LLMServingABC interface\n"
+                "- input_key: Input instruction field name\n"
+                "- output_key: Syntax tree node count field name, default is 'TreeinstructScore'\n"
+                "Output Parameters:\n"
+                "- Filtered DataFrame containing only samples with syntax tree node count within [min_score, max_score] range\n"
+                "- List containing syntax tree node count field name for subsequent operator reference"
+            )
+        else:
+            return "Filter data based on instruction complexity measured by syntax tree node count."
+    def run(self, storage: DataFlowStorage, input_key: str, output_key: str = 'TreeinstructScore'):
+        self.input_key = input_key
+        self.output_key = output_key
+        dataframe = storage.read("dataframe")
+        self.logger.info(f"Running {self.__class__.__name__} with input_key = {self.input_key} and output_key = {self.output_key}...")
+        # Get the scores for filtering
+        scores = np.array(self.scorer.eval(dataframe, self.input_key))
+        dataframe[self.output_key] = scores
+        filtered_dataframe = dataframe[(scores >= self.min_score) & (scores <= self.max_score)]
+        # Write the filtered dataframe back to storage
+        storage.write(filtered_dataframe)
+        self.logger.info(f"Filtering completed. Total records passing filter: {len(filtered_dataframe)}.")
+        return [self.output_key]
--- a/dataflow/operators/text_sft/generate/condor_generator.py
+++ b/dataflow/operators/text_sft/generate/condor_generator.py
+import json
+import random
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+import pandas as pd
+from dataflow.core import LLMServingABC
+from dataflow.prompts.general_text import CondorQuestionPrompt
+from dataflow.core.prompt import prompt_restrict
+@prompt_restrict(
+    CondorQuestionPrompt
+) 
+@OPERATOR_REGISTRY.register()
+class CondorGenerator(OperatorABC):
+    def __init__(self, llm_serving: LLMServingABC = None, num_samples=15, use_task_diversity=True):
+        # Based on the existing topics, it is recommended to set num_samples below 5000. Otherwise, it is recommended to add topics in dataflow.prompts.general_text.CondorPrompt on your own to increase data richness
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.llm_serving = llm_serving
+        self.num_questions = num_samples // 3  # 每个prompt生成3个难度的问题
+        self.prompt = CondorQuestionPrompt()
+        self.use_task_diversity = use_task_diversity  # 是否使用任务场景增强多样性
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于预置知识树标签，两阶段从0合成SFT格式数据（合成数量大于5000时建议增加标签数量）。第一阶段生成不同难度级别的问题，第二阶段为每个问题生成对应的答案。"
+                "输入参数：\n"
+                "- llm_serving：LLM服务对象，需实现LLMServingABC接口\n"
+                "- num_samples：生成样本总数，建议小于5000，默认值为15\n"
+                "输出参数：\n"
+                "- 包含'difficulty'、'instruction'和'output'字段的DataFrame\n"
+                "- 返回生成的DataFrame用于后续处理"
+            )
+        elif lang == "en":
+            return (
+                "Two-stage generation of SFT-style data from scratch based on predefined knowledge tree tags (for over 5000 samples, consider increasing the number of tags). \n"
+                "First stage generates questions of varying difficulty levels, second stage generates answers for each question.\n"
+                "Input Parameters:\n"
+                "- llm_serving: LLM serving object implementing LLMServingABC interface\n"
+                "- num_samples: Total number of samples to generate, recommended to be less than 5000, default is 15\n\n"
+                "Output Parameters:\n"
+                "- DataFrame containing 'difficulty', 'instruction', and 'output' fields\n"
+                "- Returns generated DataFrame for subsequent processing"
+            )
+        else:
+            return (
+                "CondorGenerator generates SFT-style data through two-stage LLM generation based on predefined knowledge tree tags."
+            )
+    def parse_generated_responses(self, questions_responses):
+        questions_data = []
+        for response in questions_responses:
+            try:
+                if not isinstance(response, str):
+                    raise ValueError("Invalid response type")
+                # 解析问题字符串，提取不同难度级别的问题
+                question_data = {}
+                lines = response.split('\n')
+                for line in lines:
+                    if line.startswith("[Easy]"):
+                        question_data["Easy"] = line.replace("[Easy][Question Start]", "").replace("[Question End]", "").strip()
+                    elif line.startswith("[Medium]"):
+                        question_data["Medium"] = line.replace("[Medium][Question Start]", "").replace("[Question End]", "").strip()
+                    elif line.startswith("[Hard]"):
+                        question_data["Hard"] = line.replace("[Hard][Question Start]", "").replace("[Question End]", "").strip()
+                if question_data:
+                    questions_data.append(question_data)
+            except Exception as e:
+                self.logger.debug(f'Error while parsing the response: {str(e)}')
+                continue
+        return questions_data
+    def run(self, storage: DataFlowStorage, output_instruction_key: str = "instruction", output_output_key: str = "output", output_difficulty_key: str = "difficulty"):
+        # 生成所有的prompt
+        prompts = []
+        prompt_metadata = []  # 记录每个prompt的元信息（用于后续追踪）
+        for _ in range(self.num_questions):
+            # 每次随机选择topic, domain, theme
+            topic = random.choice(list(self.prompt.tag.keys()))
+            domain = random.choice(list(self.prompt.tag[topic].keys()))
+            theme = random.choice(self.prompt.tag[topic][domain])
+            # 如果启用任务场景多样性，随机选择一个任务类型
+            task_type = None
+            if self.use_task_diversity:
+                task_type = random.choice(self.prompt.task_types)
+            # 获取生成问题的prompt（保留原有的3难度生成逻辑）
+            prompt = self.prompt.build_prompt(theme, domain)
+            # 如果使用任务场景，在prompt中添加场景说明
+            if task_type:
+                prompt = f"""Task Scenario: {task_type}
+{prompt}
+Remember to frame the questions within the context of "{task_type}" scenario."""
+            prompts.append(prompt)
+            prompt_metadata.append({
+                'topic': topic,
+                'domain': domain, 
+                'theme': theme,
+                'task_type': task_type
+            })
+        # 调用LLM一次性生成问题
+        self.logger.info("Generating questions...")
+        questions_responses = self.llm_serving.generate_from_input(user_inputs=prompts)
+        # 解析问题
+        self.logger.info("Parsing questions...")
+        questions_data = self.parse_generated_responses(questions_responses)
+        # 生成答案的prompt
+        answer_prompts = []
+        for question in questions_data:
+            for difficulty_level in ["Easy", "Medium", "Hard"]:
+                question_text = question.get(difficulty_level)
+                if question_text:
+                    # 构造问题对应的answer prompt
+                    answer_prompt = f"Please answer this questiong truthfully. Question: {question_text}"
+                    answer_prompts.append(answer_prompt)
+        # 调用LLM一次性生成所有答案
+        self.logger.info("Generating answers...")
+        answer_responses = self.llm_serving.generate_from_input(user_inputs=answer_prompts)
+        # 解析答案
+        answers_data = []
+        answer_idx = 0  # 用来追踪答案响应的索引
+        for question in questions_data:
+            for difficulty_level in ["Easy", "Medium", "Hard"]:
+                question_text = question.get(difficulty_level)
+                if question_text:
+                    # 获取对应的答案
+                    answer_text = answer_responses[answer_idx].strip()  # 获取答案
+                    answers_data.append({
+                        output_difficulty_key: difficulty_level,
+                        output_instruction_key: question_text,
+                        output_output_key: answer_text
+                    })
+                    answer_idx += 1  # 更新索引
+        # Step 4: Write to storage (e.g., save to DataFrame)
+        df = pd.DataFrame(answers_data)
+        storage.write(df)
+        self.logger.info(f'SFT data generated')
+        return [output_instruction_key, output_output_key, output_difficulty_key]
--- a/dataflow/operators/text_sft/generate/sft_generator_from_seed.py
+++ b/dataflow/operators/text_sft/generate/sft_generator_from_seed.py
+from dataflow.prompts.general_text import SFTGeneratorSeedPrompt
+import re
+import json
+import pandas as pd
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.utils.storage import DataFlowStorage
+from dataflow.core import OperatorABC
+from dataflow.core import LLMServingABC
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.core.prompt import prompt_restrict
+@prompt_restrict(
+    SFTGeneratorSeedPrompt
+)
+def extract_json_object(model_output):
+    """提取第一个包含 instruction 和 output 字段的 JSON 对象"""
+    json_pattern = r'\{[^}]*\}'
+    matches = re.findall(json_pattern, model_output)
+    for match in matches:
+        try:
+            obj = json.loads(match)
+            if 'instruction' in obj and 'output' in obj:
+                return obj
+        except json.JSONDecodeError:
+            continue
+    return None
+@OPERATOR_REGISTRY.register()
+class SFTGeneratorSeed(OperatorABC):
+    def __init__(self, llm_serving: LLMServingABC, custom_prompt: str):
+        self.logger = get_logger()
+        self.prompts = SFTGeneratorSeedPrompt(custom_prompt=custom_prompt)    
+        self.llm_serving = llm_serving
+        self.max_tokens = 4096  
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "基于给定文档内容，生成监督微调格式的问答数据。并支持用户自定义生成内容要求。从原始文档中提取信息，生成符合SFT格式的指令-响应对。"
+                "输入参数：\n"
+                "- llm_serving：LLM服务对象，需实现LLMServingABC接口\n"
+                "- custom_prompt：用户自定义提示词\n"
+                "- input_key：输入文档内容字段名，默认为'raw_content'\n"
+                "- max_tokens：生成文本的最大token数，默认为4096\n"
+                "输出参数：\n"
+                "- 包含'instruction'、'output'和'raw_content'字段的DataFrame\n"
+                "- 返回包含'instruction'和'output'字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Generate supervised fine-tuning format Q&A data based on the given document content and support user-defined content generation requirements. \n"
+                "Extracts information from raw documents to generate instruction-response pairs in SFT format.\n"
+                "Input Parameters:\n"
+                "- llm_serving: LLM serving object implementing LLMServingABC interface\n"
+                "- custom_prompt: User-defined custom prompt\n"
+                "- input_key: Field name for input document content, default is 'raw_content'\n"
+                "- max_tokens: Maximum number of tokens for generated text, default is 4096\n\n"
+                "Output Parameters:\n"
+                "- DataFrame containing 'instruction', 'output', and 'raw_content' fields\n"
+                "- List containing 'instruction' and 'output' field names for subsequent operator reference"
+            )
+        else:
+            return (
+                "SFTGeneratorSeed generates SFT format Q&A data from document content with custom prompt support."
+            )
+    def run(self, storage: DataFlowStorage, input_key: str = "raw_content"):
+        self.input_key = input_key
+        self.logger.info("Running SFTGenerator...")
+        # Load the raw dataframe from the input file
+        dataframe = storage.read('dataframe')
+        self.logger.info(f"Loading, number of rows: {len(dataframe)}")
+        # Create a list to hold all generated questions and answers
+        llm_inputs = []
+        # Prepare LLM inputs by formatting the prompt with raw content from the dataframe
+        for index, row in dataframe.iterrows():
+            raw_content = row.get(self.input_key, '')
+            llm_input = self.prompts.build_prompt(content=raw_content)
+            llm_inputs.append(llm_input)
+        # Generate the text using the model
+        try:
+            self.logger.info("Generating text using the model...")
+            outputs = self.llm_serving.generate_from_input(llm_inputs)
+            self.logger.info("Text generation completed.")
+        except Exception as e:
+            self.logger.error(f"Error during text generation: {e}")
+            return
+        valid_records = []
+        for idx, output in enumerate(outputs):
+            result = extract_json_object(output)
+            if result:
+                result["raw_content"] = dataframe[self.input_key].iloc[idx]  # 添加原文内容
+                valid_records.append(result)
+        # Add the generated content back to the dataframe
+        output_df = pd.DataFrame(valid_records)
+        # Save the updated dataframe to the output file
+        output_file = storage.write(output_df)
+        return ['instruction', 'output']
--- a/dataflow/operators/text_sft/refine/condor_refiner.py
+++ b/dataflow/operators/text_sft/refine/condor_refiner.py
+import json
+import random
+from dataflow.utils.registry import OPERATOR_REGISTRY
+from dataflow import get_logger
+from dataflow.core import OperatorABC
+from dataflow.utils.storage import DataFlowStorage
+import pandas as pd
+from dataflow.core import LLMServingABC
+from dataflow.prompts.general_text import CondorCritiquePrompt, CondorRefinePrompt
+from dataflow.core.prompt import prompt_restrict
+@prompt_restrict(
+    CondorCritiquePrompt,
+    CondorRefinePrompt
+)
+@OPERATOR_REGISTRY.register()
+class CondorRefiner(OperatorABC):
+    def __init__(self, llm_serving: LLMServingABC = None):
+        self.logger = get_logger()
+        self.logger.info(f'Initializing {self.__class__.__name__}...')
+        self.llm_serving = llm_serving
+        self.critique_prompt = CondorCritiquePrompt()  # 创建 CondorPrompt 类的实例
+        self.refine_prompt = CondorRefinePrompt()
+        self.logger.info(f'{self.__class__.__name__} initialized.')
+    @staticmethod
+    def get_desc(lang: str = "zh"):
+        if lang == "zh":
+            return (
+                "两阶段优化指令回复质量：第一阶段调用API生成对回复的评论，第二阶段利用评论调用API改写回复，提升指令对质量。通过迭代优化提高问答对的整体质量。"
+                "输入参数：\n"
+                "- llm_serving：LLM服务对象，需实现LLMServingABC接口\n"
+                "- input_instruction_key：输入指令字段名，默认为'instruction'\n"
+                "- input_output_key：输入回复字段名，默认为'output'\n"
+                "输出参数：\n"
+                "- 包含优化后回复的DataFrame\n"
+                "- 返回包含优化后回复字段名的列表，用于后续算子引用"
+            )
+        elif lang == "en":
+            return (
+                "Two-stage optimization of instruction-response quality: First stage calls API to generate critique on responses, \n"
+                "second stage uses critique to call API to refine responses, improving the quality of QA pairs through iterative optimization.\n"
+                "Input Parameters:\n"
+                "- llm_serving: LLM serving object implementing LLMServingABC interface\n"
+                "- input_instruction_key: Field name for input instructions, default is 'instruction'\n"
+                "- input_output_key: Field name for input responses, default is 'output'\n\n"
+                "Output Parameters:\n"
+                "- DataFrame containing refined responses\n"
+                "- List containing refined response field name for subsequent operator reference"
+            )
+        else:
+            return (
+                "CondorRefiner improves QA pair quality through two-stage critique and refinement process."
+            )
+    def generate_critique(self, question, answer):
+        # 批量生成 Critique
+        critique_prompts = [self.critique_prompt.build_prompt(q, a) for q, a in zip(question, answer)]
+        critique_responses = self.llm_serving.generate_from_input(critique_prompts)
+        return critique_responses
+    def generate_refined_answer(self, question, answer, critique):
+        # 批量生成修改后的答案
+        refine_prompts = [self.refine_prompt.build_prompt(q, a, c) for q, a, c in zip(question, answer, critique)]
+        refined_answers = self.llm_serving.generate_from_input(refine_prompts)
+        refined_answers = [answer.replace('[Improved Answer Start]', '').replace('[Improved Answer End]', '').strip() for answer in refined_answers]
+        return refined_answers
+    def run(self, storage: DataFlowStorage, input_instruction_key: str='instruction', input_output_key: str='output'):
+        df = storage.read('dataframe')
+        # 从 storage 获取批量问题和答案
+        questions = df.get(input_instruction_key).to_list()
+        answers = df.get(input_output_key).to_list()
+        # 生成 Critique
+        critique_responses = self.generate_critique(questions, answers)
+        self.logger.info(f'Generated Critiques for the answers.')
+        # 生成修改后的答案
+        refined_answers = self.generate_refined_answer(questions, answers, critique_responses)
+        self.logger.info(f'Refined Answers generated.')
+        df[input_output_key] = refined_answers
+        output_file = storage.write(df)
+        self.logger.info(f'Refined answers updated in storage.')
+        return [input_output_key]