适配后端vllm

97e8278b · zzg_666 · 97e8278b · 97e8278b · 97e8278b · 97e8278b
Commit 97e8278b authored Dec 03, 2025 by zzg_666
20 changed files
--- a/dataflow/statics/pipelines/api_pipelines/code_code_to_sft_data_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/code_code_to_sft_data_pipeline.py
+from dataflow.operators.code import (
+    CodeCodeToInstructionGenerator,
+    CodeInstructionToCodeGenerator,
+    CodeQualitySampleEvaluator,
+    CodeQualityScoreFilter,
+    CodeSandboxSampleEvaluator,
+)
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.core import LLMServingABC
+
+class CodeSFTSynthesis_APIPipeline():
+    def __init__(self, llm_serving: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/CodePipeline/code_synthesis_input.jsonl",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+        
+        # use API server as LLM serving
+        self.llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",
+            model_name="gpt-4o",
+            max_workers=100
+        )
+        
+        # Step 1: Code to Instruction synthesizer
+        self.instruction_synthesizer_step1 = CodeCodeToInstructionGenerator(
+            llm_serving=self.llm_serving
+        )
+        
+        # Step 2: Instruction to Code generator
+        self.code_generator_step2 = CodeInstructionToCodeGenerator(
+            llm_serving=self.llm_serving
+        )
+        
+        # Step 3: Quality evaluator for (instruction, code) pairs
+        self.pair_evaluator_step3 = CodeQualitySampleEvaluator(
+            llm_serving=self.llm_serving
+        )
+        
+        # Step 4: Score-based filter
+        self.score_filter_step4 = CodeQualityScoreFilter(
+            llm_serving=self.llm_serving,
+            min_score=0.0,
+            max_score=10.0
+        )
+        
+        # Step 5: Sandbox evaluator
+        self.sandbox_evaluator_step5 = CodeSandboxSampleEvaluator(
+            language='python'
+        )
+    
+    def forward(self):
+        # Step 1: Generate instructions from raw code
+        self.instruction_synthesizer_step1.run(
+            storage=self.storage.step(),
+            input_key="raw_code",
+            output_key="generated_instruction"
+        )
+        
+        # Step 2: Generate code from instructions
+        self.code_generator_step2.run(
+            storage=self.storage.step(),
+            input_key="generated_instruction",
+            output_key="generated_code"
+        )
+        
+        # Step 3: Evaluate the generated (instruction, code) pairs
+        self.pair_evaluator_step3.run(
+            storage=self.storage.step(),
+            input_instruction_key="generated_instruction",
+            input_code_key="generated_code" 
+        )
+        
+        # Step 4: Filter out low-quality samples
+        self.score_filter_step4.run(
+            storage=self.storage.step(),
+            input_instruction_key = "generated_instruction",
+            input_code_key = "generated_code",
+            output_key="quality_score_filter_label"
+        )
+        
+        # Step 5: Evaluate high-quality code in sandbox
+        self.sandbox_evaluator_step5.run(
+            storage=self.storage.step(),
+            input_key="generated_code"
+        )
+
+if __name__ == "__main__":
+    model = CodeSFTSynthesis_APIPipeline()
+    model.forward()
\ No newline at end of file
--- a/dataflow/statics/pipelines/api_pipelines/code_gen_dataset_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/code_gen_dataset_pipeline.py
+from dataflow.operators.code import (
+    CodeCodeToInstructionGenerator,
+    CodeInstructionToCodeGenerator,
+    CodeQualitySampleEvaluator,
+    CodeQualityScoreFilter,
+    CodeSandboxSampleEvaluator,
+    CodeEnhancementInstructionGenerator,
+    CodeInstructionGenerator
+)
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.core import LLMServingABC
+
+class CodeGenDataset_APIPipeline():
+    def __init__(self, llm_serving: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/CodePipeline/raw_code.jsonl",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+        
+        # use API server as LLM serving
+        self.llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",
+            model_name="gpt-4o",
+            max_workers=10
+        )
+        
+        # Step 1: Instruction Enhancement
+        self.instruction_generater_step1 = CodeEnhancementInstructionGenerator(
+            llm_serving=self.llm_serving
+        )
+
+        # Step 2: Instruction to Code generator
+        self.code_generator_step2 = CodeInstructionToCodeGenerator(
+            llm_serving=self.llm_serving
+        )
+
+        # Step 3: Quality evaluator for (instruction, code) pairs
+        self.pair_evaluator_step3 = CodeQualitySampleEvaluator(
+            llm_serving=self.llm_serving
+        )
+        
+        # Step 4: Score-based filter
+        self.score_filter_step4 = CodeQualityScoreFilter(
+            llm_serving=self.llm_serving,
+            min_score=7.0,
+            max_score=10.0
+        )
+        
+        # Step 4: Sandbox evaluator
+        self.sandbox_evaluator_step5 = CodeSandboxSampleEvaluator(
+            language='python'
+        )
+    
+    def forward(self):
+        # Step 1: Generate instructions from raw data
+        self.instruction_generater_step1.run(
+            storage=self.storage.step(),
+            input_key="messages",
+            output_key="generated_instruction",
+        )
+        
+        #Step 2: Generate code from instructions
+        self.code_generator_step2.run(
+            storage=self.storage.step(),
+            input_key="generated_instruction",
+            output_key="generated_code"
+        )
+
+        # Step 3: Evaluate the generated (instruction, code) pairs
+        self.pair_evaluator_step3.run(
+            storage=self.storage.step(),
+            input_instruction_key="generated_instruction",
+            input_code_key="generated_code" 
+        )
+        
+        # Step 4: Filter out low-quality samples
+        self.score_filter_step4.run(
+            storage=self.storage.step(),
+            input_instruction_key = "generated_instruction",
+            input_code_key = "generated_code",
+            output_score_key = "quality_score",
+            output_feedback_key = "quality_feedback",
+            output_key="quality_score_filter_label"
+        )
+
+        # Step 5: Evaluate high-quality code in sandbox
+        self.sandbox_evaluator_step5.run(
+            storage=self.storage.step(),
+            input_key="generated_code"
+        )
+
+if __name__ == "__main__":
+    model = CodeGenDataset_APIPipeline()
+    model.forward()
\ No newline at end of file
--- a/dataflow/statics/pipelines/api_pipelines/core_text_bencheval_semantic_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/core_text_bencheval_semantic_pipeline.py
+from dataflow.operators.core_text import BenchDatasetEvaluator
+from dataflow.operators.reasoning import ReasoningAnswerGenerator
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm
+from dataflow.core import LLMServingABC
+from dataflow.prompts.reasoning.diy import (
+    DiyAnswerGeneratorPrompt,
+)
+    
+DIY_PROMPT_ANSWER = """Please output the answer."""
+
+class BenchEvalPipeline():
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/core_text_data/bench_eval_data_2.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        # use API server as LLM serving
+        self.llm_serving_judger = APILLMServing_request(
+                    api_url="https://api.openai.com/v1/chat/completions",
+                    model_name="gpt-4o",
+                    max_workers=30
+        )
+        
+        self.evaluator_step = BenchDatasetEvaluator(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            compare_method="semantic", # or match
+            llm_serving=self.llm_serving_judger,
+            prompt_template = None # you can diy your judger prompt in dataflow.prompts.reasoning.general.AnswerJudgePrompt
+        )
+        
+    def forward(self):
+        self.evaluator_step.run(
+            storage = self.storage.step(),
+            input_test_answer_key="model_answer",
+            input_gt_answer_key="golden_label"
+        )
+
+if __name__ == "__main__":
+    pl = BenchEvalPipeline()
+    pl.forward()
--- a/dataflow/statics/pipelines/api_pipelines/core_text_bencheval_semantic_pipeline_question.py
+++ b/dataflow/statics/pipelines/api_pipelines/core_text_bencheval_semantic_pipeline_question.py
+from dataflow.operators.core_text import BenchDatasetEvaluatorQuestion
+from dataflow.operators.reasoning import ReasoningAnswerGenerator
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm
+from dataflow.core import LLMServingABC
+from dataflow.prompts.reasoning.diy import (
+    DiyAnswerGeneratorPrompt,
+)
+    
+DIY_PROMPT_ANSWER = """Please output the answer."""
+
+class BenchEvalPipeline():
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/core_text_data/bench_eval_data.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving_generator = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=2048,
+        )
+        
+        # use API server as LLM serving
+        self.llm_serving_judger = APILLMServing_request(
+                    api_url="https://api.openai.com/v1/chat/completions",
+                    model_name="gpt-4o",
+                    max_workers=30
+        )
+
+        self.answer_generator_step1 = ReasoningAnswerGenerator(
+            llm_serving=self.llm_serving_generator,
+            prompt_template=DiyAnswerGeneratorPrompt(DIY_PROMPT_ANSWER)
+        )
+        
+        self.evaluator_step2 = BenchDatasetEvaluatorQuestion(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            compare_method="semantic", # or match
+            llm_serving=self.llm_serving_judger,
+            prompt_template = None # you can diy your judger prompt in dataflow.prompts.reasoning.general.AnswerJudgePrompt
+        )
+        
+    def forward(self):
+        self.answer_generator_step1.run(
+            storage = self.storage.step(),
+            input_key = "instruction", 
+            output_key = "generated_cot"
+        )
+        self.evaluator_step2.run(
+            storage = self.storage.step(),
+            input_test_answer_key="generated_cot",
+            input_gt_answer_key="golden_answer",
+            input_question_key="instruction",
+        )
+
+if __name__ == "__main__":
+    pl = BenchEvalPipeline()
+    pl.forward()
--- a/dataflow/statics/pipelines/api_pipelines/core_text_bencheval_semantic_pipeline_question_single_step.py
+++ b/dataflow/statics/pipelines/api_pipelines/core_text_bencheval_semantic_pipeline_question_single_step.py
+from dataflow.operators.core_text import BenchDatasetEvaluatorQuestion
+from dataflow.operators.reasoning import ReasoningAnswerGenerator
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request, LocalModelLLMServing_vllm
+from dataflow.core import LLMServingABC
+from dataflow.prompts.reasoning.diy import (
+    DiyAnswerGeneratorPrompt,
+)
+    
+DIY_PROMPT_ANSWER = """Please output the answer."""
+
+class BenchEvalPipeline():
+    def __init__(self, llm_serving_generator: LLMServingABC = None, llm_serving_judger: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/core_text_data/bench_eval_data_2.jsonl",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        # use API server as LLM serving
+        self.llm_serving_judger = APILLMServing_request(
+                    api_url="https://api.openai.com/v1/chat/completions",
+                    model_name="gpt-4o",
+                    max_workers=30
+        )
+        
+        self.evaluator_step = BenchDatasetEvaluatorQuestion(
+            eval_result_path="./cache_local/eval_result/eval_result.jsonl",
+            compare_method="semantic", # or match
+            llm_serving=self.llm_serving_judger,
+            prompt_template = None # you can diy your judger prompt in dataflow.prompts.reasoning.general.AnswerJudgePrompt
+        )
+        
+    def forward(self):
+        self.evaluator_step.run(
+            storage = self.storage.step(),
+            input_test_answer_key="model_answer",
+            input_gt_answer_key="golden_label",
+            input_question_key="question",
+        )
+
+if __name__ == "__main__":
+    pl = BenchEvalPipeline()
+    pl.forward()
--- a/dataflow/statics/pipelines/api_pipelines/func_call_synthesis.py
+++ b/dataflow/statics/pipelines/api_pipelines/func_call_synthesis.py
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.operators.conversations import (
+    ScenarioExtractGenerator,
+    ScenarioExpandGenerator,
+    AtomTaskGenerator,
+    SequentialTaskGenerator,
+    ParaSeqTaskGenerator,
+    CompositionTaskFilter,
+    FunctionGenerator,
+    MultiTurnConversationGenerator,
+    FuncCallConversationSampleEvaluator
+)
+
+class FuncCall_APIPipeline:
+    def __init__(self):
+
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/FuncCallPipeline/chat_data.jsonl",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+      
+        self.llm_serving = APILLMServing_request(
+                api_url="http://123.129.219.111:3000/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=128
+        )
+        
+        self.scenario_extractor = ScenarioExtractGenerator(llm_serving=self.llm_serving)
+        self.scenario_expander = ScenarioExpandGenerator(llm_serving=self.llm_serving)
+        self.atom_task_generator = AtomTaskGenerator(llm_serving=self.llm_serving)
+        self.sequential_task_generator = SequentialTaskGenerator(llm_serving=self.llm_serving)
+        self.parallel_sequential_stak_generator = ParaSeqTaskGenerator(llm_serving=self.llm_serving)
+        self.composition_task_filter = CompositionTaskFilter(llm_serving=self.llm_serving)
+        self.function_generator = FunctionGenerator(llm_serving=self.llm_serving)
+        self.multi_turn_conversations_generator = MultiTurnConversationGenerator(llm_serving=self.llm_serving)
+        self.evaluator = FuncCallConversationSampleEvaluator(llm_serving=self.llm_serving)
+
+    def forward(self):
+        self.scenario_extractor.run(
+            self.storage.step(),
+            input_chat_key="chat"
+        )
+        self.scenario_expander.run(
+            self.storage.step(),
+            input_scenario_key="scenario"
+        )
+        self.atom_task_generator.run(
+            self.storage.step(),
+            input_scenario_key="scenario"
+        )
+        #    self.atom_task_generator.run(
+        #        self.storage.step(),
+        #        input_scenario_key="modified_scenario",
+        #        output_key='subsequent_task'
+        #    )
+        self.sequential_task_generator.run(
+            self.storage.step(),
+            input_task_key="atom_task"
+        )
+        #    self.parallel_sequential_stak_generator.run(
+        #        self.storage.step(),
+        #        input_task_key="atom_task"
+        #    )
+        self.composition_task_filter.run(
+            self.storage.step(),
+            input_composition_task_key="composition_task",
+            input_sub_tasks_keys=["atom_task", "subsequent_task"]
+        )
+        self.function_generator.run(
+            self.storage.step(),
+            input_composition_task_key="composition_task",
+            input_sub_tasks_keys=["atom_task", "subsequent_task"]
+        )
+        self.multi_turn_conversations_generator.run(
+           self.storage.step(),
+           input_task_key="composition_task",
+           input_sub_tasks_keys=["atom_task", "subsequent_task"],
+           input_functions_key="functions",
+        )
+        self.evaluator.run(
+            self.storage.step(),
+            input_conversation_key='conversations'
+        )
+
+if __name__ == "__main__":
+    pipeline = FuncCall_APIPipeline()
+    pipeline.forward()
\ No newline at end of file
--- a/dataflow/statics/pipelines/api_pipelines/kbcleaning_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/kbcleaning_pipeline.py
+from dataflow.operators.knowledge_cleaning import (
+    KBCChunkGenerator,
+    FileOrURLToMarkdownConverterBatch,
+    KBCTextCleaner,
+    # KBCMultiHopQAGenerator,
+)
+from dataflow.operators.core_text import Text2MultiHopQAGenerator
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+
+class KBCleaningPDF_APIPipeline():
+    def __init__(self):
+
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/KBCleaningPipeline/kbc_test_1.jsonl",
+            cache_path="./.cache/api",
+            file_name_prefix="knowledge_cleaning_step",
+            cache_type="json",
+        )
+
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=100
+        )
+
+        self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterBatch(
+            intermediate_dir="../example_data/KBCleaningPipeline/raw/",
+            lang="en",
+            mineru_backend="vlm-vllm-engine",
+        )
+
+        self.knowledge_cleaning_step2 = KBCChunkGenerator(
+            split_method="token",
+            chunk_size=512,
+            tokenizer_name="Qwen/Qwen2.5-7B-Instruct",
+        )
+
+        self.knowledge_cleaning_step3 = KBCTextCleaner(
+            llm_serving=self.llm_serving,
+            lang="en"
+        )
+
+        self.knowledge_cleaning_step4 = Text2MultiHopQAGenerator(
+            llm_serving=self.llm_serving,
+            lang="en",
+            num_q = 5
+        )
+
+    def forward(self):
+        extracted=self.knowledge_cleaning_step1.run(
+            storage=self.storage.step(),
+            # input_key=,
+            # output_key=,
+        )
+        
+        self.knowledge_cleaning_step2.run(
+            storage=self.storage.step(),
+            # input_key=,
+            # output_key=,
+        )
+
+        self.knowledge_cleaning_step3.run(
+            storage=self.storage.step(),
+            # input_key=,
+            # output_key=,
+        )
+        self.knowledge_cleaning_step4.run(
+            storage=self.storage.step(),
+            # input_key=,
+            # output_key=,
+        )
+        
+if __name__ == "__main__":
+    model = KBCleaningPDF_APIPipeline()
+    model.forward()
\ No newline at end of file
--- a/dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/pdf_vqa_extract_pipeline.py
+import os
+import sys
+
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+from dataflow.operators.pdf2vqa import VQAExtractor
+
+class VQA_extract_optimized_pipeline:
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="./example_data/PDF2VQAPipeline/vqa_extract_test.jsonl",
+            cache_path="./cache",
+            file_name_prefix="vqa",
+            cache_type="jsonl",
+        )
+        
+        self.llm_serving = APILLMServing_request(
+            api_url="https://generativelanguage.googleapis.com/v1beta/openai/chat/completions",
+            key_name_of_api_key="DF_API_KEY",
+            model_name="gemini-2.5-pro",
+            max_workers=100,
+        )
+        
+        self.vqa_extractor = VQAExtractor(
+            llm_serving=self.llm_serving,
+            mineru_backend='vlm-vllm-engine',
+            max_chunk_len=128000
+        )
+        
+    def forward(self):
+        # 单一算子：包含预处理、QA提取、后处理的所有功能
+        self.vqa_extractor.run(
+            storage=self.storage.step(),
+            input_question_pdf_path_key="question_pdf_path",
+            input_answer_pdf_path_key="answer_pdf_path",
+            input_pdf_path_key="pdf_path",  # 支持 interleaved 模式
+            input_subject_key="subject",
+            output_dir_key="output_dir",
+            output_jsonl_key="output_jsonl_path",
+        )
+
+
+
+if __name__ == "__main__":
+    # jsonl中每一行包含question_pdf_path, answer_pdf_path, subject (math, physics, chemistry, ...), output_dir
+    # 如果question和answer在同一份pdf中，请将question_pdf_path和answer_pdf_path设置为相同的路径，会自动切换为interleaved模式
+    pipeline = VQA_extract_optimized_pipeline()
+    pipeline.forward()
\ No newline at end of file
--- a/dataflow/statics/pipelines/api_pipelines/reasoning_diy_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/reasoning_diy_pipeline.py
+from dataflow.operators.reasoning import (
+    ReasoningQuestionGenerator,
+    ReasoningAnswerGenerator,
+)
+from dataflow.operators.reasoning import ReasoningQuestionFilter, ReasoningAnswerNgramFilter
+
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.core import LLMServingABC
+from dataflow.prompts.reasoning.diy import (
+    DiyQuestionFilterPrompt,
+    DiyAnswerGeneratorPrompt,
+    DiyQuestionSynthesisPrompt
+)
+
+"""
+if the 'prompt_template' is not None and the 'content_type is set to 'diy', please check the input and output format, the same as default prompt
+"""
+
+DIY_PROMPT_QUESTION = """Please only keep the medical related data (judgement_test is true), for other data the judgement_test is false.
+        After these steps, output exactly:
+        {{
+            "judgement_test": true/false,
+            "error_type": "<error description or null>"
+        }}
+        You may include your chain of thought, but the final output must be the JSON above.
+
+        Here is the content to evaluate:
+        -------------------------------
+        {question}
+        -------------------------------
+        """
+    
+DIY_PROMPT_SYNTHESIS = """
+    Please construct some new sports related data from source problem.
+    Here is the problem from the user:
+    {question}
+    Write another problem inspired by this one.
+    Not only change the problem scenario, but also try to create a new problem that requires another approach to solve.
+    Start directly with the problem statement and DO NOT include any phrases such as "Here is a new problem inspired by a given one".
+    After the problem is generated finish your response right away.
+    """
+    
+DIY_PROMPT_ANSWER = """Please firstly output a symbol "Yeah, It is the answer:", and then output the answer."""
+
+class DiyReasoning_APIPipeline():
+    def __init__(self, llm_serving: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/ReasoningPipeline/pipeline_general.json",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        # use API server as LLM serving
+        self.llm_serving = APILLMServing_request(
+                    api_url="http://api.openai.com/v1/chat/completions",
+                    model_name="gpt-4o",
+                    max_workers=30
+        )
+
+        self.question_filter_step1 = ReasoningQuestionFilter(
+            system_prompt="You are an expert in evaluating problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
+            llm_serving=self.llm_serving,
+            prompt_template=DiyQuestionFilterPrompt(DIY_PROMPT_QUESTION)
+        )
+        
+        self.question_gen_step2 =  ReasoningQuestionGenerator(
+            num_prompts=1,
+            llm_serving=self.llm_serving,
+            prompt_template=DiyQuestionSynthesisPrompt(DIY_PROMPT_SYNTHESIS)
+        )
+        
+        self.answer_generator_step3 = ReasoningAnswerGenerator(
+            llm_serving=self.llm_serving,
+            prompt_template=DiyAnswerGeneratorPrompt(DIY_PROMPT_ANSWER)
+        )
+        
+        self.answer_ngram_filter_step4 = ReasoningAnswerNgramFilter(
+            min_score = 0.1,
+            max_score = 1.0,
+            ngrams = 5
+        )
+        
+    def forward(self):
+        self.question_filter_step1.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+        )
+
+        self.question_gen_step2.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+        )
+        self.answer_generator_step3.run(
+            storage = self.storage.step(),
+            input_key = "instruction", 
+            output_key = "generated_cot"
+        )
+        self.answer_ngram_filter_step4.run(
+            storage = self.storage.step(),
+            input_question_key = "instruction",
+            input_answer_key = "generated_cot"
+        )
+
+if __name__ == "__main__":
+    pl = DiyReasoning_APIPipeline()
+    pl.forward()
--- a/dataflow/statics/pipelines/api_pipelines/reasoning_general_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/reasoning_general_pipeline.py
+from dataflow.operators.reasoning import (
+    ReasoningQuestionGenerator,
+    ReasoningAnswerGenerator,
+)
+from dataflow.operators.reasoning import ReasoningQuestionFilter, ReasoningAnswerNgramFilter, ReasoningAnswerModelJudgeFilter
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.core import LLMServingABC
+from dataflow.prompts.reasoning.general import (
+    GeneralQuestionFilterPrompt,
+    GeneralAnswerGeneratorPrompt,
+    GeneralQuestionSynthesisPrompt,
+)
+from dataflow.prompts.model_evaluation.general import AnswerJudgePrompt
+
+class GeneralReasoning_APIPipeline():
+    def __init__(self, llm_serving: LLMServingABC = None):
+        
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/ReasoningPipeline/pipeline_general.json",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+        
+        # use API server as LLM serving
+        self.llm_serving = APILLMServing_request(
+                    api_url="http://api.openai.com/v1/chat/completions",
+                    model_name="gpt-4o",
+                    max_workers=30
+        )
+
+        self.question_filter_step1 = ReasoningQuestionFilter(
+            system_prompt="You are an expert in evaluating problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
+            llm_serving=self.llm_serving,
+            prompt_template=GeneralQuestionFilterPrompt()
+        )
+        
+        self.question_gen_step2 = ReasoningQuestionGenerator(
+            num_prompts=1,
+            llm_serving=self.llm_serving,
+            prompt_template=GeneralQuestionSynthesisPrompt()
+        )
+        
+        self.answer_generator_step3 = ReasoningAnswerGenerator(
+            llm_serving=self.llm_serving,
+            prompt_template=GeneralAnswerGeneratorPrompt()
+        )
+        self.answer_model_judge_step4 = ReasoningAnswerModelJudgeFilter(
+            llm_serving=self.llm_serving,
+            prompt_template=AnswerJudgePrompt(),
+            keep_all_samples=True
+        )
+        self.answer_ngram_filter_step5 = ReasoningAnswerNgramFilter(
+            min_score = 0.1,
+            max_score = 1.0,
+            ngrams = 5
+        )
+        
+    def forward(self):
+        self.question_filter_step1.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+        )
+
+        self.question_gen_step2.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+        )
+        self.answer_generator_step3.run(
+            storage = self.storage.step(),
+            input_key = "instruction", 
+            output_key = "generated_cot"
+        ),
+        self.answer_model_judge_step4.run(
+            storage = self.storage.step(),
+            input_question_key = "instruction",
+            input_answer_key = "generated_cot",
+            input_reference_key = "golden_answer"
+        ),
+        self.answer_ngram_filter_step5.run(
+            storage = self.storage.step(),
+            input_question_key = "instruction",
+            input_answer_key = "generated_cot"
+        )
+
+if __name__ == "__main__":
+    pl = GeneralReasoning_APIPipeline()
+    pl.forward()
--- a/dataflow/statics/pipelines/api_pipelines/reasoning_math_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/reasoning_math_pipeline.py
+from dataflow.operators.reasoning import (
+    ReasoningQuestionCategorySampleEvaluator,
+    ReasoningQuestionDifficultySampleEvaluator,
+    ReasoningQuestionGenerator,
+    ReasoningAnswerGenerator,
+)
+
+from dataflow.operators.reasoning import (
+    ReasoningQuestionFilter,
+    ReasoningAnswerFormatterFilter,
+    ReasoningAnswerGroundTruthFilter,
+    ReasoningAnswerTokenLengthFilter,
+    ReasoningAnswerNgramFilter
+)
+
+from dataflow.prompts.reasoning.math import (
+    MathQuestionFilterPrompt,
+    MathAnswerGeneratorPrompt,
+    MathQuestionSynthesisPrompt
+)
+
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.core import LLMServingABC
+
+# 这里或许未来可以有个pipeline基类
+class ReasoningMath_APIPipeline():
+    def __init__(self, llm_serving: LLMServingABC = None):
+
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/ReasoningPipeline/pipeline_math_short.json",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        # use API server as LLM serving
+        self.llm_serving = APILLMServing_request(
+                api_url="http://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=100
+        )
+        
+        # if llm_serving is None:
+        #     # use local model as LLM serving
+        #     llm_serving = LocalModelLLMServing(
+        #         # model_name_or_path="/data0/models/Qwen2.5-7B-Instruct", # set to your own model path
+        #         model_name_or_path="/mnt/public/model/huggingface/Qwen2.5-7B-Instruct",
+        #         tensor_parallel_size=4,
+        #         max_tokens=8192,
+        #         model_source="local"
+        #     )
+
+        self.question_filter_step1 = ReasoningQuestionFilter(
+            system_prompt="You are an expert in evaluating mathematical problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
+            llm_serving=self.llm_serving,
+            prompt_template=MathQuestionFilterPrompt()
+        )
+        self.question_gen_step2 =  ReasoningQuestionGenerator(
+            num_prompts=3,
+            llm_serving=self.llm_serving,
+            prompt_template=MathQuestionSynthesisPrompt()
+        )
+        self.question_filter_step3 = ReasoningQuestionFilter(
+            system_prompt="You are an expert in evaluating mathematical problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
+            llm_serving=self.llm_serving,
+            prompt_template=MathQuestionFilterPrompt()
+        )
+        self.question_difficulty_classifier_step4 = ReasoningQuestionDifficultySampleEvaluator(
+            llm_serving=self.llm_serving
+        )
+        self.question_category_classifier_step5 = ReasoningQuestionCategorySampleEvaluator(
+            llm_serving=self.llm_serving
+        )
+        ########################## branch ############################
+        # self.answer_pipeline_root_step6 = AnswerPipelineRoot()
+        ########################## answer ############################
+        self.answer_generator_step7 = ReasoningAnswerGenerator(
+            llm_serving=self.llm_serving,
+            prompt_template=MathAnswerGeneratorPrompt()
+        )
+        
+        self.answer_format_filter_step8 = ReasoningAnswerFormatterFilter()
+        
+        self.answer_token_length_filter_step9 = ReasoningAnswerTokenLengthFilter(
+            max_answer_token_length = 8192,
+            tokenizer_dir = "Qwen/Qwen2.5-0.5B-Instruct"
+        )
+        
+        self.answer_groundtruth_filter_step10 = ReasoningAnswerGroundTruthFilter()
+        
+        self.answer_ngram_filter_step11 = ReasoningAnswerNgramFilter(
+            min_score = 0.1,
+            max_score = 1.0,
+            ngrams = 5
+        )
+        
+        # 未来或许可以维护一个类似nn.sequential的容器，方便添加并实例化多个算子
+    def forward(self):
+
+        self.question_filter_step1.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+        )
+
+        self.question_gen_step2.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+        )
+
+        self.question_filter_step3.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+        )
+
+        self.question_difficulty_classifier_step4.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+            output_key = "question_difficulty"
+        )
+
+        self.question_category_classifier_step5.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+            output_key = "question_category"
+        )
+        ############# branch #############
+        # self.answer_pipeline_root_step6.run(
+        #     storage = self.storage.step(),
+        #     input_answer_key = "output",
+        #     input_gt_key = "golden_answer"
+        # )
+        ############## answer #############
+        self.answer_generator_step7.run(
+            storage = self.storage.step(),
+            input_key = "instruction", 
+            output_key = "generated_cot"
+        )
+        self.answer_format_filter_step8.run(
+            storage = self.storage.step(),
+            input_key = "generated_cot",
+        )
+        self.answer_token_length_filter_step9.run(
+            storage = self.storage.step(),
+            input_key =  "generated_cot"
+        )
+        self.answer_groundtruth_filter_step10.run(
+            storage = self.storage.step(),
+            input_test_answer_key = "generated_cot",
+            input_gt_answer_key =  "golden_answer"
+        )
+        self.answer_ngram_filter_step11.run(
+            storage = self.storage.step(),
+            input_question_key = "instruction",
+            input_answer_key = "generated_cot"
+        )
+
+if __name__ == "__main__":
+    model = ReasoningMath_APIPipeline()
+    model.forward()
--- a/dataflow/statics/pipelines/api_pipelines/reasoning_math_pipeline_mathfusion.py
+++ b/dataflow/statics/pipelines/api_pipelines/reasoning_math_pipeline_mathfusion.py
+from dataflow.operators.reasoning import (
+    ReasoningQuestionFusionGenerator,
+    ReasoningQuestionSolvableSampleEvaluator,
+)
+from dataflow.operators.core_text import (
+    PandasOperator,
+    EmbeddingGenerator,
+)
+
+from dataflow.prompts.reasoning.math import (
+    MathQuestionParallelFusionGeneratorPrompt,
+    MathQuestionSequentialFusionGeneratorPrompt,
+    MathQuestionConditionFusionGeneratorPrompt,
+    MathQuestionEvaluatorPrompt
+)
+
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.core import LLMServingABC
+import torch
+import numpy as np
+import pandas as pd
+import re
+
+# 这里或许未来可以有个pipeline基类
+class ReasoningMath_APIPipeline_Mathfusion():
+    def __init__(self, llm_serving: LLMServingABC = None):
+
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/ReasoningPipeline/pipeline_math_short.json",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        # use API server as LLM serving
+        llm_serving = APILLMServing_request(
+                    api_url="http://api.openai.com/v1/chat/completions",
+                    model_name="gpt-4o",
+                    max_workers=10
+        )
+
+        embedding_serving = APILLMServing_request(
+                    api_url="http://api.openai.com/v1/embeddings",
+                    model_name="text-embedding-ada-002",
+                    max_workers=10
+        )       
+        
+        self.embedding_generator = EmbeddingGenerator(embedding_serving=embedding_serving)
+        
+        def find_most_similar_questions(df):
+            df = df.dropna(subset=['embeddings']).reset_index(drop=True)
+            embeddings = torch.tensor(np.stack(df['embeddings'].values), dtype=torch.float32).cuda()  # shape: (n, d)
+            sim_matrix = torch.matmul(embeddings, embeddings.T)  # shape: (n, n)
+            sim_matrix.fill_diagonal_(-float('inf'))
+            most_similar_idx = torch.argmax(sim_matrix, dim=1).cpu().numpy()
+            df['most_similar_problem'] = df['question'].iloc[most_similar_idx].values
+
+            return df
+        
+        self.most_similar_matcher = PandasOperator([
+                        find_most_similar_questions
+                    ])
+        
+        self.extract_pair = PandasOperator([ # dropping embeddings to decrease file size
+                        lambda df: df.drop(columns=[col for col in df.columns if "embeddings" in col])
+                    ])
+        
+        self.sequential_fusion = ReasoningQuestionFusionGenerator(
+            num_prompts=1, 
+            llm_serving=llm_serving, 
+            prompt_template=MathQuestionSequentialFusionGeneratorPrompt()
+            )
+
+        self.parallel_fusion = ReasoningQuestionFusionGenerator(
+            num_prompts=1, 
+            llm_serving=llm_serving, 
+            prompt_template=MathQuestionParallelFusionGeneratorPrompt()
+            )
+
+        self.condition_fusion = ReasoningQuestionFusionGenerator(
+            num_prompts=2, 
+            llm_serving=llm_serving, 
+            prompt_template=MathQuestionConditionFusionGeneratorPrompt()
+            )
+
+        def combined(df: pd.DataFrame) -> pd.DataFrame:
+            """
+            Combine all question-related columns into a single long-format DataFrame.
+            Automatically detects columns containing '_question_{i}' patterns.
+            """
+            # 始终保留原始 question
+            question_cols = ["question"] if "question" in df.columns else []
+
+            # 匹配所有类似 *_question_0, *_question_1, ...
+            pattern = re.compile(r".*_question_\d+$")
+            question_cols.extend([col for col in df.columns if pattern.match(col)])
+
+            if not question_cols:
+                raise ValueError("No question columns found matching pattern '_question_{i}'.")
+
+            # 转换成长表
+            long_df = df.melt(value_vars=question_cols, value_name="questions")[["questions"]]
+
+            # 去除空值与重复项（通常同一问题会出现重复）
+            long_df = long_df.dropna(subset=["questions"]).drop_duplicates().reset_index(drop=True)
+
+            return long_df
+
+        self.combined_question = PandasOperator([combined])
+
+        self.question_evaluation = ReasoningQuestionSolvableSampleEvaluator(llm_serving=llm_serving, prompt_template=MathQuestionEvaluatorPrompt())
+
+        def extract_new_problem(df: pd.DataFrame) -> pd.DataFrame:
+            """
+            Extract the content after '#New Problem#' from the 'questions' column
+            and store it in a new column 'new_problem'.
+            """
+            if "questions" not in df.columns:
+                raise ValueError("Input DataFrame must contain a 'questions' column.")
+
+            def _extract(text: str) -> str:
+                if not isinstance(text, str):
+                    return None
+                match = re.search(r"#New Problem#[:\s]*(.*)", text, re.DOTALL)
+                return match.group(1).strip() if match else None
+
+            df = df.copy()
+            df["refined_question"] = df["questions"].apply(_extract)
+            df = df.dropna(subset=["refined_question"]).reset_index(drop=True)
+
+            return df
+        
+        self.extract_new_problem = PandasOperator([extract_new_problem])
+
+    def forward(self):
+        # self.first10.run(
+        #     storage = self.storage.step(),
+        # )
+
+        self.embedding_generator.run(
+            storage = self.storage.step(),
+            input_key = "question",
+            output_key = "embeddings",
+        )
+
+        self.most_similar_matcher.run(
+            storage = self.storage.step(),
+        )
+
+        self.extract_pair.run(
+            storage = self.storage.step(),
+        )
+
+        self.sequential_fusion.run(
+            storage = self.storage.step(),
+            input_problem_1= "question",
+            input_problem_2= "most_similar_problem",
+            output_key="sequential_fusion",
+        )
+
+        self.parallel_fusion.run(
+            storage = self.storage.step(),
+            input_problem_1= "question",
+            input_problem_2= "most_similar_problem",
+            output_key="parallel_fusion"
+        )
+
+        self.condition_fusion.run(
+            storage = self.storage.step(),
+            input_problem_1= "question",
+            input_problem_2= "most_similar_problem",
+            output_key="condition_fusion"
+        )
+
+        self.combined_question.run(
+            storage = self.storage.step()
+        )
+
+        self.question_evaluation.run(
+            storage = self.storage.step(),
+            input_key = "questions",
+            output_key= "question_solvability"
+        )
+        self.extract_new_problem.run(
+            storage = self.storage.step()
+        )
+
+
+
+if __name__ == "__main__":
+    pl = ReasoningMath_APIPipeline_Mathfusion()
+    pl.forward()
--- a/dataflow/statics/pipelines/api_pipelines/reasoning_pretrain_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/reasoning_pretrain_pipeline.py
+from dataflow.operators.reasoning import (
+    ReasoningQuestionGenerator,
+    ReasoningAnswerGenerator,
+    ReasoningPretrainFormatConvertGenerator
+)
+from dataflow.prompts.reasoning.math import (
+    MathQuestionFilterPrompt,
+    MathQuestionSynthesisPrompt,
+    MathAnswerGeneratorPrompt
+)
+from dataflow.operators.reasoning import ReasoningQuestionFilter, ReasoningAnswerNgramFilter, ReasoningAnswerPipelineRootFilter
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+
+# 这里或许未来可以有个pipeline基类
+class Reasoning_APIPipeline_Pretrain():
+    def __init__(self, llm_serving=None):
+
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/ReasoningPipeline/pipeline_math_short.json",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+        if llm_serving is None:
+            # use API server as LLM serving
+            self.llm_serving = APILLMServing_request(
+                    api_url="http://api.openai.com/v1/chat/completions",
+                    model_name="gpt-4o",
+                    max_workers=100
+            )
+
+            ## use local model as LLM serving
+            # llm_serving = LocalModelLLMServing(
+            #     # model_name_or_path="/data0/models/Qwen2.5-7B-Instruct", # set to your own model path
+            #     model_name_or_path="/mnt/public/model/huggingface/Qwen2.5-7B-Instruct",
+            #     tensor_parallel_size=4,
+            #     max_tokens=1024,
+            #     model_source="local"
+            # )
+        
+        self.question_filter_step1 = ReasoningQuestionFilter(
+            system_prompt="You are an expert in evaluating mathematical problems. Follow the user's instructions strictly and output your final judgment in the required JSON format.",
+            llm_serving=self.llm_serving,
+            prompt_template=MathQuestionFilterPrompt()
+        )
+        self.question_gen_step2 =  ReasoningQuestionGenerator(
+            num_prompts=3,
+            llm_serving=self.llm_serving,
+            prompt_template=MathQuestionSynthesisPrompt()
+        )
+        
+        ########################## branch ############################
+        self.answer_pipeline_root_step3 = ReasoningAnswerPipelineRootFilter()
+        ########################## answer ############################
+        self.answer_generator_step4 = ReasoningAnswerGenerator(
+            llm_serving=self.llm_serving,
+            prompt_template=MathAnswerGeneratorPrompt()
+        )
+        
+        self.answer_ngram_filter_step5 = ReasoningAnswerNgramFilter(
+            min_score = 0.1,
+            max_score = 1.0,
+            ngrams = 5
+        )
+        
+        self.sft_to_pretrain_step6 = ReasoningPretrainFormatConvertGenerator()
+                
+        # 未来或许可以维护一个类似nn.sequential的容器，方便添加并实例化多个算子
+    def forward(self):
+
+        self.question_filter_step1.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+        )
+
+        self.question_gen_step2.run(
+            storage = self.storage.step(),
+            input_key = "instruction",
+        )
+
+        ############# branch #############
+        self.answer_pipeline_root_step3.run(
+            storage = self.storage.step(),
+            input_answer_key = "output",
+            input_gt_key = "golden_answer"
+        )
+        ############## answer #############
+        self.answer_generator_step4.run(
+            storage = self.storage.step(),
+            input_key = "instruction", 
+            output_key = "generated_cot"
+        )
+        self.answer_ngram_filter_step5.run(
+            storage = self.storage.step(),
+            input_question_key = "instruction",
+            input_answer_key = "generated_cot"
+        )
+        self.sft_to_pretrain_step6.run(
+            storage = self.storage.step(),
+            input_read_key_question="instruction",
+            input_read_key_answer="generated_cot",
+            output_key="text",
+            )
+
+if __name__ == "__main__":
+    pipeline = Reasoning_APIPipeline_Pretrain()
+    pipeline.forward()
+
--- a/dataflow/statics/pipelines/api_pipelines/text2qa_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/text2qa_pipeline.py
+from dataflow.operators.core_text import (
+    Text2QASampleEvaluator,
+    Text2QAGenerator,
+    KCenterGreedyFilter
+)
+
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.serving import LocalModelLLMServing_vllm
+
+class AgenticRAG_APIPipeline():
+    def __init__(self):
+
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/core_text_data/pipeline_small_chunk.json",
+            cache_path="./cache_local",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="json",
+        )
+
+        # use API server as LLM serving
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=1
+        )
+
+        embedding_serving = APILLMServing_request(
+                    api_url="https://api.openai.com/v1/embeddings",
+                    model_name="text-embedding-ada-002",
+                    max_workers=100
+        )
+
+        self.content_chooser_step1 = KCenterGreedyFilter(embedding_serving=embedding_serving, num_samples=5)
+
+        self.text2qa_generator_step3 = Text2QAGenerator(self.llm_serving)
+
+        self.text2qa_scorer_step4 = Text2QASampleEvaluator(self.llm_serving)
+        
+    def forward(self):
+
+        self.content_chooser_step1.run(
+            storage = self.storage.step(),
+            input_key = "text"
+        )
+
+        self.text2qa_generator_step3.run(
+            storage = self.storage.step(),
+            input_key="text",
+            input_question_num= 3,
+            output_prompt_key="generated_prompt",
+            output_quesion_key="generated_question",
+            output_answer_key="generated_answer"
+        )
+
+        self.text2qa_scorer_step4.run(
+            storage = self.storage.step(),
+            input_question_key="generated_question",
+            input_answer_key="generated_answer",
+            output_question_quality_key="question_quality_grades",
+            output_question_quality_feedback_key="question_quality_feedbacks",
+            output_answer_alignment_key="answer_alignment_grades",
+            output_answer_alignment_feedback_key="answer_alignment_feedbacks",
+            output_answer_verifiability_key="answer_verifiability_grades",
+        )
+        
+if __name__ == "__main__":
+    model = AgenticRAG_APIPipeline()
+    model.forward()
--- a/dataflow/statics/pipelines/api_pipelines/text2sql_pipeline_gen.py
+++ b/dataflow/statics/pipelines/api_pipelines/text2sql_pipeline_gen.py
+import os
+from dataflow import get_logger
+import zipfile
+from huggingface_hub import snapshot_download
+
+from dataflow.operators.text2sql import (
+    SQLGenerator,
+    SQLByColumnGenerator,
+    Text2SQLQuestionGenerator,
+    Text2SQLPromptGenerator,
+    Text2SQLCoTGenerator
+)
+from dataflow.operators.text2sql import (
+    SQLExecutionFilter
+)
+from dataflow.operators.text2sql import (
+    SQLComponentClassifier,
+    SQLExecutionClassifier
+)
+from dataflow.prompts.text2sql import (
+    Text2SQLCotGeneratorPrompt,
+    SelectSQLGeneratorPrompt,
+    Text2SQLQuestionGeneratorPrompt,
+    Text2SQLPromptGeneratorPrompt
+)
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.text2sql.database_manager import DatabaseManager
+
+
+def download_and_extract_database(logger):
+    dataset_repo_id = "Open-Dataflow/dataflow-Text2SQL-database-example"
+    local_dir = "./hf_cache"
+    extract_to = "./downloaded_databases"
+    
+    logger.info(f"Downloading and extracting database from {dataset_repo_id}...")
+    os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+
+    db_exp_folder_path = os.path.join(extract_to, "databases")
+
+    if os.path.exists(db_exp_folder_path):
+        return db_exp_folder_path
+    
+    os.makedirs(local_dir, exist_ok=True)
+    os.makedirs(extract_to, exist_ok=True)
+    
+    downloaded_path = snapshot_download(
+        repo_id=dataset_repo_id,
+        repo_type="dataset",
+        local_dir=local_dir,
+        resume_download=True
+    )
+    
+    logger.info(f"Files downloaded to: {downloaded_path}")
+    
+    zip_path = os.path.join(downloaded_path, "databases.zip")
+    if os.path.exists(zip_path):
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_to)
+        logger.info(f"Database files extracted to {extract_to}")
+        return db_exp_folder_path
+    else:
+        raise FileNotFoundError(f"Database zip file not found at {zip_path}")
+
+class Text2SQLGeneration_APIPipeline():
+    def __init__(self, db_root_path=""):
+        self.logger = get_logger()
+        self.db_root_path = db_root_path
+
+        if not db_root_path:
+            try:
+                self.db_root_path = download_and_extract_database(self.logger)
+                self.logger.info(f"Using automatically downloaded database at: {self.db_root_path}")
+            except Exception as e:
+                self.logger.error(f"Failed to auto-download database: {e}")
+                raise 
+        else:
+            self.logger.info(f"Using manually specified database path: {self.db_root_path}")
+
+        if not os.path.exists(self.db_root_path):
+            raise FileNotFoundError(f"Database path does not exist: {self.db_root_path}")
+
+        self.storage = FileStorage(
+            first_entry_file_name="",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",
+            model_name="gpt-4o",
+            max_workers=100
+        )
+
+        # It is recommended to use better LLMs for the generation of Chain-of-Thought (CoT) reasoning process.
+        cot_generation_api_llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",
+            model_name="gpt-4o", # You can change to a more powerful model for CoT generation
+            max_workers=100
+        )
+
+        embedding_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/embeddings",
+            model_name="text-embedding-ada-002",
+            max_workers=100
+        )
+
+        # SQLite and MySQL are currently supported
+        # db_type can be sqlite or mysql, which must match your database type
+        # If sqlite is selected, root_path must be provided, this path must exist and contain database files
+        # If mysql is selected, host, user, password must be provided, these credentials must be correct and have access permissions
+        # MySQL example:
+        # database_manager = DatabaseManager(
+        #     db_type="mysql",
+        #     config={
+        #         "host": "localhost",
+        #         "user": "root",
+        #         "password": "your_password",
+        #         "database": "your_database_name"
+        #     }
+        # )
+        # SQLite example:
+        database_manager = DatabaseManager(
+            db_type="sqlite",
+            config={
+                "root_path": self.db_root_path
+            }
+        )
+        
+        self.sql_generator_step1 = SQLGenerator(
+            llm_serving=self.llm_serving,
+            database_manager=database_manager,
+            generate_num=2,
+            prompt_template=SelectSQLGeneratorPrompt()
+        )
+
+        self.sql_execution_filter_step2 = SQLExecutionFilter(
+            database_manager=database_manager
+        )
+
+        self.text2sql_question_generator_step3 = Text2SQLQuestionGenerator(
+            llm_serving=self.llm_serving,
+            embedding_serving=embedding_serving,
+            database_manager=database_manager,
+            question_candidates_num=5,
+            prompt_template=Text2SQLQuestionGeneratorPrompt()
+        )
+
+        self.text2sql_prompt_generator_step4 = Text2SQLPromptGenerator(
+            database_manager=database_manager,
+            prompt_template=Text2SQLPromptGeneratorPrompt()
+        )
+
+        self.sql_cot_generator_step5 = Text2SQLCoTGenerator(
+            llm_serving=cot_generation_api_llm_serving,
+            database_manager=database_manager,
+            prompt_template=Text2SQLCotGeneratorPrompt()
+        )
+
+        self.sql_component_classifier_step6 = SQLComponentClassifier(
+            difficulty_thresholds=[2, 4, 6],
+            difficulty_labels=['easy', 'medium', 'hard', 'extra']
+        )
+
+        self.sql_execution_classifier_step7 = SQLExecutionClassifier(
+            llm_serving=self.llm_serving,
+            database_manager=database_manager,
+            num_generations=10,
+            difficulty_thresholds=[2, 5, 9],
+            difficulty_labels=['extra', 'hard', 'medium', 'easy']
+        )
+        
+    def forward(self):
+
+        sql_key = "SQL"
+        db_id_key = "db_id"
+        question_key = "question"
+        evidence_key = "evidence"
+
+        self.sql_generator_step1.run(
+            storage=self.storage.step(),
+            output_sql_key=sql_key,
+            output_db_id_key=db_id_key
+        )
+
+        self.sql_execution_filter_step2.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key
+        )
+
+        self.text2sql_question_generator_step3.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key,
+            output_question_key=question_key,
+            output_evidence_key=evidence_key
+        )
+
+        self.text2sql_prompt_generator_step4.run(
+            storage=self.storage.step(),
+            input_question_key=question_key,
+            input_db_id_key=db_id_key,
+            input_evidence_key=evidence_key,
+            output_prompt_key="prompt"
+        )
+
+        self.sql_cot_generator_step5.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_question_key=question_key,
+            input_db_id_key=db_id_key,
+            input_evidence_key=evidence_key,
+            output_cot_key="cot_reasoning"
+        )
+
+        self.sql_component_classifier_step6.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            output_difficulty_key="sql_component_difficulty"
+        )
+
+        self.sql_execution_classifier_step7.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key,
+            input_prompt_key="prompt",
+            output_difficulty_key="sql_execution_difficulty"
+        )
+
+if __name__ == "__main__":
+    # If you have your own database files, you can set the db_root_path to the path of your database files
+    # If not, please set the db_root_path "", and we will download the example database files automatically
+    db_root_path = ""
+
+    model = Text2SQLGeneration_APIPipeline(db_root_path=db_root_path)
+    model.forward()
+
--- a/dataflow/statics/pipelines/api_pipelines/text2sql_pipeline_refine.py
+++ b/dataflow/statics/pipelines/api_pipelines/text2sql_pipeline_refine.py
+import os
+from dataflow import get_logger
+import zipfile
+from pathlib import Path
+from huggingface_hub import snapshot_download
+
+from dataflow.operators.text2sql import (
+    SQLVariationGenerator,
+    Text2SQLQuestionGenerator,
+    Text2SQLPromptGenerator,
+    Text2SQLCoTGenerator
+)
+from dataflow.operators.text2sql import (
+    SQLExecutionFilter,
+    SQLConsistencyFilter
+)
+from dataflow.operators.text2sql import (
+    SQLComponentClassifier,
+    SQLExecutionClassifier
+)
+from dataflow.prompts.text2sql import (
+    SQLConsistencyFilterPrompt,
+    Text2SQLCotGeneratorPrompt,
+    Text2SQLQuestionGeneratorPrompt,
+    SQLVariationGeneratorPrompt,
+    Text2SQLPromptGeneratorPrompt
+)
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.text2sql.database_manager import DatabaseManager
+
+
+def download_and_extract_database(logger):
+    dataset_repo_id = "Open-Dataflow/dataflow-Text2SQL-database-example"
+    local_dir = "./hf_cache"
+    extract_to = "./downloaded_databases"
+    
+    logger.info(f"Downloading and extracting database from {dataset_repo_id}...")
+    os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'
+
+    db_exp_folder_path = os.path.join(extract_to, "databases")
+
+    if os.path.exists(db_exp_folder_path):
+        return db_exp_folder_path
+    
+    os.makedirs(local_dir, exist_ok=True)
+    os.makedirs(extract_to, exist_ok=True)
+    
+    downloaded_path = snapshot_download(
+        repo_id=dataset_repo_id,
+        repo_type="dataset",
+        local_dir=local_dir,
+        resume_download=True
+    )
+    
+    logger.info(f"Files downloaded to: {downloaded_path}")
+    
+    zip_path = os.path.join(downloaded_path, "databases.zip")
+    if os.path.exists(zip_path):
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_to)
+        logger.info(f"Database files extracted to {extract_to}")
+        return db_exp_folder_path
+    else:
+        raise FileNotFoundError(f"Database zip file not found at {zip_path}")
+
+class Text2SQLRefine_APIPipeline():
+    def __init__(self, db_root_path=""):
+        self.logger = get_logger()
+        self.db_root_path = db_root_path
+
+        if not db_root_path:
+            try:
+                self.db_root_path = download_and_extract_database(self.logger)
+                self.logger.info(f"Using automatically downloaded database at: {self.db_root_path}")
+            except Exception as e:
+                self.logger.error(f"Failed to auto-download database: {e}")
+                raise 
+        else:
+            self.logger.info(f"Using manually specified database path: {self.db_root_path}")
+
+        if not os.path.exists(self.db_root_path):
+            raise FileNotFoundError(f"Database path does not exist: {self.db_root_path}")
+
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/Text2SQLPipeline/pipeline_refine.jsonl",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl"
+        )
+
+        self.llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",
+            model_name="gpt-4o",
+            max_workers=100
+        )
+
+        # It is recommended to use better LLMs for the generation of Chain-of-Thought (CoT) reasoning process.
+        cot_generation_api_llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",
+            model_name="gpt-4o", # You can change to a more powerful model for CoT generation
+            max_workers=100
+        )
+
+        embedding_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/embeddings",
+            model_name="text-embedding-ada-002",
+            max_workers=100
+        )
+
+        # SQLite and MySQL are currently supported
+        # db_type can be sqlite or mysql, which must match your database type
+        # If sqlite is selected, root_path must be provided, this path must exist and contain database files
+        # If mysql is selected, host, user, password must be provided, these credentials must be correct and have access permissions
+        # MySQL example:
+        # database_manager = DatabaseManager(
+        #     db_type="mysql",
+        #     config={
+        #         "host": "localhost",
+        #         "user": "root",
+        #         "password": "your_password",
+        #         "database": "your_database_name"
+        #     }
+        # )
+        # SQLite example:
+        database_manager = DatabaseManager(
+            db_type="sqlite",
+            config={
+                "root_path": self.db_root_path
+            }
+        )
+        
+        self.sql_execution_filter_step1 = SQLExecutionFilter(
+            database_manager=database_manager
+        )
+
+        self.sql_consistency_filter_step2 = SQLConsistencyFilter(
+            llm_serving=self.llm_serving,
+            database_manager=database_manager,
+            prompt_template=SQLConsistencyFilterPrompt()
+        )
+
+        self.sql_variation_generator_step3 = SQLVariationGenerator(
+            llm_serving=self.llm_serving,
+            database_manager=database_manager,
+            num_variations=1, # Number of variations to generate for each SQL
+            prompt_template=SQLVariationGeneratorPrompt()
+        )
+
+        self.sql_execution_filter_step4 = SQLExecutionFilter(
+            database_manager=database_manager
+        )
+
+        self.text2sql_question_generator_step5 = Text2SQLQuestionGenerator(
+            llm_serving=self.llm_serving,
+            embedding_serving=embedding_serving,
+            database_manager=database_manager,
+            question_candidates_num=5,
+            prompt_template=Text2SQLQuestionGeneratorPrompt()
+        )
+
+        self.text2sql_prompt_generator_step6 = Text2SQLPromptGenerator(
+            database_manager=database_manager,
+            prompt_template=Text2SQLPromptGeneratorPrompt()
+        )
+
+        self.sql_cot_generator_step7 = Text2SQLCoTGenerator(
+            llm_serving=cot_generation_api_llm_serving,
+            database_manager=database_manager,
+            prompt_template=Text2SQLCotGeneratorPrompt()
+        )
+
+        self.sql_component_classifier_step8 = SQLComponentClassifier(
+            difficulty_thresholds=[2, 4, 6],
+            difficulty_labels=['easy', 'medium', 'hard', 'extra']
+        )
+
+        self.sql_execution_classifier_step9 = SQLExecutionClassifier(
+            llm_serving=self.llm_serving,
+            database_manager=database_manager,
+            num_generations=10,
+            difficulty_thresholds=[2, 5, 9],
+            difficulty_labels=['extra', 'hard', 'medium', 'easy']
+        )
+        
+        
+    def forward(self):
+
+        sql_key = "SQL"
+        db_id_key = "db_id"
+        question_key = "question"
+        evidence_key = "evidence"
+
+        self.sql_execution_filter_step1.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key
+        )
+
+        self.sql_consistency_filter_step2.run(
+            storage=self.storage.step(),   
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key,
+            input_question_key=question_key
+        )
+
+        self.sql_variation_generator_step3.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key
+        )
+
+        self.sql_execution_filter_step4.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key
+        )
+
+        self.text2sql_question_generator_step5.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key,
+            output_question_key=question_key,
+            output_evidence_key=evidence_key
+        )
+
+        self.text2sql_prompt_generator_step6.run(
+            storage=self.storage.step(),
+            input_question_key=question_key,
+            input_db_id_key=db_id_key,
+            input_evidence_key=evidence_key,
+            output_prompt_key="prompt"
+        )
+
+        self.sql_cot_generator_step7.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_question_key=question_key,
+            input_db_id_key=db_id_key,
+            input_evidence_key=evidence_key,
+            output_cot_key="cot_reasoning"
+        )
+
+        self.sql_component_classifier_step8.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            output_difficulty_key="sql_component_difficulty"
+        )
+
+        self.sql_execution_classifier_step9.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key,
+            input_prompt_key="prompt",
+            output_difficulty_key="sql_execution_difficulty"
+        )
+
+if __name__ == "__main__":
+    # If you have your own database files, you can set the db_root_path to the path of your database files
+    # If not, please set the db_root_path "", and we will download the example database files automatically
+    db_root_path = ""
+
+    model = Text2SQLRefine_APIPipeline(db_root_path=db_root_path)
+    model.forward()
--- a/dataflow/statics/pipelines/api_pipelines/text2vecsql_pipeline_gen.py
+++ b/dataflow/statics/pipelines/api_pipelines/text2vecsql_pipeline_gen.py
+import os
+from dataflow import get_logger
+import zipfile
+from pathlib import Path
+from huggingface_hub import snapshot_download
+
+from dataflow.operators.text2sql import (
+    SQLByColumnGenerator,
+    Text2SQLQuestionGenerator,
+    Text2SQLPromptGenerator
+)
+from dataflow.operators.text2sql import (
+    SQLExecutionFilter
+)
+from dataflow.operators.text2sql import (
+    SQLComponentClassifier,
+    SQLExecutionClassifier
+)
+from dataflow.prompts.text2sql import (
+    SelectVecSQLGeneratorPrompt,
+    Text2VecSQLQuestionGeneratorPrompt,
+    Text2VecSQLPromptGeneratorPrompt
+)
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+from dataflow.serving import LocalEmbeddingServing
+from dataflow.utils.text2sql.database_manager import DatabaseManager
+
+
+def download_and_extract_database(logger):
+    dataset_repo_id = "Open-Dataflow/dataflow-Text2SQL-vector-database-example"
+    subfolder = "databases_vec"
+    local_dir = "./hf_cache"
+    extract_to = "./downloaded_databases_vec"
+
+    logger.info(f"Downloading and extracting database from {dataset_repo_id}...")
+    # os.environ['HF_ENDPOINT'] = 'https://alpha.hf-mirror.com'
+
+    os.makedirs(local_dir, exist_ok=True)
+    os.makedirs(extract_to, exist_ok=True)
+
+    downloaded_path = snapshot_download(
+        repo_id=dataset_repo_id,
+        repo_type="dataset",
+        local_dir=local_dir,
+        resume_download=True  
+    )
+
+    logger.info(f"Database files downloaded to {downloaded_path}")
+
+    zip_path = os.path.join(downloaded_path, "vector_databases.zip")
+    if os.path.exists(zip_path):
+        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+            zip_ref.extractall(extract_to)
+        logger.info(f"Database files extracted to {extract_to}")
+        return extract_to
+    else:
+        raise FileNotFoundError(f"Database zip file not found at {zip_path}")
+
+
+class Text2VecSQLGeneration_APIPipeline():
+    def __init__(self, db_root_path=""):
+        self.logger = get_logger()
+        self.db_root_path = db_root_path
+
+        if not db_root_path:
+            try:
+                self.db_root_path = download_and_extract_database(self.logger)
+                self.logger.info(f"Using automatically downloaded database at: {self.db_root_path}")
+            except Exception as e:
+                self.logger.error(f"Failed to auto-download database: {e}")
+                raise 
+        else:
+            self.logger.info(f"Using manually specified database path: {self.db_root_path}")
+
+        if not os.path.exists(self.db_root_path):
+            raise FileNotFoundError(f"Database path does not exist: {self.db_root_path}")
+
+        self.storage = FileStorage(
+            first_entry_file_name="",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+
+        self.llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",
+            model_name="gpt-4o",
+            max_workers=100
+        )
+
+        # It is recommended to use better LLMs for the generation of Chain-of-Thought (CoT) reasoning process.
+        # cot_generation_api_llm_serving = APILLMServing_request(
+        #     api_url="http://api.openai.com/v1/chat/completions",
+        #     model_name="gpt-4o", # You can change to a more powerful model for CoT generation
+        #     max_workers=100
+        # )
+
+        embedding_serving = LocalEmbeddingServing(
+            model_name='sentence-transformers/all-MiniLM-L6-v2'
+        )
+
+        # SQLite and MySQL are currently supported
+        # db_type can be sqlite or mysql, which must match your database type
+        # If sqlite is selected, root_path must be provided, this path must exist and contain database files
+        # If mysql is selected, host, user, password must be provided, these credentials must be correct and have access permissions
+        # MySQL example:
+        # database_manager = DatabaseManager(
+        #     db_type="mysql",
+        #     config={
+        #         "host": "localhost",
+        #         "user": "root",
+        #         "password": "your_password",
+        #         "database": "your_database_name"
+        #     }
+        # )
+        # SQLite example:
+        database_manager = DatabaseManager(
+            db_type="sqlite-vec",
+            config={
+                "root_path": self.db_root_path,
+                "model_name": "all-MiniLM-L6-v2",
+                "model_path": "./hf_cache/all-MiniLM-L6-v2.e4ce9877.q8_0.gguf",
+                "enable_lembed": False
+            },
+            embedding_serving=embedding_serving
+        )
+        
+        self.sql_generator_step1 = SQLByColumnGenerator(
+            llm_serving=self.llm_serving,
+            database_manager=database_manager,
+            generate_num=2,
+            prompt_template=SelectVecSQLGeneratorPrompt()
+        )
+
+        self.sql_execution_filter_step2 = SQLExecutionFilter(
+            database_manager=database_manager,
+        )
+
+        self.text2sql_question_generator_step3 = Text2SQLQuestionGenerator(
+            llm_serving=self.llm_serving,
+            embedding_serving=embedding_serving,
+            database_manager=database_manager,
+            question_candidates_num=5,
+            prompt_template=Text2VecSQLQuestionGeneratorPrompt()
+        )
+
+        self.text2sql_prompt_generator_step4 = Text2SQLPromptGenerator(
+            database_manager=database_manager,
+            prompt_template=Text2VecSQLPromptGeneratorPrompt()
+        )
+
+        self.sql_component_classifier_step5 = SQLComponentClassifier(
+            difficulty_thresholds=[2, 4, 6],
+            difficulty_labels=['easy', 'medium', 'hard', 'extra']
+        )
+
+        self.sql_execution_classifier_step6 = SQLExecutionClassifier(
+            llm_serving=self.llm_serving,
+            database_manager=database_manager,
+            num_generations=10,
+            difficulty_thresholds=[2, 5, 9],
+            difficulty_labels=['extra', 'hard', 'medium', 'easy']
+        )
+        
+    def forward(self):
+
+        sql_key = "SQL"
+        db_id_key = "db_id"
+        question_key = "question"
+
+        self.sql_generator_step1.run(
+            storage=self.storage.step(),
+            output_sql_key=sql_key,
+            output_db_id_key=db_id_key
+        )
+
+        self.sql_execution_filter_step2.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key
+        )
+
+        self.text2sql_question_generator_step3.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key,
+            output_question_key=question_key
+        )
+
+        self.text2sql_prompt_generator_step4.run(
+            storage=self.storage.step(),
+            input_question_key=question_key,
+            input_db_id_key=db_id_key,
+            output_prompt_key="prompt"
+        )
+
+        # self.sql_cot_generator_step5.run(
+        #     storage=self.storage.step(),
+        #     input_sql_key=sql_key,
+        #     input_question_key=question_key,
+        #     input_db_id_key=db_id_key,
+        #     output_cot_key="cot_reasoning"
+        # )
+
+        self.sql_component_classifier_step5.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            output_difficulty_key="sql_component_difficulty"
+        )
+
+        self.sql_execution_classifier_step6.run(
+            storage=self.storage.step(),
+            input_sql_key=sql_key,
+            input_db_id_key=db_id_key,
+            input_prompt_key="prompt",
+            output_difficulty_key="sql_execution_difficulty"
+        )
+
+if __name__ == "__main__":
+    # If you have your own database files, you can set the db_root_path to the path of your database files
+    # If not, please set the db_root_path "", and we will download the example database files automatically
+    db_root_path = ""
+    model = Text2VecSQLGeneration_APIPipeline(db_root_path)
+    model.forward()
--- a/dataflow/statics/pipelines/api_pipelines/text_conversation_synthesis_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/text_conversation_synthesis_pipeline.py
+from dataflow.operators.conversations import ConsistentChatGenerator
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request 
+
+class TextConversationSynthesis_APIPipeline():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+        self.llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",
+            model_name="gpt-4o",
+            max_workers=100
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.processor = ConsistentChatGenerator(llm_serving=self.llm_serving, num_dialogs_per_intent=5)
+
+    def forward(self):
+        self.processor.run(
+            storage=self.storage.step()
+        )
+
+if __name__ == "__main__":
+    # This is a test entry point for the TextPipeline
+    # It will run the forward method of the TextPipeline class
+    # to process the data and generate the output.
+    print("Running TextPipeline...")
+    model = TextConversationSynthesis_APIPipeline()
+    model.forward()
\ No newline at end of file
--- a/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py
+++ b/dataflow/statics/pipelines/api_pipelines/text_sft_synthesis_pipeline.py
+ 
+from dataflow.operators.text_sft import AlpagasusFilter
+from dataflow.operators.text_sft import CondorGenerator
+from dataflow.operators.text_sft import CondorRefiner
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import APILLMServing_request
+
+class TextSFTSynthesis_APIPipeline():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.num_generated_samples = 3
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=100
+        )
+        self.generator = CondorGenerator(llm_serving=self.llm_serving, num_samples=self.num_generated_samples)
+        self.refiner = CondorRefiner(llm_serving=self.llm_serving)
+        self.alpagasus_filter = AlpagasusFilter(min_score=3,max_score=5,llm_serving=self.llm_serving)
+
+    def forward(self):
+        self.generator.run(
+            storage=self.storage.step()
+        )
+        self.refiner.run(
+            storage=self.storage.step(),
+            input_instruction_key='instruction',
+            input_output_key='output'
+        )
+        self.alpagasus_filter.run(
+            storage=self.storage.step(),
+            input_instruction_key='instruction',
+            input_input_key="input",
+            input_output_key='output'
+        )
+
+if __name__ == "__main__":
+    model = TextSFTSynthesis_APIPipeline()
+    model.forward()
--- a/dataflow/statics/pipelines/core_text/prompted_eval.py
+++ b/dataflow/statics/pipelines/core_text/prompted_eval.py
+from dataflow.operators.core_text import PromptedEvaluator
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+
+class GPT_evaluator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/core_text_data/eval_data.json",
+            cache_path="./cache_1",
+            file_name_prefix="math_QA",
+            cache_type="json",
+        )
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=10
+        )
+        self.prompt_evaluator = PromptedEvaluator(
+            llm_serving = self.llm_serving, 
+        )
+
+    def forward(self):
+        # Initial filters
+        self.prompt_evaluator.run(
+            storage = self.storage.step(),
+            input_key = "conversations",
+            output_key = "eval_dim_1",
+        )
+
+
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+
+    model = GPT_evaluator()
+    model.forward()