适配后端vllm

97e8278b · zzg_666 · 97e8278b · 97e8278b · 97e8278b · 97e8278b
Commit 97e8278b authored Dec 03, 2025 by zzg_666
20 changed files
--- a/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/text_sft_filter.py
+from dataflow.operators.general_text import (
+    WordNumberFilter,
+)
+from dataflow.operators.text_sft import (
+    SuperfilteringFilter,
+    DeitaQualityFilter
+)
+from dataflow.utils.storage import FileStorage
+class SFTTextFilter_GPUPipeline():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/sft_input.jsonl",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.word_number_filter_step1 = WordNumberFilter(
+            min_words=20,
+            max_words=1000
+        )
+        self.super_filtering_filter_step2 = SuperfilteringFilter(
+            min_score=0.5,
+            max_score=1.0,
+            model_cache_dir=self.model_cache_dir
+        )
+        self.deita_quality_filter_step3 = DeitaQualityFilter(
+            min_score=2.5,
+            max_score=10000,
+            max_length=512,
+            model_cache_dir=self.model_cache_dir
+        )
+    def forward(self):
+        self.word_number_filter_step1.run(
+            storage=self.storage.step(),
+            input_key="output",
+        )
+        self.super_filtering_filter_step2.run(
+            storage=self.storage.step(),
+            input_instruction_key='instruction',
+            input_input_key=None,
+            input_output_key='output'
+        )
+        self.deita_quality_filter_step3.run(
+            storage=self.storage.step(),
+            input_instruction_key='instruction',
+            input_output_key='output'
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    pipeline = SFTTextFilter_GPUPipeline()
+    pipeline.forward()
\ No newline at end of file
--- a/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py
+++ b/dataflow/statics/pipelines/gpu_pipelines/text_sft_synthetic.py
+from dataflow.operators.general_text import (
+    MinHashDeduplicateFilter,
+    LanguageFilter,
+    WordNumberFilter,
+    BlocklistFilter,
+    HtmlUrlRemoverRefiner,
+    RemoveEmojiRefiner,
+    RemoveExtraSpacesRefiner,
+    ColonEndFilter,
+    SentenceNumberFilter,
+    LineEndWithEllipsisFilter,
+    ContentNullFilter,
+    MeanWordLengthFilter,
+    SymbolWordRatioFilter,
+    HtmlEntityFilter,
+    NoPuncFilter,
+    SpecialCharacterFilter,
+    WatermarkFilter,
+    CurlyBracketFilter,
+    CapitalWordsFilter,
+    LoremIpsumFilter,
+    UniqueWordsFilter,
+    CharNumberFilter,
+    LineStartWithBulletpointFilter,
+    LineWithJavascriptFilter,
+)
+from dataflow.operators.text_pt import (
+    PairQualFilter,
+)
+from dataflow.operators.text_sft import (
+    SuperfilteringFilter,
+    DeitaQualityFilter
+)
+from dataflow.operators.text_sft import SFTGeneratorSeed
+from dataflow.serving import LocalModelLLMServing_vllm, LocalModelLLMServing_sglang
+from dataflow.utils.storage import FileStorage
+class SFTTextSynthetic_GPUPipeline():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/pt_input.jsonl",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        # use vllm as LLM serving
+        self.llm_serving = LocalModelLLMServing_vllm(
+            hf_model_name_or_path='Qwen/Qwen2.5-7B-Instruct',
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=8192,
+        )
+        # use SGLang as LLM serving
+        # self.llm_serving = LocalModelLLMServing_sglang(
+        #     hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct",
+        #     sgl_dp_size=1, # data parallel size
+        #     sgl_tp_size=1, # tensor parallel size
+        #     sgl_max_tokens=1024,
+        #     sgl_tensor_parallel_size=4
+        # )
+        self.language_filter = LanguageFilter(allowed_languages = '__label__eng_Latn', model_cache_dir = self.model_cache_dir)        
+        self.remove_extra_spaces_refiner = RemoveExtraSpacesRefiner()
+        self.remove_emoji_refiner = RemoveEmojiRefiner()
+        self.html_remove_refiner = HtmlUrlRemoverRefiner()
+        self.minhash_deduplicator = MinHashDeduplicateFilter(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5)
+        self.blocklist_filter = BlocklistFilter()
+        self.word_number_filter = WordNumberFilter(min_words=20, max_words=100000)
+        self.colon_end_filter = ColonEndFilter()
+        self.sentence_number_filter = SentenceNumberFilter(min_sentences=3, max_sentences=7500)
+        self.line_end_with_ellipsis_filter = LineEndWithEllipsisFilter(threshold=0.3)
+        self.content_null_filter = ContentNullFilter()
+        self.mean_word_length_filter = MeanWordLengthFilter(min_length=3, max_length=10)
+        self.symbol_word_ratio_filter = SymbolWordRatioFilter(threshold=0.4)
+        self.html_entity_filter = HtmlEntityFilter()
+        self.no_punc_filter = NoPuncFilter(threshold=112)
+        self.special_character_filter = SpecialCharacterFilter()
+        self.watermark_filter = WatermarkFilter(watermarks=['Copyright', 'Watermark', 'Confidential'])
+        self.curly_bracket_filter = CurlyBracketFilter(threshold=0.025)
+        self.capital_words_filter = CapitalWordsFilter(threshold=0.2, use_tokenizer=False)
+        self.lorem_ipsum_filter = LoremIpsumFilter(threshold=3e-8)
+        self.unique_words_filter = UniqueWordsFilter(threshold=0.1)
+        self.char_number_filter = CharNumberFilter(threshold=100)
+        self.line_start_with_bulletpoint_filter = LineStartWithBulletpointFilter(threshold=0.9)
+        self.line_with_javascript_filter = LineWithJavascriptFilter(threshold=3)
+        self.quality_filter = PairQualFilter(min_score=-2, max_score=10000, lang='en')
+        self.sft_generator = SFTGeneratorSeed(
+            llm_serving=self.llm_serving
+        )
+        self.word_number_filter_syn = WordNumberFilter(
+            min_words=20,
+            max_words=5000
+        )
+        self.super_filtering_filter = SuperfilteringFilter(
+            min_score=0.5,
+            max_score=10000,
+            model_cache_dir=self.model_cache_dir
+        )
+        self.deita_quality_filter = DeitaQualityFilter(
+            min_score=2.5,
+            max_score=10000,
+            max_length=512,
+            model_cache_dir=self.model_cache_dir
+        )
+    def forward(self):
+        self.remove_emoji_refiner.run(
+            storage=self.storage.step(),
+            input_key="raw_content"
+        )
+        self.html_remove_refiner.run(
+            storage=self.storage.step(),
+            input_key="raw_content"
+        )
+        self.remove_extra_spaces_refiner.run(
+            storage=self.storage.step(),
+            input_key="raw_content"
+        )
+        self.blocklist_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.word_number_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.colon_end_filter.run(
+            storage = self.storage.step(),
+            input_key = 'raw_content'
+        )
+        self.sentence_number_filter.run(
+            storage = self.storage.step(),
+            input_key = 'raw_content'
+        )
+        self.line_end_with_ellipsis_filter.run(
+            storage = self.storage.step(),
+            input_key = 'raw_content'
+        )
+        # Add the additional filters here
+        self.content_null_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.mean_word_length_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.symbol_word_ratio_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.html_entity_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.no_punc_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.special_character_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.watermark_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.curly_bracket_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.capital_words_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.lorem_ipsum_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.unique_words_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.char_number_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.line_start_with_bulletpoint_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.line_with_javascript_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.quality_filter.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+        )
+        self.language_filter.run(
+            storage = self.storage.step(),
+            input_key = "raw_content"
+        )
+        self.minhash_deduplicator.run(
+            storage = self.storage.step(),
+            input_key='raw_content',
+            output_key='minhash_deduplicated_label',
+        )
+        self.sft_generator.run(
+            storage=self.storage.step(),
+            input_key='raw_content',
+        )
+        self.llm_serving.cleanup()
+        self.word_number_filter_syn.run(
+            storage=self.storage.step(),
+            input_key="output",
+        )
+        self.super_filtering_filter.run(
+            storage=self.storage.step(),
+            input_instruction_key='instruction',
+            input_input_key=None,
+            input_output_key='output'
+        )
+        self.deita_quality_filter.run(
+            storage=self.storage.step(),
+            input_instruction_key='instruction',
+            input_output_key='output'
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = SFTTextSynthetic_GPUPipeline()
+    model.forward()
--- a/dataflow/statics/playground/playground/kbcleaning_pipeline_vllm.py
+++ b/dataflow/statics/playground/playground/kbcleaning_pipeline_vllm.py
+from dataflow.operators.knowledge_cleaning import (
+    KBCChunkGenerator,
+    FileOrURLToMarkdownConverterBatch,
+    KBCTextCleaner,
+    # KBCMultiHopQAGenerator,
+)
+from dataflow.operators.core_text import Text2MultiHopQAGenerator
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm
+class KBCleaning_PDFvllm_GPUPipeline():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/KBCleaningPipeline/kbc_test.jsonl",
+            cache_path="./.cache/gpu",
+            file_name_prefix="knowledge_cleaning_step_vllm_engine_playground",
+            cache_type="json",
+        )
+        self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterBatch(
+            intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
+            lang="en",
+            mineru_backend="vlm-vllm-engine",
+        )
+        self.knowledge_cleaning_step2 = KBCChunkGenerator(
+            split_method="token",
+            chunk_size=512,
+            tokenizer_name="Qwen/Qwen2.5-7B-Instruct",
+        )
+    def forward(self):
+        self.knowledge_cleaning_step1.run(
+            storage=self.storage.step(),
+            # input_key=
+            # output_key=
+        )
+        self.knowledge_cleaning_step2.run(
+            storage=self.storage.step(),
+            # input_key=
+            # output_key=
+        )
+        self.llm_serving = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct",
+            vllm_max_tokens=2048,
+            vllm_tensor_parallel_size=4,
+            vllm_gpu_memory_utilization=0.6,
+            vllm_repetition_penalty=1.2
+        )
+        self.knowledge_cleaning_step3 = KBCTextCleaner(
+            llm_serving=self.llm_serving,
+            lang="en"
+        )
+        self.knowledge_cleaning_step4 = Text2MultiHopQAGenerator(
+            llm_serving=self.llm_serving,
+            lang="en",
+            num_q = 5
+        )
+        self.knowledge_cleaning_step3.run(
+            storage=self.storage.step(),
+            # input_key=
+            # output_key=
+        )
+        self.knowledge_cleaning_step4.run(
+            storage=self.storage.step(),
+            # input_key=
+            # output_key=
+        )
+if __name__ == "__main__":
+    model = KBCleaning_PDFvllm_GPUPipeline()
+    model.forward()
\ No newline at end of file
--- a/dataflow/statics/playground/playground/materials/mat_names_df.py
+++ b/dataflow/statics/playground/playground/materials/mat_names_df.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.operators.chemistry import EvaluateSmilesEquivalence
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+from dataflow.prompts.chemistry import ExtractSmilesFromTextPrompt
+prompt = """You are an expert in materials science. You need analyze the composition and synthesis/process of the materials and extract the material names from the text.
+    Follow these rules strictly:
+    1. For composition and synthesis process analysis:
+        - Describe all materials and their contents, synthesis process, etc. you find in the text, like:
+         Base materials: Material1, Material2, Material3
+         Modifiers identified: Additive with contents of 1 mass%, 3 mass%, and 5 mass%
+         Additional components: Curing agent, etc.
+         Synthesis: The materials were synthesized by mixing the base materials and modifiers.
+         Process: The materials were annealed at 1000°C for 1 hour.
+    2. For material names:
+        -  List all materials that were actually tested, following these rules strictly, only use to distinguish different materials:
+         - Include only materials with measured properties, this rule has priority over the other rules.
+         - Use the abbreviations of sample provided by the author if possible, such as P1, P2, P3, etc.
+         - If no name is provided, generate a unique name based on the composition and synthesis process. The process is not required, and only needed when different materials with different processes are tested.
+         - The name should be as concise as possible, and should not be too long.
+         - If the text is not related to materials, return empty string.
+    3. For material types:
+        - List the material type corresponding to each material listed above in material names.
+        - Use "linear" or "cross_linked" to indicate the structure of polymers, instead of "polymer".
+        - If the polymer structure ("linear" or "cross_linked") is not specified, default to "cross_linked".
+        - Ensure the number of material types matches the number of material names.
+    Do not include explanations, markdown formatting, or code fences (```).
+    """
+class ExtractSmiles():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/chemistry/matched_sample_10.json",
+            #first_entry_file_name="/Users/lianghao/Desktop/dataflow_code/test_dataflow/test/matched_sample_10.json",
+            cache_path="./cache_all_1",
+            file_name_prefix="math_QA",
+            cache_type="json",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=200,
+                response_format = ""
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt=prompt
+        )
+        self.smile_eval = EvaluateSmilesEquivalence()
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "text",
+            output_key = "synth"
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = ExtractSmiles()
+    model.forward()
--- a/dataflow/statics/playground/playground/materials/mat_prop1_df.py
+++ b/dataflow/statics/playground/playground/materials/mat_prop1_df.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.operators.chemistry import EvaluateSmilesEquivalence
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+from dataflow.prompts.chemistry import ExtractSmilesFromTextPrompt
+base_prompt = """You are an expert in polymer materials.
+    Several polymers are mentioned in this article.
+    Your task is to extract the following glass transition temperature (Tg)–related information for each polymer as explicitly stated in the article only (do not use general knowledge or cited references).
+    If any information is not present in the article, leave the corresponding cell empty.
+    Field descriptions: 
+    - name: material name.
+    - tg: The glass transition temperature of the polymer, including its unit (e.g. "250 °C").
+    - method: The method used to measure Tg (e.g. DSC, DMA).
+    - parameter: Experimental parameters of the Tg measurement, such as scan cycle, scan range, heating rate, frequency, etc. Do not include parameters unrelated to Tg measurements (e.g. cure process parameters).
+    - instrument: The model of the device used to measure Tg.
+    - tg_note: Supplementary information for the Tg data, such as the state of the material, synthesis process, test process, cycle number, or other remarks.
+    Output format:
+    Return the result as a raw CSV table with the header: name,tg,method,parameter,instrument,tg_note
+    Requirments:
+    - List each polymer as one separate row under the header.
+    - If multiple variants or versions of the same polymer are tested under different conditions, list all combinations as separate rows (e.g. Cartesian product if applicable).
+    - Do not output JSON, Markdown, or any extra explanatory text.
+    - Do not add any text before or after the CSV table.
+    - For each material, if there is no Tg data, do not output the material.
+    Example output:
+    name,tg,method,parameter,instrument,tg_note
+    Polymer A,250 °C,DSC,5 °C/min to 400 °C,TA Q200,as-cured
+    Polymer B,180 °C,DMA,1 Hz; 3 °C/min,TA Q800,cured 2 h at 180 °C
+    """
+system_prompt = base_prompt + f"\nThis rule has the highest priority: Only extract information for the following materials:\n \"meta-meta-linked BPDA-ODA PAE\", \"meta-para-linked BPDA-ODA PAE\", \"para-para-linked BPDA-ODA PAE\", \"mixed BPDA-ODA PAE\" "
+class ExtractSmiles():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/chemistry/matched_sample_10.json",
+            #first_entry_file_name="/Users/lianghao/Desktop/dataflow_code/test_dataflow/test/matched_sample_10.json",
+            cache_path="./cache_all_2",
+            file_name_prefix="math_QA",
+            cache_type="json",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=200,
+                response_format = ""
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt=system_prompt
+        )
+        self.smile_eval = EvaluateSmilesEquivalence()
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "text",
+            output_key = "synth"
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = ExtractSmiles()
+    model.forward()
--- a/dataflow/statics/playground/playground/materials/mat_recipe_df.py
+++ b/dataflow/statics/playground/playground/materials/mat_recipe_df.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.operators.chemistry import EvaluateSmilesEquivalence
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+from dataflow.prompts.chemistry import ExtractSmilesFromTextPrompt
+prompt = """Extract the polyimide recipe information from the text and format it as a structured JSON object.
+    Follow these rules strictly:
+    Only use the name in "meta-meta-linked BPDA-ODA PAE", "meta-para-linked BPDA-ODA PAE", "para-para-linked BPDA-ODA PAE", "mixed BPDA-ODA PAE"
+    1. For name:
+       - Use the sample name if provided, otherwise generate a name based on the dianhydride and diamine
+       - Example: "PI-1" or "PI-2"
+    2. For dianhydride and diamine:
+       - Use ComponentInfo structure with components array and ratio array
+       - All ratios are molar ratios
+       - List all monomers in components array
+       - List corresponding molar ratios in ratio array
+       - Use abbreviation of the monomer name if possible
+       - Example for dianhydride: {{
+           "components": ["4-ODA", "6-FDA"],
+           "ratio": [1, 1]
+         }}
+       - Example for single monomer: {{
+           "components": ["6-FDA"],
+           "ratio": [1]
+         }}
+    3. General rules:
+       - Extract ALL recipes mentioned in the text
+       - Each recipe should have a unique name
+       - If multiple recipes are mentioned, return them as a list in the "recipes" field
+       - Do not include any polyimide name from references
+       - If the text is not related to polyimide, return empty string
+    Example input texts and their corresponding outputs:
+    1. Simple case:
+    Input: "The polyimide PI-1 was synthesized from 4-ODA and 6-FDA (1:1) with HPMDA and 3,4'-ODA (1:1)."
+    Output:  
+        [
+            {{
+                "name": "PI-1",
+                "dianhydride": {{
+                    "components": ["4-ODA", "6-FDA"],
+                    "ratio": [1, 1]
+                }},
+                "diamine": {{
+                    "components": ["HPMDA", "3,4'-ODA"],
+                    "ratio": [1, 1]
+                }}
+            }}
+        ]
+    2. Multiple recipes case:
+    Input: "Two polyimides were prepared: (1) PI-1 from 6-FDA and ODA (1:1), and (2) PI-2 from PMDA and PDA (1:1)."
+    Output:
+    [
+            {{
+                "name": "PI-1",
+                "dianhydride": {{
+                    "components": ["6-FDA"],
+                    "ratio": [1]
+                }},
+                "diamine": {{
+                    "components": ["ODA"],
+                    "ratio": [1]
+                }}
+            }},
+            {{
+                "name": "PI-2",
+                "dianhydride": {{
+                    "components": ["PMDA"],
+                    "ratio": [1]
+                }},
+                "diamine": {{
+                    "components": ["PDA"],
+                    "ratio": [1]
+                }}
+            }}
+        ]
+    """
+class ExtractSmiles():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/chemistry/matched_sample_10.json",
+            #first_entry_file_name="/Users/lianghao/Desktop/dataflow_code/test_dataflow/test/matched_sample_10.json",
+            cache_path="./cache_all_3",
+            file_name_prefix="math_QA",
+            cache_type="json",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gemini-2.5-flash",
+                max_workers=200,
+                response_format = ""
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt=prompt
+        )
+        self.smile_eval = EvaluateSmilesEquivalence()
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "text",
+            output_key = "synth"
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = ExtractSmiles()
+    model.forward()
--- a/dataflow/statics/playground/playground/mathbook_extract.py
+++ b/dataflow/statics/playground/playground/mathbook_extract.py
+from dataflow.operators.knowledge_cleaning import MathBookQuestionExtract
+from dataflow.serving import APIVLMServing_openai
+class QuestionExtractPipeline:
+    def __init__(self, 
+                 llm_serving: APIVLMServing_openai,
+                 api_url: str = "https://api.openai.com/v1", # end with /v1
+                 key_name_of_api_key: str = "DF_API_KEY", # set in environment first: export DF_API_KEY="your_openai_api_key"
+                 model_name: str = "o4-mini",
+                 max_workers: int = 20
+                 ):
+        self.extractor = MathBookQuestionExtract(
+            llm_serving=llm_serving,
+            key_name_of_api_key=key_name_of_api_key,
+            model_name=model_name,
+            max_workers=max_workers
+        )
+        self.test_pdf = "../example_data/PDF2VQAPipeline/questionextract_test.pdf" 
+    def forward(
+        self,
+        pdf_path: str,
+        output_name: str,
+        output_dir: str,
+    ):
+        self.extractor.run(
+            storage=None,
+            input_pdf_file_path=pdf_path,
+            output_file_name=output_name,
+            output_folder=output_dir
+        )
+if __name__ == "__main__":
+    llm_serving = APIVLMServing_openai(
+        api_url="https://api.openai.com/v1",
+        model_name="o4-mini",
+        max_workers=20
+    )
+    pipeline = QuestionExtractPipeline(llm_serving=llm_serving)
+    pipeline.forward(
+        pdf_path=pipeline.test_pdf,
+        output_name="test_question_extract",
+        output_dir="./output"
+    )
--- a/dataflow/statics/playground/playground/prompted_generator/abbreviation_qa_api.py
+++ b/dataflow/statics/playground/playground/prompted_generator/abbreviation_qa_api.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+class GPT_generator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/abbreviation.jsonl",
+            cache_path="./cache",
+            file_name_prefix="math_QA",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=2
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt = "Please rewrite the following paragraph into a concise summary that preserves the core meaning and key information:", # System prompt for math problem solving
+        )
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "raw_content",
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = GPT_generator()
+    model.forward()
--- a/dataflow/statics/playground/playground/prompted_generator/abbreviation_qa_local.py
+++ b/dataflow/statics/playground/playground/prompted_generator/abbreviation_qa_local.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.utils.storage import FileStorage
+class Qwen_generator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/abbreviation.jsonl",
+            cache_path="./cache",
+            file_name_prefix="math_QA",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=8192,
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt = "Please rewrite the following paragraph into a concise summary that preserves the core meaning and key information:",  # System prompt for math problem solving
+        )
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "raw_content",
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = Qwen_generator()
+    model.forward()
--- a/dataflow/statics/playground/playground/prompted_generator/generate_qa_api.py
+++ b/dataflow/statics/playground/playground/prompted_generator/generate_qa_api.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+class GPT_generator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/math_100.jsonl",
+            cache_path="./cache",
+            file_name_prefix="math_QA",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=2
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt = "Please solve this math problem.", # System prompt for math problem solving
+        )
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "problem",
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = GPT_generator()
+    model.forward()
--- a/dataflow/statics/playground/playground/prompted_generator/generate_qa_local.py
+++ b/dataflow/statics/playground/playground/prompted_generator/generate_qa_local.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.utils.storage import FileStorage
+class Qwen_generator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/math_100.jsonl",
+            cache_path="./cache",
+            file_name_prefix="math_QA",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=8192,
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt = "Please solve this math problem.",  # System prompt for math problem solving
+        )
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "problem",
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = Qwen_generator()
+    model.forward()
--- a/dataflow/statics/playground/playground/prompted_generator/retrieval_api.py
+++ b/dataflow/statics/playground/playground/prompted_generator/retrieval_api.py
+from dataflow.logger import get_logger
+from dataflow.operators.core_text import RetrievalGenerator
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LightRAGServing
+import asyncio
+class RAG():
+    def __init__(self, docs):
+        self.storage = FileStorage(
+            first_entry_file_name="./paperquestion.jsonl",
+            cache_path="./cache",
+            file_name_prefix="paper_question",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = None
+        self.retrieval_generator = None
+        self.docs = docs
+    async def initialize(self):
+        self.llm_serving = await LightRAGServing.create(api_url="https://api.openai.com/v1", document_list=self.docs)
+        self.retrieval_generator = RetrievalGenerator(
+            llm_serving = self.llm_serving,
+            system_prompt="Answer the question based on the text."
+        )
+    async def forward(self):
+        await self.retrieval_generator.run(
+            storage=self.storage.step()
+        )
+async def main():
+    doc = ["./text1.txt"]
+    model = RAG(doc)
+    await model.initialize()
+    await model.forward()
+if __name__ == "__main__":
+    asyncio.run(main())
\ No newline at end of file
--- a/dataflow/statics/playground/playground/prompted_generator/translation_api.py
+++ b/dataflow/statics/playground/playground/prompted_generator/translation_api.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+class GPT_generator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/translation.jsonl",
+            cache_path="./cache",
+            file_name_prefix="translation",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=2
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt = "Please translate to Chinese.", # System prompt for translation
+        )        
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "raw_content",
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = GPT_generator()
+    model.forward()
\ No newline at end of file
--- a/dataflow/statics/playground/playground/prompted_generator/translation_api_json_schema.py
+++ b/dataflow/statics/playground/playground/prompted_generator/translation_api_json_schema.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+class GPT_generator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/translation.jsonl",
+            cache_path="./cache",
+            file_name_prefix="translation",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=2
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt = "Please translate to Chinese.Please answer in JSON format.",
+            json_schema = {
+                    "type": "object",
+                    "properties": {
+                        "original": {"type": "string"},
+                        "translation": {"type": "string"}
+                    },
+                    "required": ["original", "translation"],
+                    "additionalProperties": False
+                    }
+        )        
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "raw_content",
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = GPT_generator()
+    model.forward()
--- a/dataflow/statics/playground/playground/prompted_generator/translation_litellm.py
+++ b/dataflow/statics/playground/playground/prompted_generator/translation_litellm.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import APILLMServing_request, LiteLLMServing
+from dataflow.utils.storage import FileStorage
+class GPT_generator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name= "../example_data/GeneralTextPipeline/translation.jsonl",
+            cache_path="./cache",
+            file_name_prefix="translation",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=10,
+                # custom_llm_provider="openai", # if your are using custom llm provider's api
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving,
+            system_prompt = "Please translate to Chinese.",
+        )        
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "raw_content",
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = GPT_generator()
+    model.forward()
\ No newline at end of file
--- a/dataflow/statics/playground/playground/prompted_generator/translation_litellm_json_schema.py
+++ b/dataflow/statics/playground/playground/prompted_generator/translation_litellm_json_schema.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import APILLMServing_request, LiteLLMServing
+from dataflow.utils.storage import FileStorage
+class GPT_generator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name= "../example_data/GeneralTextPipeline/translation.jsonl",
+            cache_path="./cache",
+            file_name_prefix="translation",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = APILLMServing_request(
+                api_url="https://api.openai.com/v1/chat/completions",
+                model_name="gpt-4o",
+                max_workers=10,
+                # custom_llm_provider="openai", # if your are using custom llm provider's api
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving,
+            system_prompt = "Please translate to Chinese. Please answer in JSON format.",
+            json_schema={
+                "type": "object",
+                "properties": {
+                    "original": {"type": "string"},
+                    "translation": {"type": "string"}
+                },
+                "required": ["original", "translation"],
+                "additionalProperties": False,
+            },
+        )        
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "raw_content",
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = GPT_generator()
+    model.forward()
\ No newline at end of file
--- a/dataflow/statics/playground/playground/prompted_generator/translation_local.py
+++ b/dataflow/statics/playground/playground/prompted_generator/translation_local.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.utils.storage import FileStorage
+class Qwen_generator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/translation.jsonl",
+            cache_path="./cache",
+            file_name_prefix="translation",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=8192,
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt = "Please translate to Chinese.", # System prompt for translation
+        )
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "raw_content",
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = Qwen_generator()
+    model.forward()
--- a/dataflow/statics/playground/playground/prompted_generator/translation_local_json_schema.py
+++ b/dataflow/statics/playground/playground/prompted_generator/translation_local_json_schema.py
+from dataflow.operators.core_text import PromptedGenerator
+from dataflow.serving import LocalModelLLMServing_vllm
+from dataflow.utils.storage import FileStorage
+class Qwen_generator():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/translation.jsonl",
+            cache_path="./cache",
+            file_name_prefix="translation",
+            cache_type="jsonl",
+        )
+        self.model_cache_dir = './dataflow_cache'
+        self.llm_serving = LocalModelLLMServing_vllm(
+            hf_model_name_or_path="Qwen2.5-7B-Instruct", # set to your own model path
+            vllm_tensor_parallel_size=1,
+            vllm_max_tokens=8192,
+        )
+        self.prompt_generator = PromptedGenerator(
+            llm_serving = self.llm_serving, 
+            system_prompt = "Please translate to Chinese.", # System prompt for translation
+            json_schema={
+                "type": "object",
+                "properties": {
+                    "original": {"type": "string"},
+                    "translation": {"type": "string"}
+                },
+                "required": ["original", "translation"],
+            },
+        )
+    def forward(self):
+        # Initial filters
+        self.prompt_generator.run(
+            storage = self.storage.step(),
+            input_key = "raw_content",
+        )
+if __name__ == "__main__":
+    # This is the entry point for the pipeline
+    model = Qwen_generator()
+    model.forward()
--- a/dataflow/statics/playground/playground/promte_templated_generator/double_column_input.py
+++ b/dataflow/statics/playground/playground/promte_templated_generator/double_column_input.py
+from dataflow.operators.core_text import PromptTemplatedGenerator
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+from dataflow.prompts.core_text import StrFormatPrompt
+class DoubleColumnInputTestCase():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/core_text_data/double_column_input.json",
+            file_name_prefix="double_column_input",
+            cache_path="./cache",
+            cache_type="jsonl",
+        )
+        self.llm_serving = APILLMServing_request(
+            api_url="https://api.openai.com/v1/chat/completions",   
+            model_name="gpt-4o"
+        )
+        self.prompt_template = StrFormatPrompt(
+            f_str_template="What does a {input_roll} like to {input_term}?"
+        )
+        self.operator = PromptTemplatedGenerator(
+            llm_serving=self.llm_serving,
+            prompt_template=self.prompt_template
+        )
+    def forward(self):
+        self.operator.run(
+            storage=self.storage.step(),
+            input_roll="roll",
+            input_term="term",
+            output_key="answer",
+        )
+if __name__ == "__main__":
+    model = DoubleColumnInputTestCase()
+    model.forward()
\ No newline at end of file
--- a/dataflow/statics/playground/playground/quick_evaluate.py
+++ b/dataflow/statics/playground/playground/quick_evaluate.py
+from dataflow.operators.text_pt import MetaSampleEvaluator
+from dataflow.serving import APILLMServing_request
+from dataflow.utils.storage import FileStorage
+class QuickEvaluatePipeline():
+    def __init__(self):
+        self.storage = FileStorage(
+            first_entry_file_name="../example_data/GeneralTextPipeline/pt_input.jsonl",
+            cache_path="./cache",
+            file_name_prefix="dataflow_cache_step_langc",
+            cache_type="jsonl",
+        )
+        self.llm_serving = APILLMServing_request(
+            api_url="http://123.129.219.111:3000/v1/chat/completions",
+            model_name="gpt-4o"
+        )
+        self.meta_scorer = MetaSampleEvaluator(llm_serving=self.llm_serving)
+    def forward(self):
+        self.meta_scorer.run(
+            self.storage.step(),
+            input_key='raw_content'
+        )
+if __name__ == "__main__":
+    pipeline = QuickEvaluatePipeline()
+    pipeline.forward()
\ No newline at end of file