Commit 97e8278b authored by zzg_666's avatar zzg_666
Browse files

适配后端vllm

parents
Pipeline #3071 canceled with stages
from dataflow.operators.general_text import (
WordNumberFilter,
)
from dataflow.operators.text_sft import (
SuperfilteringFilter,
DeitaQualityFilter
)
from dataflow.utils.storage import FileStorage
class SFTTextFilter_GPUPipeline():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/sft_input.jsonl",
cache_path="./cache",
file_name_prefix="dataflow_cache_step",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.word_number_filter_step1 = WordNumberFilter(
min_words=20,
max_words=1000
)
self.super_filtering_filter_step2 = SuperfilteringFilter(
min_score=0.5,
max_score=1.0,
model_cache_dir=self.model_cache_dir
)
self.deita_quality_filter_step3 = DeitaQualityFilter(
min_score=2.5,
max_score=10000,
max_length=512,
model_cache_dir=self.model_cache_dir
)
def forward(self):
self.word_number_filter_step1.run(
storage=self.storage.step(),
input_key="output",
)
self.super_filtering_filter_step2.run(
storage=self.storage.step(),
input_instruction_key='instruction',
input_input_key=None,
input_output_key='output'
)
self.deita_quality_filter_step3.run(
storage=self.storage.step(),
input_instruction_key='instruction',
input_output_key='output'
)
if __name__ == "__main__":
# This is the entry point for the pipeline
pipeline = SFTTextFilter_GPUPipeline()
pipeline.forward()
\ No newline at end of file
from dataflow.operators.general_text import (
MinHashDeduplicateFilter,
LanguageFilter,
WordNumberFilter,
BlocklistFilter,
HtmlUrlRemoverRefiner,
RemoveEmojiRefiner,
RemoveExtraSpacesRefiner,
ColonEndFilter,
SentenceNumberFilter,
LineEndWithEllipsisFilter,
ContentNullFilter,
MeanWordLengthFilter,
SymbolWordRatioFilter,
HtmlEntityFilter,
NoPuncFilter,
SpecialCharacterFilter,
WatermarkFilter,
CurlyBracketFilter,
CapitalWordsFilter,
LoremIpsumFilter,
UniqueWordsFilter,
CharNumberFilter,
LineStartWithBulletpointFilter,
LineWithJavascriptFilter,
)
from dataflow.operators.text_pt import (
PairQualFilter,
)
from dataflow.operators.text_sft import (
SuperfilteringFilter,
DeitaQualityFilter
)
from dataflow.operators.text_sft import SFTGeneratorSeed
from dataflow.serving import LocalModelLLMServing_vllm, LocalModelLLMServing_sglang
from dataflow.utils.storage import FileStorage
class SFTTextSynthetic_GPUPipeline():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/pt_input.jsonl",
cache_path="./cache",
file_name_prefix="dataflow_cache",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
# use vllm as LLM serving
self.llm_serving = LocalModelLLMServing_vllm(
hf_model_name_or_path='Qwen/Qwen2.5-7B-Instruct',
vllm_tensor_parallel_size=1,
vllm_max_tokens=8192,
)
# use SGLang as LLM serving
# self.llm_serving = LocalModelLLMServing_sglang(
# hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct",
# sgl_dp_size=1, # data parallel size
# sgl_tp_size=1, # tensor parallel size
# sgl_max_tokens=1024,
# sgl_tensor_parallel_size=4
# )
self.language_filter = LanguageFilter(allowed_languages = '__label__eng_Latn', model_cache_dir = self.model_cache_dir)
self.remove_extra_spaces_refiner = RemoveExtraSpacesRefiner()
self.remove_emoji_refiner = RemoveEmojiRefiner()
self.html_remove_refiner = HtmlUrlRemoverRefiner()
self.minhash_deduplicator = MinHashDeduplicateFilter(num_perm=128, threshold=0.9, use_n_gram=True, ngram=5)
self.blocklist_filter = BlocklistFilter()
self.word_number_filter = WordNumberFilter(min_words=20, max_words=100000)
self.colon_end_filter = ColonEndFilter()
self.sentence_number_filter = SentenceNumberFilter(min_sentences=3, max_sentences=7500)
self.line_end_with_ellipsis_filter = LineEndWithEllipsisFilter(threshold=0.3)
self.content_null_filter = ContentNullFilter()
self.mean_word_length_filter = MeanWordLengthFilter(min_length=3, max_length=10)
self.symbol_word_ratio_filter = SymbolWordRatioFilter(threshold=0.4)
self.html_entity_filter = HtmlEntityFilter()
self.no_punc_filter = NoPuncFilter(threshold=112)
self.special_character_filter = SpecialCharacterFilter()
self.watermark_filter = WatermarkFilter(watermarks=['Copyright', 'Watermark', 'Confidential'])
self.curly_bracket_filter = CurlyBracketFilter(threshold=0.025)
self.capital_words_filter = CapitalWordsFilter(threshold=0.2, use_tokenizer=False)
self.lorem_ipsum_filter = LoremIpsumFilter(threshold=3e-8)
self.unique_words_filter = UniqueWordsFilter(threshold=0.1)
self.char_number_filter = CharNumberFilter(threshold=100)
self.line_start_with_bulletpoint_filter = LineStartWithBulletpointFilter(threshold=0.9)
self.line_with_javascript_filter = LineWithJavascriptFilter(threshold=3)
self.quality_filter = PairQualFilter(min_score=-2, max_score=10000, lang='en')
self.sft_generator = SFTGeneratorSeed(
llm_serving=self.llm_serving
)
self.word_number_filter_syn = WordNumberFilter(
min_words=20,
max_words=5000
)
self.super_filtering_filter = SuperfilteringFilter(
min_score=0.5,
max_score=10000,
model_cache_dir=self.model_cache_dir
)
self.deita_quality_filter = DeitaQualityFilter(
min_score=2.5,
max_score=10000,
max_length=512,
model_cache_dir=self.model_cache_dir
)
def forward(self):
self.remove_emoji_refiner.run(
storage=self.storage.step(),
input_key="raw_content"
)
self.html_remove_refiner.run(
storage=self.storage.step(),
input_key="raw_content"
)
self.remove_extra_spaces_refiner.run(
storage=self.storage.step(),
input_key="raw_content"
)
self.blocklist_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.word_number_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.colon_end_filter.run(
storage = self.storage.step(),
input_key = 'raw_content'
)
self.sentence_number_filter.run(
storage = self.storage.step(),
input_key = 'raw_content'
)
self.line_end_with_ellipsis_filter.run(
storage = self.storage.step(),
input_key = 'raw_content'
)
# Add the additional filters here
self.content_null_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.mean_word_length_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.symbol_word_ratio_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.html_entity_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.no_punc_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.special_character_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.watermark_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.curly_bracket_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.capital_words_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.lorem_ipsum_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.unique_words_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.char_number_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.line_start_with_bulletpoint_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.line_with_javascript_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.quality_filter.run(
storage = self.storage.step(),
input_key='raw_content',
)
self.language_filter.run(
storage = self.storage.step(),
input_key = "raw_content"
)
self.minhash_deduplicator.run(
storage = self.storage.step(),
input_key='raw_content',
output_key='minhash_deduplicated_label',
)
self.sft_generator.run(
storage=self.storage.step(),
input_key='raw_content',
)
self.llm_serving.cleanup()
self.word_number_filter_syn.run(
storage=self.storage.step(),
input_key="output",
)
self.super_filtering_filter.run(
storage=self.storage.step(),
input_instruction_key='instruction',
input_input_key=None,
input_output_key='output'
)
self.deita_quality_filter.run(
storage=self.storage.step(),
input_instruction_key='instruction',
input_output_key='output'
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = SFTTextSynthetic_GPUPipeline()
model.forward()
from dataflow.operators.knowledge_cleaning import (
KBCChunkGenerator,
FileOrURLToMarkdownConverterBatch,
KBCTextCleaner,
# KBCMultiHopQAGenerator,
)
from dataflow.operators.core_text import Text2MultiHopQAGenerator
from dataflow.utils.storage import FileStorage
from dataflow.serving import LocalModelLLMServing_vllm
class KBCleaning_PDFvllm_GPUPipeline():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/KBCleaningPipeline/kbc_test.jsonl",
cache_path="./.cache/gpu",
file_name_prefix="knowledge_cleaning_step_vllm_engine_playground",
cache_type="json",
)
self.knowledge_cleaning_step1 = FileOrURLToMarkdownConverterBatch(
intermediate_dir="../../example_data/KBCleaningPipeline/raw/",
lang="en",
mineru_backend="vlm-vllm-engine",
)
self.knowledge_cleaning_step2 = KBCChunkGenerator(
split_method="token",
chunk_size=512,
tokenizer_name="Qwen/Qwen2.5-7B-Instruct",
)
def forward(self):
self.knowledge_cleaning_step1.run(
storage=self.storage.step(),
# input_key=
# output_key=
)
self.knowledge_cleaning_step2.run(
storage=self.storage.step(),
# input_key=
# output_key=
)
self.llm_serving = LocalModelLLMServing_vllm(
hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct",
vllm_max_tokens=2048,
vllm_tensor_parallel_size=4,
vllm_gpu_memory_utilization=0.6,
vllm_repetition_penalty=1.2
)
self.knowledge_cleaning_step3 = KBCTextCleaner(
llm_serving=self.llm_serving,
lang="en"
)
self.knowledge_cleaning_step4 = Text2MultiHopQAGenerator(
llm_serving=self.llm_serving,
lang="en",
num_q = 5
)
self.knowledge_cleaning_step3.run(
storage=self.storage.step(),
# input_key=
# output_key=
)
self.knowledge_cleaning_step4.run(
storage=self.storage.step(),
# input_key=
# output_key=
)
if __name__ == "__main__":
model = KBCleaning_PDFvllm_GPUPipeline()
model.forward()
\ No newline at end of file
from dataflow.operators.core_text import PromptedGenerator
from dataflow.operators.chemistry import EvaluateSmilesEquivalence
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
from dataflow.prompts.chemistry import ExtractSmilesFromTextPrompt
prompt = """You are an expert in materials science. You need analyze the composition and synthesis/process of the materials and extract the material names from the text.
Follow these rules strictly:
1. For composition and synthesis process analysis:
- Describe all materials and their contents, synthesis process, etc. you find in the text, like:
Base materials: Material1, Material2, Material3
Modifiers identified: Additive with contents of 1 mass%, 3 mass%, and 5 mass%
Additional components: Curing agent, etc.
Synthesis: The materials were synthesized by mixing the base materials and modifiers.
Process: The materials were annealed at 1000°C for 1 hour.
2. For material names:
- List all materials that were actually tested, following these rules strictly, only use to distinguish different materials:
- Include only materials with measured properties, this rule has priority over the other rules.
- Use the abbreviations of sample provided by the author if possible, such as P1, P2, P3, etc.
- If no name is provided, generate a unique name based on the composition and synthesis process. The process is not required, and only needed when different materials with different processes are tested.
- The name should be as concise as possible, and should not be too long.
- If the text is not related to materials, return empty string.
3. For material types:
- List the material type corresponding to each material listed above in material names.
- Use "linear" or "cross_linked" to indicate the structure of polymers, instead of "polymer".
- If the polymer structure ("linear" or "cross_linked") is not specified, default to "cross_linked".
- Ensure the number of material types matches the number of material names.
Do not include explanations, markdown formatting, or code fences (```).
"""
class ExtractSmiles():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/chemistry/matched_sample_10.json",
#first_entry_file_name="/Users/lianghao/Desktop/dataflow_code/test_dataflow/test/matched_sample_10.json",
cache_path="./cache_all_1",
file_name_prefix="math_QA",
cache_type="json",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=200,
response_format = ""
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt=prompt
)
self.smile_eval = EvaluateSmilesEquivalence()
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "text",
output_key = "synth"
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = ExtractSmiles()
model.forward()
from dataflow.operators.core_text import PromptedGenerator
from dataflow.operators.chemistry import EvaluateSmilesEquivalence
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
from dataflow.prompts.chemistry import ExtractSmilesFromTextPrompt
base_prompt = """You are an expert in polymer materials.
Several polymers are mentioned in this article.
Your task is to extract the following glass transition temperature (Tg)–related information for each polymer as explicitly stated in the article only (do not use general knowledge or cited references).
If any information is not present in the article, leave the corresponding cell empty.
Field descriptions:
- name: material name.
- tg: The glass transition temperature of the polymer, including its unit (e.g. "250 °C").
- method: The method used to measure Tg (e.g. DSC, DMA).
- parameter: Experimental parameters of the Tg measurement, such as scan cycle, scan range, heating rate, frequency, etc. Do not include parameters unrelated to Tg measurements (e.g. cure process parameters).
- instrument: The model of the device used to measure Tg.
- tg_note: Supplementary information for the Tg data, such as the state of the material, synthesis process, test process, cycle number, or other remarks.
Output format:
Return the result as a raw CSV table with the header: name,tg,method,parameter,instrument,tg_note
Requirments:
- List each polymer as one separate row under the header.
- If multiple variants or versions of the same polymer are tested under different conditions, list all combinations as separate rows (e.g. Cartesian product if applicable).
- Do not output JSON, Markdown, or any extra explanatory text.
- Do not add any text before or after the CSV table.
- For each material, if there is no Tg data, do not output the material.
Example output:
name,tg,method,parameter,instrument,tg_note
Polymer A,250 °C,DSC,5 °C/min to 400 °C,TA Q200,as-cured
Polymer B,180 °C,DMA,1 Hz; 3 °C/min,TA Q800,cured 2 h at 180 °C
"""
system_prompt = base_prompt + f"\nThis rule has the highest priority: Only extract information for the following materials:\n \"meta-meta-linked BPDA-ODA PAE\", \"meta-para-linked BPDA-ODA PAE\", \"para-para-linked BPDA-ODA PAE\", \"mixed BPDA-ODA PAE\" "
class ExtractSmiles():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/chemistry/matched_sample_10.json",
#first_entry_file_name="/Users/lianghao/Desktop/dataflow_code/test_dataflow/test/matched_sample_10.json",
cache_path="./cache_all_2",
file_name_prefix="math_QA",
cache_type="json",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=200,
response_format = ""
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt=system_prompt
)
self.smile_eval = EvaluateSmilesEquivalence()
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "text",
output_key = "synth"
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = ExtractSmiles()
model.forward()
from dataflow.operators.core_text import PromptedGenerator
from dataflow.operators.chemistry import EvaluateSmilesEquivalence
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
from dataflow.prompts.chemistry import ExtractSmilesFromTextPrompt
prompt = """Extract the polyimide recipe information from the text and format it as a structured JSON object.
Follow these rules strictly:
Only use the name in "meta-meta-linked BPDA-ODA PAE", "meta-para-linked BPDA-ODA PAE", "para-para-linked BPDA-ODA PAE", "mixed BPDA-ODA PAE"
1. For name:
- Use the sample name if provided, otherwise generate a name based on the dianhydride and diamine
- Example: "PI-1" or "PI-2"
2. For dianhydride and diamine:
- Use ComponentInfo structure with components array and ratio array
- All ratios are molar ratios
- List all monomers in components array
- List corresponding molar ratios in ratio array
- Use abbreviation of the monomer name if possible
- Example for dianhydride: {{
"components": ["4-ODA", "6-FDA"],
"ratio": [1, 1]
}}
- Example for single monomer: {{
"components": ["6-FDA"],
"ratio": [1]
}}
3. General rules:
- Extract ALL recipes mentioned in the text
- Each recipe should have a unique name
- If multiple recipes are mentioned, return them as a list in the "recipes" field
- Do not include any polyimide name from references
- If the text is not related to polyimide, return empty string
Example input texts and their corresponding outputs:
1. Simple case:
Input: "The polyimide PI-1 was synthesized from 4-ODA and 6-FDA (1:1) with HPMDA and 3,4'-ODA (1:1)."
Output:
[
{{
"name": "PI-1",
"dianhydride": {{
"components": ["4-ODA", "6-FDA"],
"ratio": [1, 1]
}},
"diamine": {{
"components": ["HPMDA", "3,4'-ODA"],
"ratio": [1, 1]
}}
}}
]
2. Multiple recipes case:
Input: "Two polyimides were prepared: (1) PI-1 from 6-FDA and ODA (1:1), and (2) PI-2 from PMDA and PDA (1:1)."
Output:
[
{{
"name": "PI-1",
"dianhydride": {{
"components": ["6-FDA"],
"ratio": [1]
}},
"diamine": {{
"components": ["ODA"],
"ratio": [1]
}}
}},
{{
"name": "PI-2",
"dianhydride": {{
"components": ["PMDA"],
"ratio": [1]
}},
"diamine": {{
"components": ["PDA"],
"ratio": [1]
}}
}}
]
"""
class ExtractSmiles():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/chemistry/matched_sample_10.json",
#first_entry_file_name="/Users/lianghao/Desktop/dataflow_code/test_dataflow/test/matched_sample_10.json",
cache_path="./cache_all_3",
file_name_prefix="math_QA",
cache_type="json",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gemini-2.5-flash",
max_workers=200,
response_format = ""
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt=prompt
)
self.smile_eval = EvaluateSmilesEquivalence()
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "text",
output_key = "synth"
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = ExtractSmiles()
model.forward()
from dataflow.operators.knowledge_cleaning import MathBookQuestionExtract
from dataflow.serving import APIVLMServing_openai
class QuestionExtractPipeline:
def __init__(self,
llm_serving: APIVLMServing_openai,
api_url: str = "https://api.openai.com/v1", # end with /v1
key_name_of_api_key: str = "DF_API_KEY", # set in environment first: export DF_API_KEY="your_openai_api_key"
model_name: str = "o4-mini",
max_workers: int = 20
):
self.extractor = MathBookQuestionExtract(
llm_serving=llm_serving,
key_name_of_api_key=key_name_of_api_key,
model_name=model_name,
max_workers=max_workers
)
self.test_pdf = "../example_data/PDF2VQAPipeline/questionextract_test.pdf"
def forward(
self,
pdf_path: str,
output_name: str,
output_dir: str,
):
self.extractor.run(
storage=None,
input_pdf_file_path=pdf_path,
output_file_name=output_name,
output_folder=output_dir
)
if __name__ == "__main__":
llm_serving = APIVLMServing_openai(
api_url="https://api.openai.com/v1",
model_name="o4-mini",
max_workers=20
)
pipeline = QuestionExtractPipeline(llm_serving=llm_serving)
pipeline.forward(
pdf_path=pipeline.test_pdf,
output_name="test_question_extract",
output_dir="./output"
)
from dataflow.operators.core_text import PromptedGenerator
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
class GPT_generator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/abbreviation.jsonl",
cache_path="./cache",
file_name_prefix="math_QA",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=2
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt = "Please rewrite the following paragraph into a concise summary that preserves the core meaning and key information:", # System prompt for math problem solving
)
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "raw_content",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = GPT_generator()
model.forward()
from dataflow.operators.core_text import PromptedGenerator
from dataflow.serving import LocalModelLLMServing_vllm
from dataflow.utils.storage import FileStorage
class Qwen_generator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/abbreviation.jsonl",
cache_path="./cache",
file_name_prefix="math_QA",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = LocalModelLLMServing_vllm(
hf_model_name_or_path="Qwen2.5-7B-Instruct", # set to your own model path
vllm_tensor_parallel_size=1,
vllm_max_tokens=8192,
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt = "Please rewrite the following paragraph into a concise summary that preserves the core meaning and key information:", # System prompt for math problem solving
)
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "raw_content",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = Qwen_generator()
model.forward()
from dataflow.operators.core_text import PromptedGenerator
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
class GPT_generator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/math_100.jsonl",
cache_path="./cache",
file_name_prefix="math_QA",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=2
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt = "Please solve this math problem.", # System prompt for math problem solving
)
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "problem",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = GPT_generator()
model.forward()
from dataflow.operators.core_text import PromptedGenerator
from dataflow.serving import LocalModelLLMServing_vllm
from dataflow.utils.storage import FileStorage
class Qwen_generator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/math_100.jsonl",
cache_path="./cache",
file_name_prefix="math_QA",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = LocalModelLLMServing_vllm(
hf_model_name_or_path="Qwen2.5-7B-Instruct", # set to your own model path
vllm_tensor_parallel_size=1,
vllm_max_tokens=8192,
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt = "Please solve this math problem.", # System prompt for math problem solving
)
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "problem",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = Qwen_generator()
model.forward()
from dataflow.logger import get_logger
from dataflow.operators.core_text import RetrievalGenerator
from dataflow.utils.storage import FileStorage
from dataflow.serving import LightRAGServing
import asyncio
class RAG():
def __init__(self, docs):
self.storage = FileStorage(
first_entry_file_name="./paperquestion.jsonl",
cache_path="./cache",
file_name_prefix="paper_question",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = None
self.retrieval_generator = None
self.docs = docs
async def initialize(self):
self.llm_serving = await LightRAGServing.create(api_url="https://api.openai.com/v1", document_list=self.docs)
self.retrieval_generator = RetrievalGenerator(
llm_serving = self.llm_serving,
system_prompt="Answer the question based on the text."
)
async def forward(self):
await self.retrieval_generator.run(
storage=self.storage.step()
)
async def main():
doc = ["./text1.txt"]
model = RAG(doc)
await model.initialize()
await model.forward()
if __name__ == "__main__":
asyncio.run(main())
\ No newline at end of file
from dataflow.operators.core_text import PromptedGenerator
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
class GPT_generator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/translation.jsonl",
cache_path="./cache",
file_name_prefix="translation",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=2
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt = "Please translate to Chinese.", # System prompt for translation
)
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "raw_content",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = GPT_generator()
model.forward()
\ No newline at end of file
from dataflow.operators.core_text import PromptedGenerator
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
class GPT_generator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/translation.jsonl",
cache_path="./cache",
file_name_prefix="translation",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=2
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt = "Please translate to Chinese.Please answer in JSON format.",
json_schema = {
"type": "object",
"properties": {
"original": {"type": "string"},
"translation": {"type": "string"}
},
"required": ["original", "translation"],
"additionalProperties": False
}
)
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "raw_content",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = GPT_generator()
model.forward()
from dataflow.operators.core_text import PromptedGenerator
from dataflow.serving import APILLMServing_request, LiteLLMServing
from dataflow.utils.storage import FileStorage
class GPT_generator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name= "../example_data/GeneralTextPipeline/translation.jsonl",
cache_path="./cache",
file_name_prefix="translation",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=10,
# custom_llm_provider="openai", # if your are using custom llm provider's api
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt = "Please translate to Chinese.",
)
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "raw_content",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = GPT_generator()
model.forward()
\ No newline at end of file
from dataflow.operators.core_text import PromptedGenerator
from dataflow.serving import APILLMServing_request, LiteLLMServing
from dataflow.utils.storage import FileStorage
class GPT_generator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name= "../example_data/GeneralTextPipeline/translation.jsonl",
cache_path="./cache",
file_name_prefix="translation",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o",
max_workers=10,
# custom_llm_provider="openai", # if your are using custom llm provider's api
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt = "Please translate to Chinese. Please answer in JSON format.",
json_schema={
"type": "object",
"properties": {
"original": {"type": "string"},
"translation": {"type": "string"}
},
"required": ["original", "translation"],
"additionalProperties": False,
},
)
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "raw_content",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = GPT_generator()
model.forward()
\ No newline at end of file
from dataflow.operators.core_text import PromptedGenerator
from dataflow.serving import LocalModelLLMServing_vllm
from dataflow.utils.storage import FileStorage
class Qwen_generator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/translation.jsonl",
cache_path="./cache",
file_name_prefix="translation",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = LocalModelLLMServing_vllm(
hf_model_name_or_path="Qwen2.5-7B-Instruct", # set to your own model path
vllm_tensor_parallel_size=1,
vllm_max_tokens=8192,
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt = "Please translate to Chinese.", # System prompt for translation
)
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "raw_content",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = Qwen_generator()
model.forward()
from dataflow.operators.core_text import PromptedGenerator
from dataflow.serving import LocalModelLLMServing_vllm
from dataflow.utils.storage import FileStorage
class Qwen_generator():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/translation.jsonl",
cache_path="./cache",
file_name_prefix="translation",
cache_type="jsonl",
)
self.model_cache_dir = './dataflow_cache'
self.llm_serving = LocalModelLLMServing_vllm(
hf_model_name_or_path="Qwen2.5-7B-Instruct", # set to your own model path
vllm_tensor_parallel_size=1,
vllm_max_tokens=8192,
)
self.prompt_generator = PromptedGenerator(
llm_serving = self.llm_serving,
system_prompt = "Please translate to Chinese.", # System prompt for translation
json_schema={
"type": "object",
"properties": {
"original": {"type": "string"},
"translation": {"type": "string"}
},
"required": ["original", "translation"],
},
)
def forward(self):
# Initial filters
self.prompt_generator.run(
storage = self.storage.step(),
input_key = "raw_content",
)
if __name__ == "__main__":
# This is the entry point for the pipeline
model = Qwen_generator()
model.forward()
from dataflow.operators.core_text import PromptTemplatedGenerator
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
from dataflow.prompts.core_text import StrFormatPrompt
class DoubleColumnInputTestCase():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/core_text_data/double_column_input.json",
file_name_prefix="double_column_input",
cache_path="./cache",
cache_type="jsonl",
)
self.llm_serving = APILLMServing_request(
api_url="https://api.openai.com/v1/chat/completions",
model_name="gpt-4o"
)
self.prompt_template = StrFormatPrompt(
f_str_template="What does a {input_roll} like to {input_term}?"
)
self.operator = PromptTemplatedGenerator(
llm_serving=self.llm_serving,
prompt_template=self.prompt_template
)
def forward(self):
self.operator.run(
storage=self.storage.step(),
input_roll="roll",
input_term="term",
output_key="answer",
)
if __name__ == "__main__":
model = DoubleColumnInputTestCase()
model.forward()
\ No newline at end of file
from dataflow.operators.text_pt import MetaSampleEvaluator
from dataflow.serving import APILLMServing_request
from dataflow.utils.storage import FileStorage
class QuickEvaluatePipeline():
def __init__(self):
self.storage = FileStorage(
first_entry_file_name="../example_data/GeneralTextPipeline/pt_input.jsonl",
cache_path="./cache",
file_name_prefix="dataflow_cache_step_langc",
cache_type="jsonl",
)
self.llm_serving = APILLMServing_request(
api_url="http://123.129.219.111:3000/v1/chat/completions",
model_name="gpt-4o"
)
self.meta_scorer = MetaSampleEvaluator(llm_serving=self.llm_serving)
def forward(self):
self.meta_scorer.run(
self.storage.step(),
input_key='raw_content'
)
if __name__ == "__main__":
pipeline = QuickEvaluatePipeline()
pipeline.forward()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment